Python Web Scraping - Exercise Solutions

import requests
import bs4
from urllib.parse import urljoin

URL = "http://quotes.toscrape.com/"
authors = set()

while URL:

request = requests.get(URL)
    response = request.text
    soup = bs4.BeautifulSoup(response, "lxml")
    
	# Note that ".author" will work here as well. Just to demo the capabilities of bs4.select()
    authors_page_list = [page_author_tag.text for page_author_tag in soup.select(".quote span small.author")]
    authors |= set(authors_page_list) # or authors.update(authors_page_list)
    
#     # instead of using list comprehension     
#     page_author_tags = soup.select(".quote span small.author")
#     for page_author_tag in page_author_tags:
#         authors.add(page_author_tag.text) # or .contents will return it in a list and you can add it with set.update(list)
    
    # paging. bs4.select() returns a list
    # if list not empty, take the first element
    # from the web page format we know it will be the only one
    ahref = soup.select("li.next a")
    if ahref:
        next_url = ahref[0]["href"]
        URL = urljoin(URL, next_url)
    else:
        URL = ""
        
#    # paging. just an alternative using bs4.find()
#    ahref = soup.find("li", {"class": "next"})
#    if ahref:
#        next_url = ahref.find("a")["href"]
#        URL = urljoin(URL, next_url)
#    else:
#        URL = ""
        
print(authors)

Browser Version Not Supported

Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+ Firefox 4+ Safari 5+
IE 10+

Let me try anyway!

Python Fiddle

Python Cloud IDE