import requests import bs4 from urllib.parse import urljoin URL = "http://quotes.toscrape.com/" authors = set() while URL: request = requests.get(URL) response = request.text soup = bs4.BeautifulSoup(response, "lxml") # Note that ".author" will work here as well. Just to demo the capabilities of bs4.select() authors_page_list = [page_author_tag.text for page_author_tag in soup.select(".quote span small.author")] authors |= set(authors_page_list) # or authors.update(authors_page_list) # # instead of using list comprehension # page_author_tags = soup.select(".quote span small.author") # for page_author_tag in page_author_tags: # authors.add(page_author_tag.text) # or .contents will return it in a list and you can add it with set.update(list) # paging. bs4.select() returns a list # if list not empty, take the first element # from the web page format we know it will be the only one ahref = soup.select("li.next a") if ahref: next_url = ahref[0]["href"] URL = urljoin(URL, next_url) else: URL = "" # # paging. just an alternative using bs4.find() # ahref = soup.find("li", {"class": "next"}) # if ahref: # next_url = ahref.find("a")["href"] # URL = urljoin(URL, next_url) # else: # URL = "" print(authors)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question