# Modify the crawl_web procedure to take a second parameter, # max_pages, that limits the number of pages to crawl. # Your procedure should terminate the crawl after # max_pages different pages have been crawled, or when # there are no more pages to crawl. # The following definition of get_page provides an interface # to the website found at http://www.udacity.com/cs101x/index.html def get_page(url): try: if url == "http://www.udacity.com/cs101x/index.html": return ('<html> <body> This is a test page for learning to crawl! ' '<p> It is a good idea to ' '<a href="http://www.udacity.com/cs101x/crawling.html">learn to ' 'crawl</a> before you try to ' '<a href="http://www.udacity.com/cs101x/walking.html">walk</a> ' 'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. ' '</p> </body> </html> ') elif url == "http://www.udacity.com/cs101x/crawling.html": return ('<html> <body> I have not learned to crawl yet, but I ' 'am quite good at ' '<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.' '</body> </html>') elif url == "http://www.udacity.com/cs101x/walking.html": return ('<html> <body> I cant get enough ' '<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! ' '</body> </html>') elif url == "http://www.udacity.com/cs101x/flying.html": return ('<html> <body> The magic words are Squeamish Ossifrage! ' '</body> </html>') except: return "" return "" def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote def union(p,q): for e in q: if e not in p: p.append(e) def get_all_links(page): links = [] while True: url,endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def crawl_web(seed,max_pages): tocrawl = [seed] crawled = [] while tocrawl: last_url = tocrawl.pop() #if seed is "http://www.udacity.com/cs101x/index.html," then the last item in "tocrawl" is assigned to variable "last_url" if last_url not in crawled: #the next three lines: if last_url in tocrawl list is not in the crawled list, then get all the links on last_url page, then... union(tocrawl, get_all_links(get_page(last_url))) #...check to see if each is in the tocrawl list. If not, add them to tocrawl list crawled.append(last_url) #Then added the last_url to the crawled list so we can be done with it! return crawled[0:max_pages] print crawl_web("http://www.udacity.com/cs101x/index.html",1) #>>> ['http://www.udacity.com/cs101x/index.html'] # print crawl_web("http://www.udacity.com/cs101x/index.html",3) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html'] #print crawl_web("http://www.udacity.com/cs101x/index.html",500) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html', #>>> 'http://www.udacity.com/cs101x/crawling.html', #>>> 'http://www.udacity.com/cs101x/kicking.html']
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question