# The web crawler we built at the end of Unit 3 has some serious # flaws if we were going to use it in a real crawler. One # problem is if we start with a good seed page, it might # run for an extremely long time (even forever, since the # number of URLS on the web is not actually finite). This # question and the following one explore two different ways # to limit the pages that it can crawl. # Modify the crawl_web procedure to take a second parameter, # max_pages, that limits the number of pages to crawl. # Your procedure should terminate the crawl after # max_pages different pages have been crawled, or when # there are no more pages to crawl. # The following definition of get_page provides an interface # to the website found at http://www.udacity.com/cs101x/index.html # The function output order does not affect grading. def get_page(url): try: if url == "http://www.udacity.com/cs101x/index.html": return ('<html> <body> This is a test page for learning to crawl! ' '<p> It is a good idea to ' '<a href="http://www.udacity.com/cs101x/crawling.html">learn to ' 'crawl</a> before you try to ' '<a href="http://www.udacity.com/cs101x/walking.html">walk</a> ' 'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. ' '</p> </body> </html> ') elif url == "http://www.udacity.com/cs101x/crawling.html": return ('<html> <body> I have not learned to crawl yet, but I ' 'am quite good at ' '<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.' '</body> </html>') elif url == "http://www.udacity.com/cs101x/walking.html": return ('<html> <body> I cant get enough ' '<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! ' '</body> </html>') elif url == "http://www.udacity.com/cs101x/flying.html": return ('<html> <body> The magic words are Squeamish Ossifrage! ' '</body> </html>') except: return "" return "" def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote def union(p,q): for e in q: if e not in p: p.append(e) def get_all_links(page): links = [] while True: url,endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def crawl_web(seed, max_pages): tocrawl = [seed] crawled = [] while tocrawl: page = tocrawl.pop() if page not in crawled and len(crawled) < max_pages: union(tocrawl, get_all_links(get_page(page))) crawled.append(page) return crawled #alternative _version def crawl_web(seed,max_depth): tocrawl = [seed] crawled = [] next_depth = [] depth = 0 while tocrawl and depth <= max_depth: page = tocrawl.pop() if page not in crawled: union(next_depth, get_all_links(get_page(page))) crawled.append(page) if not tocrawl: tocrawl, next_depth = next_depth, [] depth = depth + 1 return crawled #print crawl_web("http://www.udacity.com/cs101x/index.html",1) #>>> ['http://www.udacity.com/cs101x/index.html'] print crawl_web("http://www.udacity.com/cs101x/index.html",3) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html'] #print crawl_web("http://www.udacity.com/cs101x/index.html",500) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html', #>>> 'http://www.udacity.com/cs101x/crawling.html', #>>> 'http://www.udacity.com/cs101x/kicking.html']
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question