# # This question explores a different way (from the previous question) # to limit the pages that it can crawl. # ####### # THREE GOLD STARS # # Yes, we really mean it! This is really tough (but doable) unless # you have some previous experience before this course. # Modify the crawl_web procedure to take a second parameter, # max_depth, that limits the depth of the search. We can # define the depth of a page as the number of links that must # be followed to reach that page starting from the seed page, # that is, the length of the shortest path from the seed to # the page. No pages whose depth exceeds max_depth should be # included in the crawl. # # For example, if max_depth is 0, the only page that should # be crawled is the seed page. If max_depth is 1, the pages # that should be crawled are the seed page and every page that # it links to directly. If max_depth is 2, the crawl should # also include all pages that are linked to by these pages. # # Note that the pages in the crawl may be in any order. # # The following definition of get_page provides an interface # to the website found at http://www.udacity.com/cs101x/index.html # The function output order does not affect grading. def get_page(url): try: if url == "http://www.udacity.com/cs101x/index.html": return ('<html> <body> This is a test page for learning to crawl! ' '<p> It is a good idea to ' '<a href="http://www.udacity.com/cs101x/crawling.html">learn to ' 'crawl</a> before you try to ' '<a href="http://www.udacity.com/cs101x/walking.html">walk</a> ' 'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. ' '</p> </body> </html> ') elif url == "http://www.udacity.com/cs101x/crawling.html": return ('<html> <body> I have not learned to crawl yet, but I ' 'am quite good at ' '<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.' '</body> </html>') elif url == "http://www.udacity.com/cs101x/walking.html": return ('<html> <body> I cant get enough ' '<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! ' '</body> </html>') elif url == "http://www.udacity.com/cs101x/flying.html": return ('<html> <body> The magic words are Squeamish Ossifrage! ' '</body> </html>') elif url == "http://top.contributors/velak.html": return ('<a href="http://top.contributors/jesyspa.html">' '<a href="http://top.contributors/forbiddenvoid.html">') elif url == "http://top.contributors/jesyspa.html": return ('<a href="http://top.contributors/elssar.html">' '<a href="http://top.contributors/kilaws.html">') elif url == "http://top.contributors/forbiddenvoid.html": return ('<a href="http://top.contributors/charlzz.html">' '<a href="http://top.contributors/johang.html">' '<a href="http://top.contributors/graemeblake.html">') elif url == "http://top.contributors/kilaws.html": return ('<a href="http://top.contributors/tomvandenbosch.html">' '<a href="http://top.contributors/mathprof.html">') elif url == "http://top.contributors/graemeblake.html": return ('<a href="http://top.contributors/dreyescat.html">' '<a href="http://top.contributors/angel.html">') elif url == "A1": return '<a href="B1"> <a href="C1"> ' elif url == "B1": return '<a href="E1">' elif url == "C1": return '<a href="D1">' elif url == "D1": return '<a href="E1"> ' elif url == "E1": return '<a href="F1"> ' except: return "" return "" def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote def union(p,q): for e in q: if e not in p: p.append(e) def get_all_links(page): links = [] while True: url,endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def crawl_web(seed,max_depth): tocrawl = [seed] crawled = [] interm_list=[] depth=0 while depth<=max_depth: while tocrawl: page = tocrawl.pop() if page not in crawled: union(interm_list, get_all_links(get_page(page))) crawled.append(page) tocrawl=interm_list depth=depth+1 interm_list=[] return crawled #print crawl_web("http://www.udacity.com/cs101x/index.html",0) #>>> ['http://www.udacity.com/cs101x/index.html'] #print crawl_web("http://www.udacity.com/cs101x/index.html",1) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html', #>>> 'http://www.udacity.com/cs101x/crawling.html'] #print crawl_web("http://www.udacity.com/cs101x/index.html",50) #>>> ['http://www.udacity.com/cs101x/index.html', #>>> 'http://www.udacity.com/cs101x/flying.html', #>>> 'http://www.udacity.com/cs101x/walking.html', #>>> 'http://www.udacity.com/cs101x/crawling.html', #>>> 'http://www.udacity.com/cs101x/kicking.html'] #rint crawl_web("http://top.contributors/forbiddenvoid.html",2) #>>> ['http://top.contributors/forbiddenvoid.html', #>>> 'http://top.contributors/graemeblake.html', #>>> 'http://top.contributors/angel.html', #>>> 'http://top.contributors/dreyescat.html', #>>> 'http://top.contributors/johang.html', #>>> 'http://top.contributors/charlzz.html'] #print crawl_web("A1",3) #>>> ['A1', 'C1', 'B1', 'E1', 'D1', 'F1'] # (May be in any order)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question