# The current index includes a url in the list of urls # for a keyword multiple times if the keyword appears # on that page more than once. # It might be better to only include the same url # once in the url list for a keyword, even if it appears # many times. # Modify add_to_index so that a given url is only # included once in the url list for a keyword, # no matter how many times that keyword appears. def get_page(url): try: if url == "http://www.udacity.com/cs101x/index.html": return '''<html> <body> This is a test page for learning to crawl! <p> It is a good idea to <a href="http://www.udacity.com/cs101x/crawling.html"> learn to crawl</a> before you try to <a href="http://www.udacity.com/cs101x/walking.html">walk</a> or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body> </html>''' elif url == "http://www.udacity.com/cs101x/crawling.html": return '''<html> <body> I have not learned to crawl yet, but I am quite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>. </body> </html>''' elif url == "http://www.udacity.com/cs101x/walking.html": return '''<html> <body> I cant get enough <a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>''' elif url == "http://www.udacity.com/cs101x/flying.html": return '''<html> <body>The magic words are Squeamish Ossifrage!</body></html>''' except: return "" return "" def union(a, b): for e in b: if e not in a: a.append(e) def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote def get_all_links(page): links = [] while True: url, endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def crawl_web(seed): #2: This function takes the crawl_web target url, puts it in a tocrawl = [seed] #"tocrawl" list, pops it from the tocrawl list, gets all its html crawled = [] #content. It then calls add_page_to_index using the index list, index = [] #the last_tocrawl_url, and the page content or that url. while tocrawl: # last_tocrawl_url = tocrawl.pop() # if last_tocrawl_url not in crawled: # content = get_page(last_tocrawl_url) # add_page_to_index(index, last_tocrawl_url, content)# union(tocrawl, get_all_links(content)) # crawled.append(last_tocrawl_url) # return index # def add_to_index(index, keyword, last_tocrawl_url): #4: We start with an empty index list, the first first word in for entry in index: #a list of all that page's content (we're using a for loop), and if entry[0] == keyword: #the last_tocrawl_url. The function: checks each entry in index, if url not in entry[1]: entry[1].append(last_tocrawl_url) #if the currently checked content word matches the entry in index, return #it adds the last_tocrawl_url to the url list[1] for that keyword index.append([keyword, [last_tocrawl_url]]) #--IF the entry in index does not match keyword, add keyword + url def add_page_to_index(index, last_tocrawl_url, content): #3The last_tocrawl_url content is split up using .split and assigned words = content.split() #to variable "words." It then checks each list entry: each entry for word in words: #is passed into the add_to_index function. add_to_index(index, word, last_tocrawl_url) def lookup(index, keyword): for entry in index: if entry[0] == keyword: return entry[1] return None print crawl_web("http://www.udacity.com/cs101x/index.html") #1: Calls the crawl_web procedure #index = crawl_web("http://www.udacity.com/cs101x/index.html") #this will return an index of all links on a page #print lookup(index,"is") #>>> ['http://www.udacity.com/cs101x/index.html']
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question