Web Crawler Work | Python Fiddle

# The current index includes a url in the list of urls
# for a keyword multiple times if the keyword appears
# on that page more than once.

# It might be better to only include the same url
# once in the url list for a keyword, even if it appears
# many times.

# Modify add_to_index so that a given url is only
# included once in the url list for a keyword,
# no matter how many times that keyword appears.

def get_page(url):
    try:
        if url == "http://www.udacity.com/cs101x/index.html":
            return '''<html> <body> This is a test page for learning to crawl!
<p> It is a good idea to
<a href="http://www.udacity.com/cs101x/crawling.html">
learn to crawl</a> before you try to
<a href="http://www.udacity.com/cs101x/walking.html">walk</a> or
<a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body>
</html>'''

elif url == "http://www.udacity.com/cs101x/crawling.html":
            return '''<html> <body> I have not learned to crawl yet, but I am
quite good at  <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.
</body> </html>'''

elif url == "http://www.udacity.com/cs101x/walking.html":
            return '''<html> <body> I cant get enough
<a href="http://www.udacity.com/cs101x/index.html">crawling</a></body></html>'''

elif url == "http://www.udacity.com/cs101x/flying.html":
            return '''<html>
<body>The magic words are Squeamish Ossifrage!</body></html>'''
    except:
        return ""
    return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed):                                           #2: This function takes the crawl_web target url, puts it in a
    tocrawl = [seed]                                           #"tocrawl" list, pops it from the tocrawl list, gets all its html
    crawled = []                                               #content. It then calls add_page_to_index using the index list,   
    index = []                                                 #the last_tocrawl_url, and the page content or that url.
    while tocrawl:                                             #
        last_tocrawl_url = tocrawl.pop()                       #
        if last_tocrawl_url not in crawled:                    #
            content = get_page(last_tocrawl_url)               #
            add_page_to_index(index, last_tocrawl_url, content)#
            union(tocrawl, get_all_links(content))             #
            crawled.append(last_tocrawl_url)                   #
    return index                                               #

def add_to_index(index, keyword, last_tocrawl_url):            #4: We start with an empty index list, the first first word in  
    for entry in index:                                        #a list of all that page's content (we're using a for loop), and
        if entry[0] == keyword:                                #the last_tocrawl_url. The function: checks each entry in index,
            if url not in entry[1]:
                entry[1].append(last_tocrawl_url)                  #if the currently checked content word matches the entry in index,     
            return                                             #it adds the last_tocrawl_url to the url list[1] for that keyword
    index.append([keyword, [last_tocrawl_url]])                #--IF the entry in index does not match keyword, add keyword + url

def add_page_to_index(index, last_tocrawl_url, content):       #3The last_tocrawl_url content is split up using .split and assigned
    words = content.split()                                    #to variable "words." It then checks each list entry: each entry
    for word in words:                                         #is passed into the add_to_index function.
        add_to_index(index, word, last_tocrawl_url)

def lookup(index, keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return None

print crawl_web("http://www.udacity.com/cs101x/index.html")   #1: Calls the crawl_web procedure

#index = crawl_web("http://www.udacity.com/cs101x/index.html") #this will return an index of all links on a page
#print lookup(index,"is")
#>>> ['http://www.udacity.com/cs101x/index.html']

Python Fiddle

Python Cloud IDE