## returns the next 'url' and the end position of that ## so that we start looking for the next url def get_next_target(page): start_link = page.find('<a href=') if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote ## compiles a list of all links on 'page' def get_all_links(page): links = [] while page: url, endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links ## removes element x from list p and returns new p def remove_element(x, p): if x not in p: return p tempList = [] for i in p: if i != x: tempList.append(i) p = tempList return p ## starts from seed page and compiles 2 lists: # toCrawl = links left to crawl # crawled = list of crawled links def crawl_web(seed): toCrawl = [seed] crawled = [] while toCrawl: for x in toCrawl: if x in crawled: toCrawl = remove_element(x, toCrawl) break crawled = crawled + get_all_links(x) remove_element(x, toCrawl) return crawled # should return a list of all links on input # print get_all_links('Here is a <a href="http://udacity.com">link') crawl_web(
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question