import re rx=re.compile('(<(form|a)\s[^>]*(action|href)=)',re.MULTILINE) def get_page(url): try: import urllib opener = urllib.FancyURLopener({}) f = opener.open(url) f.read() except: print "ERROR" return "" def get_next_target(page): start_link = page.find(rx) if start_link == -1: return None, 0 start_quote = page.find('"', start_link) end_quote = page.find('"', start_quote + 1) url = page[start_quote + 1:end_quote] return url, end_quote def union(p,q): for e in q: if e not in p: p.append(e) def get_all_links(page): links = [] while True: url,endpos = get_next_target(page) if url: links.append(url) page = page[endpos:] else: break return links def crawl_web(seed,max_pages): tocrawl = [seed] crawled = [] while tocrawl: last_url = tocrawl.pop() if last_url not in crawled: union(tocrawl, get_all_links(get_page(last_url))) crawled.append(last_url) return crawled[0:max_pages] print "Running..." print crawl_web("http://sympol.net",1)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question