import urllib2 import re from time import time from bs4 import BeautifulSoup # takes a url as input, extract all urls from the input's web page's html source def extract_urls(page): html = urllib2.urlopen(page).read() url_re = re.compile(r'(?<=href=").*?(?=")') for match in url_re.finditer(html): yield match.group(0) def crawl_web(seed): tocrawl = [seed] crawled = [] while tocrawl: page = tocrawl.pop() if page not in crawled: for uri in extract_urls(page): tocrawl.append(uri) crawled.append(page) return crawled source_urls = list() # get list of top-tier news sites to check from text file sources.txt sources_text = open('sources.txt', 'r') for line in sources_text: source_urls.append(line) sources_text.close() # *** BEGIN SCANNING *** while 1==1: all_stories = list() matching_stories = list() new_matching_stories = list() num_searched = 0 num_matched = 0 num_new_matched = 0 t0= time() # start the clock # for each media source, get all the links on the home page for link in source_urls: for uri in extract_urls(link): # gather all current front-page stories all_stories.append(uri.lower()) num_searched += 1 # for each link retrieved, check if it contains the keywords for story in all_stories: try: # more sophisticated matching analysis will go here if "romney" in story or "obama" in story or "convention" in story or "hurricane" in story: #print "match found!" matching_stories.append(story) num_matched += 1 except: print "an error occurred" # for each matching story collected, make sure it's not a duplicate. add to old list if it's new old_matches = open('oldmatches.txt', 'r').read() for story in matching_stories: if not story in old_matches: new_matching_stories.append(story) num_new_matched += 1 #old_matches.close() mod_matches = open('oldmatches.txt', 'a') for match in new_matching_stories: mod_matches.write(match + "\n") print match # output: report results t = time() - t0 print str(num_searched) + " links scanned in " + str(t) + "seconds" print str(num_matched) + " matches found" print str(num_new_matched) + " new matches found\n\n" mod_matches.close()
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question