from string import * import re, HTMLParser def get_urls(word): f = '/Users/tomsmith/Downloads/pdfextractor_1.csv' lines = open(f).readlines() urls =[] for line in lines: url = line.split(",")[0].strip() text = lower(line.split(",")[1].strip()) words = text.split(" ") if word in words: urls.append( url ) return urls class MLStripper(HTMLParser.HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_fed_data(self): return ''.join(self.fed) def strip_tags(html): #Warning this does all including script and javascript x = MLStripper() x.feed(html) return x.get_fed_data() def match(s, reg): p = re.compile(reg, re.IGNORECASE| re.DOTALL) results = p.findall(s) return results f = '/Users/tomsmith/Downloads/pdfextractor_1.csv' lines = open(f).readlines() stopwords = open( 'stopwords-en.txt').read().split() d = {} words = [] for line in lines: url = line.split(",")[0].strip() text = line.split(",")[1].strip() words = text.split(" ") for word in words: word = lower( word ) word = match(word, "[a-z]*")[0] print word try: float(word) word = '' int( word ) word = '' except: pass if word != '' and word != '"\r\n' and word !='"' and len(word) > 1: if word not in stopwords: print word try: d[word] += 1 except: d[word] = 1 finalFreq = sorted(d.iteritems(), key=lambda t: t[1], reverse=True) out = open("tagcloud.csv", 'w') out.write("word,frequency,urls\r") for item in finalFreq: urls = get_urls( item[0] ) urls_str = "|".join(urls) out.write(item[0] + "," + str(item[1]) + "," + urls_str + "\r" ) print item[0], item[1], urls_str out.close()
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question