Word Count for a Tag Cloud

from string import *
import  re, HTMLParser

def get_urls(word):
	f = '/Users/tomsmith/Downloads/pdfextractor_1.csv'
	lines = open(f).readlines()
	urls =[]
	for line in lines:	
		url = line.split(",")[0].strip()
		text = lower(line.split(",")[1].strip())
		words = text.split(" ")
		if word in words:
			urls.append( url )
	return urls

class MLStripper(HTMLParser.HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_fed_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    #Warning this does all including script and javascript
    x = MLStripper()
    x.feed(html)
    return x.get_fed_data()

def match(s, reg):
    p = re.compile(reg, re.IGNORECASE| re.DOTALL)
    results = p.findall(s)
    return results
    
f = '/Users/tomsmith/Downloads/pdfextractor_1.csv'
lines = open(f).readlines()

stopwords = open( 'stopwords-en.txt').read().split()
d = {}
words = []

for line in lines:	
	url = line.split(",")[0].strip()
	text = line.split(",")[1].strip()
	words = text.split(" ")
	for word in words:
		
		word = lower( word )
		word = match(word, "[a-z]*")[0]
		print word
		try:
			float(word)
			word = ''
			int( word )
			word = ''
		except:
			pass
		
		if word != '' and word != '"\r\n' and word !='"' and len(word) > 1:
			if word not in stopwords:
				print word
				try:
					d[word] += 1
				except:
					d[word] = 1

finalFreq = sorted(d.iteritems(), key=lambda t: t[1], reverse=True)

out = open("tagcloud.csv", 'w')
out.write("word,frequency,urls\r")
for item in finalFreq:
	urls = get_urls( item[0] )
	urls_str = "|".join(urls)
	
	out.write(item[0] + "," + str(item[1]) + "," + urls_str + "\r" ) 
	print item[0], item[1], urls_str
	
out.close()

Python Fiddle

Python Cloud IDE