import re,math from collections import Counter def cos_sim(sent1, sent2): WORD = re.compile(r'\w+') def get_cosine(vec1, vec2): intersection = set(vec1.keys()) & set(vec2.keys()) numerator = sum([vec1[x] * vec2[x] for x in intersection]) sum1 = sum([vec1[x]**2 for x in vec1.keys()]) sum2 = sum([vec2[x]**2 for x in vec2.keys()]) denominator = math.sqrt(sum1) * math.sqrt(sum2) if not denominator: return 0.0 else: return float(numerator) / denominator def text_to_vector(text): words = WORD.findall(text) return Counter(words) vector1 = text_to_vector(sent1) vector2 = text_to_vector(sent2) cosine = get_cosine(vector1, vector2) return cosine s1 = "This is a foo bar sentence" s2 = "This is a bar bar black sheep sentence" s3 = "that is just a bar" print cos_sim(s1, s2) print cos_sim(s2, s3) print cos_sim(s1, s3)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question