import re paragraph = """Given an arbitrary text document written in English, write a program that will generate a concordance, i.e. an alphabetical list of all word occurrences, labeled with word frequencies. Bonus: label each word with the sentence numbers in which each occurrence appeared.""" def splitParagraph(paragraph, debug=False): # regex found from http://en.wikipedia.org/wiki/Sentence_boundary_disambiguation regex = "((?<=[a-z0-9][.?!])|(?<=[a-z0-9][.?!]\"))(\s|\r\n)(?=\"?[A-Z])" sentences = re.split(regex, paragraph) #remove punctuation from end of strings #remove empty strings sentences = [sentence.rstrip(".!?;") for sentence in sentences if len(sentence) > 1] if debug: for i in xrange(len(sentences)): print i, len(sentences[i]), sentences[i] return sentences def splitSentence(sentence, debug=False): words = sentence.split(" ") words = [w.strip(",:;").lower() for w in words] if debug: print words return words def makeConcordance(words, debug=False): concordance = {} for w in words: if concordance.has_key(w): concordance[w] += 1 else: concordance[w] = 1 if debug: printConcordance(concordance) return concordance def joinConcordances(concordances, debug=False): concordance = {} for i in xrange(len(concordances)): for word, value in concordances[i].items(): if concordance.has_key(word): concordance[word] += [i+1] * value else: concordance[word] = [i+1] * value if debug: printJoinedConcordance(concordance) return concordance def printConcordance(concordance): for key, value in sorted(concordance.items()): print key+"\t"+str(value) def printJoinedConcordance(concordance): for key, value in sorted(concordance.items()): print key + "\t{" + str(len(value)) + ":" + ",".join([str(v) for v in value]) + "}" sentences = splitParagraph(paragraph) #regular version words = [] for s in sentences: words += splitSentence(s, debug=False) regularConcordance = makeConcordance(words, debug=False) printConcordance(regularConcordance) #bonus version concordances = [makeConcordance(splitSentence(s)) for s in sentences] bonusConcordance = joinConcordances(concordances, debug=False) printJoinedConcordance(bonusConcordance)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question