import nltk,itertools #from collections import Counter ''' ASSIGNMENT I NAME: Tian Wang ''' def sentence_segmentation(raw_text): segment = nltk.sent_tokenize(raw_text) print(segment) return segment def word_tokenization(raw_text): tokens = nltk.word_tokenize(raw_text) print(tokens) return tokens def compute_tags_stats(tagged_words): nr_nouns = 0 nr_verbs = 0 nr_adjs = 0 for (w,t) in tagged_words: if t == 'NN' or t == 'NNP' or t =='NNPS' or t =='NNS': nr_nouns += 1 elif t == 'VB' or t =='VBD' or t =='VBG' or t =='VBN' or t =='VBZ': nr_verbs += 1 elif t == 'JJ' or t =='JJR' or t =='JJS' : nr_adjs += 1 print_tags_stats(nr_nouns,nr_verbs,nr_adjs) return nr_nouns,nr_verbs,nr_adjs def print_corpus_stats(sentences,words): sent = len(sentences) word = len(words) print("=================================================================") print(" Corpus Stats ") print('{:<50}'.format("Number of Sentences: "), sent) print('{:<50}'.format("Number of Words: "), word) print("=================================================================") def print_tags_stats(nr_nouns,nr_verbs,nr_adjs): print("-----------------------------------------------------------------") print(" Tags Stats ") print('{:<50}'.format("Number of Nouns: "), nr_nouns) print('{:<50}'.format("Number of Verbs: "), nr_verbs) print('{:<50}'.format("Number of Adjectives: "), nr_adjs) print("-----------------------------------------------------------------") def tag_and_print_text(taggers,words): tagged_words = train_nltk_taggers(words) compute_tags_stats(tagged_words) ''' def compare_taggers(taggers,words): ''' def train_nltk_taggers(): """this function returns five taggers """ train_sents = brown_tagged_sents = nltk.corpus.brown.tagged_sents(categories='news') default_tag = most_common_tag(train_sents) default_tagger = nltk.DefaultTagger(default_tag) affix_tagger = nltk.AffixTagger(train_sents,backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(train_sents,backoff=affix_tagger) bigram_tagger = nltk.BigramTagger(train_sents,backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(train_sents,backoff=bigram_tagger) return [default_tagger,affix_tagger,unigram_tagger,bigram_tagger,trigram_tagger] def most_common_tag(train_sents): """ it returns most common tag, which is then used as default tag""" tags = [tag for (token,tag) in list(itertools.chain.from_iterable(train_sents))] fdist = nltk.FreqDist(tags) (tag,n) = fdist.most_common(1)[0] return tag if __name__ == "__main__": raw_text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')[:500] sentences = sentence_segmentation(raw_text) words = word_tokenization(raw_text) print_corpus_stats(sentences,words) taggers = train_nltk_taggers() tag_and_print_text(taggers,words) #compare_taggers(taggers,words)
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question