赞
踩
Deepti Chopra(印度)
王威 译
创建黄金标准注释语料库是一项主要的任务,而且其实成本也是非常昂贵的。它通过手工标注给定的测试数据来完成该操作。以这种方式筛选的标记被视为标准标记,其可用于表示大范围的信息。
import nltk
from nltk.corpus import brown
sentences=brown.tagged_sents (categories= 'news')
sent=brown.sents (categories='news' )
unigram_sent = nltk.UnigramTagger (sentences)
print(unigram_sent.tag (sent [2008]))
print(unigram_sent.evaluate (sentences))
import nltk
from nltk.corpus import brown
sentences = irown.tagged_sents (categories= 'news’)
sz=int (len (sentences)*0.8)
print(sz)
training_sents = sentences[:sz]
testing_sentssentences [sz:]
unigram_tagger = nltk.UnigramTagger (training_sents)
print(unigram_cagger.evaluate (testing_sents))
import nltk
from nltk.corpus import brown
sentences = brcwn.tagged_sents (categories= 'news')
s2 = int (len (sentences)*0.8)
training_sents = sentences[:sz]
testing_sents = sentences[sz:]
bigram_tagger = nltk.UnigramTagger (training_sents)
bigram_tagger = nltk.BigramTagger (training_sents)
print(bigram_tagger.tag (sentences[2008]))
un_sent = sentences [4203]
print(bigram_tagger.tag(un_sent))
print(bigram_tagger.evaluate (testing sents))
import nltk
from nltk.corpus import brown
sentence = brown.tagged_sents (categories = ' news')
sz=int (len(sentences)*0.8)
training_sents = sentences[:sz]
tesling_sents = sentences[sz:]
s0 = nltk.DefaultTagger('NNP')
s1 = nltk.UnLgramTagger (training_sents, backoff = s0)
s2 = n1k.BigramTagger (training_sents,backoff = s1)
print(s2.evaluate(testing_sents))
import nltk
chunkparser = nltk.RegexpParser ("")
print (nltk.chunk.accuracy(chunkparser, nltk.corpus.con112000.chunked_sents(
'train.txt', chunk_types=('NP',))))
import nltk
grammar = r"NP: (< [CDJNP]. *>+}"
cp = nltk.RegexpParser (grammar)
print(nltk.chunk.accuracy(cp, nltk.corpus.con112000.chunked_sents(
'train.txt', chunk_types = ('NP',))))
def chunk_tags(train) :
"""Generate a following tags list chat appears inside chunks"""
cfreqdist = nltk.ConditionalFreqDist()
for t in train:
for word, tag, chunktag in nltk.chunk.tree2conlltags(t):
if chtag == "O":
cfreqdist[tag].inc (False)
else:
cfreqdist[tag].inc (True)
return [tag for tag in cfreqdist.conditions() if cfreqdist [tag] .max() == True]
training_sents = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types = ('NP',))
print(chunked_tags (train_sents))
import nltk
correct = nltk.chunk.tagstr2tree ("[ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ]")
print (correct. flatten())
grammar = r"NP: {< [CDJNP] . *>+}”
cp = nltk.RegexpParser (grammar)
grammar = r"NP: {<PRP|DT| POS| JJ|CD|N.*>+)”
chunk_parser = nltk.RegexpParser (grammar)
tagged_tok = [("the", "DT"), (“little", "JJ"), ( "cat","NN"), ("sat", "VBD"), ("on","IN"), ("the", "DT"), "mat", "NN")]
chunkscore = nltk.chunk.ChunkScore()
guesaed = cp.parse(correct.flatten())
chunkscore.score(correct, guessed)
print (chunkscore)
chunker_data = [[(t,c) for w, t, c in nltk.chunk.tree2conlltags (chtree)]
for chtree in nltk.corpus.conll2000.chunked_sents('train.txt')]
unigram chunk = nltk.UnigramTagger (chunker_data)
print (nItk.tag.accuracy (unigram_chunk, chunker_data))
bigram_chunk = nltk.BigramTagger(chunker_data, backoff_unigram_chunker)
print( nltk. tag.accuracy (bigram_chunk, churker_data))
from nltk.corpus import brown suffix_freqdist = nltk.FreqDist() for wrd in brown.words() : wrd = wrd.lower() suffix_freqdist [wrd[-1:]] += 1 suffix_fdist[wrd[-2:]] += 1 suffix_fdist[wrd[-3:]] += 1 common_suffixes = [suffix for (suffix, count) in suffix_freqdist.most _common(100) ] print (common_suffixes) def pos_feature (wrd) : feature = { } for suffix in common_suffixes: feature['endswith({}) '. format(suffix)] = wrd.lower.endswith (suffix) return feature tagged_wrds = brown.taged_wrds (categories = 'news ') featureset = [(pos_feature(n), g) for (n,g) in tagged_wrds] size = int (len (featureset) * 0.1) train_set, test_set = featureset[size:], featureset[:size] classifier1 = nltk.DecisionTreeClassifier. train(train_set) print(nltk.classify.accuracy (classifier1, test_set)) classifier.classify(pos_features( 'cats')) 'NNS ' print (classifier.pseudocode (depth=4) ) if endswith(,) == True: return ',' if endswith(,) == False: if endswithlthe) == True: return 'AT' if endswith(the) == False: if endswith(s) == True: if endswith(is) == True: return ' BEZ' if endswith(is) == False: return 'VBZ' if endswith(s) == False: if endswith(.) == True: return' ·' if endswith(.) == False: return 'NN'
import nltk from nltk.corpus import brown sentences = brown.tagged_sents (categories = 'news' ) sent = brown.sents (categories= 'news') pattern= [ (r’ .*ing$', 'VBG'), # for gerunds (z'. *eds', 'VBD'), # for simple past (r' . *es$', 'VBZ'), #for 3rd singular present (r'.*oulds', 'MD'), #for rodals (z'.*\'s$', 'NNS'), #for possessive nouns (r'. *s$','NNS'), #for plural nouns (r'^-?[0-9] + (.[0-9]+)?$','CD'), #for cardinal numbers (r' .*’, 'NN') #for nouns (default) ] regexpr_tagger = nitk .RegexpTagger (pattern) print(regexpr_tagger.tag(sent[3])) print(regexp_tagger.evaluate (sentences))
import nltk from nltk.corpus import brown freqd = nltk.FreqDist (brown.words (categories = 'news')) cfreqd = n1tk.ConditicnalFreqDist (brown.tagged_words (categories = 'news') mostfreq_words = freqd.most_common(100) likelytags = dict (word, cfreqdlwordj.max()) for (word,_ ) in mostfreq_words) baselinetagger = nItk.UnigramTagger (model = likelytags) print(baselinetagger.evaluate (brown tagged sents)) sent = brown.sents(categories = 'news') [3] print(baselinetagger.tag(sent)) baselinetagger = nltk.UnigramTagger (model=likely_ tags, backoff=nltk. DefaultTagger('NN')) def performance (cfreqd, wordlist) : It = dict( (word, cfreqd[word] ,max()) for word in wordlist) Baseline_tagger = nltk.UnigramTagger (model = lt, backoff = nltk.DefaultTagger('NN')) return baseline_tagger.evaluate (brown.tagged_sents (categories = 'news')) def display() : import pylab word_freqs = nltk.FreqDist (browm.words (categories = 'news')).most_common () words_by_freq = [w for (w, ) in word_freqs] cfd = nltk.ConditionalFreqDist (brown.agged_words (categories = 'news')) sizes = 2 ** pylab.arange(15) perfs = lperformance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show() display()
import nltk
from nltk.stem.lancaster import Lancasterstemmer
stri=LancasterStemmer ()
print(stri.stem('achievement'))
class conseNPChunkagger( nltk.laggerI): def __init__(self, train_sente): train_set = [] for tagsent in train_sents: untagsent = nitk.cag.untag(tagsent) history= [] for i, (word, tag) in enumerate (tagsent): featureset = mpchunk_features (untagsent, i, history) train_set.append( (featureset, tag) ) histcry.append(tag) self.classifier = nltk.MaxentClassifier.train(train_set, algorithm = ' megam', trace = 0) def tag(self, sentence) ; history = [] for i, word in enumerate (sentence) : featureset = npchunk_features(sentence, i, history) tag = self.classifier.classify (featureset) histcry.append(tag) return zip(sentence, history) class ConseNPChunker (nltk,ChunkParserI): [4] def__init__(self, train_sents) : tagsent = [ [ (w,t),c) for (w, t, c) in nltk.chunk.tree2conlltags(sent) ] for sent in train_sents] self.tagger = ConseNPChunkTagger (tagsent〉 def parse (self, sentence) : tagsent= self.tagger.tag (sentence) conlltags = [(w, t, c) for (w, t), c] in tagsent] return nltk.chunk.conlltags2tree (conlltags)
def npchunk_features (sentence, i, history) :
… word, pos = sentence[i]
… return {"pos": pos}
chunker = ConseNPChunker (train_sents)
print (chunker.evaluate (test_sents) )
def npchunk features (sentence, i, history):
… word, pos = sentence[i]
… if i == 0:
… previword, previpos = "<START>", "<START>"
… else:
… previword, previpos = sentence[i - 1]
… return {"pos": pos, "previpos": previpos}
chunker = ConseNPChunker (train_sents)
print (chunker.evaluate(test_sents))
def npchunk_features (sentence, i, history) :
… word, pos = sentence[i]
… if I == 0:
… previword, previpos = "<START>", "<START>"
… else:
… previword, previpos = sentence[i-1]
... return {"pos"; pos, "word": word, "previpos": previpos}
chunker = ConseNPChunker (train_sents)
print (chunker.evaluate(test_sents))
def npchunk features (sentence, i,history): word, pos = sentence[i] if i == 0: previword, previpos = "<START>", "<START>” else: previword, previpos = sentence[i-1] if i == len(sentence) - 1: nextword, nextpos = "<END>", "<END>" else: nextword, nextpos = sentence[i+1] return {"pos": pos, "word": word, "previpos": previpos, "nextpos": nextpos, "previpost + pos": "%s + %s" % (previpos, pos) , "pos+nextpos": "%s + %s" % (pos, nextpos), "tags-since-dt": tags_since_dt(sentence, 1) } def tags_since_dt (sentence, i) : tags = set() for word, pos in sentence[:i] : if pos =='DT': tags = set() else: tags.add (pos) return ‘+' .join (sorted(tags)) chunker = ConsecutiveNPChunker (train_sents) print (chunker.evaluate (test_sents))
以下两个手段可用于解析器性能的评估:
IR系统评估需考虑如下几个方面:
(通常使用精确率、召回率、F值来评估IR系统)
错误识别是一个非常重要的可影响NLP系统性能的方面,可能涉及以下术语:
真正(True Positive,TP) | 被正确识别为相关文档的相关文档集 |
---|---|
真负(True Negative,TN) | 被正确识别为无关文档的无关文档集 |
假正(False Positive,FP) | 被错误识别为相关文档的无关文档集 |
假负(False Negative,FN) | 被错误识别为无关文档的相关文档集 |
(通常使用精确率、召回率、F值来评估这个指标)
from nltk.corpus inport movie_reviews docs = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random. shuffle (docs) all_ wrds = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_wrds) [:2000] def doc_features (doc) : doc_words.set (doc) features = {} for word in word_ features: features['contains({}) '.format (word)] = (word in doc_words) return features print (doc_features (movie_reviews.words ('pos/cv957_8737.txt'))) print (nltk.classify.aecuracy(classifier, test_set)) classifier.show_most_informative_features(5) Most Informative Features contains (outstanding) = True pos : neg = 11.1 :1.0 contains (seagal) = True neg : pos = 7.7 :1.0 contains (wonderfully) = True pos : neg = 6.8 :1.0 contains(damon) = True pos : neg = 5.9 :1.0 contains(wasted) = True neg : pos = 5.8 :1.0
from __future__ import print_function from __future__ import division def _edit_dist_init(len1, len2): lev = [] for i in range (len1): lev. append([0] * len2) #initialization of 2D array to zero for i in range (len1): lev[i][0] = I # column 0: 0,1,2,3…. for j in range(len2): lev[0][j] = j #row 0: 0,1,2,3,4… return lev def _edit_dist_step(lev, i, j, s1, s2, transpositions = False): c1 = s1[i – 1] c2 = s2[j – 1] # skipping a character in s1 a = lev[i - 1][j] + 1 # skipping a character in s2 b = lev[i][j – 1] + 1 #substicution c = lev[i - 1][j – 1] + (c1 != c2) #transposition d=c+1 # never picked by default if transpositions and i> 1 and j > 1: if s1[i - 2] = c2 and s2[j – 2] == c1: d = lev[i – 2][j - 2] + 1 # pick the cheapest lev[i][j] = min(a, b, c, d) def edit_distance(s1, s2, transpositions=False) : #set up a 2-D array len1 = len(s1) len2 = len(s2) lev =_edit_dist_init(len1 + 1, 1en2 + 1) # iterate over the array for i in range(len1): for j in range(len2) : _edit_dist_step(lev, i + 1, j +1, s1, s2,transpositions = transpositions) return lev[len1] [len2] def binary.distance (label1, label2) : """Simple equality test . 0.0 if the labels are identical, 1.0 if they are diferent. from nltk.metrics import print(binary_distance(1,2)) print(binary_distance(1,3) “”” return 0.0 if label1 == label2 else 1.0 def jaccard_distance(label1, label2): “""Distance metric comparing set-similarity.””” return (len(label1.union(labe12) - len(label1.intersection(label2))) / len(labell.union (label2)) def masi_ distance (labell, label2): len_intersection = len(label1. intersection (label2)) len_union = len (label1.union (label2)) len_label1 = len(labell) len_label2 = len(label2) if len_label1 == len_label2 and len_ label1 == len_intersection: m=1 elif len_intersection = min(len_label1, len_label2) : m =0.67 elif len_intersection > 0: m= 0.33 else: m = 0 return 1 - (len_intersection / len_union) * m def interval_distance (label1, label2): try: return pow(label1 - label2, 2) # return pow(list(label1) [0] – list(label2) [0],2) except: print ("non-numeric labels not supported with intervaldistance") def presence(label) : return lambda x, y: 1.0 * ((label in x) == (label in y)) def fractional_presence(label): return lambda x, y: \ abs(((1.0 / len(x)) - (1.0 / len(y)))) * (label in x and label in y) \ or 0.0 * (label not in x and label not in y) \ or abs((1.0 / len(x))) * (label in x and label not in y) \ or ((1.0 / len(y))) * (label not in x and label in y) def custom_distance (file) : data =[] with open(file, 'p') as infile: for l in infile: labelA, labelB, diat = l.strip().split("\t") labelA = frozenset ( [labelA]) labelB = frozenset( [labelB]) data[frozenset([labelA, labelB])] = float(dist) return lambda x,y:data [frozenset([x,y])] def demo(): edit_distance_examples = [ ("rain", "shine"), ("abcdef", "acbdef"), ("language","lnaguaeg"), ("language", "Inaugage"), ("language", "lngauage")] for s1, s2 in edit_distance_examples: print("Edit distance between '%s' and '%s' :" % (s1, s2), edit_distance (s1, s2)) for s1, s2 in edit_distance_examples: print("Edit distance with transpositions between '%s' and'%s':" % (s1, s2), edit_ distance(s1, s2, transpositions = True)) s1 = set([1, 2, 3, 4]) s2 = set([3, 4, 5]) print("s1:", s1) print("s2:", s2) print("Binary distance:", binary_distance(s1, s2)) print ("Jaccard distance:", jaccard_distance(s1, s2)) print ("MASI distance:", masi_distance(s1, s2)) if __name__ == __ main__' : demo ()
句法匹配可以通过执行分块任务来完成。NLTK中提供了个叫作nitk. chunk.api的模块,其有助于识别语块并为给定的语块序列返回一个解析树。
import nltk from nltk.tree import Tree print (Tree(1, [2, Tree(3, [4]) ,5])) ct=Tree('VP', [Tree('V',['gave']) ,Tree('NP',['her'])]) sent=Tree('S', [Tree('NP',['I']),ct]) print (sent) print (sent[1]) print (sent[1,1]) t1 = Tree.from string("(S(NP I) (VP (V gave) (NP her))") print(sent == t1) t1[1][1].set_label('X') t1[1][1].label () print (t1) t1[0],t1[1,1] = t1[1,1], t1[0] print (t1) len(t1)
print(wordnet.N[‘dog’][0].path_similarity(wordnet.N[‘cat’][0]))
print(wordnet.V[‘run’][0].path_similarity(wordnet.N[‘walk’][0]))
(文章末尾有前面几张的链接,如果需要的话。)
“”"***笔者的话:整理了《精通Python自然语言处理》的第十章内容:NLP系统评估。本书的最后一张内容。介绍了有关NLP系统评估的内容,主要还是计算系统的准确度,本博客记录了书中的每段代码。希望对阅读这本书的人有所帮助。FIGHTING...(热烈欢迎大家批评指正,互相讨论)
("Catch the moments of your life. Catch them while you are young and quick." --《我们这一天》
) ***"""
(第九章):语篇分析(https://blog.csdn.net/cjx14060307101/article/details/88623202)
(第八章):信息检索(https://blog.csdn.net/cjx14060307101/article/details/88595396)
(第七章):情感分析(https://blog.csdn.net/cjx14060307101/article/details/88580981)
(第六章):语义分析(https://blog.csdn.net/cjx14060307101/article/details/88541214)
(第五章):语法分析(https://blog.csdn.net/cjx14060307101/article/details/88378177)
(第四章):词性标注(https://blog.csdn.net/cjx14060307101/article/details/88357016)
(第三章):形态学(https://blog.csdn.net/cjx14060307101/article/details/88316108)
(第二章):统计语言建模(https://blog.csdn.net/cjx14060307101/article/details/88087305)
(第一章):字符串操作(https://blog.csdn.net/cjx14060307101/article/details/87980631)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。