当前位置:   article > 正文

《Python自然语言处理》第五章练习题答案_python中文自然语言处理基础与实战课后练习

python中文自然语言处理基础与实战课后练习

这章主要内容涉及分词、词性标注和标注器训练、字典使用。
因为中英文差别,所以在后面练习里尝试用中文数据来训练ngram标注器。

首先导包

import nltk
from nltk.corpus import brown
from nltk.book import *
import jieba
import matplotlib.pyplot as plt
  • 1
  • 2
  • 3
  • 4
  • 5

1

#nltk词性标注无法消除歧义
text = nltk.word_tokenize('British Left Waffles on Falkland Islands')
nltk.pos_tag(text)
  • 1
  • 2
  • 3

2

tag_words=brown.tagged_words()
for (word,tag) in tag_words:
    if word == 'contest':
        print(tag)
        break
  • 1
  • 2
  • 3
  • 4
  • 5

3

nltk.pos_tag(nltk.word_tokenize('They wind back the clock,while we chase after the wind.'))
  • 1

7

#dic中update将内容全部添加到d1中
d1 = {'a':1,'b':2,'c':3}
d2 = {'d':4,'f':5,'g':6}
d1.update(d2)
print(d1,d2)
  • 1
  • 2
  • 3
  • 4
  • 5

9

text1.concordance('go')
text1.concordance('went')
  • 1
  • 2

10

import re
brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
new_text = re.sub('[\,\.]',' ',"What needs to be clarified is that the fundamental purpose of China's development is to ensure that the Chinese people can live a better life and to benefit all humankind. Win-win cooperation is an important principle of China's development and a golden rule in China's external relations. China has no intention to interfere in the political system of the United States, nor challenge or replace its status and influence.In the past few years, due to Washington's irrational suppression of China's legitimate rights and interests, China-US relations have encountered unprecedented difficulties. This situation should not continue any longer. The only right way is to follow the principles of non-conflict, non-confrontation, mutual respect and win-win cooperation.")
word_tags = unigram_tagger.tag(new_text.split())
none_tag = []
for (word,tag) in word_tags:
    if tag==None:
        none_tag.append(word)
none_tag
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

没被标记的有拼写不规范的词、有连字符、新词

11

help(nltk.AffixTagger)
  • 1

用法AffixTagger(train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False)

brown_sents=brown.sents(categories='news')
brown_tagged_sents=brown.tagged_sents(categories='news')
affixtagger=nltk.AffixTagger(train=brown_tagged_sents,affix_length=-3,min_stem_length=2)
affixtagger.tag(brown_sents[2007])
  • 1
  • 2
  • 3
  • 4

12


sents = brown.sents()
tag_sents = brown.tagged_sents()
baseline_tagger = nltk.BigramTagger(tag_sents)
baseline_tagger.evaluate(tag_sents)
  • 1
  • 2
  • 3
  • 4
  • 5
sent = "They expressed their willingness to enhance cooperation or coordination in some specific areas. For instance, the two sides are committed to strengthening dialogue and cooperation in the field of climate change and will establish a joint working group on that subject. In the spirit of reciprocity and mutual benefit, the two sides will hold talks on facilitating activities of each other's diplomatic and consular missions and personnel, as well as on issues related to media reporters.".split()
baseline_tagger.tag(sent)
  • 1
  • 2
baseline_tagger.evaluate(brown_tagged_sents)
  • 1

bigram用于新数据后得分会提高

13

print("date:%i-%i-%i"%(2021,3,21))
print("date:%s/%s/%s"%('2021','3','21'))
  • 1
  • 2

14

words = brown.words()
fd_dic={}
for w in words:
    w = w.lower()
    if w in fd_dic.keys():
        fd_dic[w] += 1
    else:
        fd_dic[w] = 1 
fd_dic
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
sorted(fd_dic.items(),key=lambda item:item[1],reverse=True)
  • 1

15

words = brown.tagged_words(tagset='universal')
set(cont[1] for cont in words)
  • 1
  • 2
n_words = set(word for (word,tag) in words if tag=='NOUN')
n_dic={}
for w in brown.words():
    if len(w)>2:
        w = w[:-1]
        if w in n_words:
            if w in n_dic.keys():
                n_dic[w] += 1
            else:
                n_dic[w] = 1
sorted(n_dic.items(),key = lambda item:item[1],reverse=True)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    count_dic[word] = len(cfd[word])
sorted(count_dic.items(),key = lambda item:item[1],reverse=True)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
count_tag = {}
words = brown.tagged_words()
for (w,tag) in words:
    if tag in count_tag.keys():
        count_tag[tag] += 1
    else:
        count_tag[tag] = 1
sorted(count_tag.items(),key = lambda item:item[1],reverse=True)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
words = brown.tagged_words(tagset='universal')
count_tags = {}
for i in range(len(words)):
    if words[i][1] == 'NOUN':
        back_tag = words[i+1][1]
        if back_tag in count_tags.keys():
            count_tags[back_tag] += 1
        else:
            count_tags[back_tag] = 1
sorted(count_tags.items(),key = lambda item:item[1],reverse=True)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

16

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
  • 1
  • 2
  • 3
  • 4
  • 5
fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))
baseline_tagger.evaluate(brown_tagged_sents)
  • 1
  • 2
  • 3
  • 4
  • 5

18

for (k,v) in count_tags.items():
    print(k,":",(v/sum(count_tags.values()))*100)
  • 1
  • 2
cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    if len(cfd[word])>1:
        count_dic[word] = len(cfd[word])
print((len(count_dic)/len(words))*100,"%")
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

20

words = brown.tagged_words()
w_li = []
for (w,t) in words:
    if t=='MD':
        w_li.append(w.lower())
w_li.sort()
print(set(w_li))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
for i in range(len(words)):
    if words[i][1]=='P' and words[i+1][1]=='DET' and words[i+2][1]=='NN':
        print(words[i:i+3][0])
  • 1
  • 2
  • 3

21

ws = []
for i in range(len(words)):
    if words[i][0].lower() in ('adore','love','like','prefer'):
        ws.append(words[i-1][0].lower())
set(ws)
  • 1
  • 2
  • 3
  • 4
  • 5

24

brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()

train_full_size = int(len(brown_tagged_sents)*0.7)
train_sents = brown_tagged_sents[:train_full_size]
test_sents = brown_tagged_sents[train_full_size:]
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
tagger = nltk.UnigramTagger(train_sents)
tagger.evaluate(test_sents)
  • 1
  • 2
tagger = nltk.BigramTagger(train_sents)
tagger.evaluate(test_sents)
  • 1
  • 2
tagger = nltk.TrigramTagger(train_sents)
tagger.evaluate(test_sents)
  • 1
  • 2

多元标注器性能逐渐下降

25
加载人民日报2014语料

with open(r'E:\laptop\研一\2014_corpus.txt',encoding='utf8') as f:
    corpus = f.readlines()
  • 1
  • 2
#人民日报语料切分
tagged_sents = []
i = 1
for sent in corpus:
    if i<20000:
        tagged_sent = []
        for w in sent.split(' '):
            if w != '\n' and len(w.split('/'))==2:
                tagged_sent.append(tuple(w.split('/')))
        tagged_sents.append(tagged_sent)
        i+=1
    else:
        break
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
#一元标注器训练
size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
t2.tag(jieba.lcut('PFR语料库是对人民日报1998年上半年的纯文本语料进行了词语切分和词性标注制作而成的,严格按照人民日报的日期、版序、文章顺序编排的。文章中的每个词语都带有词性标记。'))
  • 1
t1 = nltk.UnigramTagger(train)
  • 1

26

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.plot(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

27

#抽出人民语料的标记
orl_sent = [[word for (word,tag) in sent if (word != None and tag!= None)] for sent in test]
  • 1
  • 2
test_tags = [tag for sent in orl_sent for (word,tag) in t2.tag(sent) if (word != None and tag!= None)]
gold_tags = [tag for (word,tag) in sent for sent in test if (word != None and tag!= None)]
nltk.ConfusionMatrix(gold_tags,test_tags)
  • 1
  • 2
  • 3

31

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.semilogx(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

32

size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
t3 = nltk.BrillTaggerTrainer.train(train_sents=train,max_rules=200, min_score=2, min_acc=None)
t3.evaluate(test)
  • 1
  • 2
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/459637
推荐阅读
相关标签
  

闽ICP备14008679号