一 信息提取







  1. import nltk, re, pprint
  2. def ie_preprocess(document):
  3. sentences = nltk.sent_tokenize(document) #句子分割
  4. sentences = [nltk.word_tokenize(sent) for sent in sentences] #分词
  5. sentences = [nltk.pos_tag(sent) for sent in sentence] #词性标注器

二 分块









  1. sentence = [("the","DT"),("little","JJ"),("yellow","JJ"),("dog","NN"),("barked","VBD"),("at","IN"),("the","DT"),("cat","NN")]
  2. grammar = "NP: {<DT>?<JJ>*<NN>}"
  3. cp = nltk.RegexpParser(grammar)
  4. result = cp.parse(sentence)
  5. print result
  6. (S
  7. (NP the/DT little/JJ yellow/JJ dog/NN)
  8. barked/VBD
  9. at/IN
  10. (NP the/DT cat/NN))




  1. grammer = r"""
  2. NP: {<DT|PP\$>?<JJ>*<NN>} #匹配一个可选的限定词或所有格代名词
  3. {<NNP>+} #匹配一个或多个专有名词
  4. """
  5. cp = nltk.RegexpParser(grammer)
  6. sentence = [("Rapunzel","NNP"),("let","VBD"),("down", "RP"),("her","PP$"),("long","JJ"),("golden","JJ"),("hair","NN")]
  7. print cp.parse(sentence)
  8. (S
  9. (NP Rapunzel/NNP)
  10. let/VBD
  11. down/RP
  12. (NP her/PP$ long/JJ golden/JJ hair/NN))
  1. nouns = [("money","NN"),("market","NN"),("fund","NN")]
  2. grammar = "NP: {<NN><NN>}" #如果将匹配两个连续名词的文本的规则应用到包含3个连续名词的文本中,则只有前两个名词被分块
  3. cp = nltk.RegexpParser(grammar)
  4. print cp.parse(nouns)
  5. (S (NP money/NN market/NN) fund/NN)



  1. cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
  2. brown = nltk.corpus.brown
  3. for sent in brown.tagged_sents():
  4. tree = cp.parse(sent)
  5. for subtree in tree.subtrees():
  6. if subtree.label() == 'CHUNK':
  7. print subtree
  8. (CHUNK combined/VBN to/TO achieve/VB)
  9. (CHUNK continue/VB to/TO place/VB)
  10. (CHUNK serve/VB to/TO protect/VB)
  11. (CHUNK wanted/VBD to/TO wait/VB)
  12. (CHUNK allowed/VBN to/TO place/VB)
  13. ......




  1. grammar = r"""
  2. NP:
  3. {<.*>+}
  4. }<VBD|IN>+{"""
  5. sentence = [("the","DT"),("little","JJ"),("yellow","JJ"),("dog","NN"),("barked","VBD"),("at","IN"),("the","DT"),("cat","NN")]
  6. cp = nltk.RegexpParser(grammar)
  7. print cp.parse(sentence)
  8. (S
  9. (NP the/DT little/JJ yellow/JJ dog/NN)
  10. barked/VBD
  11. at/IN
  12. (NP the/DT cat/NN))





B和I标记是块类型的后缀,如B-NP, I-NP。


3 开发和评估分块器




  1. from nltk.corpus import conll2000
  2. print conll2000.chunked_sents('train.txt')[99]
  3. (S
  4. (PP Over/IN)
  5. (NP a/DT cup/NN)
  6. (PP of/IN)
  7. (NP coffee/NN)
  8. ,/,
  9. (NP Mr./NNP Stone/NNP)
  10. (VP told/VBD)
  11. (NP his/PRP$ story/NN)
  12. ./.)
包含 3中分块类型:NP分块,VP分块,PP分块
print conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]  #只选择NP分块


  1. cp = nltk.RegexpParser("") #不分块
  2. test_sents = conll2000.chunked_sents('test.txt',chunk_types=['NP'])
  3. print cp.evaluate(test_sents) #评估结果
  4. ChunkParse score:
  5. IOB Accuracy: 43.4%%
  6. Precision: 0.0%%
  7. Recall: 0.0%%
  8. F-Measure: 0.0%%
  1. grammar = r"NP: {<[CDJNP].*>+}"
  2. cp = nltk.RegexpParser(grammar) #初级的正则表达式分块器
  3. test_sents = conll2000.chunked_sents('test.txt')
  4. print cp.evaluate(test_sents) #评估结果
  5. ChunkParse score:
  6. IOB Accuracy: 62.5%%
  7. Precision: 70.6%%
  8. Recall: 38.5%%
  9. F-Measure: 49.8%%

  1. #使用训练语料找到对每个词性标记最有可能的块标记(I、O或B)
  2. #可以用unigram标注器建立一个分块器,但不是要确定每个词的正确词性标记,而是给定每个词的词性标记,尝试确定正确的块标记
  3. class UnigramChunker(nltk.ChunkParserI):
  4. def __init__(self, train_sents):
  5. train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
  6. self.tagger = nltk.UnigramTagger(train_data)
  7. def parse(self, sentence):
  8. pos_tags = [pos for (word,pos) in sentence]
  9. tagged_pos_tags = self.tagger.tag(pos_tags)
  10. chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
  11. #为词性标注IOB块标记
  12. conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
  13. return nltk.chunk.conlltags2tree(conlltags) #转换成分块树状图
  1. #使用CoNLL2000分块语料库训练
  2. test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
  3. train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
  4. unigram_chunker = UnigramChunker(train_sents)
  5. print unigram_chunker.evaluate(test_sents)
  6. ChunkParse score:
  7. IOB Accuracy: 92.9%%
  8. Precision: 79.9%%
  9. Recall: 86.8%%
  10. F-Measure: 83.2%%

  1. postags = sorted(set(pos for sent in train_sents for (word,pos) in sent.leaves()))
  2. print unigram_chunker.tagger.tag(postags)
  3. [(u'#', u'B-NP'), (u'$', u'B-NP'), (u"''", u'O'), (u'(', u'O'), (u')', u'O'), (u',', u'O'), (u'.', u'O'), (u':', u'O'), (u'CC', u'O'), (u'CD', u'I-NP'), (u'DT', u'B-NP'), (u'EX', u'B-NP'), (u'FW', u'I-NP'), (u'IN', u'O'), (u'JJ', u'I-NP'), (u'JJR', u'B-NP'), (u'JJS', u'I-NP'), (u'MD', u'O'), (u'NN', u'I-NP'), (u'NNP', u'I-NP'), (u'NNPS', u'I-NP'), (u'NNS', u'I-NP'), (u'PDT', u'B-NP'), (u'POS', u'B-NP'), (u'PRP', u'B-NP'), (u'PRP$', u'B-NP'), (u'RB', u'O'), (u'RBR', u'O'), (u'RBS', u'B-NP'), (u'RP', u'O'), (u'SYM', u'O'), (u'TO', u'O'), (u'UH', u'O'), (u'VB', u'O'), (u'VBD', u'O'), (u'VBG', u'O'), (u'VBN', u'O'), (u'VBP', u'O'), (u'VBZ', u'O'), (u'WDT', u'B-NP'), (u'WP', u'B-NP'), (u'WP$', u'B-NP'), (u'WRB', u'O'), (u'``', u'O')]
  1. #使用训练语料找到对每个词性标记最有可能的块标记(I、O或B)
  2. #可以用bigram标注器建立一个分块器,但不是要确定每个词的正确词性标记,而是给定每个词的词性标记,尝试确定正确的块标记
  3. class BigramChunker(nltk.ChunkParserI):
  4. def __init__(self, train_sents):
  5. train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
  6. self.tagger = nltk.BigramTagger(train_data)
  7. def parse(self, sentence):
  8. pos_tags = [pos for (word,pos) in sentence]
  9. tagged_pos_tags = self.tagger.tag(pos_tags)
  10. chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
  11. #为词性标注IOB块标记
  12. conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
  13. return nltk.chunk.conlltags2tree(conlltags) #转换成分块树状图
  14. bigram_chunker = BigramChunker(train_sents)
  15. print bigram_chunker.evaluate(test_sents)
  16. ChunkParse score:
  17. IOB Accuracy: 93.3%%
  18. Precision: 82.3%%
  19. Recall: 86.8%%
  20. F-Measure: 84.5%%




安装maxnet     最大熵

  1. class ConsecutiveNPChunkTagger(nltk.TaggerI):
  2. def __init__(self, train_sents):
  3. train_set = []
  4. for tagged_sent in train_sents:
  5. untagged_sent = nltk.tag.untag(tagged_sent)
  6. history = []
  7. for i, (word,tag) in enumerate(tagged_sent):
  8. featureset = npchunk_features(untagged_sent, i, history)
  9. train_set.append( (featureset, tag) )
  10. history.append(tag)
  11. self.classifier = nltk.MaxentClassifier.train(train_set, algorithm='megam', trace=0) #最大熵
  12. def tag(self, sentence):
  13. history = []
  14. for i, word in enumerate(sentence):
  15. featureset = npchunk_features(sentence, i, history)
  16. tag = self.classifier.classify(featureset)
  17. history.append(tag)
  18. return zip(sentence, history)
  19. class ConsecutiveNPChunker(nltk.ChunkParserI):
  20. def __init__(self, train_sents):
  21. tagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]
  22. self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
  23. def parse(self, sentence):
  24. tagged_sents = self.tagger.tag(sentence)
  25. conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
  26. return nltk.chunk.conlltags2tree(conlltags)
  1. def npchunk_features(sentence, i, history):
  2. word, pos = sentence[i]
  3. return {"pos": pos} #只提供当前标识符的词性标记

  1. chunker = ConsecutiveNPChunker(train_sents)
  2. print chunker.evaluate(test_sents)
  3. ChunkParse score:
  4. IOB Accuracy: 92.9%%
  5. Precision: 79.9%%
  6. Recall: 86.7%%
  7. F-Measure: 83.2%%

  1. def npchunk_features(sentence, i, history):
  2. word, pos = sentence[i]
  3. if i == 0:
  4. prevword, prevpos = "<START>", "<START>"
  5. else:
  6. prevword, prevpos = sentence[i-1]
   return {"pos": pos, "prevpos": prevpos} #模拟相邻标记之间的相互作用'

  1. chunker = ConsecutiveNPChunker(train_sents)
  2. print chunker.evaluate(test_sents)
  3. ChunkParse score:
  4. IOB Accuracy: 93.7%%
  5. Precision: 82.1%%
  6. Recall: 87.2%%
  7. F-Measure: 84.5%%

  1. def npchunk_features(sentence, i, history):
  2. word, pos = sentence[i]
  3. if i == 0:
  4. prevword, prevpos = "<START>", "<START>"
  5. else:
  6. prevword, prevpos = sentence[i-1]
  7. return {"pos": pos, "word": word, "prevpos": prevpos} #增加词的内容
  8. chunker = ConsecutiveNPChunker(train_sents)
  9. print chunker.evaluate(test_sents)
  10. ChunkParse score:
  11. IOB Accuracy: 94.2%%
  12. Precision: 83.2%%
  13. Recall: 88.3%%
  14. F-Measure: 85.7%%

  1. def npchunk_features(sentence, i, history):
  2. word, pos = sentence[i]
  3. if i == 0:
  4. prevword, prevpos = "<START>", "<START>"
  5. else:
  6. prevword, prevpos = sentence[i-1]
  7. if i == len(sentence)-1:
  8. nextword, nextpos = "<END>", "<END>"
  9. else:
  10. nextword, nextpos = sentence[i+1]
  11. return {"pos": pos,
  12. "word": word,
  13. "prevpos": prevpos,
  14. "nextpos": nextpos,
  15. "prevpos+pos": "%s+%s" % (prevpos, pos),
  16. "pos+nextpos": "%s+%s" % (pos, nextpos),
  17. "tags-since-dt": tags_since_dt(sentence, i)} #预取特征、配对功能和复杂的语境特征
  18. def tags_since_dt(sentence, i):
  19. tags = set()
  20. for word, pos in sentence[:i]:
  21. if pos == "DT":
  22. tags = set()
  23. else:
  24. tags.add(pos)
  25. return '+'.join(sorted(tags))
  26. chunker = ConsecutiveNPChunker(train_sents)
  27. print chunker.evaluate(test_sents)
  28. ChunkParse score:
  29. IOB Accuracy: 96.0%%
  30. Precision: 88.8%%
  31. Recall: 91.1%%
  32. F-Measure: 89.9%%

四 语言结构中的递归




  1. grammar = r"""
  2. NP: {<DT|JJ|NN.*>+}
  3. PP: {<IN><NP>}
  4. VP: {<VB.*><NP|PP|CLAUSE>+$}
  5. CLAUSE: {<NP><VP>}
  6. """
  7. cp = nltk.RegexpParser(grammar)
  8. sentence = [("Mary","NN"), ("saw","VBD"),("the","DT"),("cat","NN"),("sit","VB"),("on","IN"),("the","DT"),("mat","NN")]
  9. print cp.parse(sentence)
  10. (S
  11. (NP Mary/NN)
  12. saw/VBD #无法识别VP
  13. (CLAUSE
  14. (NP the/DT cat/NN)
  15. (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
  1. cp = nltk.RegexpParser(grammar, loop=2) #添加循环
  2. print cp.parse(sentence)
  3. (S
  4. (CLAUSE
  5. (NP Mary/NN)
  6. (VP
  7. saw/VBD
  8. (CLAUSE
  9. (NP the/DT cat/NN)
  10. (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))





五 命名实体识别

命名实体识别(NER)系统的目标是识别所有文字提及的命名实体。这可以分解成两个子任务:确定NE的边界和确定其类型。命名实体识别经常是信息提取中关系识别的前奏,也有助于其他任务。例如:在问答系统(QA)中,我们试图提高信息检索的精确度,不用返回整个页面而只是包含用户问题的答案的那部分。大多数QA系统利用标准信息检索返回的文件,然后尝试分离文档中包含答案的最小的文本分段P303,例如问题:Who was the first President of the US?被检索的文档中包含答案,但我们想得到的答案应该是X was the first President of the US的形式,其中X不仅是一个名词短语也是一个PER类型的命名实体。




  1. sent = nltk.corpus.treebank.tagged_sents()[22]
  2. print nltk.ne_chunk(sent, binary=True) #如果设置参数binary=True,那么命名实体只被标注为NE
  3. (S
  4. The/DT
  5. (NE U.S./NNP)
  6. is/VBZ
  7. one/CD
  8. of/IN
  1. print nltk.ne_chunk(sent) #PERSON, ORGANIZATION and GPE
  2. (S
  3. The/DT
  4. (GPE U.S./NNP)
  5. is/VBZ
  6. ......
  7. (PERSON Brooke/NNP T./NNP Mossman/NNP)
  8. ,/,
  9. a/DT
  10. professor/NN
  11. of/IN
  12. pathlogy/NN
  13. at/IN
  14. the/DT
  15. (ORGANIZATION University/NNP)
  16. of/IN
  17. (PERSON Vermont/NNP College/NNP)
  18. of/IN
  19. (GPE Medicine/NNP)

六 关系抽取


方法之一是首先寻找所有(X, a, Y)形式的三元组,其中X和Y是指定类型的命名实体,a表示X和Y之间关系的字符串

  1. IN = re.compile(r'.*\bin\b(?!\b.+ing)')
  2. for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
  3. for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
  4. print nltk.sem.relextract.rtuple(rel)
  5. [ORG: u'WHYY'] u'in' [LOC: u'Philadelphia']
  6. [ORG: u'McGlashan & Sarrail'] u'firm in' [LOC: u'San Mateo']
  7. [ORG: u'Freedom Forum'] u'in' [LOC: u'Arlington']
  8. [ORG: u'Brookings Institution'] u', the research group in' [LOC: u'Washington']
  9. [ORG: u'Idealab'] u', a self-described business incubator based in' [LOC: u'Los Angeles']
  10. [ORG: u'Open Text'] u', based in' [LOC: u'Waterloo']

