赞
踩
此模型是基于CRF的命名实体识别模型,输入一段新闻数据,然后预测输出这段文字中按BIOE规则标注的时间、地点、人物、机构名称等实体。
N-gram简介
它的第一个特点是某个词的出现依赖于其他若干个词,第二个特点是我们获得的信息越多,预测越准确。我想说,我们每个人的大脑中都有一个N-gram模型,而且是在不断完善和训练的。我们的见识与经历,都在丰富着我们的阅历,增强着我们的联想能力。
N-gram模型是一种语言模型(Language Model,LM),语言模型是一个基于概率的判别模型,它的输入是一句话(单词的顺序序列),输出是这句话的概率,即这些单词的联合概率(joint probability)。
1、建立文件 model 和training,model用于存放模型、training用于存放训练集
将rmrb199801.pkl放入training中,并建立两个py文件,train和predict
2、编辑train.py
(1)、库的导入
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import joblib
from nltk import trigrams
import pickle
(2)、读取训练数据
#训练数据的读取
def load_pickle(path):
with open(path,'rb') as f:
data = pickle.load(f)
print(f'\rloaded{path} down!')
return data
(3)、将词性标注转成对应的实体标注
将词性标注转成对应的实体标注
_maps = {u't':u't'
,u'nr':u'PER'
,u'ns':u'LOC'
,u'nt':u'ORG'}
#将词性转换为实体,如果没有对应的实体标注则为O
def pos_to_tag(p):
t = _maps.get(p,None)
return t if t else u'O'
(4)、为实体按照BIOE模式加上标签
# 根据实体中不同单字的位置为标签加上'B'或'I'或'E',使用BIOE模式
def tag_perform(tag, index):
if index == 0 and tag != u'O':
return u'B_{}'.format(tag)
elif index == -1 and tag != u'O':
return u'E_{}'.format(tag)
elif tag != u'O':
return u'I_{}'.format(tag)
else:
return tag
(5)、 初始化字序列、实体标签序列
## 初始化字序列、实体标签序列 def init_sequence(training_data): text = load_pickle(training_data) # 将数据变换成这种形式:[['迈向/v'], ['充满/v'], ['希望/n'], ['的/u'], ['新/a'], ...] words_list = [line.split() for line in text if line.strip()] # 获取汉字列表,如[['迈向'], ['充满'], ['希望'], ['的'], ['新'], ...] words_seq = [[word.split(u'/')[0] for word in words] for words in words_list] # 获取词性标注列表,如[['v'], ['v'], ['n'], ['u'], ['a'], ...] pos_seq = [[word.split(u'/')[1] for word in words] for words in words_list] # 根据词性标注列表转换成实体标签列表,如[['O'], ['O'], ['O'], ['O'], ['O'], ...] tag_seq = [[pos_to_tag(p) for p in pos] for pos in pos_seq] # 对实体标签按BIOE模式标注,[['O'], ['O'], ['O'], ['O'], ['O'], ...] tag_seq = [[[tag_perform(tag_seq[index][i], w) if w != len(words_seq[index][i])-1 else tag_perform(tag_seq[index][i], -1) for w in range(len(words_seq[index][i]))] for i in range(len(tag_seq[index]))] for index in range(len(tag_seq))] # 生成字序列,如[['迈', '向'], ['充', '满'], ['希', '望'], ['的'], ['新'], ...] tag_seq = [[t for tag in tag_seq for t in tag] for tag_seq in tag_seq] # 生成实体标签序列,如[['O', 'O'], ['O', 'O'], ['O', 'O'], ['O'], ['O'], ...] word_seq = [[w for word in word_seq for w in word] for word_seq in words_seq] # 生成最后的字序列,并添加起始符和终止符,如['<BOS>', '迈', '向', '充', '满',...,<EOS>] # 生成最后的字序列,如['B_O', 'I_O', 'I_O', 'I_O',...] one_list_word = [] one_list_tag = [] for l in range(len(word_seq)): one_list_word += word_seq[l] one_list_tag += tag_seq[l] # 为数据添加起始符<BOS>和终止符<EOS> one_list_word = ['<BOS>'] + one_list_word + ['<EOS>'] return one_list_word, one_list_tag
(6)、对每个单字抽取其此模板定义的特征
# 对每个单字抽取其此模板定义的特征
def extract_feature(word_grams):
features, feature_list = [], []
for index in range(len(word_grams)):
for i in range(len(word_grams[index])):
word_gram = word_grams[index][i]
feature = {u'w-1': word_gram[0], u'w': word_gram[1], u'w+1': word_gram[2],
u'w-1:w': word_gram[0]+word_gram[1], u'w:w+1': word_gram[1]+word_gram[2],
u'bias': 1.0}
feature_list.append(feature)
features.append(feature_list)
feature_list = []
return features
(7)、`标注并生成模型,保存
def train(one_list_word, one_list_tag, model_path): # 以步长=3对数据进行窗口切分,即n-gram分词过程 X = list(trigrams(one_list_word)) total_length = len(X) # 选取90%的数据作为训练集 train_length = int(total_length * 0.9) y = one_list_tag # 获取训练数据及实体标签 x_train, y_train = X[:train_length], y[:train_length] # 获取测试数据及实体标签 x_test, y_test = X[train_length:], y[train_length:] # 对训练数据构建特征函数 x_train_fitted = extract_feature([x_train]) y_train_fitted = [y_train] # 对测试数据构建特征函数 x_test_fitted = extract_feature([x_test]) y_test_fitted = [y_test] print(f'\rStart training...') # 训练CRF模型 model = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) model.fit(x_train_fitted, y_train_fitted) # 模型保存 joblib.dump(filename=model_path, value=model) print(f'\rCompleted training and saved {model_path}.') labels = list(model.classes_) labels.remove('O') y_predict = model.predict(x_test_fitted) f1_score = metrics.flat_f1_score(y_test_fitted, y_predict, average='weighted', labels=labels) print(f'\rF1 score in testing data:', f1_score) return
(8)、主函数
if __name__ == '__main__':
training_data = 'training/rmrb199801.pkl'
one_list_word, one_list_tag = init_sequence(training_data)
# word_seq's content likes '[['迈', '向'], ['充', '满'], ['希', '望'], ['的'], ['新'], ...]'
# tag_seq's content likes '[['O', 'O'], ['O', 'O'], ['O', 'O'], ['O'], ['O'], ...]'
model_path = 'model/CRFner.model'
train(one_list_word, one_list_tag, model_path)
3、运行train.py
4、predict.py
# -*- coding: utf-8 -*- ''' : 基于CRF的命名实体识别 : 模型测试脚本 ''' from itertools import chain import joblib from train import extract_feature from nltk import trigrams ## 预测函数 def predict(sent, model_path): """ 功能:预测实体标签的函数 输入:一段待检验的话 输出:具有非'O'标签的词语及其实体类别 """ # 数据预处理部分 ## code starts here ## s = list(chain(*sent)) word_lists = [u'<BOS>']+s+[u'<EOS>'] word_grams = [list(trigrams(word_lists))] X = extract_feature(word_grams) model = joblib.load(model_path) # 预测 predict_list = model.predict(X)[0] # predict_list 中含有输入的段落中每一个字符(包括标点)的预测实体标签。留下有用实体的内容及标签,去掉'O'及其内容。 # 抽取有用内容及标签 entity_word_list = [] entity_tag_list = [] for i in range(len(predict_list)): if predict_list[i] != 'O': entity_word_list.append(s[i]) entity_tag_list.append(predict_list[i]) # 加上终止标识,为合并实体做准备 entity_word_list.append("END") # 加上终止标识,为合并实体标签做准备 entity_tag_list.append("END") # 实体及对应标签合并 ## code starts here ## entity_name = [] entity_tags = [] st = 0 for j in range((len(entity_tag_list)-1)): if r'E_' in entity_tag_list[j]: entity_name.append("".join(entity_word_list[st:j + 1])) entity_tags.append(",".join(entity_tag_list[st:j + 1])) st = j + 1 return entity_name, entity_tags ## 打印结果 def print_result(entity_name, entity_tags): for i in range(len(entity_name)): length = 10 - len(entity_name[i].encode('gbk')) + len(entity_name[i]) s = "Content: %-"+str(length)+"s\t,Tag: %s" print(s % (entity_name[i], entity_tags[i])) if __name__ == '__main__': # eg: 输入: sent = input('Please input your message: ') model_path = 'model/CRFner.model' entity_name, entity_tags = predict(sent, model_path) print_result(entity_name, entity_tags)
运行后有一个输入框,输入一句简短的话,如:4月16日,游人在河北省南宫市南湖樱花园内拍照游玩。近日,河北省南宫市南湖樱花园内樱花绽放,吸引游客前来赏花踏青。
然后,词性标注就完成了
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。