赞
踩
找了大半天,GIT、CSDN没找到完整的一篇介绍spacy做NER的项目,这么棒的工业级框架,竟然没有详细的介绍,经本人用医疗数据初步测试,标注1000条数据,测试集F1值竟然可以达到90%,附官网链接 https://spacy.io/
1、spacy版本号2.3.2
2、训练数据格式
TRAIN_DATA = [ ("TEXT", {'entities': [(START_index, END_index, 'LBALE'), (START_index, START_index, 'LBALE')]})]
3、训练模块
nlp = spacy.blank('zh') # 英文为 en if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # add labels for _, annotations in TRAIN_DATA: # 训练数据 for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(10): print("Statring iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses)
nlp.to_disk('./test_model') # 模型保存
4、测试模块
examples = TRAIN_DATA # 测试数据 tp = 0 tr = 0 tf = 0 ta = 0 c = 0 for text, annot in examples: f = open("resume" + str(c) + ".txt", "w") doc_to_test = nlp(text) d = {} for ent in doc_to_test.ents: d[ent.label_] = [] for ent in doc_to_test.ents: d[ent.label_].append(ent.text) for i in set(d.keys()): f.write("\n\n") f.write(i + ":" + "\n") for j in set(d[i]): f.write(j.replace('\n', '') + "\n") d = {} for ent in doc_to_test.ents: d[ent.label_] = [0, 0, 0, 0, 0, 0] for ent in doc_to_test.ents: doc_gold_text = nlp.make_doc(text) gold = GoldParse(doc_gold_text, entities=annot.get("entities")) y_true = [ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner] y_pred = [x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test] if (d[ent.label_][0] == 0): # f.write("For Entity "+ent.label_+"\n") # f.write(classification_report(y_true, y_pred)+"\n") (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted') a = accuracy_score(y_true, y_pred) d[ent.label_][0] = 1 d[ent.label_][1] += p d[ent.label_][2] += r d[ent.label_][3] += f d[ent.label_][4] += a d[ent.label_][5] += 1 c += 1 for i in d: print("\n For Entity " + i + "\n") print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%") print("Precision : " + str(d[i][1] / d[i][5])) print("Recall : " + str(d[i][2] / d[i][5])) print("F-score : " + str(d[i][3] / d[i][5])) 5、模型加载调用 text = "测试句子" nlp1 = spacy.load("./test_model") doc = nlp1(text) for ent in doc.ents: print(ent.text, ent.label_)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。