当前位置:   article > 正文

基于spacy的中文命名实体识别_spacy中文实体识别

spacy中文实体识别

找了大半天,GIT、CSDN没找到完整的一篇介绍spacy做NER的项目,这么棒的工业级框架,竟然没有详细的介绍,经本人用医疗数据初步测试,标注1000条数据,测试集F1值竟然可以达到90%,附官网链接 https://spacy.io/

1、spacy版本号2.3.2

2、训练数据格式

TRAIN_DATA = [ ("TEXT", {'entities': [(START_index, END_index, 'LBALE'), (START_index, START_index, 'LBALE')]})]

3、训练模块

  1. nlp = spacy.blank('zh') # 英文为 en
  2. if 'ner' not in nlp.pipe_names:
  3. ner = nlp.create_pipe('ner')
  4. nlp.add_pipe(ner, last=True)
  5. # add labels
  6. for _, annotations in TRAIN_DATA: # 训练数据
  7. for ent in annotations.get('entities'):
  8. ner.add_label(ent[2])
  9. # get names of other pipes to disable them during training
  10. other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  11. with nlp.disable_pipes(*other_pipes): # only train NER
  12. optimizer = nlp.begin_training()
  13. for itn in range(10):
  14. print("Statring iteration " + str(itn))
  15. random.shuffle(TRAIN_DATA)
  16. losses = {}
  17. for text, annotations in TRAIN_DATA:
  18. nlp.update(
  19. [text], # batch of texts
  20. [annotations], # batch of annotations
  21. drop=0.2, # dropout - make it harder to memorise data
  22. sgd=optimizer, # callable to update weights
  23. losses=losses)
  24. print(losses)
nlp.to_disk('./test_model')  # 模型保存

4、测试模块

  1. examples = TRAIN_DATA # 测试数据
  2. tp = 0
  3. tr = 0
  4. tf = 0
  5. ta = 0
  6. c = 0
  7. for text, annot in examples:
  8. f = open("resume" + str(c) + ".txt", "w")
  9. doc_to_test = nlp(text)
  10. d = {}
  11. for ent in doc_to_test.ents:
  12. d[ent.label_] = []
  13. for ent in doc_to_test.ents:
  14. d[ent.label_].append(ent.text)
  15. for i in set(d.keys()):
  16. f.write("\n\n")
  17. f.write(i + ":" + "\n")
  18. for j in set(d[i]):
  19. f.write(j.replace('\n', '') + "\n")
  20. d = {}
  21. for ent in doc_to_test.ents:
  22. d[ent.label_] = [0, 0, 0, 0, 0, 0]
  23. for ent in doc_to_test.ents:
  24. doc_gold_text = nlp.make_doc(text)
  25. gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
  26. y_true = [ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner]
  27. y_pred = [x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test]
  28. if (d[ent.label_][0] == 0):
  29. # f.write("For Entity "+ent.label_+"\n")
  30. # f.write(classification_report(y_true, y_pred)+"\n")
  31. (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted')
  32. a = accuracy_score(y_true, y_pred)
  33. d[ent.label_][0] = 1
  34. d[ent.label_][1] += p
  35. d[ent.label_][2] += r
  36. d[ent.label_][3] += f
  37. d[ent.label_][4] += a
  38. d[ent.label_][5] += 1
  39. c += 1
  40. for i in d:
  41. print("\n For Entity " + i + "\n")
  42. print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%")
  43. print("Precision : " + str(d[i][1] / d[i][5]))
  44. print("Recall : " + str(d[i][2] / d[i][5]))
  45. print("F-score : " + str(d[i][3] / d[i][5]))
  46. 5、模型加载调用
  47. text = "测试句子"
  48. nlp1 = spacy.load("./test_model")
  49. doc = nlp1(text)
  50. for ent in doc.ents:
  51. print(ent.text, ent.label_)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Gausst松鼠会/article/detail/367930
推荐阅读
相关标签
  

闽ICP备14008679号