当前位置:   article > 正文

NER 常见问题(BIO BIOES BMES)标注之间的转换_ner任务中bio文件数据如何使用

ner任务中bio文件数据如何使用

实习收到的第一个任务

人民日报数据集的训练集用的就是BIO格式

然后我们转化为BIOES 和 BMES

首先是BIO转BMES

  1. path = r'./input/data_train.txt'
  2. res_path = r'./output/BMES.txt'
  3. f = open(path, encoding='utf-8')
  4. f1 = open(res_path, 'w+', encoding='utf_8')
  5. sentences = []
  6. sentence = []
  7. label_set = set()
  8. cnt_line = 0
  9. for line in f:
  10. cnt_line += 1
  11. if len(line) == 0 or line[0] == '\n':
  12. if len(sentence) > 0:
  13. sentences.append(sentence)
  14. print(sentence)
  15. sentence = []
  16. continue
  17. splits = line.split(' ')
  18. sentence.append([splits[0], splits[-1][:-1]])
  19. label_set.add(splits[-1])
  20. if len(sentence) > 0:
  21. sentences.append(sentence)
  22. sentence = []
  23. f.close()
  24. for sen in sentences:
  25. i = 0
  26. for index, word in enumerate(sen):
  27. char = word[0]
  28. label = word[1]
  29. if index < len(sen) - 1:
  30. if (label[0] == 'B'):
  31. if sen[index + 1][1][0] == 'I':
  32. label = label
  33. elif sen[index + 1][1][0] == 'O':
  34. label = 'S' + label[1:]
  35. elif (label[0] == 'I'):
  36. if sen[index + 1][1][0] == 'I':
  37. label = 'M' + label[1:]
  38. if sen[index + 1][1][0] == 'O' or sen[index + 1][1][0] == 'B':
  39. label = 'E' + label[1:]
  40. elif (label[0] == 'O'):
  41. label = label
  42. else:
  43. if (label[0] == 'B'):
  44. label = 'S' + label[1:]
  45. elif (label[0] == 'I'):
  46. label = 'E' + label[1:]
  47. elif (label[0] == 'O'):
  48. label = label
  49. f1.write(f'{char} {label}\n')
  50. f1.write('\n')
  51. f1.close()

然后是BMES转BIOES

  1. f= open(r'./output/BMES.txt', 'r', encoding='utf-8')
  2. f1 = open(r'./output/BIOES.txt', 'w+', encoding='utf-8')
  3. str1=[]
  4. for line in f.readlines():
  5. #print(list(line))
  6. if line!="\n":
  7. line1 = line.split()
  8. str2 = line1[0]
  9. for i in range(1, len(line1)):
  10. line2 = list(line1[i])
  11. if line2[0] == "M":
  12. line2[0] = "I"
  13. str3 = ''
  14. for i in line2:
  15. str3 = str3 + i
  16. str2 = str2 + ' ' + str3
  17. print(str2)
  18. str1.append(str2)
  19. else:
  20. str1.append(line)
  21. for j in str1:
  22. f1.write(j)
  23. f1.write("\n")

不同的标注格式跑出来的召回率是不一样的 以后会经常用到

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/一键难忘520/article/detail/788831
推荐阅读
相关标签
  

闽ICP备14008679号