当前位置:   article > 正文

处理ontonotes数据集_ontonotes数据集处理

ontonotes数据集处理

1.所需配置

Linux服务器一台(我使用的是Ubuntu系统),pycharm,python2,python3。

2.下载过程:

首先下载ontonotes数据集(具体可参考连接文章OntoNote5数据集下载及处理过程(完整版)_ontonotes-CSDN博客),并且下载以下6个文件(连接Index of /conll/2012/download)

 加上之前下载的OntoNote一共7个文件,将它们解压(以上链接下载的6个文件是在一个目录的),得到如下目录

其中conll-2012前缀开头的几个压缩包解压出来的文件中相同的会合并,不用管。

conll-2012内部结构

这6个文件解压后在conll-2012文件夹下。
然后将这两个文件传到linux服务器上,注意v3下的scripts是在python2的环境下运行的。

3.代码运行进行处理

首先确保python解释器是python2,然后执行命令(这里需要保证conll-2012和ontonotes-release-5.0文件夹在同一级目录)

bash ./conll-2012/v3/scripts/skeleton2conll.sh -D ./ontonotes-release-5.0/data/files/data/ ./conll-2012/

以上过程可能比较久,耐心等待。
执行完毕后,切回python3环境:

之后将以下代码放在conll-2012和ontonotes-release-5.0文件夹在同一级目录,命名为covert_into_bmes_format.py

  1. # coding=UTF-8
  2. import io
  3. import os, glob, itertools
  4. def generate_collection(data_tag, dir_name, lang):
  5. folder = './conll-2012/v4/data/'+ data_tag + '/data/'+ lang
  6. results = itertools.chain.from_iterable(glob.iglob(os.path.join(root, '*.v4_gold_conll'))
  7. for root, dirs, files in os.walk(folder))
  8. text, word_count, sent_count = "", 0, 0
  9. for cur_file in results:
  10. with io.open(cur_file, 'r', encoding='utf-8') as f:
  11. flag = None
  12. for line in f.readlines():
  13. l = ' '.join(line.strip().split())
  14. ls = l.split(" ")
  15. if len(ls) >= 11:
  16. word = ls[3]
  17. pos = ls[4]
  18. cons = ls[5]
  19. ori_ner = ls[10]
  20. ner = ori_ner
  21. # print(word, pos, cons, ner)
  22. if ori_ner == "*":
  23. if flag==None:
  24. ner = "O"
  25. else:
  26. ner = "I-" + flag
  27. elif ori_ner == "*)":
  28. ner = "I-" + flag
  29. flag = None
  30. elif ori_ner.startswith("(") and ori_ner.endswith("*") and len(ori_ner)>2:
  31. flag = ori_ner[1:-1]
  32. ner = "B-" + flag
  33. elif ori_ner.startswith("(") and ori_ner.endswith(")") and len(ori_ner)>2 and flag == None:
  34. ner = "B-" + ori_ner[1:-1]
  35. text += "\t".join([word, pos, cons, ner]) + '\n'
  36. word_count += 1
  37. else:
  38. text += '\n'
  39. if not line.startswith('#'):
  40. sent_count += 1
  41. text += '\n'
  42. if data_tag == 'development':
  43. data_tag = 'dev'
  44. filepath = os.path.join(dir_name, data_tag + '.bio')
  45. with io.open(filepath, 'w', encoding='utf-8') as f:
  46. f.write(text)
  47. filepath = os.path.join(dir_name, data_tag+'.info.txt')
  48. with io.open(filepath, 'w', encoding='utf-8') as f:
  49. f.write("For file:{}, there are {} sentences, {} tokens.".format(filepath, sent_count, word_count))
  50. def nertag_bio2bioes(dir_name):
  51. for bio_file in glob.glob(dir_name + '/*.bio'):
  52. with io.open(bio_file.rsplit('/', 1)[0]+'/ontonotes5.'+bio_file.rsplit('/',1)[1].rstrip('bio')+'bmes', 'w', encoding='utf-8') as fout, open(bio_file, 'r', encoding='utf-8') as fin:
  53. lines = fin.readlines()
  54. for idx in range(len(lines)):
  55. if len(lines[idx])<3:
  56. fout.write('\n')
  57. continue
  58. word, pos, label = lines[idx].split()[0], lines[idx].split()[1], lines[idx].split()[-1]
  59. if "-" not in label: # O
  60. for idx in range(len(word)):
  61. fout.write(word[idx]+' O\n')
  62. else:
  63. label_type=label.split('-')[-1]
  64. if 'B-' in label: # B
  65. if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
  66. idx==len(lines)-1 or \
  67. (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
  68. if len(word)==1: # S
  69. fout.write(word+' S-'+label_type+'\n')
  70. else: # 对于BIE在同一个word
  71. fout.write(word[0]+' B-'+label_type+'\n')
  72. for char_idx in range(1, len(word)-1):
  73. fout.write(word[char_idx]+' M-'+label_type+'\n')
  74. fout.write(word[-1]+' E-'+label_type+'\n')
  75. else:
  76. fout.write(word[0]+' B-'+label_type+'\n')
  77. for char_idx in range(1, len(word)):
  78. fout.write(word[char_idx]+' M-'+label_type+'\n')
  79. elif 'I-' in label: # I
  80. if (idx<len(lines)-1 and len(lines[idx+1])<3) or \
  81. idx==len(lines)-1 or \
  82. (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]):
  83. for char_idx in range(0, len(word)-1):
  84. fout.write(word[char_idx]+' M-'+label_type+'\n')
  85. fout.write(word[-1]+' E-'+label_type+'\n')
  86. else:
  87. for idx in range(len(word)):
  88. fout.write(word[idx]+' M-'+label_type+'\n')
  89. def main():
  90. for language in ('english', 'chinese', 'arabic'):
  91. dir_name = os.path.join('./result/', language)
  92. if not os.path.exists(dir_name):
  93. os.makedirs(dir_name)
  94. for split in ['train', 'development', 'test']:
  95. generate_collection(data_tag=split, dir_name=dir_name, lang=language)
  96. if language=='chinese':
  97. nertag_bio2bioes(dir_name)
  98. if __name__ == "__main__":
  99. main()
'
运行

 执行代码:python covert_into_bmes_format.py
然后在当前目录下会出现一个result文件,里面就是三种语言的处理结果:

打开chinese文件,可以看到我们所需要的ontonotes5.train.bmes,ontonotes5.dev.bmes,ontonotes5.test.bmes 已经得到。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Li_阴宅/article/detail/1012159
推荐阅读
相关标签
  

闽ICP备14008679号