赞
踩
为了跑NER模型,四大数据集,其中三个Weibo,Resume、MSRA都是公开的(可以通过该链接获取数据集)
唯有OntoNote数据集需要授权使用,需要在LDC官网上去注册申请,
直接在右上角搜索OntoNotes即可,
注册时要注意,邮箱最好用edu的,这样比较容易通过。另外,组织(Organization)这里选择你的学校,如果组织里没找到你的学校,那么你自己注册一个组织。如果着急的话可以邮件联系ldc@ldc.upenn.edu 管理员很好的,很快就能回你的消息(注意用edu的邮箱)。
在这里把ontonotes-release-5.0文件下载下来之后就开始处理数据了。
(这里我花费了很长时间,因为看了一些没什么用的博客讲解)这里我介绍一种最简单的方法。
再下载上图6个文件下载,加上你之前下载的OntoNote一共7个文件,将它们解压(以上链接下载的6个文件是在一个目录的),得到如下目录
conll-2012内部结构
这6个文件解压后在conll-2012文件夹下。
然后将这两个文件传到linux服务器上,注意v3下的scripts是在python2的环境下运行的(别急,python2环境很好搞)
首先用conda命令创建一个python2环境
conda create -n py27 python=2.7
然后激活环境:
source activate py27
接下来执行命令(这里需要保证conll-2012和ontonotes-release-5.0文件夹在同一级目录):
bash ./conll-2012/v3/scripts/skeleton2conll.sh -D ./ontonotes-release-5.0/data/files/data/ ./conll-2012/
以上过程可能比较久,耐心等待。
执行完毕后,切回python3环境:
conda deactivate
之后将以下代码放在conll-2012和ontonotes-release-5.0文件夹在同一级目录,命名为covert_into_bmes_format.py
import os, glob, itertools def generate_collection(data_tag, dir_name, lang): folder = './conll-2012/v4/data/'+ data_tag + '/data/'+ lang results = itertools.chain.from_iterable(glob.iglob(os.path.join(root, '*.v4_gold_conll')) for root, dirs, files in os.walk(folder)) text, word_count, sent_count = "", 0, 0 for cur_file in results: with open(cur_file, 'r', encoding='utf-8') as f: flag = None for line in f.readlines(): l = ' '.join(line.strip().split()) ls = l.split(" ") if len(ls) >= 11: word = ls[3] pos = ls[4] cons = ls[5] ori_ner = ls[10] ner = ori_ner # print(word, pos, cons, ner) if ori_ner == "*": if flag==None: ner = "O" else: ner = "I-" + flag elif ori_ner == "*)": ner = "I-" + flag flag = None elif ori_ner.startswith("(") and ori_ner.endswith("*") and len(ori_ner)>2: flag = ori_ner[1:-1] ner = "B-" + flag elif ori_ner.startswith("(") and ori_ner.endswith(")") and len(ori_ner)>2 and flag == None: ner = "B-" + ori_ner[1:-1] text += "\t".join([word, pos, cons, ner]) + '\n' word_count += 1 else: text += '\n' if not line.startswith('#'): sent_count += 1 text += '\n' if data_tag == 'development': data_tag = 'dev' filepath = os.path.join(dir_name, data_tag + '.bio') with open(filepath, 'w', encoding='utf-8') as f: f.write(text) filepath = os.path.join(dir_name, data_tag+'.info.txt') with open(filepath, 'w', encoding='utf-8') as f: f.write("For file:{}, there are {} sentences, {} tokens.".format(filepath, sent_count, word_count)) def nertag_bio2bioes(dir_name): for bio_file in glob.glob(dir_name + '/*.bio'): with open(bio_file.rsplit('/', 1)[0]+'/ontonotes5.'+bio_file.rsplit('/',1)[1].rstrip('bio')+'bmes', 'w', encoding='utf-8') as fout, open(bio_file, 'r', encoding='utf-8') as fin: lines = fin.readlines() for idx in range(len(lines)): if len(lines[idx])<3: fout.write('\n') continue word, pos, label = lines[idx].split()[0], lines[idx].split()[1], lines[idx].split()[-1] if "-" not in label: # O for idx in range(len(word)): fout.write(word[idx]+' O\n') else: label_type=label.split('-')[-1] if 'B-' in label: # B if (idx<len(lines)-1 and len(lines[idx+1])<3) or \ idx==len(lines)-1 or \ (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]): if len(word)==1: # S fout.write(word+' S-'+label_type+'\n') else: # 对于BIE在同一个word fout.write(word[0]+' B-'+label_type+'\n') for char_idx in range(1, len(word)-1): fout.write(word[char_idx]+' M-'+label_type+'\n') fout.write(word[-1]+' E-'+label_type+'\n') else: fout.write(word[0]+' B-'+label_type+'\n') for char_idx in range(1, len(word)): fout.write(word[char_idx]+' M-'+label_type+'\n') elif 'I-' in label: # I if (idx<len(lines)-1 and len(lines[idx+1])<3) or \ idx==len(lines)-1 or \ (idx<len(lines)-1 and 'I' not in lines[idx+1].split()[-1]): for char_idx in range(0, len(word)-1): fout.write(word[char_idx]+' M-'+label_type+'\n') fout.write(word[-1]+' E-'+label_type+'\n') else: for idx in range(len(word)): fout.write(word[idx]+' M-'+label_type+'\n') def main(): for language in ('english', 'chinese', 'arabic'): dir_name = os.path.join('./result/', language) if not os.path.exists(dir_name): os.makedirs(dir_name) for split in ['train', 'development', 'test']: generate_collection(data_tag=split, dir_name=dir_name, lang=language) if language=='chinese': nertag_bio2bioes(dir_name) if __name__ == "__main__": main()
执行代码:python covert_into_bmes_format.py
然后在当前目录下会出现一个result文件,里面就是三种语言的处理结果:
打开chinese文件,可以看到我们所需要的ontonotes5.train.bmes,ontonotes5.dev.bmes,ontonotes5.test.bmes 已经得到。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。