赞
踩
例:
{"text": "《步步惊心》改编自著名作家桐华的同名清穿小说《甄嬛传》改编自流潋紫所著的同名小说电视剧《何以笙箫默》改编自顾漫同名小说《花千骨》改编自fresh果果同名小说《裸婚时代》是月影兰析创作的一部情感小说《琅琊榜》是根据海宴同名网络小说改编电视剧《宫锁心玉》,又名《宫》《雪豹》,该剧改编自网络小说《特战先驱》《我是特种兵》由红遍网络的小说《最后一颗子弹留给我》改编电视剧《来不及说我爱你》改编自匪我思存同名小说《来不及说我爱你》", "spo_list": [{"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "顾漫"}, "subject": "何以笙箫默"}, {"predicate": "改编自", "object_type": {"@value": "作品"}, "subject_type": "影视作品", "object": {"@value": "最后一颗子弹留给我"}, "subject": "我是特种兵"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "桐华"}, "subject": "步步惊心"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "流潋紫"}, "subject": "甄嬛传"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "fresh果果"}, "subject": "花千骨"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "月影兰析"}, "subject": "裸婚时代"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "海宴"}, "subject": "琅琊榜"}, {"predicate": "改编自", "object_type": {"@value": "作品"}, "subject_type": "影视作品", "object": {"@value": "特战先驱"}, "subject": "雪豹"}, {"predicate": "改编自", "object_type": {"@value": "作品"}, "subject_type": "影视作品", "object": {"@value": "来不及说我爱你"}, "subject": "来不及说我爱你"}, {"predicate": "作者", "object_type": {"@value": "人物"}, "subject_type": "图书作品", "object": {"@value": "匪我思存"}, "subject": "来不及说我爱你"}]}
text和spo_list(predict,object_type,subject_type,object,subject)两部分组成,每行一个样本。
处理后的NYT:
原文:
{"sentText": "The only greater tragedy than the death of Ousmane Zongo would be to convict an innocent man , who at the time was acting reasonably pursuant to his training as a New York City police officer , '' Mr. London said . ''", "relationMentions": [{"em1Text": "Zongo", "em2Text": "City", "label": "/people/person/place_lived"}]}
triple:
{
"text": "The only greater tragedy than the death of Ousmane Zongo would be to convict an innocent man , who at the time was acting reasonably pursuant to his training as a New York City police officer , '' Mr. London said . ''",
"triple_list": [
[
"Zongo",
"/people/person/place_lived",
"City"
]
]
}
包含两部分:text和triple_list(subject,relation,objective)
(1)将数据集提取成triple的形式
import json from tqdm import tqdm import codecs train_data = [] dev_data = [] test_data = [] with open('train_dev.json') as f: for l in tqdm(f): a = json.loads(l) if not a['spo_list']: continue line = { 'text': a['text'].lstrip('\"').strip('\r\n').rstrip('\"'), 'triple_list': [(i['subject'], i['predicate'], i['object']['@value']) for i in a['spo_list'] if i['predicate'] != 'None'] } if not line['triple_list']: continue train_data.append(line) with open('dev_dev.json') as f: for l in tqdm(f): a = json.loads(l) if not a['spo_list']: continue line = { 'text': a['text'].lstrip('\"').strip('\r\n').rstrip('\"'), 'triple_list': [(i['subject'], i['predicate'], i['object']['@value']) for i in a['spo_list'] if i['predicate'] != 'None'] } if not line['triple_list']: continue dev_data.append(line) with open('test_dev.json') as f: for l in tqdm(f): a = json.loads(l) if not a['spo_list']: continue line = { 'text': a['text'].lstrip('\"').strip('\r\n').rstrip('\"'), 'triple_list': [(i['subject'], i['predicate'], i['object']['@value']) for i in a['spo_list'] if i['predicate'] != 'None'] } if not line['triple_list']: continue test_data.append(line) with codecs.open('train_triples.json', 'w', encoding='utf-8') as f: json.dump(train_data, f, indent=4, ensure_ascii=False) with codecs.open('dev_triples.json', 'w', encoding='utf-8') as f: json.dump(dev_data, f, indent=4, ensure_ascii=False) with codecs.open('test_triples.json', 'w', encoding='utf-8') as f: json.dump(test_data, f, indent=4, ensure_ascii=False)
(2)将triple划分为SEO,EPO,Normal三种类型
SEO为一个实体与多个实体之间都存在关系的情况
例如:
{ "text": "《步步惊心》改编自著名作家桐华的同名清穿小说《甄嬛传》改编自流潋紫所著的同名小说电视剧《何以笙箫默》改编自顾漫同名小说《花千骨》改编自fresh果果同名小说《裸婚时代》是月影兰析创作的一部情感小说《琅琊榜》是根据海宴同名网络小说改编电视剧《宫锁心玉》,又名《宫》《雪豹》,该剧改编自网络小说《特战先驱》《我是特种兵》由红遍网络的小说《最后一颗子弹留给我》改编电视剧《来不及说我爱你》改编自匪我思存同名小说《来不及说我爱你》", "triple_list": [ [ "何以笙箫默", "作者", "顾漫" ], [ "我是特种兵", "改编自", "最后一颗子弹留给我" ], [ "步步惊心", "作者", "桐华" ], [ "甄嬛传", "作者", "流潋紫" ], [ "花千骨", "作者", "fresh果果" ], [ "裸婚时代", "作者", "月影兰析" ], [ "琅琊榜", "作者", "海宴" ], [ "雪豹", "改编自", "特战先驱" ], [ "来不及说我爱你", "改编自", "来不及说我爱你" ], [ "来不及说我爱你", "作者", "匪我思存" ] ] }
EPO为两个实体之间存在有不同的关系的情况
例如:
{ "text": "2019年2月25日和26日,温氏股份实控人之一、前任董事长温鹏程之妻伍翠珍分别减持公司股票608万股和256万股,成交均价分别为30.78元/股和30.02元/股,共计套现约2.64亿元", "triple_list": [ [ "温氏股份", "董事长", "温鹏程" ], [ "伍翠珍", "丈夫", "温鹏程" ], [ "温鹏程", "妻子", "伍翠珍" ] ] }
代码如下:
import json from tqdm import tqdm import codecs import numpy as np test_normal = [] test_seo = [] test_epo = [] def is_seo_triple(triples): # 头实体相同,尾实体不同 s_to_o = {} for triple in triples: if triple[0] in s_to_o.keys(): cur = set(s_to_o[triple[0]]) if triple[2] not in cur: return 1 elif triple[2] in s_to_o.keys(): cur = set(s_to_o[triple[2]]) if triple[0] not in cur: return 1 else: s_to_o[triple[0]] = triple[2] s_to_o[triple[2]] = triple[0] # print(s_to_o.items()) return 0 def is_epo_triple(triples): # 头尾实体均相同,关系不同 subject_object = {} for triple in triples: if triple[0] in subject_object.keys(): cur = set(subject_object[triple[0]]) if triple[2] in cur: return 1 else: cur.add(triple[2]) subject_object[triple[0]] = cur elif triple[2] in subject_object.keys(): cur = set(subject_object[triple[2]]) if triple[0] in cur: return 1 else: cur.add(triple[0]) subject_object[triple[2]] = cur else: subject_object[triple[0]] = triple[2] subject_object[triple[2]] = triple[0] # print(subject_object.items()) return 0 def is_normal_triple(tripels): if is_epo_triple(tripels) or is_seo_triple(tripels): return 0 return 1 if __name__ == '__main__': with open('duie_test.json') as f: for l in tqdm(f): a = json.loads(l) if not a['spo_list']: continue line = { 'text': a['text'].lstrip('\"').strip('\r\n').rstrip('\"'), 'triple_list': [(i['subject'], i['predicate'], i['object']['@value']) for i in a['spo_list'] if i['predicate'] != 'None'] } # 划分关系类型 if is_epo_triple(line['triple_list']): # epo += 1 test_epo.append(line) elif is_seo_triple(line['triple_list']): # seo += 1 test_seo.append(line) else: # normal += 1 test_normal.append(line) #测试集按照type进行划分 with codecs.open('test_normal_triples.json', 'w', encoding='utf-8') as f: json.dump(test_normal, f, indent=4, ensure_ascii=False) with codecs.open('test_seo_triples.json', 'w', encoding='utf-8') as f: json.dump(test_seo, f, indent=4, ensure_ascii=False) with codecs.open('test_epo_triples.json', 'w', encoding='utf-8') as f: json.dump(test_epo, f, indent=4, ensure_ascii=False)
这样数据集就处理好啦
(ps.附上split_by_num
#! -*- coding:utf-8 -*- import json from tqdm import tqdm import codecs test_1 = [] test_2 = [] test_3 = [] test_4 = [] test_other = [] with open('duie_test.json') as f: for l in tqdm(f): a = json.loads(l) if not a['spo_list']: continue line = { 'text': a['text'].lstrip('\"').strip('\r\n').rstrip('\"'), 'triple_list': [(i['subject'], i['predicate'], i['object']['@value']) for i in a['spo_list'] if i['predicate'] != 'None'] } if not line['triple_list']: continue spo_num = len(line['triple_list']) if spo_num == 1: test_1.append(line) elif spo_num == 2: test_2.append(line) elif spo_num == 3: test_3.append(line) elif spo_num == 4: test_4.append(line) else: test_other.append(line) with codecs.open('test_triples_1.json', 'w', encoding='utf-8') as f: json.dump(test_1, f, indent=4, ensure_ascii=False) with codecs.open('test_triples_2.json', 'w', encoding='utf-8') as f: json.dump(test_2, f, indent=4, ensure_ascii=False) with codecs.open('test_triples_3.json', 'w', encoding='utf-8') as f: json.dump(test_3, f, indent=4, ensure_ascii=False) with codecs.open('test_triples_4.json', 'w', encoding='utf-8') as f: json.dump(test_4, f, indent=4, ensure_ascii=False) with codecs.open('test_triples_5.json', 'w', encoding='utf-8') as f: json.dump(test_other, f, indent=4, ensure_ascii=False)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。