当前位置:   article > 正文

基于知识图谱的智能问答系项目——导入数据_基于知识图谱的问答系统导入表格

基于知识图谱的问答系统导入表格
  1. import os
  2. import re
  3. import json
  4. import codecs
  5. import threading
  6. from py2neo import Graph
  7. import pandas as pd
  8. import numpy as np
  9. from tqdm import tqdm
  10. def print_data_info(data_path):
  11. triples = []
  12. i = 0
  13. with open(data_path, 'r', encoding='utf8') as f:
  14. for line in f.readlines():
  15. data = json.loads(line)
  16. print(json.dumps(data, sort_keys=True, indent=4, separators=(', ', ': '), ensure_ascii=False))
  17. i += 1
  18. if i >= 5:
  19. break
  20. return triples
  21. class MedicalExtractor(object):
  22. def __init__(self):
  23. super(MedicalExtractor, self).__init__()
  24. self.graph = Graph(host="127.0.0.1", http_port=7474, user="neo4j", password="123456")
  25. self.drugs = []
  26. self.recipes = []
  27. self.foods = []
  28. self.checks = []
  29. self.departments = []
  30. self.producers = []
  31. self.diseases = []
  32. self.symptoms = []
  33. self.disease_infos = []
  34. self.rels_department = []
  35. self.rels_noteat = []
  36. self.rels_doeat = []
  37. self.rels_recommandeat = []
  38. self.rels_commonddrug = []
  39. self.rels_recommanddrug = []
  40. self.rels_check = []
  41. self.rels_drug_producer = []
  42. self.rels_symptom = []
  43. self.rels_acompany = []
  44. self.rels_category = []
  45. def extract_triples(self, data_path):
  46. print("从json文件中转换抽取三元组")
  47. with open(data_path, 'r', encoding='utf8') as f:
  48. for line in tqdm(f.readlines(), ncols=80):
  49. data_json = json.loads(line)
  50. disease_dict = {}
  51. disease = data_json['name']
  52. disease_dict['name'] = disease
  53. self.diseases.append(disease)
  54. disease_dict['desc'] = ''
  55. disease_dict['prevent'] = ''
  56. disease_dict['cause'] = ''
  57. disease_dict['easy_get'] = ''
  58. disease_dict['cure_department'] = ''
  59. disease_dict['cure_way'] = ''
  60. disease_dict['cure_lasttime'] = ''
  61. disease_dict['symptom'] = ''
  62. disease_dict['cured_prob'] = ''
  63. if 'symptom' in data_json:
  64. #症状为一个列表 因此需要对列表里的每一个症状进行处理
  65. self.symptoms += data_json['symptom']
  66. for sympotom in data_json['symptom']:
  67. self.rels_symptom.append([disease, 'has_symptom', sympotom])
  68. if 'acompany' in data_json:
  69. #并发症为一个列表 因此需要对列表里的每一个并发症进行处理
  70. for acompany in data_json['acompany']:
  71. self.rels_acompany.append([disease, 'acompany_with', acompany])
  72. if 'desc' in data_json:
  73. disease_dict['desc'] = data_json['desc']
  74. if 'prevent' in data_json:
  75. disease_dict['prevent'] = data_json['prevent']
  76. if 'cause' in data_json:
  77. disease_dict['cause'] = data_json['cause']
  78. if 'get_prob' in data_json:
  79. disease_dict['get_prob'] = data_json['get_prob']
  80. if 'easy_get' in data_json:
  81. disease_dict['easy_get'] = data_json['easy_get']
  82. if 'cure_department' in data_json:
  83. cure_department = data_json['cure_department']
  84. if len(cure_department) == 1:
  85. self.rels_category.append([disease, 'cure_department', cure_department[0]])
  86. if len(cure_department) == 2:
  87. big = cure_department[0]
  88. small = cure_department[1]
  89. self.rels_department.append([small, 'belongs_to', big])
  90. self.rels_category.append([disease, 'cure_department', small])
  91. disease_dict['cure_department'] = cure_department
  92. self.departments += cure_department
  93. if 'cure_way' in data_json:
  94. disease_dict['cure_way'] = data_json['cure_way']
  95. if 'cure_lasttime' in data_json:
  96. disease_dict['cure_lasttime'] = data_json['cure_lasttime']
  97. if 'cured_prob' in data_json:
  98. disease_dict['cured_prob'] = data_json['cured_prob']
  99. if 'common_drug' in data_json:
  100. common_drug = data_json['common_drug']
  101. for drug in common_drug:
  102. self.rels_commonddrug.append([disease, 'has_common_drug', drug])
  103. self.drugs += common_drug
  104. if 'recommand_drug' in data_json:
  105. recommand_drug = data_json['recommand_drug']
  106. for drug in recommand_drug:
  107. self.rels_recommanddrug.append([disease, 'recommand_drug', drug])
  108. self.drugs += recommand_drug
  109. if 'not_eat' in data_json:
  110. not_eat = data_json['not_eat']
  111. for _not in not_eat:
  112. self.rels_noteat.append([disease, 'not_eat', _not])
  113. self.foods += not_eat
  114. do_eat = data_json['do_eat']
  115. for _do in do_eat:
  116. self.rels_doeat.append([disease, 'do_eat', _do])
  117. self.foods += do_eat
  118. if 'recommand_eat' in data_json:
  119. recommand_eat = data_json['recommand_eat']
  120. for _recommand in recommand_eat:
  121. self.rels_recommandeat.append([disease, 'recommand_recipes', _recommand])
  122. self.recipes += recommand_eat
  123. if 'check' in data_json:
  124. check = data_json['check']
  125. for _check in check:
  126. self.rels_check.append([disease, 'need_check', _check])
  127. self.checks += check
  128. if 'drug_detail' in data_json:
  129. for det in data_json['drug_detail']:
  130. det_spilt = det.split('(')
  131. if len(det_spilt) == 2:
  132. p, d = det_spilt
  133. d = d.rstrip(')')
  134. if p.find(d) > 0:
  135. p = p.rstrip(d)
  136. self.producers.append(p)
  137. self.drugs.append(d)
  138. self.rels_drug_producer.append([p, 'production', d])
  139. else:
  140. d = det_spilt[0]
  141. self.drugs.append(d)
  142. self.disease_infos.append(disease_dict)
  143. def write_nodes(self, entitys, entity_type):
  144. print("写入{0}实体".format(entity_type))
  145. for node in tqdm(set(entitys), ncols=80):
  146. cql = """MERGE(n:{label}{{name:'{entity_name}'}})""".format(label = entity_type, entity_name = node.replace("'",""))
  147. try:
  148. self.graph.run(cql)
  149. except Exception as e:
  150. print(e)
  151. print(cql)
  152. def write_edges(self, triples, head_type, tail_type):
  153. print("写入{0}关系".format(triples[0][1]))
  154. for head, relation, tail in tqdm(triples, ncols=80):
  155. #用merge而非create 因为若关系存在则无需写入
  156. cql = """MATCH(p:{head_type}),(q:{tail_type}) WHERE (p)-[r:{relation}]->(q)""".format(head_type=head_type, tail_type=tail_type, head=head.replace("'",""), tail=tail.replace(",", ""), relation=relation)
  157. try:
  158. self.graph.run(cql)
  159. except Exception as e:
  160. print(e)
  161. print(cql)
  162. def set_attributes(self, entity_infos, etype):
  163. print("写入{0}实体的属性".format(etype))
  164. for e_dict in tqdm(entity_infos[892:], ncols=80):
  165. name = e_dict['name']
  166. del e_dict['name']
  167. for k, v in e_dict.items():
  168. #'cure_department', 'cure_way'两个属性为列表 其余属性为字符串 故分开写
  169. if k in ['cure_department', 'cure_way']:
  170. cql = """MATCH(n:{label}) WHERE n.name = '{name}' set n.{k}={v}""".format(label=etype, name=name.replace("'",""), k=k, v=v)
  171. else:
  172. cql = """MATCH(n:{label}) WHERE n.name = '{name}' set n.{k}={v}""".format(label=etype, name=name.replace("'",""), k=k, v=v.replace("'", "").replace("\n",""))
  173. try:
  174. self.graph.run(cql)
  175. except Exception as e:
  176. print(e)
  177. print(cql)
  178. def create_entitys(self):
  179. self.write_nodes(self.drugs, '药品')
  180. self.write_nodes(self.recipes, '菜谱')
  181. self.write_nodes(self.foods, '食物')
  182. self.write_nodes(self.checks, '检查')
  183. self.write_nodes(self.departments, '科室')
  184. self.write_nodes(self.producers, '药企')
  185. self.write_nodes(self.diseases, '疾病')
  186. self.write_nodes(self.symptoms, '症状')
  187. def create_relations(self):
  188. self.write_edges(self.rels_department, '科室', '科室')
  189. self.write_edges(self.rels_noteat, '疾病', '食物')
  190. self.write_edges(self.rels_doeat, '疾病', '食物')
  191. self.write_edges(self.rels_recommandeat, '疾病', '菜谱')
  192. self.write_edges(self.rels_recommanddrug, '疾病', '药品')
  193. self.write_edges(self.rels_check, '疾病', '检查')
  194. self.write_edges(self.rels_drug_producer, '药企', '药品')
  195. self.write_edges(self.rels_symptom, '疾病', '症状')
  196. self.write_edges(self.rels_acompany, '疾病', '疾病')
  197. self.write_edges(self.rels_category, '疾病', '科室')
  198. def set_diseases_attributes(self):
  199. t = threading.Thread(target=self.set_attributes, args=(self.disease_infos, "疾病"))
  200. t.setDaemon(False)
  201. t.start()
  202. def export_data(self, data, path):
  203. if isinstance(data[0], str):
  204. data = sorted([d.strip("...") for d in set(data)])
  205. with codecs.open(path, 'w', encoding='utf-8') as f:
  206. json.dump(data, f, indent=4, ensure_ascii=False)
  207. def export_entitys_relations(self):
  208. self.export_data(self.drugs, './graph_data/drugs.json')
  209. self.export_data(self.recipes, './graph_data/recipes.json')
  210. self.export_data(self.foods, './graph_data/foods.json')
  211. self.export_data(self.checks, './graph_data/checks.json')
  212. self.export_data(self.departments, './graph_data/departments.json')
  213. self.export_data(self.producers, './graph_data/producers.json')
  214. self.export_data(self.diseases, './graph_data/diseases.json')
  215. self.export_data(self.symptoms, './graph_data/symptoms.json')
  216. self.export_data(self.rels_department, './graph_data/rels_department.json')
  217. self.export_data(self.rels_noteat, './graph_data/rels_noteat.json')
  218. self.export_data(self.rels_doeat, './graph_data/rels_doeat.json')
  219. self.export_data(self.rels_recommandeat, './graph_data/rels_recommandeat.json')
  220. self.export_data(self.rels_commonddrug, './graph_data/rels_commonddrug.json')
  221. self.export_data(self.rels_recommanddrug, './graph_data/rels_recommanddrug.json')
  222. self.export_data(self.rels_check, './graph_data/rels_check.json')
  223. self.export_data(self.rels_drug_producer, './graph_data/rels_drug_producer.json')
  224. self.export_data(self.rels_symptom, './graph_data/rels_symptom.json')
  225. self.export_data(self.rels_acompany, './graph_data/rels_acompany.json')
  226. self.export_data(self.rels_category, './graph_data/rels_category.json')
  227. if __name__ == '__main__':
  228. path = "./graph_data/medical.json"
  229. #print_data_info(path)
  230. extractor = MedicalExtractor()
  231. extractor.extract_triples(path)
  232. extractor.export_entitys_relations()

代码来源:@每天都要机器学习

2-neo4j数据库安装以及导入数据_哔哩哔哩_bilibili

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/761078
推荐阅读
相关标签
  

闽ICP备14008679号