当前位置:   article > 正文

刘焕勇QABasedOnMedicaKnowledgeGraph项目全过程

刘焕勇

原项目地址:新建标签页 (github.com)

实体规模4.4万,实体关系规模30万。

一、首先安装

MongoDB-Windows-x86_64

mongodb-compass-1.36.1-win32-x64(安装的时候注意安装路径最好不要出现中文,不然会很麻烦)

neo4j-community-4.2.4-windows(这里我是去找了4开头的历史版本)

jdk-11.0.18_windows-x64_bin

开始项目时MongoDB、neo4j全程打开

  1. 连接MongoDB

(1)打开cmd,进入E:\mongodb\bin目录下,输入命令“mongod --dbpath E:\mongodb\data”即可开启MongoDB服务。

(2)浏览器进入http://127.0.0.1:27017,显示“It looks like you are trying to access MongoDB over HTTP on the native driver port.”则表示连接成功。

  1. 连接neo4j

(1)进入E:\知识图谱\neo4j\neo4j-community-4.2.4\bin,输入neo4j.bat console。

(2)浏览器进入http://localhost:7474/browser/

二、数据处理部分

1.data_spider.py 爬取数据(数据的原网站是寻医问药网的疾病百科)

  1. import urllib.request
  2. import urllib.parse
  3. from lxml import etree
  4. import pymongo
  5. import re
  6. class CrimeSpider:
  7. def __init__(self):
  8. self.conn = pymongo.MongoClient()#创建数据库连接
  9. self.db = self.conn['medical']#获取数据库‘medical'
  10. self.col = self.db['data']#获取文件'data'
  11. #根据url,请求html
  12. def get_html(self, url):
  13. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
  14. 'Chrome/51.0.2704.63 Safari/537.36'}#UA使得服务器能够识别客户使用的操作系统及版本、CPU类型、浏览器及版本等,这里是要程序伪装成浏览器进行访问
  15. req = urllib.request.Request(url=url, headers=headers)#发起带header头的请求
  16. res = urllib.request.urlopen(req)#返回一个object对象
  17. html = res.read().decode('gbk')#想要获取内容还需要进一步的解析
  18. return html
  19. def url_parser(self, content):
  20. selector = etree.HTML(content)#将content转化为能被xpath匹配的格式
  21. urls = ['http://jib.xywy.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]#抓取所有病例的页面
  22. return urls
  23. #将内容写入数据库
  24. def spider_main(self):
  25. for page in range(1, 11000):
  26. try:
  27. basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'%page#疾病概述页面
  28. cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page#病因页面
  29. prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page#预防页面
  30. symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page#症状页面
  31. inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page#检查方法页面
  32. treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page#治疗页面
  33. food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page#食物页面
  34. drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page#药品页面
  35. data = {}#将以下数据封装成字典
  36. data['url'] = basic_url#基本网址
  37. data['basic_info'] = self.basicinfo_spider(basic_url)#疾病基本信息。字典。'basic_info':{'category':疾病分类,'name':疾病名称,'desc':疾病简介,'attributes':[基本知识,治疗常识,温馨提示]}
  38. data['cause_info'] = self.common_spider(cause_url)#病因。list。'cause_info':[第一段文字,第二段文字,...]
  39. data['prevent_info'] = self.common_spider(prevent_url)#预防。list。'prevent_info':[第一段文字,第二段文字,...]
  40. data['symptom_info'] = self.symptom_spider(symptom_url)#症状(应该是并发症)。元组。'symptom_info':(症状信息,[第一段症状,第二段症状,...])
  41. data['inspect_info'] = self.inspect_spider(inspect_url)#检查方法的网址。list。'inspect_info':[检查方法网址,.....]
  42. data['treat_info'] = self.treat_spider(treat_url)#治疗概述。list。 'treat_info':[就诊科室,治疗方式,治疗周期,治愈率,常用药品]
  43. data['food_info'] = self.food_spider(food_url)#食物,字典。'food_info':{'good':宜吃食物, 'bad':忌食物, 'recommand':宜食物推荐食物}
  44. data['drug_info'] = self.drug_spider(drug_url)#药品名称。str。'drug_info':药品名称
  45. print(page, basic_url)#打印出
  46. self.col.insert(data)#将上述数据封装成的字典,写入数据库
  47. except Exception as e:
  48. print(e, page)
  49. return
  50. #基本信息解析
  51. def basicinfo_spider(self, url):
  52. html = self.get_html(url)
  53. selector = etree.HTML(html)
  54. title = selector.xpath('//title/text()')[0]#网页的标题,使用正则表达式提取内容中的某部分
  55. category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')#分类
  56. desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')#简介
  57. ps = selector.xpath('//div[@class="mt20 articl-know"]/p')#[基本知识,治疗常识,温馨提示]
  58. infobox = []
  59. for p in ps:
  60. info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')#清理换行符等内容
  61. infobox.append(info)
  62. basic_data = {}
  63. basic_data['category'] = category
  64. basic_data['name'] = title.split('的简介')[0]
  65. basic_data['desc'] = desc
  66. basic_data['attributes'] = infobox
  67. return basic_data
  68. #treat_infobox治疗解析
  69. def treat_spider(self, url):
  70. html = self.get_html(url)
  71. selector = etree.HTML(html)
  72. ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
  73. infobox = []
  74. for p in ps:
  75. info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
  76. infobox.append(info)
  77. return infobox
  78. #treat_infobox治疗解析
  79. def drug_spider(self, url):
  80. html = self.get_html(url)
  81. selector = etree.HTML(html)
  82. drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
  83. return drugs
  84. #food治疗解析
  85. def food_spider(self, url):
  86. html = self.get_html(url)
  87. selector = etree.HTML(html)
  88. divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
  89. try:#异常处理
  90. food_data = {}
  91. food_data['good'] = divs[0].xpath('./div/p/text()')
  92. food_data['bad'] = divs[1].xpath('./div/p/text()')
  93. food_data['recommand'] = divs[2].xpath('./div/p/text()')
  94. except:
  95. return {}
  96. return food_data
  97. #症状信息解析
  98. def symptom_spider(self, url):
  99. html = self.get_html(url)
  100. selector = etree.HTML(html)
  101. symptoms = selector.xpath('//a[@class="gre" ]/text()')
  102. ps = selector.xpath('//p')
  103. detail = []
  104. for p in ps:
  105. info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
  106. detail.append(info)
  107. symptoms_data = {}
  108. symptoms_data['symptoms'] = symptoms
  109. symptoms_data['symptoms_detail'] = detail
  110. return symptoms, detail
  111. #检查信息解析
  112. def inspect_spider(self, url):
  113. html = self.get_html(url)
  114. selector = etree.HTML(html)
  115. inspects = selector.xpath('//li[@class="check-item"]/a/@href')
  116. return inspects
  117. #通用解析模块
  118. def common_spider(self, url):
  119. html = self.get_html(url)
  120. selector = etree.HTML(html)
  121. ps = selector.xpath('//p')
  122. infobox = []
  123. for p in ps:
  124. info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
  125. if info:
  126. infobox.append(info)
  127. return '\n'.join(infobox)
  128. #检查项抓取模块
  129. def inspect_crawl(self):
  130. for page in range(1, 3685):
  131. try:
  132. url = 'http://jck.xywy.com/jc_%s.html'%page
  133. html = self.get_html(url)
  134. data = {}#检查的字典数据
  135. data['url']= url#检查项目网页的网址
  136. data['html'] = html#检查项目网页的源码
  137. self.db['jc'].insert(data)#放到jc这个数据库里
  138. print(url)
  139. except Exception as e:
  140. print(e)
  141. handler = CrimeSpider()
  142. handler.inspect_crawl()
  143. handler.spider_main()

2.max_cut.py 基于词典的最大前向/后向匹配。

  1. class CutWords:
  2. def __init__(self):
  3. dict_path = './disease.txt'#加载保存疾病名称的文档
  4. self.word_dict, self.max_wordlen = self.load_words(dict_path)
  5. #加载词典
  6. def load_words(self, dict_path):
  7. words = list()#list()方法用于将元组或字符串转换为列表,所以这里是想要构建一个空列表。然后定义最大字长变量,在字典遍历的过程中,不断将疾病名称保存到words中,同时更新最大字长。
  8. max_len = 0
  9. for line in open(dict_path):
  10. wd = line.strip()
  11. if not wd:
  12. continue
  13. if len(wd) > max_len:
  14. max_len = len(wd)
  15. words.append(wd)
  16. return words, max_len
  17. #最大向前匹配
  18. def max_forward_cut(self, sent):
  19. # 1.从左向右取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。
  20. # 2.查找大机器词典并进行匹配。若匹配成功,则将这个匹配字段作为一个词切分出来。如果没有则字长减一继续匹配,直到匹配到为止,如果到最后也没有匹配成功的话,就返回该字段第一个值。
  21. cutlist = []
  22. index = 0
  23. while index < len(sent):
  24. matched = False
  25. for i in range(self.max_wordlen, 0, -1):
  26. cand_word = sent[index: index + i]
  27. if cand_word in self.word_dict:
  28. cutlist.append(cand_word)
  29. matched = True
  30. break
  31. # 如果没有匹配上,则按字符切分
  32. if not matched:
  33. i = 1
  34. cutlist.append(sent[index])
  35. index += i
  36. return cutlist
  37. # 最大向后匹配
  38. def max_backward_cut(self, sent):
  39. # 1.从右向左取待切分汉语句的m个字符作为匹配字段,m为大机器词典中最长词条个数。
  40. # 2.查找大机器词典并进行匹配。若匹配成功,则将这个匹配字段作为一个词切分出来。和前面的最大向前匹配类似,只不过在该函数中如果最后匹配失败则返回字段最后一个值。
  41. cutlist = []
  42. index = len(sent)
  43. max_wordlen = 5
  44. while index > 0:
  45. matched = False
  46. for i in range(self.max_wordlen, 0, -1):
  47. tmp = (i + 1)
  48. cand_word = sent[index - tmp: index]
  49. # 如果匹配上,则将字典中的字符加入到切分字符中
  50. if cand_word in self.word_dict:
  51. cutlist.append(cand_word)
  52. matched = True
  53. break
  54. # 如果没有匹配上,则按字符切分
  55. if not matched:
  56. tmp = 1
  57. cutlist.append(sent[index - 1])
  58. index -= tmp
  59. return cutlist[::-1]
  60. # 双向最大向前匹配
  61. def max_biward_cut(self, sent):
  62. # 双向最大匹配法是将正向最大匹配法得到的分词结果和逆向最大匹配法的到的结果进行比较,从而决定正确的分词方法。
  63. # 启发式规则:
  64. # 1.如果正反向分词结果词数不同,则取分词数量较少的那个。
  65. # 2.如果分词结果词数相同 a.分词结果相同,就说明没有歧义,可返回任意一个。 b.分词结果不同,返回其中单字较少的那个。
  66. forward_cutlist = self.max_forward_cut(sent)
  67. backward_cutlist = self.max_backward_cut(sent)
  68. count_forward = len(forward_cutlist)
  69. count_backward = len(backward_cutlist)
  70. def compute_single(word_list):
  71. num = 0
  72. for word in word_list:
  73. if len(word) == 1:
  74. num += 1
  75. return num
  76. if count_forward == count_backward:
  77. if compute_single(forward_cutlist) > compute_single(backward_cutlist):
  78. return backward_cutlist
  79. else:
  80. return forward_cutlist
  81. elif count_backward > count_forward:
  82. return forward_cutlist
  83. else:
  84. return backward_cutlist

3.build_data.py 将爬虫爬取的数据进行规整

  1. import pymongo
  2. from lxml import etree
  3. import os
  4. from max_cut import *
  5. class MedicalGraph:
  6. def __init__(self):
  7. self.conn = pymongo.MongoClient()#建立无用户密码连接
  8. cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])#当前文件夹地址
  9. self.db = self.conn['medical']
  10. self.col = self.db['data']
  11. first_words = [i.strip() for i in open(os.path.join(cur_dir, 'first_name.txt'))]
  12. alphabets = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y', 'z']
  13. nums = ['1','2','3','4','5','6','7','8','9','0']
  14. self.stop_words = first_words + alphabets + nums
  15. self.key_dict = {
  16. '医保疾病' : 'yibao_status',
  17. "患病比例" : "get_prob",
  18. "易感人群" : "easy_get",
  19. "传染方式" : "get_way",
  20. "就诊科室" : "cure_department",
  21. "治疗方式" : "cure_way",
  22. "治疗周期" : "cure_lasttime",
  23. "治愈率" : "cured_prob",
  24. '药品明细': 'drug_detail',
  25. '药品推荐': 'recommand_drug',
  26. '推荐': 'recommand_eat',
  27. '忌食': 'not_eat',
  28. '宜食': 'do_eat',
  29. '症状': 'symptom',
  30. '检查': 'check',
  31. '成因': 'cause',
  32. '预防措施': 'prevent',
  33. '所属类别': 'category',
  34. '简介': 'desc',
  35. '名称': 'name',
  36. '常用药品' : 'common_drug',
  37. '治疗费用': 'cost_money',
  38. '并发症': 'acompany'
  39. }
  40. self.cuter = CutWords()#创建最大前向/后向匹配类的实例
  41. def collect_medical(self):
  42. cates = []
  43. inspects = []
  44. count = 0
  45. for item in self.col.find():
  46. data = {}
  47. basic_info = item['basic_info']#遍历
  48. name = basic_info['name']
  49. if not name:
  50. continue
  51. # 基本信息
  52. data['名称'] = name
  53. data['简介'] = '\n'.join(basic_info['desc']).replace('\r\n\t', '').replace('\r\n\n\n','').replace(' ','').replace('\r\n','\n')
  54. category = basic_info['category']
  55. data['所属类别'] = category
  56. cates += category
  57. inspect = item['inspect_info']
  58. inspects += inspect
  59. attributes = basic_info['attributes']
  60. # 成因及预防
  61. data['预防措施'] = item['prevent_info']
  62. data['成因'] = item['cause_info']
  63. # 并发症
  64. data['症状'] = list(set([i for i in item["symptom_info"][0] if i[0] not in self.stop_words]))
  65. for attr in attributes:
  66. attr_pair = attr.split(':')
  67. if len(attr_pair) == 2:
  68. key = attr_pair[0]
  69. value = attr_pair[1]
  70. data[key] = value
  71. # 检查
  72. inspects = item['inspect_info']
  73. jcs = []
  74. for inspect in inspects:
  75. jc_name = self.get_inspect(inspect)
  76. if jc_name:
  77. jcs.append(jc_name)
  78. data['检查'] = jcs
  79. # 食物
  80. food_info = item['food_info']
  81. if food_info:
  82. data['宜食'] = food_info['good']
  83. data['忌食'] = food_info['bad']
  84. data['推荐'] = food_info['recommand']
  85. # 药品
  86. drug_info = item['drug_info']
  87. data['药品推荐'] = list(set([i.split('(')[-1].replace(')','') for i in drug_info]))
  88. data['药品明细'] = drug_info
  89. data_modify = {}
  90. for attr, value in data.items():
  91. attr_en = self.key_dict.get(attr)#'name'
  92. if attr_en:
  93. data_modify[attr_en] = value#把内容放到data里面,如 'name':白百咳
  94. if attr_en in ['yibao_status', 'get_prob', 'easy_get', 'get_way', "cure_lasttime", "cured_prob"]:
  95. data_modify[attr_en] = value.replace(' ','').replace('\t','')
  96. elif attr_en in ['cure_department', 'cure_way', 'common_drug']:
  97. data_modify[attr_en] = [i for i in value.split(' ') if i]
  98. elif attr_en in ['acompany']:
  99. acompany = [i for i in self.cuter.max_biward_cut(data_modify[attr_en]) if len(i) > 1]
  100. data_modify[attr_en] = acompany
  101. try:
  102. self.db['medical'].insert(data_modify)#插入字典
  103. count += 1
  104. print(count)
  105. except Exception as e:
  106. print(e)
  107. return
  108. def get_inspect(self, url):
  109. res = self.db['jc'].find_one({'url':url})
  110. if not res:
  111. return ''
  112. else:
  113. return res['name']
  114. def modify_jc(self):
  115. for item in self.db['jc'].find():
  116. url = item['url']
  117. content = item['html']
  118. selector = etree.HTML(content)
  119. name = selector.xpath('//title/text()')[0].split('结果分析')[0]
  120. desc = selector.xpath('//meta[@name="description"]/@content')[0].replace('\r\n\t','')
  121. self.db['jc'].update({'url':url}, {'$set':{'name':name, 'desc':desc}})
  122. if __name__ == '__main__':
  123. handler = MedicalGraph()
  124. handler.modify_jc()
  125. handler.collect_medical()
三、构建知识图谱

  1. 运行build_medicalgraph.py 将结构化JSON数据导入neo4j

在PyCharm终端安装py2neo:pip install py2neo==4.3.0 -i https://pypi.douban.com/simple

(1)知识图谱实体类型

(2)知识图谱实体关系类型

(3)知识图谱属性类型

(4)支持问答的类型

  1. import os
  2. import json
  3. from py2neo import Graph,Node
  4. class MedicalGraph:
  5. def __init__(self):
  6. cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
  7. self.data_path = os.path.join(cur_dir, 'medical.json')#json地址
  8. self.g = Graph('http://localhost:7474', auth = ('neo4j', '072766'))#连接数据库
  9. #读取文件
  10. def read_nodes(self):
  11. # 共7类节点
  12. drugs = [] # 药品
  13. foods = [] # 食物
  14. checks = [] # 检查
  15. departments = [] # 科室
  16. producers = [] # 药品大类
  17. diseases = [] # 疾病
  18. symptoms = [] # 症状
  19. disease_infos = []##疾病信息,里面是字典格式[{'name':...,'desc':...,...,...},{}],里面是所有的实体节点
  20. # 构建节点实体关系
  21. rels_department = [] # 科室-科室关系
  22. rels_noteat = [] # 疾病-忌吃食物关系
  23. rels_doeat = [] # 疾病-宜吃食物关系
  24. rels_recommandeat = [] # 疾病-推荐吃食物关系
  25. rels_commonddrug = [] # 疾病-通用药品关系
  26. rels_recommanddrug = [] # 疾病-热门药品关系
  27. rels_check = [] # 疾病-检查关系
  28. rels_drug_producer = [] # 厂商-药物关系
  29. rels_symptom = [] # 疾病症状关系
  30. rels_acompany = [] # 疾病并发关系
  31. rels_category = [] # 疾病与科室之间的关系
  32. count = 0
  33. for data in open(self.data_path,encoding='utf-8'):
  34. disease_dict = {}
  35. count += 1
  36. print(count)
  37. data_json = json.loads(data)
  38. disease = data_json['name']
  39. disease_dict['name'] = disease
  40. diseases.append(disease)
  41. disease_dict['desc'] = ''
  42. disease_dict['prevent'] = ''
  43. disease_dict['cause'] = ''
  44. disease_dict['easy_get'] = ''
  45. disease_dict['cure_department'] = ''
  46. disease_dict['cure_way'] = ''
  47. disease_dict['cure_lasttime'] = ''
  48. disease_dict['symptom'] = ''
  49. disease_dict['cured_prob'] = ''
  50. if 'symptom' in data_json:
  51. symptoms += data_json['symptom']
  52. for symptom in data_json['symptom']:
  53. rels_symptom.append([disease, symptom])
  54. if 'acompany' in data_json:
  55. for acompany in data_json['acompany']:
  56. rels_acompany.append([disease, acompany])
  57. if 'desc' in data_json:
  58. disease_dict['desc'] = data_json['desc']
  59. if 'prevent' in data_json:
  60. disease_dict['prevent'] = data_json['prevent']
  61. if 'cause' in data_json:
  62. disease_dict['cause'] = data_json['cause']
  63. if 'get_prob' in data_json:
  64. disease_dict['get_prob'] = data_json['get_prob']
  65. if 'easy_get' in data_json:
  66. disease_dict['easy_get'] = data_json['easy_get']
  67. if 'cure_department' in data_json:
  68. cure_department = data_json['cure_department']
  69. if len(cure_department) == 1:
  70. rels_category.append([disease, cure_department[0]])
  71. if len(cure_department) == 2:#如果有两个科室,则会有一个大科室下面包含一个小科室
  72. big = cure_department[0]
  73. small = cure_department[1]
  74. rels_department.append([small, big])#将大科室与小科室关联
  75. rels_category.append([disease, small])#再将该疾病与小科室关联
  76. disease_dict['cure_department'] = cure_department#然后将其保存到disease_dict字典和departments列表中
  77. departments += cure_department
  78. if 'cure_way' in data_json:
  79. disease_dict['cure_way'] = data_json['cure_way']
  80. if 'cure_lasttime' in data_json:
  81. disease_dict['cure_lasttime'] = data_json['cure_lasttime']
  82. if 'cured_prob' in data_json:
  83. disease_dict['cured_prob'] = data_json['cured_prob']
  84. if 'common_drug' in data_json:
  85. common_drug = data_json['common_drug']
  86. for drug in common_drug:
  87. rels_commonddrug.append([disease, drug])
  88. drugs += common_drug
  89. if 'recommand_drug' in data_json:
  90. recommand_drug = data_json['recommand_drug']
  91. drugs += recommand_drug
  92. for drug in recommand_drug:
  93. rels_recommanddrug.append([disease, drug])
  94. if 'not_eat' in data_json:
  95. not_eat = data_json['not_eat']
  96. for _not in not_eat:
  97. rels_noteat.append([disease, _not])
  98. foods += not_eat
  99. do_eat = data_json['do_eat']
  100. for _do in do_eat:
  101. rels_doeat.append([disease, _do])
  102. foods += do_eat
  103. recommand_eat = data_json['recommand_eat']
  104. for _recommand in recommand_eat:
  105. rels_recommandeat.append([disease, _recommand])
  106. foods += recommand_eat
  107. if 'check' in data_json:
  108. check = data_json['check']
  109. for _check in check:
  110. rels_check.append([disease, _check])
  111. checks += check
  112. if 'drug_detail' in data_json:
  113. drug_detail = data_json['drug_detail']
  114. producer = [i.split('(')[0] for i in drug_detail]
  115. rels_drug_producer += [[i.split('(')[0], i.split('(')[-1].replace(')', '')] for i in drug_detail]
  116. producers += producer
  117. disease_infos.append(disease_dict)#最后将建立好的该疾病字典保存到disease_infos列表当中
  118. return set(drugs), set(foods), set(checks), set(departments), set(producers), set(symptoms), set(diseases), disease_infos,\
  119. rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,\
  120. rels_symptom, rels_acompany, rels_category#将该函数中保存好的各项列表值作为返回值返回。
  121. #建立节点
  122. def create_node(self, label, nodes):
  123. count = 0
  124. for node_name in nodes:#新建节点,每个节点有一个name属性
  125. node = Node(label, name=node_name)
  126. self.g.create(node)#新建数据库节点
  127. count += 1
  128. print(count, len(nodes))
  129. return
  130. #创建知识图谱中心疾病的节点
  131. def create_diseases_nodes(self, disease_infos):
  132. count = 0
  133. for disease_dict in disease_infos:
  134. node = Node("Disease", name=disease_dict['name'], desc=disease_dict['desc'],
  135. prevent=disease_dict['prevent'] ,cause=disease_dict['cause'],
  136. easy_get=disease_dict['easy_get'],cure_lasttime=disease_dict['cure_lasttime'],
  137. cure_department=disease_dict['cure_department']
  138. ,cure_way=disease_dict['cure_way'] , cured_prob=disease_dict['cured_prob'])
  139. self.g.create(node)
  140. count += 1
  141. print(count)
  142. return
  143. #创建知识图谱实体节点类型schema
  144. def create_graphnodes(self):
  145. Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos,rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
  146. self.create_diseases_nodes(disease_infos)#为每一个疾病都创建了一个节点,每个节点有9个属性
  147. self.create_node('Drug', Drugs)#为每个药品创建一个节点,有一个属性name = 药品名称
  148. print(len(Drugs))
  149. self.create_node('Food', Foods)
  150. print(len(Foods))
  151. self.create_node('Check', Checks)
  152. print(len(Checks))
  153. self.create_node('Department', Departments)
  154. print(len(Departments))
  155. self.create_node('Producer', Producers)
  156. print(len(Producers))
  157. self.create_node('Symptom', Symptoms)
  158. return
  159. #创建实体关系边
  160. def create_graphrels(self):
  161. Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug,rels_symptom, rels_acompany, rels_category = self.read_nodes()
  162. self.create_relationship('Disease', 'Food', rels_recommandeat, 'recommand_eat', '推荐食谱')
  163. self.create_relationship('Disease', 'Food', rels_noteat, 'no_eat', '忌吃')
  164. self.create_relationship('Disease', 'Food', rels_doeat, 'do_eat', '宜吃')
  165. self.create_relationship('Department', 'Department', rels_department, 'belongs_to', '属于')
  166. self.create_relationship('Disease', 'Drug', rels_commonddrug, 'common_drug', '常用药品')
  167. self.create_relationship('Producer', 'Drug', rels_drug_producer, 'drugs_of', '生产药品')
  168. self.create_relationship('Disease', 'Drug', rels_recommanddrug, 'recommand_drug', '好评药品')
  169. self.create_relationship('Disease', 'Check', rels_check, 'need_check', '诊断检查')
  170. self.create_relationship('Disease', 'Symptom', rels_symptom, 'has_symptom', '症状')
  171. self.create_relationship('Disease', 'Disease', rels_acompany, 'acompany_with', '并发症')
  172. self.create_relationship('Disease', 'Department', rels_category, 'belongs_to', '所属科室')
  173. #创建实体关联边
  174. def create_relationship(self, start_node, end_node, edges, rel_type, rel_name):
  175. # start_node是边的起点名称, end_node是边的终点如'Food', edges是边的关系(实际关联项) ,p是起始节点的名称,q是到达节点的名称,rel_type是关系的类型,具体字段如‘do_eat’;rel_name是关系的名称,具体字段如‘宜吃’
  176. count = 0
  177. # 去重处理,实体关系为形如[[“a”,“b”],[“c”,“d”]]的嵌套list,无法直接用set去重,所以先将嵌套内层的list转为字符串,再用set
  178. set_edges = []
  179. for edge in edges:
  180. set_edges.append('###'.join(edge))#这就转化为字符串了
  181. all = len(set(set_edges))
  182. for edge in set(set_edges):#去重
  183. edge = edge.split('###')
  184. p = edge[0]
  185. q = edge[1]
  186. query = "match(p:%s),(q:%s) where p.name='%s'and q.name='%s' create (p)-[rel:%s{name:'%s'}]->(q)" % (
  187. start_node, end_node, p, q, rel_type, rel_name)
  188. try:
  189. self.g.run(query)#run()的用法大概是这样的,比如start_node, end_node这两种节点已经提前存进去了,比如label分别是Disease和Food,现在需要在他们间建立关系,括号里面是neo4j的查询语句cql,语法结构类似sql语句。关系就是rel_typ,还带了一个属性{name:'rel_name'}
  190. count += 1
  191. print(rel_type, count, all)
  192. except Exception as e:
  193. print(e)
  194. return
  195. #导出数据
  196. def export_data(self):
  197. Drugs, Foods, Checks, Departments, Producers, Symptoms, Diseases, disease_infos, rels_check, rels_recommandeat, rels_noteat, rels_doeat, rels_department, rels_commonddrug, rels_drug_producer, rels_recommanddrug, rels_symptom, rels_acompany, rels_category = self.read_nodes()
  198. f_drug = open('drug.txt', 'w+')
  199. f_food = open('food.txt', 'w+')
  200. f_check = open('check.txt', 'w+')
  201. f_department = open('department.txt', 'w+')
  202. f_producer = open('producer.txt', 'w+')
  203. f_symptom = open('symptoms.txt', 'w+')
  204. f_disease = open('disease.txt', 'w+')
  205. f_drug.write('\n'.join(list(Drugs)))
  206. f_food.write('\n'.join(list(Foods)))
  207. f_check.write('\n'.join(list(Checks)))
  208. f_department.write('\n'.join(list(Departments)))
  209. f_producer.write('\n'.join(list(Producers)))
  210. f_symptom.write('\n'.join(list(Symptoms)))
  211. f_disease.write('\n'.join(list(Diseases)))
  212. f_drug.close()
  213. f_food.close()
  214. f_check.close()
  215. f_department.close()
  216. f_producer.close()
  217. f_symptom.close()
  218. f_disease.close()
  219. return
  220. if __name__ == '__main__':
  221. handler = MedicalGraph()
  222. print("step1:导入图谱节点中")
  223. handler.create_graphnodes()
  224. print("step2:导入图谱边中")
  225. handler.create_graphrels()
四、问答部分
  1. question_classifier.py

  1. import os
  2. import ahocorasick#ahocorasick实现快速的关键字匹配
  3. class QuestionClassifier:
  4. def __init__(self):
  5. cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
  6. # 特征词路径
  7. self.disease_path = os.path.join(cur_dir, 'disease.txt')
  8. self.department_path = os.path.join(cur_dir, 'department.txt')
  9. self.check_path = os.path.join(cur_dir, 'check.txt')
  10. self.drug_path = os.path.join(cur_dir, 'drug.txt')
  11. self.food_path = os.path.join(cur_dir, 'food.txt')
  12. self.producer_path = os.path.join(cur_dir, 'producer.txt')
  13. self.symptom_path = os.path.join(cur_dir, 'symptom.txt')
  14. self.deny_path = os.path.join(cur_dir, 'deny.txt')
  15. # 加载特征词
  16. self.disease_wds= [i.strip() for i in open(self.disease_path,encoding='utf-8') if i.strip()]
  17. self.department_wds= [i.strip() for i in open(self.department_path,encoding='utf-8') if i.strip()]
  18. self.check_wds= [i.strip() for i in open(self.check_path,encoding='utf-8') if i.strip()]
  19. self.drug_wds= [i.strip() for i in open(self.drug_path,encoding='utf-8') if i.strip()]
  20. self.food_wds= [i.strip() for i in open(self.food_path,encoding='utf-8') if i.strip()]
  21. self.producer_wds= [i.strip() for i in open(self.producer_path,encoding='utf-8') if i.strip()]
  22. self.symptom_wds= [i.strip() for i in open(self.symptom_path,encoding='utf-8') if i.strip()]
  23. self.region_words = set(self.department_wds + self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.producer_wds + self.symptom_wds)
  24. self.deny_words = [i.strip() for i in open(self.deny_path,encoding='utf-8') if i.strip()]
  25. # 构造领域actree
  26. self.region_tree = self.build_actree(list(self.region_words))
  27. # 构建词典
  28. self.wdtype_dict = self.build_wdtype_dict()#词的类型的字典,{'百日咳':'disease',....}
  29. # 问句疑问词
  30. self.symptom_qwds = ['症状', '表征', '现象', '症候', '表现']
  31. self.cause_qwds = ['原因','成因', '为什么', '怎么会', '怎样才', '咋样才', '怎样会', '如何会', '为啥', '为何', '如何才会', '怎么才会', '会导致', '会造成']
  32. self.acompany_qwds = ['并发症', '并发', '一起发生', '一并发生', '一起出现', '一并出现', '一同发生', '一同出现', '伴随发生', '伴随', '共现']
  33. self.food_qwds = ['饮食', '饮用', '吃', '食', '伙食', '膳食', '喝', '菜' ,'忌口', '补品', '保健品', '食谱', '菜谱', '食用', '食物','补品']
  34. self.drug_qwds = ['药', '药品', '用药', '胶囊', '口服液', '炎片']
  35. self.prevent_qwds = ['预防', '防范', '抵制', '抵御', '防止','躲避','逃避','避开','免得','逃开','避开','避掉','躲开','躲掉','绕开',
  36. '怎样才能不', '怎么才能不', '咋样才能不','咋才能不', '如何才能不',
  37. '怎样才不', '怎么才不', '咋样才不','咋才不', '如何才不',
  38. '怎样才可以不', '怎么才可以不', '咋样才可以不', '咋才可以不', '如何可以不',
  39. '怎样才可不', '怎么才可不', '咋样才可不', '咋才可不', '如何可不']
  40. self.lasttime_qwds = ['周期', '多久', '多长时间', '多少时间', '几天', '几年', '多少天', '多少小时', '几个小时', '多少年']
  41. self.cureway_qwds = ['怎么治疗', '如何医治', '怎么医治', '怎么治', '怎么医', '如何治', '医治方式', '疗法', '咋治', '怎么办', '咋办', '咋治']
  42. self.cureprob_qwds = ['多大概率能治好', '多大几率能治好', '治好希望大么', '几率', '几成', '比例', '可能性', '能治', '可治', '可以治', '可以医']
  43. self.easyget_qwds = ['易感人群', '容易感染', '易发人群', '什么人', '哪些人', '感染', '染上', '得上']
  44. self.check_qwds = ['检查', '检查项目', '查出', '检查', '测出', '试出']
  45. self.belong_qwds = ['属于什么科', '属于', '什么科', '科室']
  46. self.cure_qwds = ['治疗什么', '治啥', '治疗啥', '医治啥', '治愈啥', '主治啥', '主治什么', '有什么用', '有何用', '用处', '用途',
  47. '有什么好处', '有什么益处', '有何益处', '用来', '用来做啥', '用来作甚', '需要', '要']
  48. print('model init finished ......')
  49. return
  50. #分类主函数
  51. def classify(self, question):
  52. data = {}
  53. medical_dict = self.check_medical(question)#从问句中匹配出的,{'百日咳':'disease',....}
  54. if not medical_dict:
  55. return {}
  56. data['args'] = medical_dict
  57. #收集问句当中所涉及到的实体类型
  58. types = []
  59. for type_ in medical_dict.values():
  60. types += type_
  61. question_type = 'others'
  62. question_types = []
  63. # 症状
  64. if self.check_words(self.symptom_qwds, question) and ('disease' in types):
  65. question_type = 'disease_symptom'#问句类型为:已知疾病询问症状
  66. question_types.append(question_type)
  67. if self.check_words(self.symptom_qwds, question) and ('symptom' in types):
  68. question_type = 'symptom_disease'#已知症状问疾病
  69. question_types.append(question_type)
  70. # 原因
  71. if self.check_words(self.cause_qwds, question) and ('disease' in types):
  72. question_type = 'disease_cause'
  73. question_types.append(question_type)
  74. # 并发症
  75. if self.check_words(self.acompany_qwds, question) and ('disease' in types):
  76. question_type = 'disease_acompany'
  77. question_types.append(question_type)
  78. # 推荐食品
  79. if self.check_words(self.food_qwds, question) and 'disease' in types:
  80. deny_status = self.check_words(self.deny_words, question)
  81. if deny_status:
  82. question_type = 'disease_not_food'
  83. else:
  84. question_type = 'disease_do_food'
  85. question_types.append(question_type)
  86. #已知食物找疾病
  87. if self.check_words(self.food_qwds+self.cure_qwds, question) and 'food' in types:
  88. deny_status = self.check_words(self.deny_words, question)
  89. if deny_status:
  90. question_type = 'food_not_disease'
  91. else:
  92. question_type = 'food_do_disease'
  93. question_types.append(question_type)
  94. # 推荐药品
  95. if self.check_words(self.drug_qwds, question) and 'disease' in types:
  96. question_type = 'disease_drug'
  97. question_types.append(question_type)
  98. # 药品治啥病
  99. if self.check_words(self.cure_qwds, question) and 'drug' in types:
  100. question_type = 'drug_disease'
  101. question_types.append(question_type)
  102. # 疾病接受检查项目
  103. if self.check_words(self.check_qwds, question) and 'disease' in types:
  104. question_type = 'disease_check'
  105. question_types.append(question_type)
  106. # 已知检查项目查相应疾病
  107. if self.check_words(self.check_qwds+self.cure_qwds, question) and 'check' in types:
  108. question_type = 'check_disease'
  109. question_types.append(question_type)
  110. # 症状防御
  111. if self.check_words(self.prevent_qwds, question) and 'disease' in types:
  112. question_type = 'disease_prevent'
  113. question_types.append(question_type)
  114. # 疾病医疗周期
  115. if self.check_words(self.lasttime_qwds, question) and 'disease' in types:
  116. question_type = 'disease_lasttime'
  117. question_types.append(question_type)
  118. # 疾病治疗方式
  119. if self.check_words(self.cureway_qwds, question) and 'disease' in types:
  120. question_type = 'disease_cureway'
  121. question_types.append(question_type)
  122. # 疾病治愈可能性
  123. if self.check_words(self.cureprob_qwds, question) and 'disease' in types:
  124. question_type = 'disease_cureprob'
  125. question_types.append(question_type)
  126. # 疾病易感染人群
  127. if self.check_words(self.easyget_qwds, question) and 'disease' in types :
  128. question_type = 'disease_easyget'
  129. question_types.append(question_type)
  130. # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回
  131. if question_types == [] and 'disease' in types:
  132. question_types = ['disease_desc']
  133. # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回
  134. if question_types == [] and 'symptom' in types:
  135. question_types = ['symptom_disease']
  136. # 将多个分类结果进行合并处理,组装成一个字典
  137. data['question_types'] = question_types
  138. return data
  139. #构造词对应的类型
  140. def build_wdtype_dict(self):
  141. wd_dict = dict()#创建一个空字典
  142. for wd in self.region_words:#所有的关键字
  143. wd_dict[wd] = []
  144. if wd in self.disease_wds:#如果该关键字属于 疾病,则wd_dict[wd] = ['disease']
  145. wd_dict[wd].append('disease')
  146. if wd in self.department_wds:
  147. wd_dict[wd].append('department')
  148. if wd in self.check_wds:
  149. wd_dict[wd].append('check')
  150. if wd in self.drug_wds:
  151. wd_dict[wd].append('drug')
  152. if wd in self.food_wds:
  153. wd_dict[wd].append('food')
  154. if wd in self.symptom_wds:
  155. wd_dict[wd].append('symptom')
  156. if wd in self.producer_wds:
  157. wd_dict[wd].append('producer')
  158. return wd_dict#返回了一个字典,里面的内容是{'百日咳':'disease',....}
  159. #构造actree,加速过滤,输入list,输出一个AC树
  160. def build_actree(self, wordlist):
  161. actree = ahocorasick.Automaton()
  162. for index, word in enumerate(wordlist):
  163. actree.add_word(word, (index, word))
  164. actree.make_automaton()
  165. return actree
  166. #问句过滤
  167. def check_medical(self, question):
  168. region_wds = []
  169. for i in self.region_tree.iter(question):
  170. # 快速匹配,匹配的结果是一个双重元组序列,形如('百日咳',(3324,'百日咳')),就是前面的actree.add_word(word, (index, word))给加进来成tree的。这个index是在前面所有关键字加在一起之后的region_words中的序列号
  171. wd = i[1][1]#把'百日咳'这个关键字给挑出来了
  172. region_wds.append(wd)#把所有的关键字列入列表region_wds
  173. stop_wds = []
  174. for wd1 in region_wds:#1.如果一个问句里面挑出了两个词
  175. for wd2 in region_wds:
  176. if wd1 in wd2 and wd1 != wd2:#2.如果其中一个词包含另一个词
  177. stop_wds.append(wd1)#3.则把短的词写入stop_wds
  178. final_wds = [i for i in region_wds if i not in stop_wds]#4.final_wds中是把region_wds去掉stop_wds。就是说如果问句中一个词包含另外一个词,则以长词为准。
  179. final_dict = {i:self.wdtype_dict.get(i) for i in final_wds}#类似于{'百日咳':'disease',....},其中'百日咳'在问句中,且不在stop_wds中。
  180. return final_dict
  181. #基于特征词进行分类
  182. def check_words(self, wds, sent):
  183. for wd in wds:
  184. if wd in sent:
  185. return True
  186. return False
  187. if __name__ == '__main__':
  188. handler = QuestionClassifier()
  189. while 1:
  190. question = input('input an question:')
  191. data = handler.classify(question)
  192. print(data)

2.question_parser.py

  1. class QuestionPaser:
  2. #构建实体节点
  3. def build_entitydict(self, args):
  4. entity_dict = {}
  5. for arg, types in args.items():
  6. for type in types:
  7. if type not in entity_dict:
  8. entity_dict[type] = [arg]
  9. else:
  10. entity_dict[type].append(arg)
  11. return entity_dict#构建一个字典,将所有同类型的实体放在一个值里,键是类型。如{'disease':['百日咳',...],....}
  12. #解析主函数
  13. def parser_main(self, res_classify):
  14. args = res_classify['args']#{'百日咳':'disease',....}
  15. entity_dict = self.build_entitydict(args)#{'disease':['百日咳',...],....}
  16. question_types = res_classify['question_types']#问题类型
  17. sqls = []
  18. for question_type in question_types:
  19. sql_ = {}
  20. sql_['question_type'] = question_type
  21. sql = []
  22. if question_type == 'disease_symptom':
  23. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  24. elif question_type == 'symptom_disease':
  25. sql = self.sql_transfer(question_type, entity_dict.get('symptom'))
  26. elif question_type == 'disease_cause':
  27. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  28. elif question_type == 'disease_acompany':
  29. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  30. elif question_type == 'disease_not_food':
  31. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  32. elif question_type == 'disease_do_food':
  33. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  34. elif question_type == 'food_not_disease':
  35. sql = self.sql_transfer(question_type, entity_dict.get('food'))
  36. elif question_type == 'food_do_disease':
  37. sql = self.sql_transfer(question_type, entity_dict.get('food'))
  38. elif question_type == 'disease_drug':
  39. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  40. elif question_type == 'drug_disease':
  41. sql = self.sql_transfer(question_type, entity_dict.get('drug'))
  42. elif question_type == 'disease_check':
  43. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  44. elif question_type == 'check_disease':
  45. sql = self.sql_transfer(question_type, entity_dict.get('check'))
  46. elif question_type == 'disease_prevent':
  47. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  48. elif question_type == 'disease_lasttime':
  49. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  50. elif question_type == 'disease_cureway':
  51. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  52. elif question_type == 'disease_cureprob':
  53. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  54. elif question_type == 'disease_easyget':
  55. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  56. elif question_type == 'disease_desc':
  57. sql = self.sql_transfer(question_type, entity_dict.get('disease'))
  58. if sql:
  59. sql_['sql'] = sql
  60. sqls.append(sql_)
  61. return sqls
  62. #针对不同的问题,分开进行处理
  63. def sql_transfer(self, question_type, entities):#question_type是'disease_do_food'等问题类型,entities是['百日咳',...]等实体
  64. if not entities:
  65. return []
  66. # 查询语句
  67. sql = []
  68. # 查询疾病的原因
  69. if question_type == 'disease_cause':
  70. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cause".format(i) for i in entities]
  71. # 查询节点m,类型为Disease,当m的属性name为entities,返回m的那么,cause
  72. # 查询疾病的防御措施
  73. elif question_type == 'disease_prevent':
  74. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.prevent".format(i) for i in entities]
  75. # 查询疾病的持续时间
  76. elif question_type == 'disease_lasttime':
  77. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_lasttime".format(i) for i in entities]
  78. # 查询疾病的治愈概率
  79. elif question_type == 'disease_cureprob':
  80. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cured_prob".format(i) for i in entities]
  81. # 查询疾病的治疗方式
  82. elif question_type == 'disease_cureway':
  83. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.cure_way".format(i) for i in entities]
  84. # 查询疾病的易发人群
  85. elif question_type == 'disease_easyget':
  86. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.easy_get".format(i) for i in entities]
  87. # 查询疾病的相关介绍
  88. elif question_type == 'disease_desc':
  89. sql = ["MATCH (m:Disease) where m.name = '{0}' return m.name, m.desc".format(i) for i in entities]
  90. # 查询疾病有哪些症状
  91. elif question_type == 'disease_symptom':
  92. sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  93. # 查询症状会导致哪些疾病
  94. elif question_type == 'symptom_disease':
  95. sql = ["MATCH (m:Disease)-[r:has_symptom]->(n:Symptom) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  96. # 查询疾病的并发症
  97. elif question_type == 'disease_acompany':
  98. sql1 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  99. sql2 = ["MATCH (m:Disease)-[r:acompany_with]->(n:Disease) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  100. sql = sql1 + sql2
  101. # 查询疾病的忌口
  102. elif question_type == 'disease_not_food':
  103. sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  104. # 查询疾病建议吃的东西
  105. elif question_type == 'disease_do_food':
  106. sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  107. sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  108. sql = sql1 + sql2
  109. # 已知忌口查疾病
  110. elif question_type == 'food_not_disease':
  111. sql = ["MATCH (m:Disease)-[r:no_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  112. # 已知推荐查疾病
  113. elif question_type == 'food_do_disease':
  114. sql1 = ["MATCH (m:Disease)-[r:do_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  115. sql2 = ["MATCH (m:Disease)-[r:recommand_eat]->(n:Food) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  116. sql = sql1 + sql2
  117. # 查询疾病常用药品-药品别名记得扩充
  118. elif question_type == 'disease_drug':
  119. sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  120. sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  121. sql = sql1 + sql2
  122. # 已知药品查询能够治疗的疾病
  123. elif question_type == 'drug_disease':
  124. sql1 = ["MATCH (m:Disease)-[r:common_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  125. sql2 = ["MATCH (m:Disease)-[r:recommand_drug]->(n:Drug) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  126. sql = sql1 + sql2
  127. # 查询疾病应该进行的检查
  128. elif question_type == 'disease_check':
  129. sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where m.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  130. # 已知检查查询疾病
  131. elif question_type == 'check_disease':
  132. sql = ["MATCH (m:Disease)-[r:need_check]->(n:Check) where n.name = '{0}' return m.name, r.name, n.name".format(i) for i in entities]
  133. return sql
  134. if __name__ == '__main__':
  135. handler = QuestionPaser()

3.answer_search.py

  1. from py2neo import Graph#python操作图数据库
  2. class AnswerSearcher:
  3. def __init__(self):
  4. self.g = Graph('http://localhost:7474', auth=('neo4j', '072766'))
  5. self.num_limit = 20#答案的实体数量上限。比如一个疾病可以吃什么食物,最多回答20个
  6. #执行cypher查询,并返回相应结果'''
  7. def search_main(self, sqls):
  8. final_answers = []
  9. for sql_ in sqls:
  10. question_type = sql_['question_type']
  11. queries = sql_['sql']
  12. answers = []
  13. for query in queries:
  14. ress = self.g.run(query).data()#返回查询结果
  15. answers += ress
  16. final_answer = self.answer_prettify(question_type, answers)#再根据[‘question_type’]的不同调用answer_prettify函数将查询结果和答案话术结合起来。
  17. if final_answer:
  18. final_answers.append(final_answer)
  19. return final_answers
  20. #根据对应的qustion_type,调用相应的回复模板
  21. def answer_prettify(self, question_type, answers):
  22. final_answer = []
  23. if not answers:
  24. return ''
  25. if question_type == 'disease_symptom':
  26. desc = [i['n.name'] for i in answers]
  27. subject = answers[0]['m.name']
  28. final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  29. elif question_type == 'symptom_disease':
  30. desc = [i['m.name'] for i in answers]
  31. subject = answers[0]['n.name']
  32. final_answer = '症状{0}可能染上的疾病有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  33. elif question_type == 'disease_cause':
  34. desc = [i['m.cause'] for i in answers]
  35. subject = answers[0]['m.name']
  36. final_answer = '{0}可能的成因有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  37. elif question_type == 'disease_prevent':
  38. desc = [i['m.prevent'] for i in answers]
  39. subject = answers[0]['m.name']
  40. final_answer = '{0}的预防措施包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  41. elif question_type == 'disease_lasttime':
  42. desc = [i['m.cure_lasttime'] for i in answers]
  43. subject = answers[0]['m.name']
  44. final_answer = '{0}治疗可能持续的周期为:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  45. elif question_type == 'disease_cureway':
  46. desc = [';'.join(i['m.cure_way']) for i in answers]
  47. subject = answers[0]['m.name']
  48. final_answer = '{0}可以尝试如下治疗:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  49. elif question_type == 'disease_cureprob':
  50. desc = [i['m.cured_prob'] for i in answers]
  51. subject = answers[0]['m.name']
  52. final_answer = '{0}治愈的概率为(仅供参考):{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  53. elif question_type == 'disease_easyget':
  54. desc = [i['m.easy_get'] for i in answers]
  55. subject = answers[0]['m.name']
  56. final_answer = '{0}的易感人群包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  57. elif question_type == 'disease_desc':
  58. desc = [i['m.desc'] for i in answers]
  59. subject = answers[0]['m.name']
  60. final_answer = '{0},熟悉一下:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  61. elif question_type == 'disease_acompany':
  62. desc1 = [i['n.name'] for i in answers]
  63. desc2 = [i['m.name'] for i in answers]
  64. subject = answers[0]['m.name']
  65. desc = [i for i in desc1 + desc2 if i != subject]
  66. final_answer = '{0}的症状包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  67. elif question_type == 'disease_not_food':
  68. desc = [i['n.name'] for i in answers]
  69. subject = answers[0]['m.name']
  70. final_answer = '{0}忌食的食物包括有:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  71. elif question_type == 'disease_do_food':
  72. do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃']
  73. recommand_desc = [i['n.name'] for i in answers if i['r.name'] == '推荐食谱']
  74. subject = answers[0]['m.name']
  75. final_answer = '{0}宜食的食物包括有:{1}\n推荐食谱包括有:{2}'.format(subject, ';'.join(list(set(do_desc))[:self.num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit]))
  76. elif question_type == 'food_not_disease':
  77. desc = [i['m.name'] for i in answers]
  78. subject = answers[0]['n.name']
  79. final_answer = '患有{0}的人最好不要吃{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject)
  80. elif question_type == 'food_do_disease':
  81. desc = [i['m.name'] for i in answers]
  82. subject = answers[0]['n.name']
  83. final_answer = '患有{0}的人建议多试试{1}'.format(';'.join(list(set(desc))[:self.num_limit]), subject)
  84. elif question_type == 'disease_drug':
  85. desc = [i['n.name'] for i in answers]
  86. subject = answers[0]['m.name']
  87. final_answer = '{0}通常的使用的药品包括:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  88. elif question_type == 'drug_disease':
  89. desc = [i['m.name'] for i in answers]
  90. subject = answers[0]['n.name']
  91. final_answer = '{0}主治的疾病有{1},可以试试'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  92. elif question_type == 'disease_check':
  93. desc = [i['n.name'] for i in answers]
  94. subject = answers[0]['m.name']
  95. final_answer = '{0}通常可以通过以下方式检查出来:{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  96. elif question_type == 'check_disease':
  97. desc = [i['m.name'] for i in answers]
  98. subject = answers[0]['n.name']
  99. final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format(subject, ';'.join(list(set(desc))[:self.num_limit]))
  100. return final_answer
  101. if __name__ == '__main__':
  102. searcher = AnswerSearcher()

4.chatbot_graph.py

  1. from question_classifier import *
  2. from question_parser import *
  3. from answer_search import *
  4. #问答类
  5. class ChatBotGraph:
  6. def __init__(self):
  7. self.classifier = QuestionClassifier()
  8. self.parser = QuestionPaser()
  9. self.searcher = AnswerSearcher()
  10. def chat_main(self, sent):
  11. answer = '您好,我是小鱼医药智能助理,希望可以帮到您。祝您身体棒棒!'
  12. res_classify = self.classifier.classify(sent)
  13. if not res_classify:
  14. return answer#如果没有找到关键词,则返回固定答案
  15. res_sql = self.parser.parser_main(res_classify)#查询语句
  16. final_answers = self.searcher.search_main(res_sql)#根据查询语句返回答案
  17. if not final_answers:
  18. return answer
  19. else:
  20. return '\n'.join(final_answers)
  21. if __name__ == '__main__':
  22. handler = ChatBotGraph()
  23. while 1:
  24. question = input('用户:')
  25. answer = handler.chat_main(question)
  26. print('小鱼:', answer)

实现结果:

五、参考博客

(60条消息) 知识图谱的节点和关系实现(python)_python 知识图谱_chen_nnn的博客-CSDN博客

(60条消息) 菜哥学知识图谱(通过“基于医疗知识图谱的问答系统”)(四)(代码分析2)_weixin_40539807的博客-CSDN博客

(60条消息) 基于医疗知识图谱的问答系统源码详解_vivian_ll的博客-CSDN博客

本项目还有不足:关于疾病的起因、预防等,实际返回的是一大段文字,这里其实可以引入事件抽取的概念,进一步将原因结构化表示出来。这个可以后面进行尝试。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/IT小白/article/detail/526803
推荐阅读
相关标签
  

闽ICP备14008679号