当前位置:   article > 正文

知识图谱实战

知识图谱实战

一、知识图谱简单介绍

 

 

 

 

 

二、知识图谱的构建

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 三、知识图谱问答方案

NL2SQL:自然语言转为SQL语句

 

 

bulid_graph.py 

  1. """知识图谱"""
  2. #三元组:实体-关系-实体 实体-属性-属性值
  3. import re,json
  4. from py2neo import Graph
  5. from collections import defaultdict
  6. """读取三元组,并将数据写入neo4j"""
  7. #连接图数据库
  8. graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
  9. attribute_data=defaultdict(dict)
  10. relation_data=defaultdict(dict)
  11. label_data={}
  12. #有的实体后面有括号,里面的内容可以作为标签
  13. #提取到标签后,把括号部分删除
  14. def get_label_then_clean(x,label_data):
  15. if re.search("(.+)",x):
  16. label_string=re.search("(.+)",x).group()
  17. for label in ["歌曲","专辑","电影","电视剧"]:
  18. if label in label_string:
  19. x=re.sub("(.+)","",x)#括号内的内容删除掉,因为括号里面是特殊字符会影响cypher的语句运行
  20. label_data[x]=label
  21. else:
  22. x=re.sub("(.+)","",x)
  23. return x
  24. #读取实体-关系-实体三元组文件
  25. with open("data/01test.doc",encoding="utf8") as f:
  26. for line in f:
  27. head,relation,tail=line.strip().split('\t')#取出三元组
  28. head=get_label_then_clean(head,label_data)
  29. relation_data[head][relation]=tail
  30. #读取实体-属性-属性值三元组
  31. with open("data/01triplets_enti_attr_value.doc",encoding='utf8') as f:
  32. for line in f:
  33. entity,attribute,value=line.strip().split('\t')#取出三元组
  34. entity=get_label_then_clean(entity,label_data)
  35. attribute_data[entity][attribute]=value
  36. #构建cypher语句
  37. cypher=""
  38. in_graph_entity=set()
  39. for i,entity in enumerate(attribute_data):
  40. #为所有的实体增加一个名字属性
  41. attribute_data[entity]["NAME"]=entity
  42. #将一个实体的所有的属性拼接成一个类似于字典的表达式
  43. text="{"
  44. for attribute,value in attribute_data[entity].items():
  45. text+="%s:\'%s\',"%(attribute,value)
  46. text=text[:-1]+"}"#最后一个逗号替换成大括号
  47. if entity in label_data:
  48. label=label_data[entity]
  49. #带标签的实体构造语句
  50. cypher+="CREATE (%s:%s %s)"%(entity,label,text)+"\n"
  51. else:
  52. "不带标签的实体构造语句"
  53. cypher+="CREATE (%s %s)"%(entity,text)+"\n"
  54. in_graph_entity.add(entity)
  55. #构造关系语句
  56. for i in enumerate(relation_data):
  57. #有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
  58. if head not in in_graph_entity:
  59. cypher += "CREATE (%s {NAME:'%s'})"%(head,head)+'\n'
  60. in_graph_entity.add(head)
  61. for relation,tail in relation_data[head].items():
  62. #有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
  63. if tail not in in_graph_entity:
  64. cypher +="CREATE (%s {NAME:'%s'})"%(tail,tail)+"\n"
  65. in_graph_entity.add(tail)
  66. #关系语句
  67. cypher +="CREATE (%s)-[:%s]->(%s)"%(head,relation,tail)+"\n"
  68. print(cypher)
  69. #执行建表脚本
  70. graph.run(cypher)
  71. #记录我们图谱里都有哪些实体,哪些属性,哪些关系,哪些标签
  72. data=defaultdict(set)
  73. for head in relation_data:
  74. data["entitys"].add(head)
  75. for relation,tail in relation_data[head].items():
  76. data["relations"].add(relation)
  77. data["entitys"].add(tail)
  78. for enti,label in label_data.items():
  79. data["entitys"].add(enti)
  80. data['labels'].add(label)
  81. for enti in attribute_data:
  82. for attr,value in attribute_data[enti].items():
  83. data['entitys'].add(enti)
  84. data['attributes'].add(attr)
  85. data=dict((x,list(y))for x,y in data.items())
  86. with open('data/kg_schema.json','w',encoding='utf8') as f:
  87. f.write(json.dumps(data,ensure_ascii=False,indent=2))
graph_qa_base_on_sentence_match.py
  1. """使用文本匹配方式进行知识图谱的应用"""
  2. import itertools,json
  3. import re
  4. import pandas
  5. from py2neo import Graph
  6. from collections import defaultdict
  7. class GraphQA:
  8. def __init__(self):
  9. self.graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
  10. schema_path="kg_schema.json"
  11. templet_path="question_templet.xlsx"
  12. self.load(schema_path,templet_path)
  13. print('知识图谱问答系统加载完毕!\n=================')
  14. #加载模板
  15. def load(self,schema_path,templet_path):
  16. self.load_kg_schema(schema_path)
  17. self.load_question_templet(templet_path)
  18. return
  19. #加载图谱信息
  20. def load_kg_schema(self,path):
  21. with open(path,encoding='utf8') as f:
  22. schema=json.load(f)
  23. self.relation_set=set(schema['relations'])
  24. self.entity_set=set(schema['entitys'])
  25. self.label_set=set(schema['labels'])
  26. self.attribute_set=set(schema['attributes'])
  27. return
  28. #加载模板信息
  29. def load_question_templet(self,templet_path):
  30. dataframe=pandas.read_excel(templet_path)
  31. self.question_templet=[]
  32. for index in range(len(dataframe)):
  33. question=dataframe["question"][index]
  34. cypher=dataframe['cypher'][index]
  35. cypher_check=dataframe["check"][index]
  36. answer=dataframe["answer"][index]
  37. self.question_templet.append([question,cypher,json.loads(cypher_check),answer])
  38. return
  39. #获取问题中谈到的实体,可以使用基于词表的方式,也可以使用NER模型
  40. def get_mention_entitys(self,sentence):
  41. return re.findall("|".join(self.entity_set),sentence)
  42. #获取问题中谈到的关系,也可以使用各种文本分类模型
  43. def get_mention_relations(self,sentence):
  44. return re.findall("|".join(self.relation_set),sentence)
  45. #获取问题中谈到的属性
  46. def get_mention_attributes(self,sentence):
  47. return re.findall("|".join(self.attribute_set),sentence)
  48. #获取问题中的谈到的标签
  49. def get_mention_labels(self,sentence):
  50. return re.findall("|".join(self.label_set),sentence)
  51. #对问题进行预处理,提取需要的信息
  52. def parse_sentence(self,sentence):
  53. entitys=self.get_mention_entitys(sentence)
  54. relations=self.get_mention_relations(sentence)
  55. labels=self.get_mention_labels(sentence)
  56. attributes=self.get_mention_attributes(sentence)
  57. return{
  58. "%ENT%":entitys,
  59. "%REL":relations,
  60. "%LAB%":labels,
  61. "%ATT%":attributes
  62. }
  63. #将提取到的值分配到键上
  64. def decode_value_combination(self,value_combination,cypher_check):
  65. res={}
  66. for index,(key,required_count) in enumerate(cypher_check.items()):
  67. if required_count==1:
  68. res[key]=value_combination[index][0]
  69. else:
  70. for i in range(required_count):
  71. key_num=key[:-1]+str(i)+"%"
  72. res[key_num]=value_combination[index][i]
  73. return res
  74. #对于找到了超过模板中需求的实体数量的情况,需要进行排列组合
  75. #info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
  76. def get_combinations(self,cypher_check,info):
  77. slot_values=[]
  78. for key,required_count in cypher_check.items():
  79. slot_values.append(itertools.combinations(info[key],required_count))
  80. value_combinations=itertools.product(*slot_values)
  81. combinations=[]
  82. for value_combination in value_combinations:
  83. combinations.append(self.decode_value_combination(value_combination,cypher_check))
  84. return combinations
  85. #将带有token的模板替换成真实词
  86. #string:%ENT1%和%ENT2%是%REL%关系吗
  87. #combination:{“%ENT1%”:"word1","%ENT2%":"word2"}
  88. def replace_token_in_string(self,string,combination):
  89. for key,value in combination.items():
  90. string = string.replace(key,value)
  91. return string
  92. #对于单条模板,根据抽取到的实体属性信息扩展,形成一个列表
  93. #info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
  94. def expend_templet(self,templet,cypher,cypher_check,info,answer):
  95. combinations=self.get_combinations(cypher_check,info)
  96. templet_cypher_pair=[]
  97. for combination in combinations:
  98. replaced_templet=self.replace_token_in_string(templet,combination)
  99. replaced_cypher=self.replace_token_in_string(cypher,combination)
  100. replaced_answer=self.replace_token_in_string(answer,combination)
  101. templet_cypher_pair.append([replaced_templet,replaced_cypher,replaced_answer])
  102. return templet_cypher_pair
  103. #验证从文本中提取到的信息是否足够填充模板,如果不够就跳过,节省运算速度。
  104. #如模板:%ENT%和%ENT%是什么关系? 这句话需要两个实体才能填充,如果问题中只有一个,该模板无法匹配
  105. def check_cypher_info_valid(self,info,cypher_check):
  106. for key,required_count in cypher_check.items():
  107. if len(info.get(key,[]))<required_count:
  108. return False
  109. return True
  110. #根据提取到的实体,关系等信息,将模板展开成待匹配的问题文本
  111. def expand_question_and_cypher(self,info):
  112. templet_cypher_pair=[]
  113. for templet,cypher,cypher_check,answer in self.question_templet:
  114. if self.check_cypher_info_valid(info,cypher_check):
  115. templet_cypher_pair+=self.expand_templet(templet,cypher,cypher_check,info,answer)
  116. return templet_cypher_pair
  117. #距离函数,文本匹配的所有方法都可以使用
  118. def sentence_similarity_function(self,string1,string2):
  119. print("计算 %s %s"%(string1,string2))
  120. jaccard_distance=len(set(string1)&set(string2))/len(set(string1)|set(string2))
  121. return jaccard_distance
  122. #通过问题匹配的方式确定匹配的cypher
  123. def cypher_match(self,sentence,info):
  124. templet_cypher_pair=self.expand_question_and_cypher(info)
  125. result=[]
  126. for templet,cypher,answer in templet_cypher_pair:
  127. score=self.sentence_similarity_function(sentence,templet)
  128. result.append([templet,cypher,score,answer])
  129. result=sorted(result,reverse=True,key=lambda x:x[2])
  130. return result
  131. #解析结果
  132. def parse_result(self,graph_search_result,answer,info):
  133. graph_search_result=graph_search_result[0]
  134. #关系查找返回的结果形式较为特殊,单独处理
  135. if "REL" in graph_search_result:
  136. graph_search_result['REL']=list(graph_search_result["REL"].types())[0]
  137. answer=self.replace_token_in_string(answer,graph_search_result)
  138. return answer
  139. #对外提供问答接口
  140. def query(self,sentence):#sentence:谁导演的不能说的秘密
  141. info=self.parse_sentence(sentence)#信息抽取
  142. templet_cypher_score=self.cypher_match(sentence,info)
  143. for templet,cypher,score,answer in templet_cypher_score:
  144. graph_search_result=self.graph.run(cypher).data()
  145. #最高分命中的模板不一定在图上能找到答案,当不能找到答案时,运行一个搜案语句,找答案时停止查找后面的模板
  146. if graph_search_result:
  147. break
  148. answer=self.parse_resule(graph_search_result,answer,info)
  149. #通过问题匹配的方式确定匹配的cypher
  150. def cypher_match(self,sentence,info):
  151. templet_cypher_pair=self.expand_question_and_cypher(info)
  152. result=[]
  153. for templet,cypher,answer in templet_cypher_pair:
  154. score=self.sentence_similarity_function(sentence,templet)
  155. result.append([templet,cypher,score,answer])
  156. result=sorted(result,reverse=True,key=lambda x:x[2])
  157. return result
  158. if __name__=="__main__":
  159. graph=GraphQA()
  160. res=graph.query("谁导演的不能说的秘密")
  161. print(res)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/350511
推荐阅读
相关标签
  

闽ICP备14008679号