赞
踩
NL2SQL:自然语言转为SQL语句
bulid_graph.py
- """知识图谱"""
- #三元组:实体-关系-实体 实体-属性-属性值
-
- import re,json
- from py2neo import Graph
- from collections import defaultdict
-
- """读取三元组,并将数据写入neo4j"""
-
- #连接图数据库
- graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
-
- attribute_data=defaultdict(dict)
- relation_data=defaultdict(dict)
- label_data={}
-
- #有的实体后面有括号,里面的内容可以作为标签
- #提取到标签后,把括号部分删除
- def get_label_then_clean(x,label_data):
- if re.search("(.+)",x):
- label_string=re.search("(.+)",x).group()
- for label in ["歌曲","专辑","电影","电视剧"]:
- if label in label_string:
- x=re.sub("(.+)","",x)#括号内的内容删除掉,因为括号里面是特殊字符会影响cypher的语句运行
- label_data[x]=label
- else:
- x=re.sub("(.+)","",x)
- return x
-
-
- #读取实体-关系-实体三元组文件
- with open("data/01test.doc",encoding="utf8") as f:
- for line in f:
- head,relation,tail=line.strip().split('\t')#取出三元组
- head=get_label_then_clean(head,label_data)
- relation_data[head][relation]=tail
-
- #读取实体-属性-属性值三元组
- with open("data/01triplets_enti_attr_value.doc",encoding='utf8') as f:
- for line in f:
- entity,attribute,value=line.strip().split('\t')#取出三元组
- entity=get_label_then_clean(entity,label_data)
- attribute_data[entity][attribute]=value
-
- #构建cypher语句
- cypher=""
- in_graph_entity=set()
- for i,entity in enumerate(attribute_data):
- #为所有的实体增加一个名字属性
- attribute_data[entity]["NAME"]=entity
- #将一个实体的所有的属性拼接成一个类似于字典的表达式
- text="{"
- for attribute,value in attribute_data[entity].items():
- text+="%s:\'%s\',"%(attribute,value)
- text=text[:-1]+"}"#最后一个逗号替换成大括号
- if entity in label_data:
- label=label_data[entity]
- #带标签的实体构造语句
- cypher+="CREATE (%s:%s %s)"%(entity,label,text)+"\n"
- else:
- "不带标签的实体构造语句"
- cypher+="CREATE (%s %s)"%(entity,text)+"\n"
- in_graph_entity.add(entity)
-
-
- #构造关系语句
- for i in enumerate(relation_data):
- #有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
- if head not in in_graph_entity:
- cypher += "CREATE (%s {NAME:'%s'})"%(head,head)+'\n'
- in_graph_entity.add(head)
- for relation,tail in relation_data[head].items():
- #有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
- if tail not in in_graph_entity:
- cypher +="CREATE (%s {NAME:'%s'})"%(tail,tail)+"\n"
- in_graph_entity.add(tail)
-
- #关系语句
- cypher +="CREATE (%s)-[:%s]->(%s)"%(head,relation,tail)+"\n"
-
- print(cypher)
-
- #执行建表脚本
- graph.run(cypher)
-
- #记录我们图谱里都有哪些实体,哪些属性,哪些关系,哪些标签
- data=defaultdict(set)
- for head in relation_data:
- data["entitys"].add(head)
- for relation,tail in relation_data[head].items():
- data["relations"].add(relation)
- data["entitys"].add(tail)
-
- for enti,label in label_data.items():
- data["entitys"].add(enti)
- data['labels'].add(label)
-
- for enti in attribute_data:
- for attr,value in attribute_data[enti].items():
- data['entitys'].add(enti)
- data['attributes'].add(attr)
-
- data=dict((x,list(y))for x,y in data.items())
- with open('data/kg_schema.json','w',encoding='utf8') as f:
- f.write(json.dumps(data,ensure_ascii=False,indent=2))
-
graph_qa_base_on_sentence_match.py
- """使用文本匹配方式进行知识图谱的应用"""
-
- import itertools,json
- import re
-
- import pandas
- from py2neo import Graph
- from collections import defaultdict
-
- class GraphQA:
- def __init__(self):
- self.graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
- schema_path="kg_schema.json"
- templet_path="question_templet.xlsx"
- self.load(schema_path,templet_path)
- print('知识图谱问答系统加载完毕!\n=================')
-
- #加载模板
- def load(self,schema_path,templet_path):
- self.load_kg_schema(schema_path)
- self.load_question_templet(templet_path)
- return
-
- #加载图谱信息
- def load_kg_schema(self,path):
- with open(path,encoding='utf8') as f:
- schema=json.load(f)
- self.relation_set=set(schema['relations'])
- self.entity_set=set(schema['entitys'])
- self.label_set=set(schema['labels'])
- self.attribute_set=set(schema['attributes'])
- return
-
- #加载模板信息
- def load_question_templet(self,templet_path):
- dataframe=pandas.read_excel(templet_path)
- self.question_templet=[]
- for index in range(len(dataframe)):
- question=dataframe["question"][index]
- cypher=dataframe['cypher'][index]
- cypher_check=dataframe["check"][index]
- answer=dataframe["answer"][index]
- self.question_templet.append([question,cypher,json.loads(cypher_check),answer])
- return
-
- #获取问题中谈到的实体,可以使用基于词表的方式,也可以使用NER模型
- def get_mention_entitys(self,sentence):
- return re.findall("|".join(self.entity_set),sentence)
-
- #获取问题中谈到的关系,也可以使用各种文本分类模型
- def get_mention_relations(self,sentence):
- return re.findall("|".join(self.relation_set),sentence)
-
- #获取问题中谈到的属性
- def get_mention_attributes(self,sentence):
- return re.findall("|".join(self.attribute_set),sentence)
-
- #获取问题中的谈到的标签
- def get_mention_labels(self,sentence):
- return re.findall("|".join(self.label_set),sentence)
-
- #对问题进行预处理,提取需要的信息
- def parse_sentence(self,sentence):
- entitys=self.get_mention_entitys(sentence)
- relations=self.get_mention_relations(sentence)
- labels=self.get_mention_labels(sentence)
- attributes=self.get_mention_attributes(sentence)
- return{
- "%ENT%":entitys,
- "%REL":relations,
- "%LAB%":labels,
- "%ATT%":attributes
- }
-
- #将提取到的值分配到键上
- def decode_value_combination(self,value_combination,cypher_check):
- res={}
- for index,(key,required_count) in enumerate(cypher_check.items()):
- if required_count==1:
- res[key]=value_combination[index][0]
- else:
- for i in range(required_count):
- key_num=key[:-1]+str(i)+"%"
- res[key_num]=value_combination[index][i]
- return res
-
- #对于找到了超过模板中需求的实体数量的情况,需要进行排列组合
- #info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
- def get_combinations(self,cypher_check,info):
- slot_values=[]
- for key,required_count in cypher_check.items():
- slot_values.append(itertools.combinations(info[key],required_count))
- value_combinations=itertools.product(*slot_values)
- combinations=[]
- for value_combination in value_combinations:
- combinations.append(self.decode_value_combination(value_combination,cypher_check))
- return combinations
-
- #将带有token的模板替换成真实词
- #string:%ENT1%和%ENT2%是%REL%关系吗
- #combination:{“%ENT1%”:"word1","%ENT2%":"word2"}
- def replace_token_in_string(self,string,combination):
- for key,value in combination.items():
- string = string.replace(key,value)
- return string
-
- #对于单条模板,根据抽取到的实体属性信息扩展,形成一个列表
- #info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
- def expend_templet(self,templet,cypher,cypher_check,info,answer):
- combinations=self.get_combinations(cypher_check,info)
- templet_cypher_pair=[]
- for combination in combinations:
- replaced_templet=self.replace_token_in_string(templet,combination)
- replaced_cypher=self.replace_token_in_string(cypher,combination)
- replaced_answer=self.replace_token_in_string(answer,combination)
- templet_cypher_pair.append([replaced_templet,replaced_cypher,replaced_answer])
- return templet_cypher_pair
-
- #验证从文本中提取到的信息是否足够填充模板,如果不够就跳过,节省运算速度。
- #如模板:%ENT%和%ENT%是什么关系? 这句话需要两个实体才能填充,如果问题中只有一个,该模板无法匹配
- def check_cypher_info_valid(self,info,cypher_check):
- for key,required_count in cypher_check.items():
- if len(info.get(key,[]))<required_count:
- return False
- return True
-
- #根据提取到的实体,关系等信息,将模板展开成待匹配的问题文本
- def expand_question_and_cypher(self,info):
- templet_cypher_pair=[]
- for templet,cypher,cypher_check,answer in self.question_templet:
- if self.check_cypher_info_valid(info,cypher_check):
- templet_cypher_pair+=self.expand_templet(templet,cypher,cypher_check,info,answer)
- return templet_cypher_pair
-
- #距离函数,文本匹配的所有方法都可以使用
- def sentence_similarity_function(self,string1,string2):
- print("计算 %s %s"%(string1,string2))
- jaccard_distance=len(set(string1)&set(string2))/len(set(string1)|set(string2))
- return jaccard_distance
-
- #通过问题匹配的方式确定匹配的cypher
- def cypher_match(self,sentence,info):
- templet_cypher_pair=self.expand_question_and_cypher(info)
- result=[]
- for templet,cypher,answer in templet_cypher_pair:
- score=self.sentence_similarity_function(sentence,templet)
- result.append([templet,cypher,score,answer])
- result=sorted(result,reverse=True,key=lambda x:x[2])
- return result
-
- #解析结果
- def parse_result(self,graph_search_result,answer,info):
- graph_search_result=graph_search_result[0]
- #关系查找返回的结果形式较为特殊,单独处理
- if "REL" in graph_search_result:
- graph_search_result['REL']=list(graph_search_result["REL"].types())[0]
- answer=self.replace_token_in_string(answer,graph_search_result)
- return answer
-
- #对外提供问答接口
- def query(self,sentence):#sentence:谁导演的不能说的秘密
- info=self.parse_sentence(sentence)#信息抽取
- templet_cypher_score=self.cypher_match(sentence,info)
- for templet,cypher,score,answer in templet_cypher_score:
- graph_search_result=self.graph.run(cypher).data()
- #最高分命中的模板不一定在图上能找到答案,当不能找到答案时,运行一个搜案语句,找答案时停止查找后面的模板
- if graph_search_result:
- break
- answer=self.parse_resule(graph_search_result,answer,info)
-
- #通过问题匹配的方式确定匹配的cypher
- def cypher_match(self,sentence,info):
- templet_cypher_pair=self.expand_question_and_cypher(info)
- result=[]
- for templet,cypher,answer in templet_cypher_pair:
- score=self.sentence_similarity_function(sentence,templet)
- result.append([templet,cypher,score,answer])
- result=sorted(result,reverse=True,key=lambda x:x[2])
- return result
-
-
- if __name__=="__main__":
- graph=GraphQA()
- res=graph.query("谁导演的不能说的秘密")
- print(res)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。