赞
踩
LTP提供了一系列中文自然语言处理工具,用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。
ltp的官方文档里演示了分词,句法分析,语义依存关系提取等简单demo。本文在此基础上,将提取出的语义依存关系构建出知识图谱,使用的是neo4j平台。同时本文也会演示怎么使用python在neo4j上创建图谱。neo4j的安装比较简单,请自行查阅。
用ltp创建知识图谱至少需要3个信息:
本文只是简单演示,分析的句子是:他叫汤姆去拿外衣。
你也可以随意替换它。
from ltp import LTP def ltp_data(): """将句子处理成语义依存图""" ltp = LTP() # 分词 seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) # 词性标注 pos = ltp.pos(hidden) # 词性标注 ner = ltp.ner(hidden) tag, start, end = ner[0][0] print(tag, ":", "".join(seg[0][start:end + 1])) # 语义角色标注 srl = ltp.srl(hidden) # 依存句法分析 dep = ltp.dep(hidden) # 语义依存分析(图) sdp = ltp.sdp(hidden, mode='graph') return sdp, pos, seg
这里我们看一下返回的结果:
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
print(sdp)
print(pos)
print(seg)
out:
[[(1, 2, ‘AGT’), (2, 0, ‘Root’), (3, 2, ‘DATV’), (3, 4, ‘AGT’), (3, 5, ‘AGT’), (4, 2, ‘eSUCC’), (5, 2, ‘eSUCC’), (5, 4, ‘eSUCC’), (6, 5, ‘PAT’), (7, 2, ‘mPUNC’)]]
[[‘r’, ‘v’, ‘nh’, ‘v’, ‘v’, ‘n’, ‘wp’]]
[[‘他’, ‘叫’, ‘汤姆’, ‘去’, ‘拿’, ‘外衣’, ‘。’]]
标注和关系的具体含义参考ltp附录。
整理上一步返回的结果,从里面提取出节点和关系。
提取节点:
def node_extraction(seg, pos):
"""从语义依存图中提取出节点的名字和节点类型"""
seg[0] = [str(i) for i in seg[0]]
pos[0] = [str(i) for i in pos[0]]
return seg[0], pos[0]
提取关系时需要用到创建的节点,因此用到了nodes这个参数,它是在后面创建节点函数那里生成的。
提取关系
def relation_extraction(sdp,nodes): pass """ 提取出节点间的关系,将节点与关系整合成三元组,并存放在列表中。 (node1,node2,relation) """ rel = [] for tuple in sdp[0]: # 根据索引提取出节点和关系 index1 = int(tuple[0]) - 1 index2 = int(tuple[1]) - 1 node1 = nodes[index1] node2 = nodes[index2] relation = str(tuple[2]) # 将节点和关系添加到3元组中 triple = [] triple.append(node1) triple.append(node2) triple.append(relation) # 将3元组整合到列表中 rel.append(triple) return rel
这一步是创建知识图谱,需要先去neo4j连接上,在建立连接那里,第一个参数 是用cmd打开neo4j时生成的网址(http://localhost:7474),第二个参数是用户名,第三个参数是密码。
from py2neo import Node, Graph, Relationship from ltp_data import ltp_data # 可以先阅读下文档:https://py2neo.org/v4/index.htm class DataToNeo4j(object): """将数据存入neo4j""" def __init__(self): """建立连接""" link = Graph("your localhost", username="your username", password="your password") self.graph = link # self.graph = NodeMatcher(link) self.graph.delete_all() """ node3 = Node('animal' , name = 'cat') node4 = Node('animal' , name = 'dog') node2 = Node('Person' , name = 'Alice') node1 = Node('Person' , name = 'Bob') r1 = Relationship(node2 , 'know' , node1) r2 = Relationship(node1 , 'know' , node3) r3 = Relationship(node2 , 'has' , node3) r4 = Relationship(node4 , 'has' , node2) self.graph.create(node1) self.graph.create(node2) self.graph.create(node3) self.graph.create(node4) self.graph.create(r1) self.graph.create(r2) self.graph.create(r3) self.graph.create(r4) """ def create_node(self, name_node, type_node): """创建节点""" nodes = [] for i in range(len(name_node)): node = Node(type_node[i], name = name_node[i]) self.graph.create(node) nodes.append(node) print('节点创建成功') return nodes def create_relation(self, rel): """创建联系""" for triple in rel: try: # 关系要转化成字符串格式 r = Relationship(triple[0],str(triple[2]),triple[1]) self.graph.create(r) except AttributeError as e: print(e) print('关系创建成功')
测试运行
if __name__ == '__main__':
sdp, pos, seg = ltp_data()
create_data = DataToNeo4j()
# 建立节点
node_name, node_type = node_extraction(seg, pos)
nodes = create_data.create_node(node_name, node_type)
# 建立联系
rel = relation_extraction(sdp, nodes)
create_data.create_relation(rel
效果
ltp_data.py
from ltp import LTP def ltp_data(): """将句子处理成语义依存图""" ltp = LTP() # 分词 seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"]) # 词性标注 pos = ltp.pos(hidden) # 词性标注 ner = ltp.ner(hidden) # 语义角色标注 srl = ltp.srl(hidden) # 依存句法分析 dep = ltp.dep(hidden) # 语义依存分析(图) sdp = ltp.sdp(hidden, mode='graph') return sdp, pos, seg if __name__ == '__main__': sdp, pos, seg = ltp_data() print(sdp) print(pos) print(seg)
neo4j.py
# -*- coding: utf-8 -*- from py2neo import Node, Graph, Relationship from ltp_data import ltp_data # 可以先阅读下文档:https://py2neo.org/v4/index.htm class DataToNeo4j(object): """将excel中数据存入neo4j""" def __init__(self): """建立连接""" link = Graph("your localhost", username="your username", password="your password") self.graph = link # self.graph = NodeMatcher(link) self.graph.delete_all() """ node3 = Node('animal' , name = 'cat') node4 = Node('animal' , name = 'dog') node2 = Node('Person' , name = 'Alice') node1 = Node('Person' , name = 'Bob') r1 = Relationship(node2 , 'know' , node1) r2 = Relationship(node1 , 'know' , node3) r3 = Relationship(node2 , 'has' , node3) r4 = Relationship(node4 , 'has' , node2) self.graph.create(node1) self.graph.create(node2) self.graph.create(node3) self.graph.create(node4) self.graph.create(r1) self.graph.create(r2) self.graph.create(r3) self.graph.create(r4) """ def create_node(self, name_node, type_node): """创建节点""" nodes = [] for i in range(len(name_node)): node = Node(type_node[i], name = name_node[i]) self.graph.create(node) nodes.append(node) print('节点创建成功') return nodes def create_relation(self, rel): """创建联系""" for triple in rel: try: # 关系要转化成字符串格式 r = Relationship(triple[0],str(triple[2]),triple[1]) self.graph.create(r) except AttributeError as e: print(e) print('关系创建成功') def node_extraction(seg, pos): """从语义依存图中提取出节点的名字和节点类型""" seg[0] = [str(i) for i in seg[0]] pos[0] = [str(i) for i in pos[0]] return seg[0], pos[0] def relation_extraction(sdp,nodes): pass """ 提取出节点间的关系,将节点与关于整合成三元组,并存放在列表中。 (node1,node2,relation) """ rel = [] for tuple in sdp[0]: # 根据索引提取出节点和关系 index1 = int(tuple[0]) - 1 index2 = int(tuple[1]) - 1 node1 = nodes[index1] node2 = nodes[index2] relation = str(tuple[2]) # 将节点和关系添加到3元组中 triple = [] triple.append(node1) triple.append(node2) triple.append(relation) # 将3元组整合到列表中 rel.append(triple) return rel if __name__ == '__main__': sdp, pos, seg = ltp_data() create_data = DataToNeo4j() # 建立节点 node_name, node_type = node_extraction(seg, pos) nodes = create_data.create_node(node_name, node_type) # 建立联系 rel = relation_extraction(sdp, nodes) create_data.create_relation(rel)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。