赞
踩
目录
使用GCN构建网络的时候,会传入last_hidden_state 和adjacency_matirx。 所谓adjacency_matirx一般是指使用工具包,获取依存关系解析结果。依据解析结果构建0-1矩阵,若两者之间存在关系,则对应元素为1, 不存在依存关系则为0 ,特别的,主对角线全部为1,且是无向图。具体代码实现,参考另外一篇博客:Click Here
现在,想要构建全新的依存关系矩阵,与前面不同的是,不再构建单一的0-1矩阵,而是依据不同的实际关系类型,赋予(w_i,w_j)相对应位置不同的数字。
补充说明:
- import numpy as np
- import spacy
- import pickle
- from transformers import BertTokenizerFast, BertTokenizer, AutoTokenizer
- from spacy.tokens import Doc
-
- from pdb import set_trace as stop
-
- # Function: 构建英文数据集的邻接矩阵,比较句依据依存关系构图,非比较句则直接构建tokenize长度的主对角线为1的矩阵
-
- # 先获取spacy进行英文解析,所有的依存解析类型,存放在字典parser_dict中,特别的若没关系,则是0, self是1,表示主对角线。
- parser_dict = {'empty': 0, 'self':1}
- nlp = spacy.load('en_core_web_sm')
- parser_tuple = nlp.get_pipe("parser").labels
- count = 2
- for parser in parser_tuple:
- # print(i)
- parser_dict[parser] = count
- count += 1
- parser_dict
-
-
- class WhitespaceTokenizer(object):
- def __init__(self, vocab):
- self.vocab = vocab
-
- def __call__(self, text):
- words = text.split()
- # All tokens 'own' a subsequent space character in this tokenizer
- spaces = [True] * len(words)
- return Doc(self.vocab, words=words, spaces=spaces)
-
- nlp = spacy.load('en_core_web_sm') # zh_core_web_sm or en_core_web_sm
- nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
- tokenizer = BertTokenizerFast.from_pretrained("/home/qtxu/PLM/bert-base-uncased") # /home/qtxu/PLM/bert-base-chinese or bert-base-uncased
-
-
- # 构建矩阵
- def dependency_adj_matrix(text):
- tokens = nlp(text)
- tokenized = tokenizer(text.split(" "), is_split_into_words=True, add_special_tokens=False)
- word_ids = tokenized.word_ids()
- words = text.split()
- matrix1 = np.zeros((len(word_ids), len(word_ids))).astype('float32')
- assert len(words) == len(list(tokens))
- assert (len(tokens) - 1) == max(word_ids)
-
- for i, idx in enumerate(word_ids):
- matrix1[i][i] = 1
- for j, id in enumerate(word_ids):
- if i == j:
- continue
- if tokens[id].is_ancestor(tokens[idx]):
- for token in tokens[id].subtree:
- if token == tokens[idx]:
- relation_type = token.dep_
- matrix1[i][j] = parser_dict[relation_type]
- print(f" **{words[i]} ({tokens[idx]}) --{relation_type}--> {words[j]} ({tokens[id]})")
- break
- elif tokens[idx].is_ancestor(tokens[id]):
- for token in tokens[idx].subtree:
- if token == tokens[id]:
- relation_type = token.dep_
- matrix1[i][j] = parser_dict[relation_type]
- print(f" ##{words[j]} ({tokens[id]}) --{relation_type}--> {words[i]} ({tokens[idx]})")
- break
- else:
- matrix1[i][j] = 0
-
- return matrix1
-
- # 验证示例
- text = "this is a text sentence ."
- dependency_adj_matrix(text)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。