赞
踩
- text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
- marked_text = "[CLS] " + text + " [SEP]"
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- tokenized_text = tokenizer.tokenize(marked_text)
- indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) #得到每个词在词表中的索引
- segments_ids = [1] * len(tokenized_text)
- tokens_tensor = torch.tensor([indexed_tokens]) .to(DEVICE)
- segments_tensors = torch.tensor([segments_ids]).to(DEVICE)
- model = BertModel.from_pretrained('bert-base-uncased',
- output_hidden_states = True)
- model.to(DEVICE)
- model.eval()
- with torch.no_grad():
- outputs = model(tokens_tensor, segments_tensors)
- hidden_states = outputs[2]
-
- token_embeddings = torch.stack(hidden_states, dim=0)
- token_embeddings.size()
- token_embeddings = torch.squeeze(token_embeddings, dim=1)
- token_embeddings.size()
- token_embeddings = token_embeddings.permute(1,0,2)#调换顺序
- token_embeddings.size()
-
- #词向量表示
- token_vecs_cat = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] #连接最后四层 [number_of_tokens, 3072]
- token_vecs_sum = [torch.sum(layer[-4:], 0) for layer in token_embeddings] #对最后四层求和 [number_of_tokens, 768]
-
- #句子向量表示
- token_vecs = hidden_states[-2][0]
- sentence_embedding = torch.mean(token_vecs, dim=0)#一个句子就是768维度
改进:处理被切分的token,处理被分割成子单词的单词嵌入,通过平均这些子单词的嵌入向量来为原始单词生成一个近似的向量:
- text = "After stealing money from the bank vault the bank robber was seen fishing on the Mississippi river bank"
- #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- tokenized_text = tokenizer.tokenize(text)
- len(tokenized_text)
- #把分开的词进行合并
- text_words = text.split(' ')#list,放每一个words
- len(text_words)
-
- ##如果是以##开头的就将词进行合并,数一下有多少个词,对应将embedding对应进行求和平均?
- #后续再尝试进行处理,就是可以对应获得每个词的向量表示,而不是token,根据
- num_emb = []
- m = 1
- for i in range(len(tokenized_text)):
- if '##' in tokenized_text[i]:#合并前一个词和当前词
- m = m+1
- num_emb.append(m)
- else:
- m = 1
- num_emb.append(m) #仅需要当前一个emb
- #所以再在emb合并,即对由123,12这种的合成相应的embedding,倒序合成?获得emb,倒着排列,是1即取,是3取三个,就可以得到每个词的向量表示
-
-
-
- indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) #得到每个词在词表中的索引
- segments_ids = [1] * len(tokenized_text)
- tokens_tensor = torch.tensor([indexed_tokens]).to(DEVICE)
- segments_tensors = torch.tensor([segments_ids]).to(DEVICE)
- #model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = True)
- #model.to(DEVICE)
- #model.eval()
- with torch.no_grad():#其实这里可以直接放进去很多句子的
- outputs = model(tokens_tensor, segments_tensors)
- hidden_states = outputs[2]
- # print(hidden_states)#13层的tuple#好像只能获得最后一层?
-
- token_embeddings = torch.stack(hidden_states, dim=0)#将每层获得向量打包
- token_embeddings.size()
- token_embeddings = torch.squeeze(token_embeddings, dim=1)
- token_embeddings.size()
- token_embeddings = token_embeddings.permute(1,0,2)#调换顺序
- token_embeddings.size()
-
- #词向量表示
- #token_vecs_cat = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] #连接最后四层 [number_of_tokens, 3072]
- token_vecs_sum = [torch.sum(layer[-4:], 0).cpu().detach().numpy() for layer in token_embeddings] #对最后四层求和 [number_of_tokens, 768]
- token_vecs_sum = token_vecs_sum[::-1]
- num_embr = num_emb[::-1]
- text_words = text_words[::-1]
-
- token_vecs_sum_words = []
-
- m = 0
- for i in num_embr:
- if i == 1:
- token_vecs_sum_words.append(token_vecs_sum[m])
- m = m+1
- elif i==0:
- m = m+1
- else:
- #合并i个emb,并相应的删除num_emb中的词
- token_vecs_sum_words.append(np.asarray(token_vecs_sum[m:m+i]).mean(axis=0))#torch.mean(token_vecs_sum[m:m+i], dim=0).cpu().detach().numpy()
- for j in range(i-1):
- num_embr[m+1+j]=0
- m = m+1
-
- #每个词的向量表示
- emb_dic = {}
- for w,emb in zip(text_words,token_vecs_sum_words):
- emb_dic[w] = emb
-
-
- #句子向量表示
- token_vecs = hidden_states[-2][0]#直接取倒数第二层的CLS作为句子的向量表示?
- sentence_embedding = torch.mean(token_vecs, dim=0)#一个句子就是768维度
参考:1. BERT Word Embeddings Tutorial · Chris McCormick
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。