赞
踩
目录
Word2vec:NLP领域的奠基之作
论文链接:https://arxiv.org/pdf/1309.4168v1.pdf
代码:
- import numpy as np
- import torch
- import torch.nn as nn
- import torch.optim as optim
- from tqdm import tqdm
- from torch.autograd import Variable
- import matplotlib.pyplot as plt
-
- dtype = torch.FloatTensor
-
- # 语料库
- sentences = ["i like dog", "i like cat", "i like animal",
- "dog cat animal", "apple cat dog like", "cat like fish",
- "dog like meat", "i like apple", "i hate apple",
- "i like movie book music apple", "dog like bark", "dog friend cat"]
-
- word_sequence = ' '.join(sentences).split() # 所有句子以空格拼接,拼接好的内容再以空格分开
- word_list = list(set(word_sequence)) # 以集合的形式去重
- word_dict = {w: i for i, w in enumerate(word_list)}
- # print(word_sequence)
-
- skip_grams = [] # 训练数据
- for i in range(1, len(word_sequence) - 1):
- target = word_dict[word_sequence[i]] # 当前词对应的id
- context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]] # 两个上下文词对应的id
-
- for w in context:
- skip_grams.append([target, w])
-
- embedding_size = 2
- voc_size = len(word_list)
- batch_size = 2
-
-
- class Word2Vec(nn.Module):
- def __init__(self):
- super(Word2Vec, self).__init__()
-
- self.W = nn.Parameter(torch.rand(voc_size, embedding_size)).type(dtype)
- self.WT = nn.Parameter(torch.rand(embedding_size, voc_size)).type(dtype)
-
- def forward(self, x):
- hidden_layer = torch.matmul(x, self.W)
- output_layer = torch.matmul(hidden_layer, self.WT)
- return output_layer
-
-
- model = Word2Vec()
-
- criterion = nn.CrossEntropyLoss()
- optimizer = optim.Adam(model.parameters(), lr=0.0003)
-
-
- def random_batch(data, size):
- random_inputs = []
- random_labels = []
- random_index = np.random.choice(range(len(data)), size, replace=False)
-
- for i in random_index:
- random_inputs.append(np.eye(voc_size)[data[i][0]]) # 生成one-hot词向量
- random_labels.append(data[i][1])
-
- return random_inputs, random_labels
-
-
- # 训练函数
- for epoch in range(10000000):
- input_batch, target_batch = random_batch(skip_grams, batch_size)
-
- input_batch = torch.Tensor(input_batch)
- target_batch = torch.LongTensor(target_batch)
-
- optimizer.zero_grad()
-
- output = model(input_batch)
-
- loss = criterion(output, target_batch)
-
- if (epoch + 1) % 1000 == 0:
- print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
-
- loss.backward
- optimizer.step()
-
- for i, label in enumerate(word_list):
- W, WT = model.parameters()
- x, y = float(W[i][0]), float(W[i][1])
- plt.scatter(x, y)
- plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
- plt.show()

代码:
- from transformers import BertModel, BertTokenizer
- import torch
- import torch.nn as nn
-
- sentence = 'i like eating apples very much'
-
- class Model(nn.Module):
- def __init__(self):
- super().__init__()
- self.embedder = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True)
- self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-
- def forward(self, inputs):
- tokens = self.tokenizer.tokenize(inputs)
- print(tokens)
- tokens_id = self.tokenizer.convert_tokens_to_ids(tokens)
- print(tokens_id)
- tokens_id_tensor = torch.tensor(tokens_id).unsqueeze(0)
- outputs = self.embedder(tokens_id_tensor)
- print(outputs[0])
-
- model = Model()
- results = model(sentence)

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。