赞
踩
学习自:蓝思诺特
https://www.bilibili.com/video/BV1Ky4y1g7Nk?p=6
One-hot 的缺点
词向量化:把每一个词投射到n维空间中。
主流的两种简单的词向量训练方法:CBOW, Skip-gram
# 每一行可以理解为一篇文章 docs = [ #数字 ['5', '2', '4', '8', '6', '2', '3', '6', '4'], ['4', '8', '5', '6', '9', '5', '5', '6'], ['1', '1', '5', '2', '3', '3', '8'], ['3', '6', '9', '6', '8', '7', '4', '6', '3'], ['8', '9', '9', '6', '1', '4', '3', '4'], ['1', '0', '2', '0', '2', '1', '3', '3', '3', '3', '3'], ['9', '3', '3', '0', '1', '4', '7', '8'], ['9', '9', '8', '5', '6', '7', '1', '2', '3', '0', '1', '0'], #字母中夹杂了一些数字 ['a', 't', 'g', 'q', 'e', 'h', '9', 'u', 'f'], ['e', 'q', 'y', 'u', 'o', 'i', 'p', 's'], ['q', 'o', '9', 'p', 'l', 'k', 'j', 'o', 'k', 'k', 'o', 'p'], ['h', 'g', 'y', 'i', 'u', 't', 't', 'a', 'e', 'q'], ['i', 'k', 'd', 'q', 'r', 'e', '9', 'e', 'a', 'd'], ['o', 'p', 'd', 'g', '9', 's', 'a', 'f', 'g', 'a'], ['i', 'u', 'y', 'g', 'h', 'k', 'l', 'a', 's', 'w'], ['o', 'l', 'u', 'y', 'a', 'o', 'g', 'f', 's'], ['o', 'p', 'i', 'u', 'y', 'g', 'd', 'a', 's', 'j', 'd', 'l'], ['u', 'k', 'i', 'l', 'o', '9', 'l', 'j', 's'], ['y', 'g', 'i', 's', 'h', 'k', 'j', 'l', 'f', 'r', 'f'], ['i', 'o', 'h', '9', 'n', '9', 'd', '9', 'f', 'a', '9'], ] # %% #生成字典 zidian = {} for doc in docs: for word in doc: if word not in zidian: zidian[word] = len(zidian) zidian['0'], len(zidian) # (9, 30) # %% import torch from torch.utils.data import Dataset, DataLoader import torch.nn as nn #定义数据 class DocDataset(Dataset): def __init__(self): xs = [] ys = [] for doc in docs: #句子根据字典编码成数字 doc_encode = [zidian[word] for word in doc] #每个句子从头遍历到倒数第5个字母 for i in range(0, len(doc) - 4): #x是5个字母的前后各两个 xs.append([ doc_encode[i + 0], doc_encode[i + 1], doc_encode[i + 3], doc_encode[i + 4] ]) #y是5个字母中间的那个 ys.append(doc_encode[i + 2]) self.xs = torch.LongTensor(xs) self.ys = torch.LongTensor(ys) def __getitem__(self, i): return self.xs[i], self.ys[i] def __len__(self): return len(self.xs) len(DocDataset()) # 113 # %% #数据加载器 def get_dataloader(): dataloader = DataLoader(dataset=DocDataset(), batch_size=4, shuffle=True, drop_last=True) return dataloader for i, data in enumerate(get_dataloader()): sample = data break sample[0], sample[0].shape, sample[1], sample[1].shape # %% class CBOW(torch.nn.Module): def __init__(self): super().__init__() self.embed = torch.nn.Embedding(30, 2) # 30个词,映射成二维向量 self.embed.weight.data.normal_(0, 0.1) # 初始化权重,均值为0,方差为0.1的正态分布 self.fc = torch.nn.Linear(2, 30) # 全连接输出,方便计算loss def forward(self, x): #[b,4] -> [b,4,2] b句话,4个词,每个词用两位向量表示 x = self.embed(x) # 转为 embed 层 #[b,4,2] -> [b,2] 四个词求平均,变成一个词 x = torch.mean(x, dim=1) #[b,2] -> [b,30] x = self.fc(x) return x model = CBOW() out = model(sample[0]) out, out.shape criteon = torch.nn.CrossEntropyLoss() optim = torch.optim.SGD(model.parameters(), lr=1e-2) model.train() for epoch in range(1000): for i, data in enumerate(get_dataloader()): x, y = data optim.zero_grad() #计算 #[b,4] -> [b,30] out = model(x) loss = criteon(out, y) loss.backward() optim.step() if epoch % 200 == 0: print(epoch, loss) colors = [] idxs = [] for word, idx in zidian.items(): idxs.append(idx) if word in '1234567890': colors.append('red') continue colors.append('blue') #[30] -> [30,2] embed = model.embed(torch.LongTensor(idxs)).detach().numpy() # 30个词投影成二维向量 from matplotlib import pyplot as plt plt.scatter(embed[:, 0], embed[:, 1], c=colors) plt.show()
使用当前词,预测上一个词、下一个词、上两个词、下两个词。
# %% import torch from torch.utils.data import Dataset, DataLoader import torch.nn as nn #定义数据 class DocDataset(Dataset): def __init__(self): xs = [] ys = [] for doc in docs: #遍历每个句子的每个词,作为x for i in range(len(doc)): #遍历当前字的偏移,作为y for j in [-2, -1, 1, 2]: #如果出界了就跳过 if i + j < 0 or i + j >= len(doc): continue xs.append(zidian[doc[i]]) ys.append(zidian[doc[i + j]]) self.xs = torch.LongTensor(xs) self.ys = torch.LongTensor(ys) def __getitem__(self, i): return self.xs[i], self.ys[i] def __len__(self): return len(self.xs) len(DocDataset()) # %% #数据加载器 def get_dataloader(): dataloader = DataLoader(dataset=DocDataset(), batch_size=4, shuffle=True, drop_last=True) return dataloader for i, data in enumerate(get_dataloader()): sample = data break sample[0], sample[0].shape, sample[1], sample[1].shape class SkipGram(torch.nn.Module): def __init__(self): super().__init__() self.embed = torch.nn.Embedding(30, 2) self.embed.weight.data.normal_(0,0.1) self.fc = torch.nn.Linear(2, 30) def forward(self, x): #[b] -> [b,2] x = self.embed(x) #[b,2] -> [b,30] x = self.fc(x) return x model = SkipGram() out = model(sample[0]) out[:2], out.shape import random criteon = torch.nn.CrossEntropyLoss() optim = torch.optim.Adam(model.parameters(), lr=1e-2) model.train() for epoch in range(200): for i, data in enumerate(get_dataloader()): x, y = data optim.zero_grad() #计算 #[b] -> [b,30] out = model(x) loss = criteon(out, y) loss.backward() optim.step() if epoch % 20 == 0: print(epoch, loss) # %% colors = [] idxs = [] for word, idx in zidian.items(): idxs.append(idx) if word in '1234567890': colors.append('red') continue colors.append('blue') embed = model.embed(torch.LongTensor(idxs)) embed = embed.detach().numpy() from matplotlib import pyplot as plt plt.scatter(embed[:, 0], embed[:, 1], c=colors)
# %% zidian = {} with open('./data/msr_paraphrase/zidian.txt') as fr: for line in fr.readlines(): k, v = line.split(' ') zidian[k] = int(v) zidian['<PAD>'], len(zidian) # %% import numpy as np import pandas as pd import torch from torch.utils.data import Dataset, DataLoader import torch.nn as nn #定义数据 class MsrDataset(Dataset): def __init__(self): self.data = pd.read_csv('./data/msr_paraphrase/数字化数据.txt', nrows=2000) def __getitem__(self, i): return self.data.iloc[i] def __len__(self): return len(self.data) len(MsrDataset()) # %% def to_tensor(data): b = len(data) #N句话,每句话30个词 xs = np.zeros((b * 2,30)) for i in range(b): same, s1, s2 = data[i] #添加首尾符号,补0到统一长度 s1 = [zidian['<SOS>']] + s1.split(',')[:28] + [ zidian['<EOS>'] ] + [zidian['<PAD>']] * 28 xs[i] = s1[:30] s2 = [zidian['<SOS>']] + s2.split(',')[:28] + [ zidian['<EOS>'] ] + [zidian['<PAD>']] * 28 xs[b + i] = s2[:30] return torch.LongTensor(xs) #数据加载器 def get_dataloader(): dataloader = DataLoader(dataset=MsrDataset(), batch_size=8, shuffle=True, drop_last=True, collate_fn=to_tensor) return dataloader for i, data in enumerate(get_dataloader()): sample = data break sample[:5], sample.shape # %% class ForwardBackward(nn.Module): def __init__(self, flip): super().__init__() self.rnn1 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True) self.rnn2 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True) self.fc = nn.Linear(in_features=256, out_features=4300) self.flip = flip def forward(self, x): b = x.shape[0] #初始化记忆 h = torch.zeros(1, b,256) c = torch.zeros(1, b,256) #顺序运算,维度不变 #[16,29,256] -> [16,29,256] #如果是反向传播,把x逆序,由下面一个矩阵,变成下面第二个矩阵. ''' [[1,2,3], [4,5,6]] [[3,2,1], [6,5,4]]''' if self.flip: x = torch.flip(x, dims=(1, )) out1, (h, c) = self.rnn1(x, (h, c)) out2, (h, c) = self.rnn2(out1, (h, c)) #逆序后的x,计算出来的结果也是逆序的,把他们翻转回来 if self.flip: x = torch.flip(x, dims=(1, )) out1 = torch.flip(out1, dims=(1, )) out2 = torch.flip(out2, dims=(1, )) #全连接输出 #[16,29,256] -> [16,29,4300] out3 = self.fc(out2) return x, out1, out2, out3 x = torch.FloatTensor(16,29,256) out = ForwardBackward(flip=True)(x) len(out), out[-1].shape # %% class ELMo(nn.Module): def __init__(self): super().__init__() self.embed = nn.Embedding(num_embeddings=4300, embedding_dim=256, padding_idx=0) self.fw = ForwardBackward(flip=False) self.bw = ForwardBackward(flip=True) def forward(self, x): #编码 #[16,30] -> [16,30,256] x = self.embed(x) #顺序预测,以当前字预测下一个字,不需要最后一个字 outs_f = self.fw(x[:, :-1, :]) #逆序预测,以当前字预测上一个字,不需要第一个字 outs_b = self.bw(x[:, 1:, :]) return outs_f, outs_b out = ELMo()(sample) len(out), len(out[0]), out[0][-1].shape # %% model = ELMo() opt = torch.optim.Adam(model.parameters(), lr=1e-3) loss_func = nn.CrossEntropyLoss() for epoch in range(1): for i, x in enumerate(get_dataloader()): #x = [b,30] opt.zero_grad() #模型计算 outs_f, outs_b = model(x) #在计算loss的时候,只需要全连接输出 #[b,29,4300] outs_f = outs_f[-1] outs_b = outs_b[-1] #正向预测是以当前字预测下一个字,所以计算loss不需要第一个字 #[b,30] -> [b,29] x_f = x[:, 1:] #逆向预测是以当前字预测上一个字,所以计算loss不需要最后一个字 #[b,30] -> [b,29] x_b = x[:, :-1] #打平,不然计算不了loss #[b,29,4300] -> [b*29,4300] outs_f = outs_f.reshape(-1, 4300) outs_b = outs_b.reshape(-1, 4300) #[b,29] -> [b*29] x_f = x_f.reshape(-1) x_b = x_b.reshape(-1) #分别计算全向和后向的loss,再求和作为loss loss_f = loss_func(outs_f, x_f) loss_b = loss_func(outs_b, x_b) loss = (loss_f + loss_b) / 2 loss.backward() opt.step() if i % 20 == 0: #统计正确率 correct_f = (x_f == outs_f.argmax(axis=1)).sum().item() correct_b = (x_b == outs_b.argmax(axis=1)).sum().item() total = x.shape[0] * 29 print(epoch, i, loss.item(), correct_f / total, correct_b / total) # %% def get_emb(x): #模型运算 outs_f, outs_b = model(x) #在词向量编码时,可以任意选择一层的输出 #[16,29,256] outs_f = outs_f[1] outs_b = outs_b[1] #正向和反向的输出不能对齐,把他们重叠的部分截取出来 #[16,28,256] outs_f = outs_f[:, 1:] outs_b = outs_b[:, :-1] #拼合在一起,就是编码结果了 #[16,28,256 + 256] embed = torch.cat((outs_f, outs_b), dim=2) #[16,28,512] return embed get_emb(sample).shape
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。