赞
踩
视频:https://www.bilibili.com/video/BV1Ky4y1g7Nk?p=3
源码:https://github.com/lansinuote/NLP-Toturials
对于名字,以字母来处理会比较好。
不像句子可以分词。
这里的字典:字符和数字的对应表,共29个字符;不认识的字符用0表示。包含 - 和 1。
import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset, DataLoader # 1、定义数据集 class SurnameDataset(Dataset): def __init__(self, part): data = pd.read_csv('./data/surnames/数字化数据.csv') data = data[data.part == part] self.data = data def __getitem__(self, i): return self.data.iloc[i, 0], self.data.iloc[i, 1] def __len__(self): return len(self.data) train_dataset = SurnameDataset(part='train') val_dataset = SurnameDataset(part='val') test_dataset = SurnameDataset(part='test') print(len(train_dataset)) # 7680 print(len(val_dataset)) # 1640 print(len(test_dataset)) # 1660 # 2、x转one hot编码 def one_hot(data): N = len(data) #N句话,每句话15个词,每个词是个29维向量 xs = np.zeros((N, 15, 29)) ys = np.empty(N) for i in range(N): x, y = data[i] ys[i] = y x = x.split(',') for j in range(min(15, len(x))): xs[i, j, int(x[j]) - 1] = 1 return torch.FloatTensor(xs), torch.LongTensor(ys) # 3、数据加载器 train_dataloader = DataLoader(dataset=train_dataset, batch_size=100, shuffle=True, drop_last=True, # 正好100个,不会有尾数 collate_fn=one_hot) val_dataloader = DataLoader(dataset=val_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=one_hot) test_dataloader = DataLoader(dataset=test_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=one_hot) # 4、遍历数据 for i, data in enumerate(train_dataloader): x, y = data print(x[:2, :2], x.shape) print(y[:5], y.shape) break # 5、定义网络模型 class SurnameClassifier(nn.Module): def __init__(self): super(SurnameClassifier, self).__init__() h = 50 #[b,h,27] -> [b,h,13] self.conv1 = nn.Conv1d(in_channels=15, out_channels=h, kernel_size=5, stride=2) #[b,h,13] -> [b,h,5] self.conv2 = nn.Conv1d(in_channels=h, out_channels=h, kernel_size=5, stride=2) #[b,h,5] -> [b,h,1] self.conv3 = nn.Conv1d(in_channels=h, out_channels=h, kernel_size=5, stride=1) #激活函数 self.elu = nn.ELU() self.convnet = nn.Sequential(self.conv1, self.elu, self.conv2, self.elu, self.conv3, self.elu) self.fc = nn.Linear(h, 18) def forward(self, x): #out = self.conv1(x) #print(out.shape) #out = self.conv2(out) #print(out.shape) #out = self.conv3(out) #print(out.shape) #[b,h,27] -> [b,h] out = self.convnet(x).squeeze(dim=2) # 压缩掉多余的维度 #[b,h] -> [b,18] out = self.fc(out) return out model = SurnameClassifier() model(torch.randn(2, 15, 29)) def test(dataloader): model.eval() correct = 0 total = 0 for i, data in enumerate(dataloader): x, y = data y_pred = model(x) y_pred = y_pred.argmax(axis=1) correct += (y_pred == y).sum().item() total += len(y) return correct / total test(val_dataloader) loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-3) model.train() for epoch in range(10): for i, data in enumerate(train_dataloader): x, y = data optimizer.zero_grad() y_pred = model(x) loss = loss_func(y_pred, y) loss.backward() optimizer.step() if epoch % 1 == 0: accurecy = test(val_dataloader) print(epoch, loss.item(), accurecy) test(test_dataloader) # 0.659375
RNN 计算过程
拿本次的词和上一次的记忆作为输入。
字典和数据都和前面使用 CNN 对姓名分类一样;
这里不将名字变为 One-hot,而是切割为字;后面补 0。
import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset, DataLoader # 1、定义数据 class SurnameDataset(Dataset): def __init__(self, part): data = pd.read_csv('./data/surnames/数字化数据.csv') data = data[data.part == part] self.data = data def __getitem__(self, i): return self.data.iloc[i, 0], self.data.iloc[i, 1] def __len__(self): return len(self.data) train_dataset = SurnameDataset(part='train') val_dataset = SurnameDataset(part='val') test_dataset = SurnameDataset(part='test') print(len(train_dataset)) print(len(val_dataset)) print(len(test_dataset)) # 2、数据转 tensor def to_tensor(data): N = len(data) #N句话,每句话15个词 xs = np.zeros((N, 15)) ys = np.empty(N) for i in range(N): x, y = data[i] ys[i] = y x = x.split(',') + [0] * 15 x = x[:15] xs[i] = x return torch.LongTensor(xs), torch.LongTensor(ys) # 3、数据加载器 train_dataloader = DataLoader(dataset=train_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) val_dataloader = DataLoader(dataset=val_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) test_dataloader = DataLoader(dataset=test_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) # 4、遍历数据 for i, data in enumerate(train_dataloader): x, y = data print(x[:5], x.shape) print(y[:5], y.shape) break # 5、定义网络模型 class SurnameClassifier(nn.Module): def __init__(self): super(SurnameClassifier, self).__init__() self.embedding = nn.Embedding(num_embeddings=30, embedding_dim=50, padding_idx=0) self.rnn_cell = nn.RNNCell(50, 100) # 输入50维,输出100维; # 这里使用 rnn cell,但先不使用 rnn 层。rnn层可以一次处理一整句话。 self.fc1 = nn.Linear(in_features=100, out_features=100) self.fc2 = nn.Linear(in_features=100, out_features=18) # 网络计算函数 def forward(self, x): b = x.shape[0] #[b,15] -> [b,15,20] # 多一个维度,20 embed = self.embedding(x) #[b,15,20] -> [b,30] out = torch.zeros((b, 100)) for i in range(15): out = self.rnn_cell(embed[:, i, :], out) # 得到记忆 #[b,30] -> [b,18] out = F.relu(self.fc1(F.dropout(out, 0.5))) out = self.fc2(F.dropout(out, 0.5)) return out model = SurnameClassifier() model(torch.ones(2, 15).long()) # 预测函数 def test(dataloader): model.eval() correct = 0 total = 0 for i, data in enumerate(dataloader): x, y = data y_pred = model(x) y_pred = y_pred.argmax(dim=1) correct += (y_pred == y).sum().item() total += len(y) return correct / total test(val_dataloader) loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-3) model.train() for epoch in range(20): for i, data in enumerate(train_dataloader): x, y = data optimizer.zero_grad() y_pred = model(x) loss = loss_func(y_pred, y) loss.backward() optimizer.step() if epoch % 1 == 0: accurecy = test(val_dataloader) print(epoch, loss.item(), accurecy) test(test_dataloader)
三大循环神经网络
这里的字符预测,是无监督学习
#定义数据 class SurnameDataset(Dataset): def __init__(self, part): data = pd.read_csv('./data/surnames/数字化数据.csv') data = data[data.part == part] #去掉少于3个字符的名字 def filter_by_len(line): return len(line.x.split(',')) >= 3 data = data[data.apply(filter_by_len, axis=1)] self.data = data def __getitem__(self, i): return self.data.iloc[i, 0], self.data.iloc[i, 1] def __len__(self): return len(self.data) train_dataset = SurnameDataset(part='train') val_dataset = SurnameDataset(part='val') test_dataset = SurnameDataset(part='test') print(len(train_dataset)) print(len(val_dataset)) print(len(test_dataset)) def to_tensor(data): N = len(data) #N句话,每句话14个词 xs = np.zeros((N, 14)) #尾字母 ys = np.empty(N) for i in range(N): x, y = data[i] x = x.split(',') #取x的最后一个字母作为y ys[i] = x[-1] #x去掉最后一个字母 x = x[:-1] #反转后补0,在前面补0,切割到14位的长度 x = x[::-1] + ['0'] * 14 #切割到14位长度 x = x[:14] #反转回来 x = x[::-1] xs[i] = x return torch.LongTensor(xs), torch.LongTensor(ys) #数据加载器 train_dataloader = DataLoader(dataset=train_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) val_dataloader = DataLoader(dataset=val_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) test_dataloader = DataLoader(dataset=test_dataset, batch_size=100, shuffle=True, drop_last=True, collate_fn=to_tensor) #遍历数据 sample = None for i, data in enumerate(train_dataloader): sample = data x, y = data print(x[:3], x.shape) print(y[:3], y.shape) break #定义网络模型 class SurnameClassifier(nn.Module): def __init__(self): super(SurnameClassifier, self).__init__() self.embedding = nn.Embedding(num_embeddings=30, embedding_dim=50, padding_idx=0) self.rnn = nn.GRU(input_size=50, hidden_size=100, batch_first=True) # 输入是 50维的向量,记忆是100维; self.fc1 = nn.Linear(in_features=100, out_features=100) self.fc2 = nn.Linear(in_features=100, out_features=30) def forward(self, x): #[b,14] -> [b,14,50] embed = self.embedding(x) #[b,14,50] -> [b,14,100],[1,b,100] out, h = self.rnn(embed) #[b,100] -> [b,30] out = F.relu(self.fc1(F.dropout(h.squeeze(), 0.2))) out = self.fc2(F.dropout(out, 0.2)) return out model = SurnameClassifier() model(sample[0]) def test(dataloader): model.eval() correct = 0 total = 0 for i, data in enumerate(dataloader): x, y = data y_pred = model(x) y_pred = y_pred.argmax(dim=1) correct += (y_pred == y).sum().item() total += len(y) return correct / total test(val_dataloader) # ----------- loss_func = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=1e-3) model.train() for epoch in range(20): for i, data in enumerate(train_dataloader): x, y = data optimizer.zero_grad() y_pred = model(x) loss = loss_func(y_pred, y) loss.backward() optimizer.step() if epoch % 1 == 0: accurecy = test(val_dataloader) print(epoch, loss.item(), accurecy) test(test_dataloader)
2022-02-19(六) 下雨
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。