赞
踩
import math import torch from torch import nn from torch.nn import functional as F from d2l import torch as d2l import matplotlib.pyplot as plt import liliPytorch as lp # 读取H.G.Wells的时光机器数据集 batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) # 查看数据集 # for X, Y in train_iter: # print('X:', X.shape) # print('Y:', Y.shape) # print(vocab.token_freqs) # print(vocab.idx_to_token) # print(vocab.token_to_idx) # 独热编码 # 将每个索引映射为相互不同的单位向量: 假设词表中不同词元的数目为N(即len(vocab)), 词元索引的范围为0 # 到N-1。 如果词元的索引是整数i, 那么我们将创建一个长度为N的全0向量, 并将第i处的元素设置为1。 # 此向量是原始词元的一个独热向量。 # print(F.one_hot(torch.tensor([0,3,6]), len(vocab))) """ tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) """ # 每次采样的小批量数据形状是二维张量: (批量大小,时间步数)。 # one_hot函数将这样一个小批量数据转换成三维张量, 张量的最后一个维度等于词表大小(len(vocab))。 # 我们经常转换输入的维度,以便获得形状为 (时间步数,批量大小,词表大小)的输出。 # 这将使我们能够更方便地通过最外层的维度, 一步一步地更新小批量数据的隐状态。 # X = torch.arange(10).reshape((2, 5)) # print(X) # tensor([[0, 1, 2, 3, 4], # [5, 6, 7, 8, 9]]) # print(X.T) # tensor([[0, 5], # [1, 6], # [2, 7], # [3, 8], # [4, 9]]) # print(F.one_hot(X.T, 28).shape) # torch.Size([5, 2, 28]) # print(F.one_hot(X.T, 28)) """ tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]) """ # 初始化模型参数 def get_params(vocab_size, num_hiddens, device): # 设置输入和输出的数量为词汇表的大小 num_inputs = num_outputs = vocab_size # 定义一个函数,用于以正态分布初始化权重 def normal(shape): return torch.randn(size=shape, device=device) * 0.01 # 初始化隐藏层参数 W_xh = normal((num_inputs, num_hiddens)) # 输入到隐藏层的权重 W_hh = normal((num_hiddens, num_hiddens)) # 隐藏层到隐藏层的权重(循环权重) b_h = torch.zeros(num_hiddens, device=device) # 隐藏层的偏置 # 初始化输出层参数 W_hq = normal((num_hiddens, num_outputs)) # 隐藏层到输出层的权重 b_q = torch.zeros(num_outputs, device=device) # 输出层的偏置 # 将所有参数收集到一个列表中 params = [W_xh, W_hh, b_h, W_hq, b_q] # 设置每个参数的requires_grad属性为True,以便在反向传播期间计算梯度 for param in params: param.requires_grad_(True) return params # 返回参数列表 # 循环神经网络模型 # 初始化时返回隐状态 def init_rnn_state(batch_size, num_hiddens, device): # batch_size:批量的大小,即每次输入到RNN的序列数量。 # num_hiddens:隐藏层单元的数量,即隐藏状态的维度。 return (torch.zeros((batch_size, num_hiddens), device=device), ) # 返回一个包含一个张量的元组 def rnn(inputs, state, params): # inputs的形状:(时间步数量,批量大小,词表大小) # state:初始隐藏状态,通常是一个元组,包含隐藏层的状态。 # params:RNN的参数,包含权重和偏置。 W_xh, W_hh, b_h, W_hq, b_q = params H, = state # 当前的隐藏状态。 outputs = [] # X的形状:(批量大小,词表大小) for X in inputs: H = torch.tanh(torch.mm(X, W_xh) + torch.mm(H, W_hh) + b_h) Y = torch.mm(H, W_hq) + b_q outputs.append(Y) return torch.cat(outputs, dim=0), (H,) # 存储从零开始实现的循环神经网络模型的参数 class RNNModelScratch: #@save """从零开始实现的循环神经网络模型""" def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn): self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): # 前向传播方法 X = F.one_hot(X.T, self.vocab_size).type(torch.float32) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, device): # 初始化隐藏状态 return self.init_state(batch_size, self.num_hiddens, device) # X = torch.arange(10).reshape((2, 5)) num_hiddens = 512 # net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, # init_rnn_state, rnn) # state = net.begin_state(X.shape[0], d2l.try_gpu()) # 初始化隐藏状态 # 调用模型实例的 __call__ 方法执行前向传播。 # Y, new_state = net(X.to(d2l.try_gpu()), state) # Y:模型输出。 # new_state:更新后的隐藏状态。 # print(Y.shape, len(new_state), new_state[0].shape) # torch.Size([10, 28]) 1 torch.Size([2, 512]) # 输出形状是(时间步数 X 批量大小,词表大小), 而隐状态形状保持不变,即(批量大小,隐藏单元数) def predict_ch8(prefix, num_preds, net, vocab, device): #@save """在prefix后面生成新字符 prefix:生成文本的前缀,即初始输入字符序列。 num_preds:要预测的字符数。 net:训练好的循环神经网络模型。 vocab:词汇表,包含字符到索引和索引到字符的映射。 """ state = net.begin_state(batch_size=1, device=device) outputs = [vocab[prefix[0]]] # outputs:用于存储生成字符的索引列表。 get_input = lambda: torch.tensor([outputs[-1]], device=device).reshape((1, 1)) for y in prefix[1:]: # 预热期,遍历前缀中的剩余字符(从第二个字符开始)。 _, state = net(get_input(), state) # 调用 net 进行前向传播,更新隐藏状态 state。 outputs.append(vocab[y]) # 将当前字符的索引添加到 outputs 中。 for _ in range(num_preds): # 预测num_preds步 # 调用 net 进行前向传播,获取预测结果 y 和更新后的隐藏状态 state。 y, state = net(get_input(), state) # 使用 y.argmax(dim=1) 获取预测的字符索引,并将其添加到 outputs 中。 outputs.append(int(y.argmax(dim=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs]) # print(predict_ch8('time traveller ', 10, net, vocab, d2l.try_gpu())) # time traveller cfjwsthaqc # 梯度裁剪 """ 在训练深层神经网络(特别是循环神经网络)时,梯度爆炸(gradients exploding)问题会导致梯度值变得非常大, 从而导致模型不稳定甚至训练失败。为了防止梯度爆炸,可以对梯度进行裁剪,使得梯度的范数不超过某个预设的阈值。 """ def grad_clipping(net, theta): #@save """裁剪梯度 net:神经网络模型。 theta:梯度裁剪的阈值。 """ if isinstance(net, nn.Module): params = [p for p in net.parameters() if p.requires_grad] else: params = net.params # 计算梯度范数, L2 范数 norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm # 将每个参数的梯度按比例缩放,使得新的梯度范数等于 theta。 # 训练 def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter): """训练网络一个迭代周期(定义见第8章)""" state, timer = None, d2l.Timer() metric = lp.Accumulator(2) # 训练损失之和,词元数量 for X, Y in train_iter: if state is None or use_random_iter: # 在第一次迭代或使用随机抽样时初始化state state = net.begin_state(batch_size=X.shape[0], device=device) else: if isinstance(net, nn.Module) and not isinstance(state, tuple): # state对于nn.GRU是个张量 state.detach_() else: # state对于nn.LSTM或对于我们从零开始实现的模型是个张量 for s in state: s.detach_() y = Y.T.reshape(-1) X, y = X.to(device), y.to(device) y_hat, state = net(X, state) l = loss(y_hat, y.long()).mean() if isinstance(updater, torch.optim.Optimizer): updater.zero_grad() l.backward() grad_clipping(net, 1) updater.step() else: l.backward() grad_clipping(net, 1) # 因为已经调用了mean函数 updater(batch_size=1) metric.add(l * y.numel(), y.numel()) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop() #@save def train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False): """训练模型(定义见第8章)""" loss = nn.CrossEntropyLoss() animator = lp.Animator(xlabel='epoch', ylabel='perplexity', legend=['train'], xlim=[10, num_epochs]) # 初始化 if isinstance(net, nn.Module): updater = torch.optim.SGD(net.parameters(), lr) else: updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size) predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device) # 训练和预测 for epoch in range(num_epochs): ppl, speed = train_epoch_ch8( net, train_iter, loss, updater, device, use_random_iter) if (epoch + 1) % 10 == 0: print(predict('time traveller')) animator.add(epoch + 1, [ppl]) print(f'困惑度 {ppl:.1f}, {speed:.1f} 词元/秒 {str(device)}') print(predict('time traveller ')) print(predict('traveller ')) # 顺序抽样方法 num_epochs, lr = 500, 1 # train_ch8(net, train_iter, vocab, lr, num_epochs, d2l.try_gpu()) # plt.show() """ 困惑度 1.0, 95138.3 词元/秒 cuda:0 time traveller you can show black is white by argument said filby traveller you can show black is white by argument said filby """ # 随机抽样方法 net = RNNModelScratch(len(vocab), num_hiddens, d2l.try_gpu(), get_params, init_rnn_state, rnn) train_ch8(net, train_iter, vocab, lr, num_epochs, d2l.try_gpu(), use_random_iter=True) plt.show() """ 困惑度 1.3, 109268.9 词元/秒 cuda:0 time traveller held in his hand was a glitteringmetallic framewor traveller held in his hand was a glitteringmetallic framewor """
顺序抽样:
随机抽样:
import torch from torch import nn from torch.nn import functional as F from d2l import torch as d2l import matplotlib.pyplot as plt # 加载时光机器数据集并设置批量大小和序列长度 batch_size, num_steps = 32, 35 train_iter, vocab = d2l.load_data_time_machine(batch_size, num_steps) # 定义RNN模型 num_hiddens = 256 rnn_layer = nn.RNN(len(vocab), num_hiddens) # 用零张量初始化隐藏状态 state = torch.zeros((1, batch_size, num_hiddens)) # print(state.shape) # torch.Size([1, 32, 256]) # X = torch.rand(size=(num_steps, batch_size, len(vocab))) # Y, state_new = rnn_layer(X, state) # print(Y.shape, state_new.shape, X.shape) # torch.Size([35, 32, 256]) torch.Size([1, 32, 256]) torch.Size([35, 32, 28]) # 完整的循环神经网络模型定义了一个RNNModel类 #@save class RNNModel(nn.Module): """循环神经网络模型""" def __init__(self, rnn_layer, vocab_size, **kwargs): super(RNNModel, self).__init__(**kwargs) self.rnn = rnn_layer self.vocab_size = vocab_size self.num_hiddens = self.rnn.hidden_size # 如果RNN是双向的,num_directions应该是2,否则应该是1 if not self.rnn.bidirectional: self.num_directions = 1 self.linear = nn.Linear(self.num_hiddens, self.vocab_size) else: self.num_directions = 2 self.linear = nn.Linear(self.num_hiddens * 2, self.vocab_size) def forward(self, inputs, state): X = F.one_hot(inputs.T.long(), self.vocab_size) X = X.to(torch.float32) Y, state = self.rnn(X, state) # 全连接层首先将Y的形状改为(时间步数*批量大小,隐藏单元数) # 它的输出形状是(时间步数*批量大小,词表大小)。 output = self.linear(Y.reshape((-1, Y.shape[-1]))) return output, state def begin_state(self, device, batch_size=1): if not isinstance(self.rnn, nn.LSTM): # nn.GRU以张量作为隐状态 return torch.zeros((self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device) else: # nn.LSTM以元组作为隐状态 return (torch.zeros(( self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device), torch.zeros(( self.num_directions * self.rnn.num_layers, batch_size, self.num_hiddens), device=device)) # 训练与预测 device = d2l.try_gpu() net = RNNModel(rnn_layer, vocab_size=len(vocab)) net = net.to(device) num_epochs, lr = 500, 1 d2l.train_ch8(net, train_iter, vocab, lr, num_epochs, device) """ perplexity 1.3, 236379.1 tokens/sec on cuda:0 time traveller held in his hand was a glitteringmetallic framewo traveller fith a slan but move anotle bothe thon st stagee """ plt.show() print(d2l.predict_ch8('time traveller', 10, net, vocab, device)) # time traveller held in h
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。