赞
踩
pytorch 官网 xavier_uniform
xavier_uniform 的出现是为了训练过程中前后的方差稳定问题,正确的初始化有利于训练的稳定;
Xavier初始化表明,对于每⼀层,输出的⽅差不受输⼊数量的影响,任何梯度的⽅差不受输出数量的影响。
一维卷积函数,为了便于理解,我们自定义一个一维卷积函数,并测试相关结果是否正确.
import torch from torch import nn # 用pytorch官方的API实现一维卷积 conv1d # 定义输入 input input = torch.randn(32, 25, 34) # 定义net net = nn.Conv1d(in_channels=25, out_channels=5, kernel_size=2) # 得到输出 output = net(input) # 查看定义网络的权重weights和偏置bias for m, k in net.named_parameters(): print(m, k.shape) # 得到输出的形状 print(f"output.shape={output.shape}") # output.shape=torch.Size([32, 5, 33]) # 自定义一维卷积 def conv1d_cus(input, weight, bias): """ :param input: input 输入 :param weight: conv1d.weights 网络权重 :param bias: conv1d.bias 网络偏置 :return: output 输出 """ # input.shape = torch.size([bs,in_channel,T]) # weight.shape = torch.size([out_channel,in_channel,kernel_size]) bs, in_channel, T = input.shape out_channel, _, kernel_size = weight.shape # output.shape = ([bs,out_channel,out_h]) out_h = T - kernel_size + 1 output = torch.zeros((bs, out_channel, out_h)) for i in range(bs): # 遍历批次 for j in range(out_channel): # 遍历输出通道 for m in range(out_h): # 遍历输出长度 # x.shape = torch.Size([in_channel,kernel_size]) # 获取卷积核在输入的区间x x = input[i, :, m:m + kernel_size] # 得到每个输出通道的卷积核权重 # k.shape = torch.Size([in_channel,Kernel_size]) k = weight[j, :, :] # w = x*k+bias output[i, j, m] = torch.sum(x * k) + bias[j] return output # 将之前的输入传入到自定义的输入中 cu_input = input # 将官方api定义的conv1d网络权重weight传入自定义权重中 cu_weight = net.weight # 将官方api定义的conv1d网络偏置bias传入自定义偏置中 cu_bias = net.bias # 将同样的参数传入到自定义的函数中得到自定义的输出 cu_output = conv1d_cus(cu_input, cu_weight, cu_bias) # 比较官网输出output和自定义输出cu_output是否一致 # 如果flags_cu为True那么就表示我们自定义的函数是正确的 flags_cu = torch.isclose(cu_output, output) # 打印 flags_cu print(f"flags_cu={flags_cu}")
weight torch.Size([5, 25, 2]) bias torch.Size([5]) output.shape=torch.Size([32, 5, 33]) flags_cu=tensor([[[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]], [[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]], [[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]], ..., [[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]], [[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]], [[True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True], [True, True, True, ..., True, True, True]]])
torch.sigmoid函数是一个激活函数,可以将张量中的值都转换成非负值;公式如下
S
i
g
m
o
i
d
(
x
)
=
σ
(
x
)
=
1
1
+
exp
−
x
Sigmoid(x)=\sigma(x)=\frac{1}{1+\exp^{-x}}
Sigmoid(x)=σ(x)=1+exp−x1
CLASS torch.nn.Sigmoid
# x = torch.arange(12,dtype=torch.float32) x = torch.randn((3,4)) y = torch.sigmoid(x) # 自定义一个sigmoid函数 def sigmoid_cus(x): y = 1.0/(1+torch.exp(-x)) return y z = sigmoid_cus(x) # 判断自定义的sigmoid函数的输出和官方API的sigmoid函数时候一致 flags_sigmoid = torch.isclose(y,z) print(f"x={x}") # x=tensor([[ 1.1050, -1.2536, 0.2727, -0.9987], # [ 1.5892, 0.5052, 2.2567, 1.3520], # [ 1.7559, 0.4546, -0.7967, 0.0197]]) print(f"y={y}") # y=tensor([[0.7512, 0.2221, 0.5678, 0.2692], # [0.8305, 0.6237, 0.9052, 0.7945], # [0.8527, 0.6117, 0.3107, 0.5049]]) print(f"flags_sigmoid={flags_sigmoid}") # flags_sigmoid=tensor([[True, True, True, True], # [True, True, True, True], # [True, True, True, True]])
求张量的均值
# x 生成一个3行4列的张量 x = torch.arange(12,dtype=torch.float32).reshape(3,4) # 调用pytorch官网的API,torch.mean 求得第dim=1维的均值 y = torch.mean(x,dim=1) # 自定义一个求均值的函数 def mean_cus(input,dim): if isinstance(dim,int): assert 0<= dim <= max(input.shape) y = torch.sum(input,dim=dim)/input.shape[dim] return y z = mean_cus(x,dim=5) print(f"x={x}") # x=tensor([[ 0., 1., 2., 3.], # [ 4., 5., 6., 7.], # [ 8., 9., 10., 11.]]) print(f"y={y}") # y=tensor([1.5000, 5.5000, 9.5000]) print(f"z={z}") # z=tensor([1.5000, 5.5000, 9.5000])
IMDB Dataset数据集
train: 25000;test: 25000
分词器;主要作用为给文本分词
torchtext.data.utils.get_tokenizer(tokenizer, language='en')
from torchtext.data import get_tokenizer # 实例化一个分词器tokenizer tokenizer = get_tokenizer("basic_english") # 定义输入句子 input = "I like to use pytorchtext as my tools in the future" # 将输入句子用分词器进行分词 token = tokenizer(input) # 打印相关结果 print(f"token={token}") # token=['i', 'like', 'to', 'use', 'pytorchtext', 'as', 'my', 'tools', 'in', 'the', 'future']
思路如下:
train_data_iter:
创建dataset对象yield_tokens:
将dataset对象里面的x(comment句子)进行分词化处理build_vocab_from_iterator:
将低频次词过滤掉,将高频词组成单词表set_default_index:
将不在单词表的低频词的索引设置为0# dataset里面的数据分词化处理 def yield_tokens(train_data_iter, tokenizer): for i, sample in enumerate(train_data_iter): label, comment = sample # 将 comment即x进行分词化处理 yield tokenizer(comment) # 得到dataset类型对象 train_data_iter = IMDB(root='.data', split='train') # Dataset类型的对象 # 实例化一个分词器 tokenizer = get_tokenizer("basic_english") # 将频次小于20的词用"<unk>"代替,将频次高于20的词装入到单词表中Vocab vocab = build_vocab_from_iterator(yield_tokens(train_data_iter, tokenizer), min_freq=20, specials=["<unk>"]) # 将不在单词表里面的数据索引值设置为0 vocab.set_default_index(0) print(f"单词表大小: {len(vocab)}")
torch.index_select(input, dim, index, *, out=None) → Tensor
input:
表示被索引的输入张量dim:
指定按行索引还是按列索引index:
索引值,需要输入一个索引向量import torch from torch import nn # 创建一个张量 4行4列 input_3 = torch.randn(4,4) # input_3=tensor([[-1.5693, -0.6550, 0.6508, -0.8672], 第 0 行 # [ 0.2457, 0.0737, 1.6346, -0.4966], 第 1 行 # [-1.4351, 0.6115, 1.5060, 0.2504], 第 2 行 # [-0.1475, -2.5242, 1.1654, -1.9561]]) 第 3 行 # 创建一个索引,这个索引指定的是取得行索引值 index = torch.randint(0,4,(5,)) # index=tensor([3, 0, 2, 0, 1]) # 取得 第 3 行 [-0.1475, -2.5242, 1.1654, -1.9561]]) 第 3 行 # 取得 第 0 行 [-1.5693, -0.6550, 0.6508, -0.8672], 第 0 行 # 取得 第 2 行 [-1.4351, 0.6115, 1.5060, 0.2504], 第 2 行 # 取得 第 0 行 [-1.5693, -0.6550, 0.6508, -0.8672], 第 0 行 # 取得 第 1 行 [ 0.2457, 0.0737, 1.6346, -0.4966], 第 1 行 # 我们根据索引值来不断从input_3中取得行数 output_3 = torch.index_select(input_3,0,index) # output_3=tensor([[-0.1475, -2.5242, 1.1654, -1.9561], # [-1.5693, -0.6550, 0.6508, -0.8672], # [-1.4351, 0.6115, 1.5060, 0.2504], # [-1.5693, -0.6550, 0.6508, -0.8672], # [ 0.2457, 0.0737, 1.6346, -0.4966]]) print(f"input_3={input_3}") # input_3=tensor([[-1.5693, -0.6550, 0.6508, -0.8672], # [ 0.2457, 0.0737, 1.6346, -0.4966], # [-1.4351, 0.6115, 1.5060, 0.2504], # [-0.1475, -2.5242, 1.1654, -1.9561]]) print(f"input_3.shape={input_3.shape}") # input_3.shape=torch.Size([4, 4]) print(f"index={index}") # index=tensor([3, 0, 2, 0, 1]) print(f"index.shape={index.shape}") # index.shape=torch.Size([5]) print(f"output_3={output_3}") # output_3=tensor([[-0.1475, -2.5242, 1.1654, -1.9561], # [-1.5693, -0.6550, 0.6508, -0.8672], # [-1.4351, 0.6115, 1.5060, 0.2504], # [-1.5693, -0.6550, 0.6508, -0.8672], # [ 0.2457, 0.0737, 1.6346, -0.4966]]) print(f"output_3.shape={output_3.shape}") # output_3.shape=torch.Size([5, 4])
vocab = build_vocab_from_iterator(yield_tokens(train_data_iter, tokenizer), min_freq=20, specials=["<unk>"])
vocab.set_default_index(0)
将mask
值中的True对应位置赋值为value;
import torch # 随机创建一个masked矩阵4行4列 masked = torch.randint(0,2,(4,4)).to(torch.bool) # masked=tensor([[False, False, True, False], # [ True, False, False, False], # [ True, True, False, False], # [False, True, True, True]]) print(f"masked={masked}") # 创建一个全1的矩阵 input = torch.ones(4,4) print(f"input={input}") # input=tensor([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]]) # 将masked为True的位置替换为888 # 比如 masked矩阵中第0行第2列的值为True # 那么就将input中的第0行第2列的值替换为888 input.masked_fill_(masked,value=888) print(f"input={input}") # input=tensor([[ 1., 1., 888., 1.], # [888., 1., 1., 1.], # [888., 888., 1., 1.], # [ 1., 888., 888., 888.]])
对张量进行裁剪,将值大于max的置为max,将值小于min的置为min;
import torch
# 随机生成张量x(3行4列)
x = torch.randn(3,4)
print(f"x={x}")
# x=tensor([[-0.5497, -0.1973, -1.3577, -0.0390],
# [-1.9676, 0.1965, -0.2627, 0.3018],
# [-0.3583, -1.1668, -0.1516, -0.5768]])
# 将张量x中的值裁剪到区间[-0.5,0.5]之间
y = torch.clamp(x,min=-0.5,max=0.5)
# y=tensor([[-0.5000, -0.1973, -0.5000, -0.0390],
# [-0.5000, 0.1965, -0.2627, 0.3018],
# [-0.3583, -0.5000, -0.1516, -0.5000]])
print(f"y={y}")
对模型的参数的梯度进行裁剪;
nn.utils.clip_grad_norm_(model.parameters(), 0.1)
torch.masked_select(input, mask, *, out=None) → Tensor
我们根据mask值中True值对应input位置的值挑选出来后得到一个1D张量
import torch # 创建一个3行4列张量 x_2 = torch.randn(3,4) # x_2=tensor([[ 0.9040, 0.4787, -0.7427, 1.0943], # [ 1.1150, -1.4897, -1.0072, -1.0045], # [-0.2445, 1.7155, -0.7584, -0.2749]]) # 比较x_2中的值与0.5的大小,如果大于返回True,否则返回False mask_2 = x_2.ge(0.5) # mask_2=tensor([[ True, False, False, True], # [ True, False, False, False], # [False, True, False, False]]) # 根据 mask_2 将x_2中的值( mask=True )挑选出来后生成一个1D张量 y_2 = torch.masked_select(x_2,mask_2) # y_2=tensor([0.9040, 1.0943, 1.1150, 1.7155]) print(f"x_2={x_2}") # x_2=tensor([[ 0.9040, 0.4787, -0.7427, 1.0943], # [ 1.1150, -1.4897, -1.0072, -1.0045], # [-0.2445, 1.7155, -0.7584, -0.2749]]) print(f"mask_2={mask_2}") # mask_2=tensor([[ True, False, False, True], # [ True, False, False, False], # [False, True, False, False]]) print(f"y_2={y_2}") # y_2=tensor([0.9040, 1.0943, 1.1150, 1.7155])
根据maske中True的值,将Tensor中对应位置的值置为value
import torch # 随机创建一个masked矩阵4行4列 masked = torch.randint(0,2,(4,4)).to(torch.bool) # masked=tensor([[False, False, True, False], # [ True, False, False, False], # [ True, True, False, False], # [False, True, True, True]]) print(f"masked={masked}") # 创建一个全1的矩阵 input = torch.ones(4,4) print(f"input={input}") # input=tensor([[1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.], # [1., 1., 1., 1.]]) # 将masked为True的位置替换为888 # 比如 masked矩阵中第0行第2列的值为True # 那么就将input中的第0行第2列的值替换为888 input.masked_fill_(masked,value=888) print(f"input={input}") # input=tensor([[ 1., 1., 888., 1.], # [888., 1., 1., 1.], # [888., 888., 1., 1.], # [ 1., 888., 888., 888.]])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # File : gcn_imdb_sentence_classification.py # Author : admin <admin> # Date : 31.12.2021 # Last Modified Date: 06.01.2022 # Last Modified By : admin <admin> import torch import torch.nn as nn import torch.nn.functional as F import torchtext from torchtext.datasets import IMDB # pip install torchtext 安装指令 from torchtext.datasets.imdb import NUM_LINES from torchtext.data import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torchtext.data.functional import to_map_style_dataset import sys import os import logging # 配置日志的输出形式 logging.basicConfig( # 设置logging输出的级别为logging.WARN在控制台中打印出来 level=logging.WARN, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) VOCAB_SIZE = 15000 # 第一期: 编写GCNN模型代码:门卷积网络模型 class GCNN(nn.Module): def __init__(self, vocab_size=VOCAB_SIZE, embedding_dim=64, num_class=2): """ :param vocab_size: 单词表的大小,根据 datasets 进行统计 :param embedding_dim: 每一个token我们用一个向量来表示,向量长度表示为embedding_dim :param num_class: 分类的种类个数 """ # 对父类进行初始化,这样就可以调用父类nn.Module里面的相关函数 super(GCNN, self).__init__() # 创建一个nn.Embedding的词表, # 行是单词表的个数vocab_size,列表示每个单词向量的长度 self.embedding_table = nn.Embedding(vocab_size, embedding_dim) # 用xavier_uniform_来初始化嵌入表的权重值 nn.init.xavier_uniform_(self.embedding_table.weight) # 设置一维卷积 self.conv_A_1 = nn.Conv1d(embedding_dim, 64, 15, stride=7) self.conv_B_1 = nn.Conv1d(embedding_dim, 64, 15, stride=7) self.conv_A_2 = nn.Conv1d(64, 64, 15, stride=7) self.conv_B_2 = nn.Conv1d(64, 64, 15, stride=7) # 定义全连接层 self.output_linear1 = nn.Linear(64, 128) # 定义分类全连接层,num_class为分类数 self.output_linear2 = nn.Linear(128, num_class) def forward(self, word_index): # 定义GCN网络的算子操作流程,基于句子单词ID输入得到分类logits输出 # 1. 通过word_index得到word_embedding # word_index shape:[bs, max_seq_len] # nn.Embedding(vocab_size, embedding_dim) # [bs,max_seq_len,embedding_dim] word_embedding = self.embedding_table(word_index) #[bs, max_seq_len, embedding_dim] # 2. 编写第一层1D门卷积模块 # [bs,max_seq_len,embedding_dim] -> [bs,embedding_dim,max_seq_len] word_embedding = word_embedding.transpose(1, 2) #[bs, embedding_dim, max_seq_len] A = self.conv_A_1(word_embedding) B = self.conv_B_1(word_embedding) H = A * torch.sigmoid(B) #[bs, 64, max_seq_len] A = self.conv_A_2(H) B = self.conv_B_2(H) H = A * torch.sigmoid(B) #[bs, 64, max_seq_len] # 3. 池化并经过全连接层 pool_output = torch.mean(H, dim=-1) #平均池化,得到[bs, 64] linear1_output = self.output_linear1(pool_output) logits = self.output_linear2(linear1_output) #[bs, 2] return logits class TextClassificationModel(nn.Module): """ 简单版embeddingbag+DNN模型 """ def __init__(self, vocab_size=VOCAB_SIZE, embed_dim=64, num_class=2): super(TextClassificationModel, self).__init__() # self.embedding.shape = [bs,embedding_dim] # nn.EmbeddingBag词袋是在行与行之间进行求均值 self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False) # 再将得到的均值用全连接层映射到分类数上 self.fc = nn.Linear(embed_dim, num_class) def forward(self, token_index): embedded = self.embedding(token_index) # shape: [bs, embedding_dim] return self.fc(embedded) # step2 构建IMDB DataLoader BATCH_SIZE = 64 # dataset里面的数据分词化 def yield_tokens(train_data_iter, tokenizer): for i, sample in enumerate(train_data_iter): label, comment = sample # 将 comment即x进行分词化处理 yield tokenizer(comment) # 得到dataset类型对象 train_data_iter = IMDB(root='.data', split='train') # Dataset类型的对象 # 实例化一个分词器 tokenizer = get_tokenizer("basic_english") # 将频次小于20的词用"<unk>"代替,将频次高于20的词装入到单词表中Vocab vocab = build_vocab_from_iterator(yield_tokens(train_data_iter, tokenizer), min_freq=20, specials=["<unk>"]) # 将不在单词表里面的数据索引值设置为0 vocab.set_default_index(0) print(f"单词表大小: {len(vocab)}") def collate_fn(batch): """ 对DataLoader所生成的mini-batch进行后处理 """ target = [] token_index = [] max_length = 0 # batch接受的也是一个元祖 # label = [bs,y]; # comment = [bs,x] for i, (label, comment) in enumerate(batch): tokens = tokenizer(comment) token_index.append(vocab(tokens)) if len(tokens) > max_length: max_length = len(tokens) if label == "pos": target.append(0) else: target.append(1) token_index = [index + [0]*(max_length-len(index)) for index in token_index] return (torch.tensor(target).to(torch.int64), torch.tensor(token_index).to(torch.int32)) # step3 编写训练代码 def train(train_data_loader, eval_data_loader, model, optimizer, num_epoch, log_step_interval, save_step_interval, eval_step_interval, save_path, resume=""): """ 此处data_loader是map-style dataset """ start_epoch = 0 start_step = 0 if resume != "": # 加载之前训过的模型的参数文件 logging.warning(f"loading from {resume}") checkpoint = torch.load(resume) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] start_step = checkpoint['step'] for epoch_index in range(start_epoch, num_epoch): ema_loss = 0. num_batches = len(train_data_loader) for batch_index, (target, token_index) in enumerate(train_data_loader): optimizer.zero_grad() step = num_batches*(epoch_index) + batch_index + 1 logits = model(token_index) bce_loss = F.binary_cross_entropy(torch.sigmoid(logits), F.one_hot(target, num_classes=2).to(torch.float32)) ema_loss = 0.9*ema_loss + 0.1*bce_loss bce_loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() if step % log_step_interval == 0: logging.warning(f"epoch_index: {epoch_index}, batch_index: {batch_index}, ema_loss: {ema_loss.item()}") if step % save_step_interval == 0: os.makedirs(save_path, exist_ok=True) save_file = os.path.join(save_path, f"step_{step}.pt") torch.save({ 'epoch': epoch_index, 'step': step, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': bce_loss, }, save_file) logging.warning(f"checkpoint has been saved in {save_file}") if step % eval_step_interval == 0: logging.warning("start to do evaluation...") model.eval() ema_eval_loss = 0 total_acc_account = 0 total_account = 0 for eval_batch_index, (eval_target, eval_token_index) in enumerate(eval_data_loader): total_account += eval_target.shape[0] eval_logits = model(eval_token_index) total_acc_account += (torch.argmax(eval_logits, dim=-1) == eval_target).sum().item() eval_bce_loss = F.binary_cross_entropy(torch.sigmoid(eval_logits), F.one_hot(eval_target, num_classes=2).to(torch.float32)) ema_eval_loss = 0.9*ema_eval_loss + 0.1*eval_bce_loss acc = total_acc_account/total_account # logging.warning(f"eval_ema_loss: {ema_eval_loss.item()}, eval_acc: {acc.item()}") model.train() # step4 测试代码 if __name__ == "__main__": model = GCNN() # model = TextClassificationModel() print("模型总参数:", sum(p.numel() for p in model.parameters())) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) train_data_iter = IMDB(root='.data', split='train') # Dataset类型的对象 train_data_loader = torch.utils.data.DataLoader(to_map_style_dataset(train_data_iter), batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True) eval_data_iter = IMDB(root='.data', split='test') # Dataset类型的对象 eval_data_loader = torch.utils.data.DataLoader(to_map_style_dataset(eval_data_iter), batch_size=8, collate_fn=collate_fn) resume = "" train(train_data_loader, eval_data_loader, model, optimizer, num_epoch=10, log_step_interval=20, save_step_interval=500, eval_step_interval=300, save_path="./logs_imdb_text_classification", resume=resume)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。