赞
踩
/data/raw_data/服饰_50k.json(共50000条数据)
{ "1": { "title": "巴拉巴 拉 旗下 梦 多多 童装 男童 毛衫 冬季 中大童 毛衫 黑色", "kb": { "适用季节": "冬季", "厚度": "适中", "领型": "高领", "适用年龄": "9-12岁", "材质成分": "锦纶", "图案": "其它", "上市时间": "2018冬季", "面料": "其它", "风格": "休闲风", "衣门襟": "套头", "适用性别": "男", "安全等级": "B类", "毛线粗细": "普通毛线" }, "ocr": "中国蓝,深土黄,健康安全,A门襟,黑色,衣袖,面料展,产品信息,领口,可水洗,细节展示,不宜暴晒,不可漂白,短拉链设计,简洁实用,吊牌价:239.00,适合季节:秋冬季,半开领设计,舒适亲肤,]面料构成,田属性说明,不可源自,合格证,不可干流", "reference": "三合一混纺纱线制成,柔软亲肤,贴身穿也没有扎感。半开领的立领设计,在较凉的天气,保护脖颈,穿脱也更为方便。侧袖的拼接撞色设计,凸现个性,宝宝穿上更帅气。" }, "2": { "title": "土拨鼠 男款 户外运动 休闲 抓绒 开衫 黑色", "kb": { "适用人群": "男士", "功能": "超轻", "尺码": "XS", "分类": "抓绒衣", "适用场景": "徒步", "品牌": "土拨鼠(Marmot)" }, "ocr": "立领设计,拉链口袋,细节展示,产品展示,土拨鼠吊,前胸口袋,洗涤方式,能迅速吸收运动过程产生的汗水,快速干燥,舒适保暖,衣身两侧拉链口袋,立领设计商务休闲,舒适保暖,拉链采用YKK拉链制作,土拨鼠洗唛标展示,美观大,保护收纳物品,产品特,不易丢失,(以吊牌为准),徒步、登山、旅行、露营、跑步、日常穿着、骑行,基本信息,收纳物品,前胸拉链口袋方便,大方,产地:越南,4759青鬼蓝,颜色:,展示,2975深海军蓝,深海军蓝,面料:聚苯,黑色", "reference": "时尚的小高领设计,可以有效锁住勃颈处的温度,让寒风也无法侵袭。前胸处的口袋搭配上时尚的品牌logo,美观还实用,让你的穿搭造型更显品味和档次。采用100wt抓绒,穿起来更暖和。" } }
将文件“服饰_50k.json”中的数据保存成以下格式数据:
短外套 女 春夏 新款 女装 复古 百搭 牛仔 外套 女 宽松 韩版 连帽 长袖 上衣 女 图色 图案 纯色 袖型 常规袖 风格 休闲风 衣门襟 拉链 适用年龄 25-29周岁 衣领材质 其它 类型 牛仔外套 流行元素 带帽 品牌 xzoo 材质 聚酯纤维 厚度 常规 版型 宽松型 衣长 常规款 袖长 长袖 组合形式 单件 领型 连帽 上市时间 2019年夏季 潮流 时尚 , 3D 立体 裁剪 , 肌理 舒适 面料 , 无 牛仔 不 时尚 , 打造 修身 S 曲线 , 修身 显瘦 版型 , 舒适 挺括 , 精美 口袋 设计 , L 精美 口袋 设计 , L 精致 袖口 设计 , 细节 展示 , 时尚 翻领 设计 , 细节 解析 , 吸湿 透气 , 分割线 裁剪 , 时尚 大方 , 肌理 时尚 , 不 起球 , 舒适 面料 , 轻微 弹力 , 修身 版型 设计 , 舒适 自 在 , 多条 , 精细 剪裁 , 抗皱 免烫 , 直筒 样式 长袖 袖口 , 时尚 大气 , , 时尚 百搭 , 优雅 显瘦 , 时尚 翻领 设计 , 恰到好处 , 时 , 提升 了 外套 的 设计 感 和 立体感 的 亮点 , 整体 的 曲线 , 勾勒 纤细 线条 , 显瘦 大方 , 两侧 口袋 的 设计 , 方便使用 的 同 , 两侧 微 开叉 弧形 下摆 设计 , 修饰<sep>气质 的 牛仔 外套 , 修身 的 版型 设计 , 勾勒 出 少女 娇美 曼妙 的 身姿 , 经典 的 连帽 设计 , 修饰 脸型 , 清纯 的 蓝色 , 显白 衬肤 , 打造 知性 优雅 的 淑女风格 。
# -*- coding: utf-8 -*- import sys import os import pathlib import json import jieba abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) def write_samples(lines, file_path, opt='w'): with open(file_path, opt, encoding='utf8') as file: for line in lines: file.write(line) file.write('\n') if __name__ == '__main__': samples = set() json_path = os.path.join(abs_path, './raw_data/服饰_50k.json') # 原始数据地址 with open(json_path, 'r', encoding='utf8') as file: json_objs = json.load(file) for json_obj in json_objs.values(): # ----------------------- 处理样本文本信息(x) ----------------------- title = json_obj['title'] + ' ' # 样本标题 kb = dict(json_obj['kb']).items() # 样本所有属性 kb_merged = '' # 将当前样本的所有属性合并为一段文本,用空格分隔 for key, val in kb: kb_merged += key + ' ' + val + ' ' ocr = ' '.join(list(jieba.cut(json_obj['ocr']))) # 样本图谱OCR后的文本 source_text = title + kb_merged + ocr # 合并所有样本多模型信息成一个source # ----------------------- 处理样本标签信息(y) ----------------------- reference = ' '.join(list(jieba.cut(json_obj['reference']))) # ----------------------- 合并样本文本信息(x)、样本标签信息(y)【二者用<sep>分开】 ----------------------- sample = source_text + '<sep>' + reference samples.add(sample) print('len(samples) = ', len(samples)) # ----------------------- 将处理后的所有样本写入 ./processed_data/samples.txt 文件 ----------------------- write_samples(samples, os.path.join(abs_path, './processed_data/samples.txt')) # ----------------------- 将所有样本samples分隔成:训练集、验证集、测试集 ----------------------- train, dev, test = [], [], [] count = 0 for sample in samples: count += 1 if count <= 1000: # Test set size. test.append(sample) elif count <= 6000: # Dev set size. dev.append(sample) else: train.append(sample) write_samples(train, os.path.join(abs_path, './processed_data/train.txt')) write_samples(dev, os.path.join(abs_path, './processed_data/dev.txt')) write_samples(test, os.path.join(abs_path, './processed_data/test.txt'))
从train.txt数据集(每行数据的格式为src++tgt)中提取训练数据集的(src, tgt)文本数据对
# -*- coding: utf-8 -*- ''' @Description: 用于提取(source、target)文本对 ''' import sys import pathlib from typing import Callable from utils import simple_tokenizer abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) class PairsOfSrcTgt(object): """ The class represents source-reference pairs. """ def __init__(self, filename, tokenize: Callable = simple_tokenizer, max_src_len: int = None, max_tgt_len: int = None, truncate_src: bool = False, truncate_tgt: bool = False): print("Reading dataset %s..." % filename, end=' ', flush=True) self.filename = filename self.pairs = [] with open(filename, 'rt', encoding='utf-8') as f: for i, line in enumerate(f): # Split the source and reference by the <sep> tag. pair = line.strip().split('<sep>') if len(pair) != 2: print("Line %d of %s is malformed." % (i, filename)) print(line) continue src = tokenize(pair[0]) if max_src_len and len(src) > max_src_len: if truncate_src: src = src[:max_src_len] else: continue tgt = tokenize(pair[1]) if max_tgt_len and len(tgt) > max_tgt_len: if truncate_tgt: tgt = tgt[:max_tgt_len] else: continue self.pairs.append((src, tgt)) print("%d pairs." % len(self.pairs))
# -*- coding: utf-8 -*- ''' @Description: 构建词典(vocb) ''' from collections import Counter from utils import count_words import config import numpy as np class BuildVocab(object): def __init__(self, pairs: list = [], embed_file: str = None): """ Build the vocabulary for the data set. Args: pairs:[(src01,tgr01),(src02,tgr02)...] embed_file (str, optional): The file path of the pre-trained embedding word vector. Defaults to None. Returns: vocab00000.Vocab: The vocab object. """ self.vocab = Vocab() word_counts = Counter() # word frequency count_words(word_counts, [src + tgr for src, tgr in pairs]) # Filter the vocabulary by keeping only the top k tokens in terms of word frequncy in the data set, where k is the maximum vocab size set in "config.py". for word, count in word_counts.most_common(config.max_vocab_size): self.vocab.add_words([word]) if embed_file is not None: count = self.vocab.load_embeddings(embed_file) print("%d pre-trained embeddings loaded." % count) class Vocab(object): PAD = 0 SOS = 1 EOS = 2 UNK = 3 def __init__(self): ''' @Description: Define the vocabulary object. ''' self.word2index = { '<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3} self.index2word = { val: key for key, val in self.word2index.items()} self.word_count = Counter() self.embeddings = None def add_words(self, words): """Add a new token to the vocab and do mapping between word and index. Args: words (list): The list of tokens to be added. """ for word in words: if word not in self.word2index: self.word2index[word] = len(self.word2index) self.index2word[len(self.word2index) - 1] = word self.word_count.update(words) # 根据新添加的元素更新Counter() def __getitem__(self, item): if type(item) is int: return self.index2word.get(item) # 根据id取对应的token else: return self.word2index.get(item, self.UNK) # 根据token取对应的id,如果该token不在字典里,则默认为self.UNK def __len__(self): return len(self.index2word) def size(self): """Returns the total size of the vocabulary""" return len(self.index2word) def load_embeddings(self, embed_file_path: str, dtype=np.float32) -> int: """ Load embedding word vector. Args: embed_file_path (str): The file path of word vector to load. dtype (numpy dtype, optional): Defaults to np.float32. Returns: int: Number of embedded tokens. """ num_embeddings = 0 vocab_size = len(self) with open(embed_file_path, 'rb') as f: for line in f: line = line.split() word = line[0].decode('utf-8') idx = self.word2index.get(word) if idx is not None: vec = np.array(line[1:], dtype=dtype) if self.embeddings is None: n_dims = len(vec) self.embeddings = np.random.normal(np.zeros((vocab_size, n_dims))).astype(dtype) self.embeddings[self.PAD] = np.zeros(n_dims) self.embeddings[idx] = vec num_embeddings += 1 return num_embeddings
# -*- coding: utf-8 -*- ''' @Description: Define the format of data used in the model. ''' import sys import pathlib import torch from torch.utils.data import Dataset from utils import sort_batch_by_len, source2ids abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) class SampleDataset(Dataset): """ The class represents a sample set for training. """ def __init__(self, data_pairs, vocab): self.src_texts = [data_pair[0] for data_pair in data_pairs] # print("self.src_texts[:2]", self.src_texts[:2]) self.tgt_texts = [data_pair[1] for data_pair in data_pairs] # print("self.tgt_texts[:2]", self.tgt_texts[:2]) self.vocab = vocab self._len = len(data_pairs) # Keep track of how many data points. def __getitem__(self, index): # print("self.src_texts[{0}] = {1}".format(index, self.src_texts[index])) src_ids, oovs = source2ids(self.src_texts[index], self.vocab) # 将当前文本self.src_texts[index]转为ids,oovs为超出词典范围的词汇文本 return { 'x': [self.vocab.SOS] + src_ids + [self.vocab.EOS], 'y': [self.vocab.SOS] + [self.vocab[i] for i in self.tgt_texts[index]] + [self.vocab.EOS], 'x_len': len(self.src_texts[index]), 'y_len': len(self.tgt_texts[index]), 'oovs': oovs, 'len_oovs': len(oovs) } def __len__(self): return self._len def collate_fn(batch): """Split data set into batches and do padding for each batch. Args: x_padded (Tensor): Padded source sequences. y_padded (Tensor): Padded reference sequences. x_len (int): Sequence length of the sources. y_len (int): Sequence length of the references. oovs (dict): Out-of-vocabulary tokens. len_oovs (int): Number of OOV tokens. """ def padding(indice, max_length, pad_idx=0): pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice] return torch.tensor(pad_indice) data_batch = sort_batch_by_len(batch) x = data_batch["x"] x_max_length = max([len(t) for t in x]) y = data_batch["y"] y_max_length = max([len(t) for t in y]) oovs = data_batch["oovs"] len_oovs = torch.tensor(data_batch["len_oovs"]) x_padded = padding(x, x_max_length) y_padded = padding(y, y_max_length) x_len = torch.tensor(data_batch["x_len"]) y_len = torch.tensor(data_batch["y_len"]) return x_padded, y_padded, x_len, y_len, oovs, len_oovs
# -*- coding: utf-8 -*- ''' @Description: Define the model. ''' import os import sys import pathlib import torch import torch.nn as nn import torch.nn.functional as F import config abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) class Encoder(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, rnn_drop: float = 0): super(Encoder, self).__init__() self.embedding = nn.Embedding(vocab_size, embed_size) # Embedding层【输入维度:vocab_size、输出维度:embed_size】 self.hidden_size = hidden_size self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, dropout=rnn_drop, batch_first=True) # LSTM层 def forward(self, x): """ Define forward propagation for the encoder. Args: x (Tensor): The input samples as shape (batch_size, seq_len). Returns: output (Tensor): The output of lstm with shape(batch_size, seq_len, 2 * hidden_size). hidden (tuple): The hidden states of lstm (h_n, c_n). Each with shape (2, batch_size, hidden_size) """ embedded = self.embedding(x) output, hidden = self.lstm(embedded) return output, hidden class Attention(nn.Module): def __init__(self, hidden_size): super(Attention, self).__init__() # Define feed-forward layers. self.Wh_Linear = nn.Linear(2 * hidden_size, 2 * hidden_size, bias=False) self.Ws_Linear = nn.Linear(2 * hidden_size, 2 * hidden_size) self.v_Linear = nn.Linear(2 * hidden_size, 1, bias=False) def forward(self, decoder_hidden_states, encoder_output, x_padding_masks): """ Define forward propagation for the attention network.《论文:Get To The Point: Summarization with Pointer-Generator Networks》 Args: decoder_hidden_states (tuple): The hidden states from lstm (h_n, c_n) in the decoder, each with shape (1, batch_size, hidden_size) encoder_output (Tensor): The output from the lstm in the decoder with shape (batch_size, seq_len, hidden_size). x_padding_masks (Tensor): The padding masks for the input sequences with shape (batch_size, seq_len). Returns: context_vector (Tensor): Dot products of attention weights and encoder hidden states. The shape is (batch_size, 2*hidden_size). attention_weights (Tensor): The shape is (batch_size, seq_length). """ # ----------------------- 获取 Decoder端的 hidden state(Concatenate h and c to get s_t and expand the dim of s_t.) ----------------------- h_dec, c_dec = decoder_hidden_states # Decoder端是UniLSTM,合并 hidden、cell使其维度与Encoder端(BiLSTM)的hidden维度一致,都为 2*hidden_size s_t = torch.cat([h_dec, c_dec], dim=2) # (1, batch_size, 2*hidden_size) s_t = s_t.transpose(0, 1) # (batch_size, 1, 2*hidden_size) s_t = s_t.expand_as(encoder_output).contiguous() # (batch_size, seq_length, 2*hidden_size) # ----------------------- 计算 Attention scores 【论文公式(1):score = v×tanh(W_h×h_i + W_s×s_t);其中v、W_h、W_s都是参数】----------------------- encoder_features = self.Wh_Linear(encoder_output.contiguous()) # W_h×h_i (batch_size, seq_length, 2*hidden_size) decoder_features = self.Ws_Linear(s_t) # W_s×s_t (batch_size, seq_length, 2*hidden_size) att_inputs = encoder_features + decoder_features # W_h×h_i + W_s×s_t (batch_size, seq_length, 2*hidden_size) score = self.v_Linear(torch.tanh(att_inputs)) # (batch_size, seq_length, 1) # ----------------------- 论文公式(2):对 Attention scores 进行softmax操作,得到 Attention Weight----------------------- attention_weights = F.softmax(score, dim=1).squeeze(2) # (batch_size, seq_length) attention_weights = attention_weights * x_padding_masks # 删除Mask部分的权重 # 排除Mask部分的权重后重新对Attention weights进行归一化操作 normalization_factor = attention_weights.sum(1, keepdim=True) attention_weights = attention_weights / normalization_factor # ----------------------- 论文公式(3):计算 Context vector----------------------- context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_output) # (batch_size, 1, 2*hidden_size) context_vector = context_vector.squeeze(1) # (batch_size, 2*hidden_size) return context_vector, attention_weights class Decoder(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, enc_hidden_size=None, is_cuda=False): super(Decoder, self).__init__() self.DEVICE = torch.device('cuda') if is_cuda else torch.device('cpu') self.embedding = nn.Embedding(vocab_size, embed_size) self.vocab_size = vocab_size self.hidden_size = hidden_size self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True) self.W1_Linear = nn.Linear(self.hidden_size * 3, self.hidden_size) self.W2_Linear = nn.Linear(self.hidden_size, vocab_size) def forward(self, decoder_input, decoder_hidden_states, encoder_output, context_vector): """Define forward propagation for the decoder. Args: decoder_input (Tensor): The input of the decoder x_t of shape (batch_size, 1). decoder_hidden_states (tuple): The hidden states(h_n, c_n) of the decoder from last time step. The shapes are (1, batch_size, hidden_size) for each. encoder_output (Tensor): The output from the encoder of shape (batch_size, seq_length, 2*hidden_size). context_vector (Tensor): The context vector from the attention network of shape (batch_size,2*hidden_size). Returns: p_vocab (Tensor): The vocabulary distribution of shape (batch_size, vocab_size). docoder_states (tuple): The lstm states in the decoder. The shapes are (1, batch_size, hidden_size) for each. """ decoder_emb = self.embedding(decoder_input) decoder_output, decoder_hidden_states = self.lstm(decoder_emb, decoder_hidden_states) # concatenate context vector and decoder state (batch_size, 3*hidden_size) decoder_output = decoder_output.view(-1, config.hidden_size) # ----------------------- 论文公式(4):P_vocab = softmax(V'×(V×[s_t,h^*]+b)+b')----------------------- concat_vector = torch.cat([decoder_output, context_vector], dim=-1) # calculate vocabulary distribution (batch_size, hidden_size) FF1_out = self.W1_Linear(concat_vector) # (batch_size, vocab_size) FF2_out = self.W2_Linear(FF1_out) # (batch_size, vocab_size) P_vocab = F.softmax(FF2_out, dim=1) return P_vocab, decoder_hidden_states class ReduceState(nn.Module): """ Since the encoder has a bidirectional LSTM layer while the decoder has a unidirectional LSTM layer, we add this module to reduce the hidden states output by the encoder (merge two directions) before input the hidden states into the decoder. """ def __init__(self): super(ReduceState, self).__init__() def forward(self, hidden): """ The forward propagation of reduce state module. Args: hidden (tuple): Hidden states of encoder, each with shape (2, batch_size, hidden_size). Returns: tuple: Reduced hidden states, each with shape (1, batch_size, hidden_size). """ h, c = hidden h_reduced = torch.sum(h, dim=0, keepdim=True) c_reduced = torch.sum(c, dim=0, keepdim=True) hidden = (h_reduced, c_reduced) return hidden class Seq2seq(nn.Module): def __init__(self, vocab): super(Seq2seq, self).__init__() self.DEVICE = torch.device("cuda" if config.is_cuda else "cpu") self.vocab = vocab # 初始化 词典 self.attention = Attention(config.hidden_size) # 初始化 Attention self.encoder = Encoder(len(vocab), config.embed_size, config.hidden_size) # 初始化 Encoder self.decoder = Decoder(len(vocab), config.embed_size, config.hidden_size) # 初始化 Decoder self.reduce_state = ReduceState() # 初始化 降维组件 def load_model(self): if os.path.exists(config.encoder_saved_name): self.encoder = torch.load(config.encoder_saved_name, map_location='gpu' if config.is_cuda else 'cpu') self.decoder = torch.load(config.decoder_saved_name, map_location='gpu' if config.is_cuda else 'cpu') self.attention = torch.load(config.attention_saved_name, map_location='gpu' if config.is_cuda else 'cpu') self.reduce_state = torch.load(config.reduce_state_saved_name, map_location='gpu' if config.is_cuda else 'cpu') def forward(self, x, x_len, y, y_len, len_oovs, epoch_idx, batch_idx, is_train): # x:序列化后的输入文本;y:序列化后的输出文本;oovs:超出自定义词典的词汇文本列表【各含有batch_size个样本】 """ Define the forward propagation for the seq2seq model. Args: x (Tensor): Input sequences as source with shape (batch_size, seq_len) x_len ([int): Sequence length of the current batch. y (Tensor): Input sequences as reference with shape (bacth_size, y_len) len_oovs (int): The number of out-of-vocabulary words in this sample. batch (int): The number of the current batch. Returns: batch_loss (Tensor): The average loss of the current batch. """ print('\n************************************* epoch_idx = {0};batch_idx = {1}: 向model喂入的数据 *************************************\n'.format(epoch_idx, batch_idx)) print('batch_idx = ', batch_idx) print('x_len = ', x_len) print('x = ', x) print('y_len = ', y_len) # y_len是在sample_dataset中手工计算的y的真实长度 print('y.shape = ', y.shape, '; y.shape[1] = ', y.shape[1]) # 每一个batch的所有y.shape的每一个y的shape都相同,且等于该batch中长度最长的文本的长度。 print('y = ', y) # ----------------------- 对输入的序列化样本进行处理,并生成本条样本的 padding_mask ----------------------- print('\n************************************* 对输入的序列化样本进行处理,并生成本条样本的 padding_mask *************************************\n') oov_token = torch.full(x.shape, self.vocab.UNK).long().to(self.DEVICE) # torch.full(size, fill_value)【Creates a tensor of size filled with fill_value.】 x_copy = torch.where(x > len(self.vocab) - 1, oov_token, x) # torch.where(condition, x, y)【对x中的每个元素进行三目运算,符合condition则取x里的值,否则取y里的值】 x_padding_masks = torch.ne(x_copy, 0).byte().float() # torch.ne(input, other) 【按元素判断input与other是否相等,如果不相等则为True,如果相等则为False(The second argument can be a number or a tensor whose shape is broadcastable with the first argument.)】 print('\nx_padding_masks.shape = ', x_padding_masks.shape) print('x_padding_masks = \n', x_padding_masks) # ----------------------- 将序列化后的本条样本的x_copy作为输入喂给Encoder ----------------------- encoder_output, encoder_hidden_states = self.encoder(x_copy) print('encoder_output.shape = ', encoder_output.shape) # ----------------------- 将Encoder的隐层向量encoder_states进行降维, 将降维后的结果作为Decoder的初始化隐层向量(因为Encoder用的是BiLSTM, Decoder用的是UniLSTM) ----------------------- decoder_hidden_states = self.reduce_state(encoder_hidden_states) # ----------------------- 计算预测当前样本的目标summary中的每一个词汇的loss ----------------------- step_losses = [] # 用于存放当前batch所有样本的目标summary的每一个词汇的loss for i in range(y.shape[1] - 1): # for i in range(y_len - 1),此y_len是每一个样本目标summary的真实长度 print('\n--------------------------------- is_train = {0}; epoch_idx = {1}; batch_idx = {2}, 当前预测时间步 i = {3} ---------------------------------\n'.format(is_train, epoch_idx, batch_idx, i)) decoder_input_i = y[:, i] # x_i:特征值【将每个样本的第i个id作为第i+1个时间步的输入】 decoder_target_i = y[:, i + 1] # y_i:目标值【每个样本的第i+1个id作为目标值】 print('decoder_input_i.shape = ', decoder_input_i.shape, '; decoder_input_i = ', decoder_input_i, '; decoder_input_i.unsqueeze(1).shape = ', decoder_input_i.unsqueeze(1).shape, '\ndecoder_input_i.unsqueeze(1) = \n', decoder_input_i.unsqueeze(1)) print('decoder_target_i.shape = ', decoder_target_i.shape, '; decoder_target_i = ', decoder_input_i, '; decoder_target_i.unsqueeze(1).shape = ', decoder_target_i.unsqueeze(1).shape, '\ndecoder_target_i.unsqueeze(1) = \n', decoder_target_i.unsqueeze(1)) # ----------------------- 通过Attention机制计算得出Context Vector以及Attention Weight ----------------------- context_vector, attention_weights = self.attention(decoder_hidden_states, encoder_output, x_padding_masks) # Get vocab distribution and hidden states from the decoder. # ----------------------- 论文公式(4):通过Decoder计算得出第i+1个的预测值的概率分布----------------------- p_vocab, decoder_hidden_states = self.decoder(decoder_input_i.unsqueeze(1), decoder_hidden_states, encoder_output, context_vector) # p_vocab.shape = torch.Size([3, 20004]) ----decoder_hidden_states[0].shape = torch.Size([1, 4, 512]) ----decoder_hidden_states[1].shape = torch.Size([1, 4, 512]) print('\np_vocab.shape = ', p_vocab.shape, '----decoder_hidden_states[0].shape = ', decoder_hidden_states[0].shape, '----decoder_hidden_states[1].shape = ', decoder_hidden_states[1].shape) # ----------------------- 论文公式(5):概率分布 p_vocab([batch_size, vocab_size]) 表示的是当前样本在词汇表中所有词汇的概率,根据真实目标值的索引值decoder_target_i获取该索引值所获取的概率分布值 ----------------------- target_probs = torch.gather(p_vocab, 1, decoder_target_i.unsqueeze(1)) # torch.Size([3, 1]);torch.gather(input, dim, index) → Tensor【在input的dim维度,提取索引为index的值】 print('target_probs.shape = ', target_probs.shape, '; \ntarget_probs = ', target_probs) target_probs = target_probs.squeeze(1) # torch.Size([3]) 表示本batch中所有3个样本在本epoch预测中真实值所得到的概率预测值 print('squeeze之后:target_probs.shape = ', target_probs.shape, '; target_probs = ', target_probs) # ----------------------- 论文公式(6):loss_t = -logP(w^*_t)【config.eps的作用是为了方式-log(0)】----------------------- loss = -torch.log(target_probs + config.eps) print('\nepoch({0})-batch_idx({1}):预测y序列文本中第({2})个时间步的token的损失值: loss.shape = {3}; loss = {4} '.format(epoch_idx, batch_idx, i, loss.shape, loss)) # ----------------------- 通过mask消除padding处的影响(如果for循环用的是各个样本真实的长度参数y_len,则无需用mask来消除padding的影响) ----------------------- mask = torch.ne(decoder_target_i, 0).byte().float() # torch.ne(input, other) 【按元素判断input与other是否相等,如果不相等则为True,如果相等则为False print('\nmask.shape = ', mask.shape, '; mask = ', mask) loss = loss * mask # 元素对应相乘 print('经过mask后:loss.shape = ', loss.shape, '; loss = ', loss) step_losses.append(loss) print('\nlen(step_losses) =', len(step_losses), '; step_losses[0].shape = ', step_losses[0].shape) # len(step_losses) = 54; step_losses[0].shape = torch.Size([3]) stack_losses = torch.stack(step_losses, 1) # torch.stack(tensors, dim)-->torch.Size([3, 54]):对序列数据内部的张量进行扩维拼接,指定维度由程序员选择、大小是生成后数据的维度区间。 print('stack_losses.shape = ', stack_losses.shape, '; \nstack_losses = \n', stack_losses) sum_losses = torch.sum(stack_losses, 1) print('sum_losses.shape = ', sum_losses.shape, '; sum_losses = ', sum_losses) # sum_losses.shape = torch.Size([3]) ; sum_losses = tensor([514.7693, 534.7229, 455.4084], grad_fn=<SumBackward1>)代表当前batch里的3个样本各自的loss # get the non-padded length of each sequence in the batch seq_len_mask = torch.ne(y, 0).byte().float() print('seq_len_mask.shape = ', seq_len_mask.shape) print('seq_len_mask = \n', seq_len_mask) batch_seq_len = torch.sum(seq_len_mask, dim=1) # 将每个样本的所有mask值(1/0)加和得到的数值即为该样本的真实长度【即:模型输入值y_len】 print('batch_seq_len.shape = ', batch_seq_len.shape, '; batch_seq_len = ', batch_seq_len) batch_losses = sum_losses / batch_seq_len # 将当前batch中的每个样本的总loss除以该样本的长度,得到每个样本的平均loss batch_loss = torch.mean(batch_losses) # 计算当前batch所有样本的平均loss print('batch_loss = ', batch_loss) return batch_loss
# -*- coding: utf-8 -*- ''' @Description: Train the model. ''' import os import sys import pathlib import numpy as np import pickle from torch.utils.data import DataLoader from torch import optim from torch.nn.utils import clip_grad_norm_ from dataset import PairsOfSrcTgt, BuildVocab, SampleDataset, collate_fn import torch import config from tqdm import tqdm, trange from model import Seq2seq from tensorboardX import SummaryWriter abs_path = pathlib.Path(__file__).parent.absolute() sys.path.append(sys.path.append(abs_path)) def train(pairs_train, pairs_val, vocab, start_epoch=0): """Train the model, evaluate it and store it. Args: pairs_train (dataset.PairsOfSrcTgt): The training dataset. pairs_val (dataset.PairDataset): The evaluation dataset. vocab (vocab.Vocab): The vocabulary built from the training dataset. start_epoch (int, optional): The starting epoch number. Defaults to 0. """ print('\n\n\n ****************************** loading model ******************************\n') DEVICE = torch.device("cuda" if config.is_cuda else "cpu") # 加载并批次化训练数据 dataset_train = SampleDataset(pairs_train, vocab) # 训练数据集大小:len(dataset_train) = 43996 dataset_valid = SampleDataset(pairs_val, vocab) # 验证数据集大小:len(dataset_valid) = 5000 print('len(dataset_train) = {0}----dataset_train = {1}'.format(len(dataset_train), dataset_train)) print('len(dataset_valid) = {0}----dataset_valid = {1}'.format(len(dataset_valid), dataset_valid)) dataloader_train = DataLoader(dataset=dataset_train, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn) dataloader_valid = DataLoader(dataset=dataset_valid, batch_size=config.batch_size, shuffle=True, pin_memory=True, drop_last=True, collate_fn=collate_fn) print('\nlen(dataloader_train) = {0}----dataloader_train = {1}'.format(len(dataloader_train), dataloader_train)) print('len(dataloader_valid) = {0}----dataloader_valid = {1}'.format(len(dataloader_valid), dataloader_valid)) # ----------------------- 初始化 ----------------------- model = Seq2seq(vocab) # 初始化model optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) # 初始化优化器 prev_val_loss = np.inf # 初始化验证集loss # ----------------------- 加载上次未训练完的模型参数以及验证集loss,然后继续训练 ----------------------- model.load_model() if os.path.exists(config.prev_val_loss_path): with open(config.prev_val_loss_path, 'rb') as f: prev_val_loss = pickle.load(f) model.to(DEVICE) # SummaryWriter: Log writer used for TensorboardX visualization.【开启Tensorboard命令行:tensorboard --logdir ./runs/baseline】 writer_batch = SummaryWriter(config.log_path_batch_train) # 将每一个epoch中的每100个batch的平均loss保存在文件里供可视化使用。 writer_epoch_train = SummaryWriter(config.log_path_epoch_train) # 将当前epoch的train的平均loss保存在文件里供可视化使用。 writer_epoch_valid = SummaryWriter(config.log_path_epoch_valid) # 将当前epoch的val的平均loss保存在文件里供可视化使用。 # ----------------------- 开始训练 ----------------------- print('\n ***************************************** 开始训练 *****************************************\n') epoch_progress_bar = tqdm(range(start_epoch, config.epochs)) epoch_progress_bar.set_description(f'Epoch Loss: ') for epoch_idx in epoch_progress_bar: print('\n ***************************************** epoch_idx = {0} *****************************************\n'.format(epoch_idx)) batch_losses = [] # 存放各个batch计算的loss # tqdm进度条 batch_progress_bar = tqdm(dataloader_train) batch_progress_bar.set_description(f'Epoch_idx = { epoch_idx},') for batch_idx, data in enumerate(batch_progress_bar): # print('\n************************* epoch_idx = {0}; batch_idx = {1} *************************\n'.format(epoch_idx, batch_idx)) x, y, x_len, y_len, oovs, len_oovs = data # x:序列化后的输入文本;y:序列化后的输出文本;oovs:超出自定义词典的词汇文本列表【各含有batch_size个样本】 assert not np.any(np.isnan(x.numpy())) if config.is_cuda: # Training with GPUs. x = x.to(DEVICE) y = y.to(DEVICE) x_len = x_len.to(DEVICE) len_oovs = len_oovs.to(DEVICE) model.train() # 设置model进入训练模式 # ----------------------- 梯度置零(进入每一个batch后都要先梯度置零) ----------------------- optimizer.zero_grad() # ----------------------- 计算当前batch的平均loss ----------------------- loss = model(x=x, x_len=x_len, y=y, y_len=y_len, len_oovs=len_oovs, epoch_idx=epoch_idx, batch_idx=batch_idx, is_train=True) # 每次将batch_size个样本喂给model batch_losses.append(loss.item()) # ----------------------- loss 反向传播 ----------------------- loss.backward() # 反向传播 # 进行梯度裁剪,防止梯度爆炸 clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm) clip_grad_norm_(model.attention.parameters(), config.max_grad_norm) # ----------------------- 更新model的所有可训练参数 ----------------------- optimizer.step() # ----------------------- 更新一次 tqdm bar进度条 ----------------------- batch_progress_bar.set_postfix(Batch_idx=batch_idx, Loss_Of_This_Batch=loss.item()) if batch_idx % 5 == 0: # 将当前epoch里每100个batch计算的平均loss数据保存在文件里面供可视化使用。【这里是Scalar类型,所以使用writer.add_scalar()】 writer_batch.add_scalar(f'Average loss for epoch { epoch_idx}', np.mean(batch_losses), global_step=batch_idx) # 第一个参数可以简单理解为保存图的名称,第二个参数是可以理解为Y轴数据,第三个参数可以理解为X轴数据。 # ----------------------- 计算当前epoch中所有样本平均后的Loss ----------------------- curr_epoch_train_loss = np.mean(batch_losses) # ----------------------- 更新 以epoch为单位的 tqdm bar进度条 ----------------------- epoch_progress_bar.set_postfix(Loss_Of_This_Epoch=curr_epoch_train_loss) # ----------------------- 计算模型在验证集上的loss值(在验证数据集上验证本epoch训练后的效果) ----------------------- val_loss = [] with torch.no_grad(): print('\n************************************************************ 开始进入验证 ************************************************************') for batch_idx, data in enumerate(tqdm(dataloader_valid)): x, y, x_len, y_len, oovs, len_oovs = data # x:序列化后的输入文本;y:序列化后的输出文本;oovs:超出自定义词典的词汇文本列表【各含有batch_size个样本】 if config.is_cuda: x = x.to(DEVICE) y = y.to(DEVICE) x_len = x_len.to(DEVICE) len_oovs = len_oovs.to(DEVICE) loss = model(x=x, x_len=x_len, y=y, y_len=y_len, len_oovs=len_oovs, epoch_idx=epoch_idx, batch_idx=batch_idx, is_train=False) # 每次将batch_size个样本喂给model val_loss.append(loss.item()) curr_epoch_valid_loss = np.mean(val_loss) writer_epoch_train.add_scalar(f'Loss Of Train for All epoch', curr_epoch_train_loss, global_step=epoch_idx) writer_epoch_valid.add_scalar(f'Loss Of Val for All epoch', curr_epoch_valid_loss, global_step=epoch_idx) print('\n\n☆☆☆☆☆☆☆☆ 第{0}个epoch: curr_epoch_train_loss = {1}; curr_epoch_valid_loss = {2}'.format(epoch_idx, curr_epoch_train_loss, curr_epoch_valid_loss)) # 当验证数据集上的loss比上一个epoch的loss减少时,保存模型参数 if curr_epoch_valid_loss < prev_val_loss: torch.save(model.encoder, config.encoder_saved_name) torch.save(model.decoder, config.decoder_saved_name) torch.save(model.attention, config.attention_saved_name) torch.save(model.reduce_state, config.reduce_state_saved_name) prev_val_loss = curr_epoch_valid_loss with open(config.prev_val_loss_path, 'wb') as f: pickle.dump(prev_val_loss, f) writer_batch.close() writer_epoch_train.close() writer_epoch_valid.close() if __name__ == "__main__": # 声明训练设备 DEVICE = torch.device('cuda') if config.is_cuda else torch.device('cpu') # 从数据集(每行数据的格式为src+<sep>+tgt)中提取训练数据集的(src, tgt)文本数据对 pairs_train = PairsOfSrcTgt(config.data_train_path, max_src_len=config.max_src_len, max_tgt_len=config.max_tgt_len, truncate_src=config.truncate_src, truncate_tgt=config.truncate_tgt).pairs print('len(pairs_train) = ', len(pairs_train)) # 根据训练数据集的(src, tgt)文本数据对构建词典 vocab = BuildVocab(pairs_train, embed_file=config.embed_file).vocab print('len(vocab) = ', len(vocab)) print('vocab.vocab.__getitem__(3) = ', vocab.__getitem__(3)) # 从预训练数据集(每行数据的格式为src+<sep>+tgt)中提取验证数据集的(src, tgt)文本数据对 pairs_val = PairsOfSrcTgt(config.data_val_path, max_src_len=config.max_src_len, max_tgt_len=config.max_tgt_len, truncate_src=config.truncate_src, truncate_tgt=config.truncate_tgt).pairs print('len(pairs_val) = ', len(pairs_val)) train(pairs_train, pairs_val, vocab, start_epoch=0)
打印结果:
Reading dataset data/processed_data/train.txt... 43996 pairs. len(pairs_train) = 43996 len(vocab) = 20004 vocab.vocab.__getitem__(3) = <UNK> Reading dataset ./data/processed_data/dev.txt... 5000 pairs. len(pairs_val) = 5000 ****************************** loading model ****************************** len(dataset_train) = 43996----dataset_train = <sample_dataset.SampleDataset object at 0x00000218344F8438> len(dataset_valid) = 5000----dataset_valid = <sample_dataset.SampleDataset object at 0x00000218382238D0> len(dataloader_train) = 5500----dataloader_train = <torch.utils.data.dataloader.DataLoader object at 0x000002186F2FF208> len(dataloader_valid) = 625----dataloader_valid = <torch.utils.data.dataloader.DataLoader object at 0x000002187B137978> ***************************************** 开始训练 ***************************************** ***************************************** epoch_idx = 0 ***************************************** ************************************* epoch_idx = 0;batch_idx = 0: 向model喂入的数据 ************************************* batch_idx = 0 x_len = tensor([197, 121, 85]) x = tensor([[ 1, 403, 404, 77, 174, 56, 405, 406, 81, 407, 9, 408, 174, 79, 175, 409, 25, 45, 152, 25, 410, 176, 35, 411, 72, 412, 413, 414, 162, 415, 6, 177, 30, 82, 416, 117, 163, 417, 20, 29, 26, 418, 81, 4, 419, 178, 4, 9, 4, 73, 6, 4, 179, 420, 41, 4, 14, 421, 4, 14, 153, 4, 17, 4, 40, 8, 4, 45, 4, 422, 5, 180, 4, 44, 423, 424, 4, 425, 426, 427, 179, 4, 181, 4, 70, 27, 28, 4, 80, 6, 4, 83, 428, 4, 67, 9, 42, 82, 182, 4, 14, 35, 28, 181, 10, 45, 10, 429, 10, 430, 10, 431, 4, 54, 27, 28, 4, 432, 433, 183, 4, 434, 435, 436, 4, 31, 27, 4, 437, 27, 28, 4, 168, 178, 61, 4, 438, 42, 184, 5, 439, 4, 142, 440, 89, 4, 441, 442, 4, 443, 183, 5, 444, 4, 83, 73, 4, 45, 4, 185, 59, 4, 185, 22, 4, 445, 4, 446, 447, 5, 82, 4, 448, 4, 101, 4, 449, 4, 65, 4, 450, 4, 166, 4, 451, 4, 452, 4, 176, 4, 453, 4, 454, 180, 4, 455, 2], [ 1, 89, 193, 90, 91, 48, 92, 194, 195, 93, 94, 95, 49, 50, 16, 29, 196, 17, 197, 198, 51, 52, 6, 18, 30, 199, 19, 18, 96, 200, 97, 201, 53, 95, 20, 29, 54, 98, 31, 202, 99, 94, 21, 93, 203, 48, 4, 204, 205, 9, 6, 4, 55, 206, 4, 207, 208, 209, 4, 90, 4, 100, 4, 100, 4, 56, 4, 210, 4, 101, 4, 102, 6, 103, 57, 211, 104, 212, 4, 58, 213, 21, 214, 5, 6, 4, 32, 105, 10, 106, 215, 10, 216, 10, 217, 10, 218, 107, 219, 4, 220, 221, 13, 5, 4, 222, 223, 224, 5, 108, 4, 225, 4, 226, 4, 53, 4, 22, 4, 59, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1, 323, 114, 144, 145, 146, 25, 147, 324, 148, 49, 149, 145, 150, 151, 54, 98, 31, 325, 53, 144, 51, 326, 96, 327, 152, 25, 20, 29, 99, 328, 329, 18, 97, 330, 6, 18, 72, 331, 19, 102, 30, 332, 24, 8, 4, 40, 8, 4, 6, 153, 4, 333, 334, 4, 71, 335, 4, 336, 36, 4, 337, 73, 4, 131, 338, 4, 51, 339, 4, 340, 154, 4, 341, 342, 4, 74, 343, 4, 74, 344, 4, 74, 345, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) y_len = tensor([38, 48, 54]) y.shape = torch.Size([3, 54]) ; y.shape[1] = 54 y = tensor([[ 1, 186, 13, 456, 457, 79, 175, 11, 4, 458, 61, 4, 459, 460, 461, 462, 7, 68, 177, 6, 4, 172, 463, 4, 84, 464, 4, 465, 466, 467, 83, 4, 173, 9, 41, 109, 7, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 1, 227, 228, 109, 5, 229, 48, 230, 231, 4, 232, 233, 234, 5, 108, 4, 60, 23, 110, 235, 236, 237, 238, 4, 239, 240, 5, 92, 241, 242, 7, 13, 243, 5, 52, 11, 4, 9, 61, 111, 4, 244, 33, 9, 5, 112, 7, 2, 0, 0, 0, 0, 0, 0], [ 1, 346, 15, 41, 112, 5, 146, 148, 4, 149, 150, 5, 31, 11, 4, 155, 347, 4, 348, 349, 49, 350, 5, 351, 352, 353, 7, 354, 126, 5, 355, 4, 356, 357, 4, 67, 156, 358, 359, 157, 158, 360, 7, 361, 151, 362, 363, 147, 21, 4, 159, 16, 7, 2]]) ************************************* 对输入的序列化样本进行处理,并生成本条样本的 padding_mask ************************************* x_padding_masks.shape = torch.Size([3, 197]) x_padding_masks = tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]) encoder_output.shape = torch.Size([3, 197, 1024]) --------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 0 --------------------------------- decoder_input_i.shape = torch.Size([3]) ; decoder_input_i = tensor([1, 1, 1]) ; decoder_input_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_input_i.unsqueeze(1) = tensor([[1], [1], [1]]) decoder_target_i.shape = torch.Size([3]) ; decoder_target_i = tensor([1, 1, 1]) ; decoder_target_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_target_i.unsqueeze(1) = tensor([[186], [227], [346]]) p_vocab.shape = torch.Size([3, 1003]) ----decoder_states[0].shape = torch.Size([1, 3, 512]) ----decoder_states[1].shape = torch.Size([1, 3, 512]) target_probs.shape = torch.Size([3, 1]) ; target_probs = tensor([[5.2464e-05], [5.7170e-05], [2.6298e-05]], grad_fn=<GatherBackward>) squeeze之后:target_probs.shape = torch.Size([3]) ; target_probs = tensor([5.2464e-05, 5.7170e-05, 2.6298e-05], grad_fn=<SqueezeBackward1>) epoch(0)-batch_idx(0):预测y序列文本中第(0)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([ 9.8554, 9.7695, 10.5460], grad_fn=<NegBackward>) mask.shape = torch.Size([3]) ; mask = tensor([1., 1., 1.]) 经过mask后:loss.shape = torch.Size([3]) ; loss = tensor([ 9.8554, 9.7695, 10.5460], grad_fn=<MulBackward0>) --------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 1 --------------------------------- decoder_input_i.shape = torch.Size([3]) ; decoder_input_i = tensor([186, 227, 346]) ; decoder_input_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_input_i.unsqueeze(1) = tensor([[186], [227], [346]]) decoder_target_i.shape = torch.Size([3]) ; decoder_target_i = tensor([186, 227, 346]) ; decoder_target_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_target_i.unsqueeze(1) = tensor([[ 13], [228], [ 15]]) p_vocab.shape = torch.Size([3, 1003]) ----decoder_states[0].shape = torch.Size([1, 3, 512]) ----decoder_states[1].shape = torch.Size([1, 3, 512]) target_probs.shape = torch.Size([3, 1]) ; target_probs = tensor([[2.9507e-05], [2.9505e-05], [1.0280e-03]], grad_fn=<GatherBackward>) squeeze之后:target_probs.shape = torch.Size([3]) ; target_probs = tensor([2.9507e-05, 2.9505e-05, 1.0280e-03], grad_fn=<SqueezeBackward1>) epoch(0)-batch_idx(0):预测y序列文本中第(1)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([10.4309, 10.4309, 6.8801], grad_fn=<NegBackward>) mask.shape = torch.Size([3]) ; mask = tensor([1., 1., 1.]) 经过mask后:loss.shape = torch.Size([3]) ; loss = tensor([10.4309, 10.4309, 6.8801], grad_fn=<MulBackward0>) ...... --------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 36 --------------------------------- decoder_input_i.shape = torch.Size([3]) ; decoder_input_i = tensor([ 7, 4, 156]) ; decoder_input_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_input_i.unsqueeze(1) = tensor([[ 7], [ 4], [156]]) decoder_target_i.shape = torch.Size([3]) ; decoder_target_i = tensor([ 7, 4, 156]) ; decoder_target_i.unsqueeze(1).shape = torch.Size([3, 1]) decoder_target_i.unsqueeze(1) = tensor([[ 2], [ 9], [358]]) p_vocab.shape = torch.Size([3, 1003]) ----decoder_states[0].shape = torch.Size([1, 3, 512]) ----decoder_states[1].shape = torch.Size([1, 3, 512]) target_probs.shape = torch.Size([3, 1]) ; target_probs = tensor([[3.0631e-03], [1.1222e-03], [1.7396e-05]], grad_fn=<GatherBackward>) squeeze之后:target_probs.shape = torch.Size([3]) ; target_probs = tensor([3.0631e-03, 1.1222e-03, 1.7396e-05], grad_fn=<SqueezeBackward1>) epoch(0)-batch_idx(0):预测y序列文本中第(36)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([ 5.7883, 6.7925, 10.9593], grad_fn=<NegBackward>) mask.shape = torch.Size([3
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。