赞
踩
with open(os.path.join(folder_name, file), 'rb') as f:
review = f.read().decode('utf-8').replace('\n', '').lower()
data.append([review, 1 if label == 'pos' else 0])
torchtext.vocab.Vocab
创建词典。class BiRNN(nn.Module): def __init__(self, vocab, embed_size, num_hiddens, num_layers): ''' @params: vocab: 在数据集上创建的词典,用于获取词典大小 embed_size: 嵌入维度大小 num_hiddens: 隐藏状态维度大小 num_layers: 隐藏层个数 ''' super(BiRNN, self).__init__() self.embedding = nn.Embedding(len(vocab), embed_size) # encoder-decoder framework # bidirectional设为True即得到双向循环神经网络 self.encoder = nn.LSTM(input_size=embed_size, hidden_size=num_hiddens, num_layers=num_layers, bidirectional=True) self.decoder = nn.Linear(4*num_hiddens, 2) # 初始时间步和最终时间步的隐藏状态作为全连接层输入 def forward(self, inputs): ''' @params: inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量 @return: outs: 对文本情感的预测,形状为 (batch_size, 2) 的张量 ''' # 因为LSTM需要将序列长度(seq_len)作为第一维,所以需要将输入转置 embeddings = self.embedding(inputs.permute(1, 0)) # (seq_len, batch_size, d) # rnn.LSTM 返回输出、隐藏状态和记忆单元,格式如 outputs, (h, c) outputs, _ = self.encoder(embeddings) # (seq_len, batch_size, 2*h) encoding = torch.cat((outputs[0], outputs[-1]), -1) # (batch_size, 4*h) outs = self.decoder(encoding) # (batch_size, 2) return outs embed_size, num_hiddens, num_layers = 100, 100, 2 net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
class TextCNN(nn.Module): def __init__(self, vocab, embed_size, kernel_sizes, num_channels): ''' @params: vocab: 在数据集上创建的词典,用于获取词典大小 embed_size: 嵌入维度大小 kernel_sizes: 卷积核大小列表 num_channels: 卷积通道数列表 ''' super(TextCNN, self).__init__() self.embedding = nn.Embedding(len(vocab), embed_size) # 参与训练的嵌入层 self.constant_embedding = nn.Embedding(len(vocab), embed_size) # 不参与训练的嵌入层 self.pool = GlobalMaxPool1d() # 时序最大池化层没有权重,所以可以共用一个实例 self.convs = nn.ModuleList() # 创建多个一维卷积层 for c, k in zip(num_channels, kernel_sizes): self.convs.append(nn.Conv1d(in_channels = 2*embed_size, out_channels = c, kernel_size = k)) self.decoder = nn.Linear(sum(num_channels), 2) self.dropout = nn.Dropout(0.5) # 丢弃层用于防止过拟合 def forward(self, inputs): ''' @params: inputs: 词语下标序列,形状为 (batch_size, seq_len) 的整数张量 @return: outputs: 对文本情感的预测,形状为 (batch_size, 2) 的张量 ''' embeddings = torch.cat(( self.embedding(inputs), self.constant_embedding(inputs)), dim=2) # (batch_size, seq_len, 2*embed_size) # 根据一维卷积层要求的输入格式,需要将张量进行转置 embeddings = embeddings.permute(0, 2, 1) # (batch_size, 2*embed_size, seq_len) encoding = torch.cat([ self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1) # encoding = [] # for conv in self.convs: # out = conv(embeddings) # (batch_size, out_channels, seq_len-kernel_size+1) # out = self.pool(F.relu(out)) # (batch_size, out_channels, 1) # encoding.append(out.squeeze(-1)) # (batch_size, out_channels) # encoding = torch.cat(encoding) # (batch_size, out_channels_sum) # 应用丢弃法后使用全连接层得到输出 outputs = self.decoder(self.dropout(encoding)) return outputs embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100] net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
cache_dir = "/home/kesci/input/GloVe6B5429" glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir) def load_pretrained_embedding(words, pretrained_vocab): ''' @params: words: 需要加载词向量的词语列表,以 itos (index to string) 的词典形式给出 pretrained_vocab: 预训练词向量 @return: embed: 加载到的词向量 ''' embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0 oov_count = 0 # out of vocabulary for i, word in enumerate(words): try: idx = pretrained_vocab.stoi[word] embed[i, :] = pretrained_vocab.vectors[idx] except KeyError: oov_count += 1 if oov_count > 0: print("There are %d oov words." % oov_count) return embed net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab)) net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。