赞
踩
AclImdb – v1 Dataset 是用于二进制情绪分类的大型电影评论数据集,其涵盖比基准数据集更多的数据,其中有 25,000 条电影评论用于训练,25,000 条用于测试,还有其他未经标记的数据可供使用。
import re from torch.utils.data import DataLoader from torch.utils.data import Dataset import os def tokenization(content): content = re.sub("<.*?>"," ",content) fileters = ['\t','\n','\x97','\x96','#','%','$','&',"\.","\?","!","\,"] content = re.sub("|".join(fileters)," ",content) tokens = [i.strip().lower() for i in content.split()] return tokens def collate_fn(batch): """ :param batch:( [tokens, labels], [tokens, labels]) :return: """ content, label = list(zip(*batch)) return content,label class ImdbDataset(Dataset): def __init__(self, train=True): self.train_data_path = '..\\aclImdb\\train\\' self.test_data_path = '..\\aclImdb\\test\\' data_path = self.train_data_path if train else self.test_data_path #把所有文件名放入列表 temp_data_path = [os.path.join(data_path,"pos"), os.path.join(data_path+"neg")] print(temp_data_path) self.total_file_path = [] #所有评论文件路径 for path in temp_data_path: file_name_list = os.listdir(path) file_path_list = [os.path.join(path, i) for i in file_name_list if i.endswith(".txt")] self.total_file_path.extend(file_path_list) def __len__(self): return len(self.total_file_path) def __getitem__(self, index): file_path = self.total_file_path[index] # 获取label labelstr = file_path.split("\\")[-2] label = 0 if labelstr == "neg" else 1 # 获取内容 content = open(file_path).read() tokens = tokenization(content) return tokens, label def get_data(train=True): imbd_dataset = ImdbDataset(train) data_loader = DataLoader(imbd_dataset, batch_size=2, shuffle=True,collate_fn=collate_fn) return data_loader
把文本里每个词语和其对应数字,使用字典保存 即句子—>数字列表
思路:
遇到新出现的字符再词典里没有,可以用特殊字符替代
预保持每个batch里的序列大小一致,使用填充方法
""" 构建词典 把句子转换成序列 再把序列转成句子 """ class Word2Sequence: UNK_TAG = "UNK" PAD_TAG = "PAD" UNK =0 PAD =1 def __init__(self): self.dict = { self.UNK_TAG: self.UNK, self.PAD_TAG: self.PAD } self.count = {} def fit(self, sentence): # 把单个句子保存到dict for word in sentence: self.count[word] = self.count.get(word, 0)+1 def build_vocab(self, min=5, max=None, max_features=None): """ :param min: :param max: :param max_features: 一共保留多少个词语 :return: """ # 删除count中词频小于min的词语 self.count = {word:value for word, value in self.count.items() if value>min} # 删除count中词频大于max的词语 if max is not None: self.count = {word: value for word, value in self.count.items() if value < max} # 限制保留的词语数 if max_features is not None: temp = sorted(self.cout.items(), key=lambda x:x[-1], reverse=True)[:max_features] self.count = dict(temp) # 把 词语 ——>数字 for word in self.count: self.dict[word] = len(self.dict) # 得到一个反转的dict字典 self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys())) def transform(self, sentence, max_len=None): """ 把句子 转成 序列 :param sentence: [word1, word2, ..] :param max_len: 对句子进行填充或者裁剪 :return: """ if max_len is not None: if max_len > len(sentence): sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence)) # 填充 if max_len < len(sentence): sentence = sentence[:max_len] # 裁剪 return [self.dict.get(word, self.UNK) for word in sentence] def inverse_transform(self, indices): # 把 序列 ——>句子 return [self.inverse_dict.get(idx) for idx in indices] if __name__ == '__main__': ws = Word2Sequence() ws.fit(["我","是","你","的","爸爸"]) ws.fit(["我","是","我","的","人"]) ws.build_vocab(min=0) print(ws.dict) re = ws.transform(["我","爱","人"],max_len=10) print(re) ret = ws.inverse_transform(re) print(ret)
注意 word_embedding的使用!
""" 定义模型 """ import torch import torch.nn as nn import torch.nn.functional as F from lib import ws,max_len from dataset import get_data class MyModel(nn.Module): def __init__(self): super(MyModel, self).__init__() self.embedding = nn.Embedding(len(ws), 100) self.fc = nn.Linear(100*max_len, 2) def forward(self, input): """ :param input: [batch_size, max_len] :return: """ x = self.embedding(input) # [batch_size, max_len, 100] x = x.view([-1, 100*max_len]) output = self.fc(x) return F.log_softmax(output,dim=-1) model = MyModel() optimizer = torch.optim.Adam(model.parameters(),lr=0.001) def train(epoch): for idx,(input,target) in enumerate(get_data(train=True)): # 梯度清零 optimizer.zero_grad() output= model(input) loss = F.nll_loss(output,target) loss.backward() optimizer.step() print(loss.item()) if __name__ == '__main__': for i in range(1): train(epoch=i)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。