赞
踩
废话不多说,上源码
# 用LSTM模型进行中文评论情感分析 import jieba import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.utils.data import Dataset from torch import optim from torch.nn import functional as F from torch.nn.utils.rnn import pack_padded_sequence from torch.nn.utils.rnn import pad_sequence import pandas as pd from tqdm import tqdm from collections import defaultdict # 定义类型读取转换,需要将.txt文件转换为DataFrame类型 whole_data = pd.read_csv("E:\share_file_linux\深度学习\中文分词\datanew") # 定义一个词表类,以将自然语言映射为向量 class MyVocab: def __int__(self, tokens=None): # 此属性用来存各种token self.idx_to_token = list() # 将索引到token的映射存为字典,键为索引,值为对应的token self.token_to_idx = dict() # 若不存在,则将该tokens中的各个token加入列表中,并建立字典索引 if tokens is not None: if "<unk>" not in tokens: tokens = tokens + "<unk>" for token in tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 self.unk = self.token_to_idx["<unk>"] @classmethod def build(cls, data, min_freq=1, reserved_tokens=None, stop_words = 'hit_stopwords.txt'): # 以此函数构建词表 token_freqs = defaultdict(int) stop_words = open(stop_words).read().split('\n') for i in tqdm(range(data.shape[0]), desc=f"Building MyVocab"): for token in jieba.lcut(data.iloc[i]["review"]): if token in stop_words: continue token_freqs[token] += 1 # 进行token频率的统计 uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else []) # 加入额外的token uniq_tokens += [token for token, freq in token_freqs.items() \ if freq >= min_freq and token != "<unk>"] return cls(uniq_tokens) def __len__(self): # 返回值为词表的长度 return len(self.idx_to_token) def __getitem__(self, token): # 此函数进行输入token的对应索引的查找,若不存在则返回<unk>返回的索引 return self.token_to_idx.get(token, self.unk) def convert_tokens_to_ids(self, tokens): # 查找输入标签对应的索引值 return [self[token] for token in tokens] def convert_ids_to_tokens(self, ids): 查找索引值对应的标记 return [self.idx_to_token[index] for index in ids] def build_data(data_path): # 此函数进行数据集的构建,输入的参数为本地数据集的路径,返回值为训练集、测试集、词表 data = pd.read_csv(data_path) myvocab = MyVocab.build(data) train_data = [(myvocab.convert_tokens_to_ids(sentence), 1) for sentence in data[data["label"] == 1][:50000]["review"]]\ + [(myvocab.convert_tokens_to_ids(sentence), 0) for sentence in data[data["label"] == 0][:50000]["review"]] test_data = [(myvocab.convert_tokens_to_ids(sentence), 1) for sentence in data[data["label"] == 1][50000:]["review"]] \ + [(myvocab.convert_tokens_to_ids(sentence), 0) for sentence in data[data["label"] == 0][50000:]["review"]] return train_data, test_data, myvocab class Mydataset(Dataset): def __init__(self, data): self.data = data def __len__(self): return len(self.data) def __getitem__(self, index): return self.data[index] def collect_fn(examples): lengths = torch.tensor([len(example[0]) for example in examples]) inputs = [torch.tensor(example[0]) for example in examples] target = torch.tensor([example[1] for example in examples], dtype=torch.long) inputs = pad_sequence(inputs, batch_first=True) return inputs, lengths, target class RNN(nn.Module): def __int__(self, vocab_size, embedding_dim, hidden_dim, num_class): super(RNN, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first= True) self.output = nn.Linear(hidden_dim, num_class) def forward(self, inputs): embeds = self.embedding(inputs) hidden, (hn, cn) = self.rnn(embeds) outputs = self.output(hn[-1]) # 使用logsoftmax函数,产生预测概率值 log_probs = F.log_softmax(outputs, dim=-1) return log_probs model = RNN() num_epoch = 10 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 损失函数采用NLLLoss loss = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=0.0005) model.train() l_loss = [] l_acc = [] for epoch in range(num_epoch): total_loss = 0 for batch in tqdm(train_data_loader, desc=f"Training Epoch {epoch}"): inputs, lengths, targets = [x.to(device) for x in batch] log_probs = model(inputs, lengths) loss = loss(log_probs, targets) optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() l_loss.append(total_loss) acc = 0 for batch in tqdm(test_data_loader, desc=f"Testing"): inputs, lengths, targets = [x.to(device) for x in batch] with torch.no_grad(): output = model(inputs) acc += (output.argmax(dim=1) == targets).sum().item() l_acc.append(acc)
有用的话,一键三连哦!
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。