赞
踩
#此数据为京东评论数据分为两列一列为text,一列为target分数需要数据集私信 #1.处理数据 #导入数据处理的基础包 import numpy as np import pandas as pd #导入用于计数的包 from collections import Counter import os import requests #这里label的评分有1-5有5类 #目的是将label的数值 -1 缩放到[0,4]之间 def get_label(label): label = label - 1 return label data["target"] = data['target'].apply(get_label) #按评论进行去重,对于重复项,保留第一次出现的值 data = data.drop_duplicates('text',keep='first') #会将标签重新从零开始顺序排序,使用参数设置drop=True删除旧的索引序列 data = data.reset_index(drop=True) import re def clear_character(sentence): pattern1='[a-zA-Z0-9]' pattern2 = re.compile(u'[^\s1234567890::' + '\u4e00-\u9fa5]+') pattern3='[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+' line1=re.sub(pattern1,'',sentence) #去除英文字母和数字 line2=re.sub(pattern2,'',line1) #去除表情和其他字符 line3=re.sub(pattern3,'',line2) #去除去掉残留的冒号及其它符号 new_sentence=''.join(line3.split()) #去除空白 return new_sentence #在["评论"]这一列使用定义的"clear_character"函数 data["text"]=data['text'].apply(clear_character) data.head() # 导入中文分词包jieba, 并用jieba对原始文本做分词 import jieba from tqdm import tqdm def comment_cut(content): # TODO: 使用结巴完成对每一个comment的分词 seg = list(jieba.cut(content.strip())) return seg # 输出进度条 tqdm.pandas(desc='apply') data['text'] = data['text'].progress_apply(comment_cut) # 观察新的数据的格式 data.head() # 停用词可以去网上搜下载的停用词表改为json格式,读取下载的停用词表,并保存在列表中 with open("D:\\shujuji\\1\\stopwords.json","r",encoding='utf-8') as f: stopWords = f.read().split("\n") # 去除停用词 def rm_stop_word(wordList): filtered_words = [word for word in wordList if word not in stopWords] return filtered_words #return " ".join(filtered_words) #这行代码中.progress_apply()函数的作用等同于.apply()函数的作用,只是写成.progress_apply()函数才能被tqdm包监控从而输出进度条。 data['text'] = data['text'].progress_apply(rm_stop_word) # 观察新的数据的格式 data.head() # 去除低频词, 去掉词频小于10的单词,并把结果存放在data['comment_processed']里 from collections import Counter list_set = [] for i in range(len(data)): for j in data.iloc[i]['text']: list_set.extend(j) words_count = Counter(list_set) min_threshold=10 my_dict = {k: v for k, v in words_count.items() if v < min_threshold} filteredA = Counter(my_dict) # 去除低频词 def rm_low_frequence_word(wordList): # your code, remove stop words # TODO outstr = '' for word in wordList: if word not in filteredA: if word != '\t': outstr += word outstr += " " #filtered_words = [word for word in wordList if word not in filteredA] return outstr #这行代码中.progress_apply()函数的作用等同于.apply()函数的作用,只是写成.progress_apply()函数才能被tqdm包监控从而输出进度条。 data['text'] = data['text'].progress_apply(rm_low_frequence_word) data.head()
#进行模型处理过程 import collections import os import random import time from tqdm import tqdm import numpy as np import torch from torch import nn import torchtext.vocab as Vocab import torch.utils.data as Data import torch.nn.functional as F import matplotlib.pyplot as plt import seaborn as sns #os.environ["CUDA_VISIBLE_DEVICES"] = "6" #使用GPU运算 device=torch.device("cuda:6" if torch.cuda.is_available() else "cpu") #首先将comment_processed中的每一条评论转换为列表 word_list=[str(s).split() for s in data["text"]] print(word_list) #生成word2vec模型 from gensim.models.word2vec import Word2Vec import time start = time.time() #窗口大小设置为3,词的最小出现次数为1 model_w2v = Word2Vec(word_list, window = 3, iter = 5,size=256,min_count=1) print('完成') end = time.time() print('花费时间:', end - start) print(model_w2v) #数据集做划分 from sklearn.model_selection import train_test_split Temp_trin, valid_data = train_test_split(data,test_size=0.2, random_state=42) #默认split_ratio=0.7 train_data,test_data = train_test_split(Temp_trin,test_size=0.2, random_state=42) train_data.to_csv("D:/shujuji/1/2/train_data.csv",index=False,header=True,encoding="utf-8") valid_data.to_csv("D:/shujuji/1/2/valid_data.csv",index=False,header=True,encoding="utf-8") test_data.to_csv("D:/shujuji/1/2/test_data.csv",index=False,header=True,encoding="utf-8") #torchtext处理数据过程 import torch import torchtext from torchtext.legacy import data from torchtext.legacy.data import Field from torchtext.legacy.data import TabularDataset torch.backends.cudnn.deterministic = True tokenize = lambda x:x.split() TEXT = data.Field(sequential=True,tokenize=tokenize) LABEL = data.Field(sequential=False, dtype=torch.long, use_vocab=False) fields = [('text',TEXT), ('label',LABEL)] #定理类划分数据集 class DataFrameDataset(data.Dataset): def __init__(self, df, fields, is_test=False, **kwargs): examples = [] for i, row in df.iterrows(): label = row.target if not is_test else None text = row.text examples.append(data.Example.fromlist([text, label], fields)) super().__init__(examples, fields, **kwargs) @staticmethod def sort_key(ex): return len(ex.text) @classmethod def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs): train_data, val_data, test_data = (None, None, None) data_field = fields if train_df is not None: train_data = cls(train_df.copy(), data_field, **kwargs) if val_df is not None: val_data = cls(val_df.copy(), data_field, **kwargs) if test_df is not None: test_data = cls(test_df.copy(), data_field, True, **kwargs) return tuple(d for d in (train_data, val_data, test_data) if d is not None) print(f'Number of training examples: {len(train_data)}') print(f'Number of validation examples: {len(valid_data)}') print(f'Number of testing examples: {len(test_data)}') train_df,val_df,test_df = DataFrameDataset.splits(fields, train_df=train_data, val_df=valid_data, test_df = test_data) # 构建词表 TEXT.build_vocab(train_df) # print(train[0].__dict__.keys()) print(vars(train_df.examples[0])) print(vars(test_df.examples[0])) #语料库单词频率越高,索引越靠前。前两个默认为unk和pad。 print(TEXT.vocab.stoi) #查看训练数据集中最常见的单词。 print(TEXT.vocab.freqs.most_common(40)) print(TEXT.vocab.itos[:10]) #查看TEXT单词表 #生成词嵌入矩阵 import numpy as np embedding_dic = dict(zip(model_w2v.wv.index2word, model_w2v.wv.syn0)) embedding_matrix = np.zeros((len(TEXT.vocab), 256)) for w, i in TEXT.vocab.stoi.items(): embedding_vec = embedding_dic.get(w) if embedding_vec is not None: embedding_matrix[i] = embedding_vec print(embedding_matrix.shape) #划分batch from torchtext.legacy.data import Iterator, BucketIterator train_batch_size = 64 val_batch_size = 64 test_batch_size = 64 #相当于把样本划分batch,只是多做了一步,把相等长度的单词尽可能的划分到一个batch,不够长的就用padding。 # 同时对训练集和验证集进行迭代器构建 train_iterator, valid_iterator = BucketIterator.splits( (train_df, val_df), batch_sizes=(train_batch_size, val_batch_size), device=device, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False ) # 对测试集进行迭代器构建 test_iterator = Iterator( test_df, batch_size=test_batch_size, device=device, sort=False, sort_within_batch=False, repeat=False) #定义bilstm+attention模型 import torch.nn as nn import torch.nn.functional as F class LSTMmodel(nn.Module): def __init__(self,embedding_size,hidden_size,output_size): super(LSTMmodel,self).__init__() self.embedding=nn.Embedding(len(TEXT.vocab),256) self.lstm=nn.LSTM(embedding_size,hidden_size,num_layers=2,bidirectional=True,dropout=0.5) self.fc=nn.Linear(hidden_size*2,output_size) self.dropout = nn.Dropout(0.5) #向前传播 def attention_net(self, x, query, mask=None): d_k = query.size(-1) # d_k为query的维度 # query:[batch, seq_len, hidden_dim*2], x.t:[batch, hidden_dim*2, seq_len] # print("query: ", query.shape, x.transpose(1, 2).shape) # torch.Size([128, 38, 128]) torch.Size([128, 128, 38]) # 打分机制 scores: [batch, seq_len, seq_len] scores = torch.matmul(query, x.transpose(1, 2)) / math.sqrt(d_k) # print("score: ", scores.shape) # torch.Size([128, 38, 38]) # 对最后一个维度 归一化得分 alpha_n = F.softmax(scores, dim=-1) # print("alpha_n: ", alpha_n.shape) # torch.Size([128, 38, 38]) # 对权重化的x求和 # [batch, seq_len, seq_len]·[batch,seq_len, hidden_dim*2] = [batch,seq_len,hidden_dim*2] -> [batch, hidden_dim*2] context = torch.matmul(alpha_n, x).sum(1) return context, alpha_n def forward(self,text): embedded=self.embedding(text) output,(hidden,c)=self.lstm(embedded) #hidden的维度是(num_layers * num_directions, batch, hidden_size)取最后一层的前向和后向输出,[4,64,hidden_size] h = torch.cat((hidden[-1, :, :], hidden[-2, :, :]), dim=1) output = output.permute(1, 0, 2) # [batch, seq_len, hidden_dim*2] query = self.dropout(output) # 加入attention机制 attn_output, alpha_n = self.attention_net(output, query) output = self.fc(attn_output) #print("h",h) #print(h.shape) # output=self.fc(h) return output def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 total_len = 0 count = 0 model.train() #model.train()代表了训练模式 #这步一定要加,是为了区分model训练和测试的模式的。 #有时候训练时会用到dropout、归一化等方法,但是测试的时候不能用dropout等方法。 for batch in iterator: #iterator为train_iterator optimizer.zero_grad() #加这步防止梯度叠加 predictions = model(batch.text) #print("predictions",predictions) #batch.comment_processed comment_processed loss = criterion(predictions, batch.label) epoch_loss += loss.item() loss.backward() #反向传播 optimizer.step() #梯度下降 epoch_acc += ((predictions.argmax(axis = 1)) == batch.label).sum().item() #(acc.item():一个batch的正确率) *batch数 = 正确数 #train_iterator所有batch的正确数累加。 total_len += len(batch.label) #计算train_iterator所有样本的数量 count += 1 print(f'训练了{count}个batch') return epoch_loss / total_len, epoch_acc / total_len #epoch_loss / total_len :train_iterator所有batch的损失 #epoch_acc / total_len :train_iterator所有batch的正确率 def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 total_len = 0 count = 0 model.eval() #转换成测试模式,冻结dropout层或其他层。 with torch.no_grad(): for batch in iterator: #iterator为valid_iterator #没有反向传播和梯度下降 predictions = model(batch.text) loss = criterion(predictions, batch.label) epoch_loss += loss.item() epoch_acc += ((predictions.argmax(axis = 1)) == batch.label).sum().item() total_len += len(batch.label) count += 1 model.train() #调回训练模式 print(f'验证了{count}个batch') return epoch_loss / total_len, epoch_acc / total_len #设置参数 #设置超参数 EMBEDDING_SIZE = 256 HIDDEN_SIZE = 128 OUTPUT_SIZE = 5 #实例化模型 model = LSTMmodel(embedding_size = EMBEDDING_SIZE, hidden_size = HIDDEN_SIZE, output_size = OUTPUT_SIZE,).to(device) #模型词向量初始化成预训练的词向量 #from_munpy ndarray和tensor转换 #将生成的词向量-id矩阵嵌入到我们的网络模型中 model.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))[2:10] def count_parameters(model): #统计模型参数 return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {count_parameters(model):,} trainable parameters') import torch.optim as optim optimizer = optim.Adam(model.parameters()) #定义优化器 criterion = nn.CrossEntropyLoss() #定义损失函数,交叉熵损失函数 model = model.to(device) #送到gpu上去 criterion = criterion.to(device) #送到gpu上去 import time def epoch_time(start_time, end_time): #查看每个epoch的时间 elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs #最后训练 import math N_EPOCHS = 10 best_valid_loss = float('inf') #无穷大 for epoch in tqdm(range (N_EPOCHS),desc='Processing'): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: #只要模型效果变好,就存模型 best_valid_loss = valid_loss torch.save(model.state_dict(), 'Best-Checkpoint.pt') print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。