赞
踩
主要涉及使用pandas从csv文件读取文本数据,将dataframe格式数据转为torch iteration 输出模型训练和预测。
import pandas as pd
import numpy as np
import re
import torch.utils.data as data_utils
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets,vocab
from tqdm import tqdm
from torch import nn
数据格式为csv,按逗号分隔,主要分为complain_content、label两列,其中complain_content为分词后的中文文本。
df = pd.read_csv("/home/jovyan/work/NLP/kesu_cate/data/kesu_clean_data.csv",encoding='utf-8')
df["complain_content"] = df["complain_content"].apply(lambda x : ' '.join(x.replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))) #
df["label"] = df["label"].astype(float)
#划分训练集、测试集、验证集
from sklearn.model_selection import train_test_split
train_data,test_data= train_test_split(df,test_size = 0.2,random_state = 1024)
train_data,valid_data= train_test_split(train_data,test_size = 0.2,random_state = 1024)
tokenize=lambda x:x.split(' ') TEXT=data.Field(tokenize=tokenize,sequential=True) LABEL=data.Field(sequential=False, use_vocab=False) #数据处理类 class DatasetProcess(data.Dataset): def __init__(self,df,text_tield,label_field): fields=[('complain_content',text_tield),("label",label_field)] examples=[] for text , label in tqdm(zip(df['complain_content'], df['label'])): examples.append(data.Example.fromlist([text, label], fields)) super(DatasetProcess,self).__init__(examples,fields) train_data = DatasetProcess(train_data,TEXT,LABEL) valid_data = DatasetProcess(valid_data,TEXT,LABEL) test_data = DatasetProcess(test_data,TEXT,LABEL) #加载腾讯NLP预训练结果 并建立词典 vectors = vocab.Vectors(name='/home/jovyan/work/NLP/pytoch/.vector_cache/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt') TEXT.build_vocab(train_data,max_size=25000,vectors=vectors,unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) train_iter,valid_iter,test_iter=data.BucketIterator.splits( (train_data,valid_data,test_data), batch_size=256, device='cpu', sort_key=lambda x: len(x.complain_content), sort_within_batch=False, repeat=False )
#初始化参数 vocab_size = len(TEXT.vocab) embedding_size = 100 output_size = 1 num_layers = 2 dropout = 0.3 pad_idx = TEXT.vocab.stoi[TEXT.pad_token] hidden_size = 100 device = 'cpu' #定义模型 class RNNModel(nn.Module): def __init__(self,vocab_size,embedding_size,output_size,hidden_size,num_layers,dropout,pad_idx): super(RNNModel,self).__init__() self.embed = nn.Embedding(vocab_size,embedding_size,padding_idx=pad_idx) self.lstm = nn.LSTM(embedding_size,hidden_size,bidirectional=True,num_layers=num_layers) self.linear = nn.Linear(hidden_size * num_layers,output_size) self.dropout = nn.Dropout(dropout) def forward(self,text): embeded = self.embed(text) embeded = self.dropout(embeded) output,(hidden,cell) = self.lstm(embeded) hidden = self.dropout(hidden.squeeze()) hidden = torch.cat([hidden[-1],hidden[-2]],dim=1) return self.linear(hidden) #定义评价函数和训练验证函数 def binary_accuracy(preds,y): rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() acc = correct.sum()/len(correct) return acc def train(model,iterator,optimizer,crit): epoch_loss,epoch_acc = 0.,0. model.train() total_len = 0. for batch in iterator: preds = model(batch.complain_content).squeeze() loss = crit(preds,batch.label.float()) acc = binary_accuracy(preds,batch.label) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += loss.item() * len(batch.label) epoch_acc += acc.item() * len(batch.label) total_len += len(batch.label) print(epoch_loss,epoch_acc,len(batch.label)) return epoch_loss / total_len,epoch_acc/total_len def evaluate(model,iterator,crit): epoch_loss,epoch_acc = 0.,0. model.eval() total_len = 0. for batch in iterator: preds = model(batch.complain_content).squeeze() loss = crit(preds,batch.label.float()) acc = binary_accuracy(preds,batch.label) epoch_loss += loss.item() * len(batch.label) epoch_acc += acc.item() * len(batch.label) total_len += len(batch.label) model.train() return epoch_loss / total_len,epoch_acc/total_len model = RNNModel(vocab_size,embedding_size,output_size,hidden_size,num_layers,dropout,pad_idx) #初始化模型参数 pretrained_embedding = TEXT.vocab.vectors model.embed.weight.data.copy_(pretrained_embedding) unk_idx = TEXT.vocab.stoi[TEXT.unk_token] model.embed.weight.data[pad_idx] = torch.zeros(embedding_size) model.embed.weight.data[unk_idx] = torch.zeros(embedding_size) #训练模型 n_epoch = 10 best_valid_acc = 0. for epoch in range(n_epoch): train_loss,train_acc = train(model,train_iter,optimizer,crit) valid_loss,valid_acc = evaluate(model,valid_iter,crit) if valid_acc > best_valid_acc: best_valid_acc = valid_acc torch.save(model.state_dict(),'lstm_model_cn.pth') print("Epoch",epoch,"Train loss",train_loss,"Train Acc",train_acc) print("Epoch",epoch,"valid_loss",valid_loss,"valid Acc",valid_acc) #测试模型 def removepad(arr): res = [] for x in arr: if x != '<pad>': res.append(x) return res def predict(test_iter): with torch.no_grad(): for batch in test_iter: pred = torch.sigmoid(model(batch.complain_content)) for i in range(len(batch.label)): index = batch.complain_content.t()[i] tmp = [TEXT.vocab.itos[x] for x in index] seq = ' '.join(removepad(tmp)) print(pred[i].item(),seq)
1.torchtext数据预处理:https://www.jianshu.com/p/8e891bdca78a
2.腾讯预训练词向量链接:https://ai.tencent.com/ailab/nlp/zh/download.html
3.加载教程:https://blog.csdn.net/nlpuser/article/details/83627709
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。