赞
踩
最近在帮老板干活,自己边学边干,也不知道对不对,先记录一下。
任务很简单,给一些特征来做分类。
自己分析一下先选取某些可能影响分类结果的特征,拼接成字符串。
然后用Roberta模型对每个特征分别做embedding。
#首先定义embedding函数,来把特征编码成1024维的向量(roberta-large固定1024维) from transformers import RobertaTokenizer, RobertaModel tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaModel.from_pretrained('roberta-large') #用roberta-large模型来做 def embedding(text): encoded_input = tokenizer(text,padding="max_length", truncation=True,return_tensors='pt') output = model(**encoded_input) return output.pooler_output.detach().numpy().tolist()[0] #返回一个list #然后一个一个编码文本 train_features = [] for i in range(len(train_data)): train_features.append(embedding(train_data[i])) #现在有了list形式的特征和label,训练时想要用到dataloader,需要转换成tensor import torch data_ts_x = torch.tensor(train_features) data_ts_y = torch.tensor(train_label) #多分类,我想采用cross-emtropy损失,所以要将标签y变成一维的tensor data_ts_y=data_ts_y.reshape(data_ts_y.shape[0],) #hyper-parameters input_dim = len(train_features[0]) hidden_dim1 = 512 hidden_dim2 = 128 output_dim = 4 #接下来定义模型 import torch.nn as nn def init_params(module_lst): #这是通过Xavier方法初始化模型参数 for module in module_lst: for param in module.parameters(): if param.dim() > 1: torch.nn.init.xavier_uniform_(param) return class classification(nn.Module): def __init__(self): super(classification,self).__init__() self.linear = nn.Sequential( nn.Linear(data_ts_x.shape[1],hidden_dim1),nn.ReLU(),nn.Linear(hidden_dim1,hidden_dim2),nn.ReLU(),nn.Linear(hidden_dim2,output_dim) ) init_params([self.linear]) def forward(self,X): linear_output = self.linear(X) return linear_output #定义dataset import torch.utils.data as tud class ZhongYidongDataset(tud.Dataset): def __init__(self, data, label): super(ZhongYidongDataset, self).__init__() self.data = data self.label = label def __len__(self): ''' 返回整个数据集(所有单词)的长度 ''' return self.data.shape[0] def __getitem__(self, idx): assert idx < self.data.shape[0] return self.data[idx], self.label[idx] #定义dataloader train_data_set = ZhongYidongDataset(data_ts_x,data_ts_y) dataloader = tud.DataLoader(train_data_set, batch_size=32, shuffle=True) #然后就可以定义模型进行训练了 classification_model = classification() loss = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(classification_model.parameters(),lr=0.001) for e in range(50): for i,(data,label) in enumerate(dataloader): pre_y = classification_model(data) optimizer.zero_grad() l = loss(pre_y,label).mean() l.backward() optimizer.step() if i % 10 == 0: print("epoch: {}, iter: {}, loss: {}".format(e, i, l.item())) #训练好了可以测试模型效果 pred_y = [] def evaluate(model,test_data_x,test_data_y): num_correct = 0. num_all = 0. for i in range(test_data_x.shape[0]): y_pre = model(test_data_x[i]) y = torch.argmax(y_pre) num_all+=1 pred_y.append(y.item()) if y.item()==test_data_y[i].item(): num_correct+=1 precision = num_correct/num_all print("precision:{}".format(precision)) return precision evaluate(classification_model,test_data_ts_x,test_data_ts_y)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。