赞
踩
本文主要是针对入门级别的Bert使用,包括英文文本分类和中文文本分类。
这部分主要使用Bert进行情感分析,属于中文文本分类,同样使用BertForSequenceClassification
数据集中包括三个情感分类等级[-1,0,1]
流程和第一部分一致,主要修改地方是在Bert的config文件中将类别设置成3,并将数据集中的[-1,0,1],变化成[0,1,2]的形式,bert的预训练模型使用
bert-base-uncased-cn
这个数据集包括俩列:[‘label’, ‘txt’]
首先读入数据:
df = pd.read_csv(os.path.join(data_path,"train.tsv"), delimiter='\t')
df_dev=pd.read_csv(os.path.join(data_path,"dev.tsv"), delimiter='\t')
print("train:",df.head())
print("dev:",df_dev.head())
提取句子并进行处理
#提取语句并处理
sentencses=['[CLS] ' + sent + ' [SEP]' for sent in df.txt.values]
labels=df.label.values
#这里中性还是使用0表示1表示积极2表示不积极
labels=list(map(lambda x:0 if x == 0 else 1 if x == 1 else 2,[x for x in labels]))
print("train label:",labels[100:110])
print("第一句话:",sentencses[0])
tokenizer=BertTokenizer.from_pretrained(bert_pre_tokenizer,do_lower_case=True)
tokenized_sents=[tokenizer.tokenize(sent) for sent in sentencses]
print("tokenized的第一句话:",tokenized_sents[0])
定义Bert的输入格式
MAX_LEN=80 #训练集部分 #将分割后的句子转化成数字 word-->idx input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_sents] print("转化后的第一个句子:",input_ids[0]) #做PADDING #大于128做截断,小于128做PADDING input_ids=pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") print("Padding 第一个句子:",input_ids[0]) #建立mask attention_masks = [] for seq in input_ids: seq_mask = [float(i>0) for i in seq] attention_masks.append(seq_mask) print("第一个attention mask:",attention_masks[0]) #验证集部分 #构建验证集 dev_sentencses=['[CLS] ' + sent + ' [SEP]' for sent in df_dev.txt.values] dev_labels=df_dev.label.values print("dev_label:",dev_labels[100:110]) dev_labels=list(map(lambda x:0 if x == 0 else 1 if x == 1 else 2,[x for x in dev_labels])) # dev_labels=[to_categorical(i, num_classes=3) for i in dev_labels] dev_tokenized_sents=[tokenizer.tokenize(sent) for sent in dev_sentencses] dev_input_ids=[tokenizer.convert_tokens_to_ids(sent) for sent in dev_tokenized_sents] dev_input_ids=pad_sequences(dev_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") dev_attention_masks = [] for seq in dev_input_ids: dev_seq_mask = [float(i>0) for i in seq] dev_attention_masks.append(dev_seq_mask)
构建训练集和验证集的dataloader
train_inputs = torch.tensor(input_ids)
validation_inputs = torch.tensor(dev_input_ids)
train_labels = torch.tensor(labels)
validation_labels = torch.tensor(dev_labels)
train_masks = torch.tensor(attention_masks)
validation_masks = torch.tensor(dev_attention_masks)
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
装载预训练模型,这里是bert的中文模型
#装载预训练bert模型
modelConfig = BertConfig.from_pretrained(bert_config)
model = BertForSequenceClassification.from_pretrained(bert_pre_model, config=modelConfig)
print(model.cuda())
定义优化器
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,
lr=2e-5,
warmup=.1)
训练部分
def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) train_loss_set = [] epochs = 4 for _ in trange(epochs, desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch)#将数据放置在GPU上 b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)[0] # print("loss:",loss) train_loss_set.append(loss.item()) loss.backward() optimizer.step() tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) #验证集 model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))
测试一下效果
#str st="真的好吗?" str='[CLS] ' + st + ' [SEP]' str_tokenized_sents = tokenizer.tokenize(str) print(str_tokenized_sents) # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary str_input_ids = [tokenizer.convert_tokens_to_ids(str_tokenized_sents)] str_input_ids = pad_sequences(str_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") print(str_input_ids) str_mask = [[float(i > 0) for i in str_input_ids[0]]] str_label=[0] str_input_ids = torch.tensor(str_input_ids).cuda() str_mask = torch.tensor(str_mask).cuda() str_label = torch.tensor(str_label).cuda() print("size:",str_input_ids.size(),str_mask.size(),str_label.size()) logits_str = model(str_input_ids, token_type_ids=None, attention_mask=str_mask)[0] print(np.argmax(logits_str.detach().cpu().numpy(), axis=1))
结果是个中性,再看看夸张点的
这里是负面的
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。