赞
踩
本文主要介绍一个框架nlp-basictasks
nlp-basictasks是利用PyTorch深度学习框架所构建一个简单的库,旨在快速搭建模型完成一些基础的NLP任务,如分类、匹配、序列标注、语义相似度计算等。
下面利用该框架实现BERT+CRF模型做NER任务
数据集来源
来源CLUE实体识别数据集
import torch,json
import sys,os
import pandas as pd
import random
import numpy as np
from nlp_basictasks.tasks import Ner
from nlp_basictasks.evaluation import nerEvaluator
from nlp_basictasks.readers.ner import InputExample
def _create_examples(input_path,mode): examples = [] with open(input_path, 'r') as f: idx = 0 for line in f: json_d = {} line = json.loads(line.strip()) text = line['text'] label_entities = line.get('label', None) words = list(text) labels = ['O'] * len(words) if label_entities is not None: for key, value in label_entities.items(): for sub_name, sub_index in value.items(): for start_index, end_index in sub_index: assert ''.join(words[start_index:end_index + 1]) == sub_name if start_index == end_index: labels[start_index] = 'S-' + key else: labels[start_index] = 'B-' + key labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1) json_d['id'] = f"{mode}_{idx}" json_d['context'] = " ".join(words) json_d['tag'] = " ".join(labels) json_d['raw_context'] = "".join(words) idx += 1 examples.append(json_d) return examples data=_create_examples('/data/nfs14/nfs/aisearch/asr/xhsun/datasets/cluener/train.json',mode='train')
train_examples=[] for example in data: seq_in=example['context'].strip().split(' ') seq_out=example['tag'].strip().split(' ') assert len(seq_in)==len(seq_out) train_examples.append(InputExample(seq_in=seq_in,seq_out=seq_out)) dev_examples=train_examples[-2000:] dev_seq_in=[] dev_seq_out=[] for example in dev_examples: dev_seq_in.append(example.seq_in) dev_seq_out.append(example.seq_out) train_examples=train_examples[:-2000] label_set=set() for examples in data: label_list=examples['tag'].strip().split(' ') for label in label_list: label_set.add(label) label2id={'[PAD]':0} for label in label_set: label2id[label]=len(label2id) print(label2id)
model_path=''#你下载BERT模型位置,比如:'chinese-roberta-wwm/'
device='cuda'
ner_model=Ner(model_path,label2id=label2id,use_crf=True,use_bilstm=True,device=device,batch_first=True)
#可以指定是否使用CRF或者BiLSTM
from torch.utils.data import DataLoader
batch_size=32
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
evaluator=nerEvaluator(label2id=label2id,seq_in=dev_seq_in,seq_out=dev_seq_out)
output_path=""#output_path是训练后保存模型的路径
ner_model.fit(train_dataloader=train_dataloader,evaluator=evaluator,epochs=5,output_path=output_path)
上图就是训练过程中各个实体的precision、recall以及f1得分
不用100行代码即可完成NER任务,相关教程见nlp-basictasks框架做NER任务,觉得好用的话还请点个star,谢谢
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。