当前位置:   article > 正文

医疗文本分类_医学文本分类

医学文本分类

医疗文本分类

视频请至FlyAI官网查看:www.flyai.com

  • 预处理数据集
  1. def pred_process(title, text, tokenizer, pad_size):
  2. content = title + text
  3. content = data_clean(content)
  4. tokens = tokenizer.tokenize(content)
  5. tokens = ["[CLS]"] + tokens + ["[SEP]"]
  6. # 得到input_id, seg_id, att_mask input_id = tokenizer.convert_tokens_to_ids(tokens)
  7. types = [0] * (len(input_id))
  8. masks = [1] * len(input_id)
  9. # 短则补齐,长则切断 if len(input_id) < pad_size:
  10. types = types + [1] * (pad_size - len(input_id)) # mask部分 segment置为1 masks = masks + [0] * (pad_size - len(input_id))
  11. input_id = input_id + [0] * (pad_size - len(input_id))
  12. # print('ok') else:
  13. # print(len(input_id)) types = types[:pad_size]
  14. masks = masks[:pad_size]
  15. input_id = input_id[:pad_size]
  16. return input_id, types, masks
  • 切分训练集和测试集 
  1. def split_train_dev_data(self):
  2. # 随机打乱索引 random_order = list(range(len(self.input_ids)))
  3. np.random.seed(2020) # 固定种子 np.random.shuffle(random_order)
  4. print(random_order[:10])
  5. # 4:1 划分训练集和测试集
  6. self.input_ids_train = np.array([self.input_ids[i] for i in random_order[:int(len(self.input_ids) * 0.8)]])
  7. self.input_types_train = np.array([self.input_types[i] for i in random_order[:int(len(self.input_ids) * 0.8)]])
  8. self.input_masks_train = np.array([self.input_masks[i] for i in random_order[:int(len(self.input_ids) * 0.8)]])
  9. self.y_train = np.array([self.labels[i] for i in random_order[:int(len(self.input_ids) * 0.8)]])
  10. print(self.input_ids_train.shape, self.input_types_train.shape, self.input_masks_train.shape, self.y_train.shape)
  11. self.input_ids_dev = np.array([self.input_ids[i] for i in random_order[int(len(self.input_ids) * 0.8):]])
  12. self.input_types_dev = np.array([self.input_types[i] for i in random_order[int(len(self.input_ids) * 0.8):]])
  13. self.input_masks_dev = np.array([self.input_masks[i] for i in random_order[int(len(self.input_ids) * 0.8):]])
  14. self.y_dev = np.array([self.labels[i] for i in random_order[int(len(self.input_ids) * 0.8):]])
  15. print(self.input_ids_dev.shape, self.input_types_dev.shape, self.input_masks_dev.shape, self.y_dev.shape)
  • 语言模型选择

预训练语言模型使用的是平台模型库的 chinese_roberta_wwm_ext_pytorch

  1. # 训练数据的路径
  2. DATA_PATH = os.path.join(sys.path[0], 'data', 'input')
  3. # 加载预训练模型
  4. path = remote_helper.get_remote_data(
  5. 'https://www.flyai.com/m/zh_roberta_wwm_ext_pytorch.zip')
  6. # 加载后模型路径
  7. BERT_PATH = os.path.join(DATA_PATH, 'model', 'chinese_roberta_wwm_ext_pytorch')
  • 定义bert模型 
  1. class BertForSequenceClassification(BertPreTrainedModel):
  2. def __init__(self, config):
  3. super(BertForSequenceClassification, self).__init__(config)
  4. self.num_labels = 240 self.bert = BertModel(config)
  5. self.dropout = nn.Dropout(config.hidden_dropout_prob)
  6. self.classifier = nn.Linear(config.hidden_size, 240)
  7. self.apply(self.init_bert_weights)
  8. def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
  9. _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
  10. pooled_output = self.dropout(pooled_output)
  11. logits = self.classifier(pooled_output)
  12. if labels is not None:
  13. loss_fct = CrossEntropyLoss()
  14. loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  15. return loss
  16. else:
  17. return logits
  • 定义优化器
  1. optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate,
  2. warmup=0.05, t_total=config.num_train_optimization_steps)
  • 定义测试函数
  1. def test(self, model, test_loader):
  2. device = config.device
  3. model.eval()
  4. acc = 0
  5. for batch_idx, (x1, x2, x3, y) in enumerate(test_loader):
  6. x1, x2, x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
  7. with torch.no_grad():
  8. y_ = model(x1, x2, x3)
  9. _, pred = torch.max(y_, 1)
  10. acc += pred.eq(y.view_as(pred)).sum().item() # 记得加item()
  11. return acc / len(test_loader.dataset)
  • 训练超参数
  1. pad_size = 256 # 文本最大长度
  2. batch_size = 32
  3. epoch = 6
  4. learning_rate = 3e-5
  • 比赛心得

选用预训练语言模型时看了中文任务基准测评(CLUE benchmark)-排行榜(https://github.com/CLUEbenchmark/CLUE)里面长文本分类的各模型表现选取的, 它里面推荐的 pad_size=128,batch_size=24,lr=2e-5,但我在此训练集上训练时,pad_size=256,batch_size=32,lr=3e-5 的结果会好一些(不排除运气成分)。

数据处理方面看过有大佬分享的nlp数据增强的方法(同义词替换、随机插入、随机交换、随机删除),本次比赛还没来得及尝试,大家可以尝试一下。

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号