赞
踩
在用transformers中的BertForMaskedLM来预测被mask掉的单词时一定要加特殊字符 [ C L S ] 和 [ S E P ] [CLS]和[SEP] [CLS]和[SEP]。不然效果很差很差!!!
from transformers import AlbertTokenizer, AlbertForMaskedLM import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', cache_dir='E:/Projects/albert/') model = AlbertForMaskedLM.from_pretrained('E:/Projects/albert') sentence = "It is a very beautiful book." tokens = ['[CLS]'] + tokenizer.tokenize(sentence) + ['[SEP]'] # i就是被mask掉的id for i in range(1, len(tokens)-1): tmp = tokens[:i] + ['[MASK]'] + tokens[i+1:] masked_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tmp)]) segment_ids = torch.tensor([[0]*len(tmp)]) outputs = model(masked_ids, token_type_ids=segment_ids) prediction_scores = outputs[0] print(tmp) # 打印被预测的字符 prediction_index = torch.argmax(prediction_scores[0, i]).item() predicted_token = tokenizer.convert_ids_to_tokens([prediction_index])[0] print(predicted_token)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。