赞
踩
from transformers import AutoTokenizer
sen = "弱小的我也有很大的梦想!"
# 从HuggingFace加载,输入模型名称,即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer
'''
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese',
vocab_size=21128, model_max_length=1000000000000000019884624838656,
is_fast=True, padding_side='right', truncation_side='right',
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]',
'cls_token': '[CLS]', 'mask_token': '[MASK]'},
clean_up_tokenization_spaces=True)
'''
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")
'''
('./roberta_tokenizer\\tokenizer_config.json',
'./roberta_tokenizer\\special_tokens_map.json',
'./roberta_tokenizer\\vocab.txt',
'./roberta_tokenizer\\added_tokens.json',
'./roberta_tokenizer\\tokenizer.json')
'''
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer
'''
BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128,
model_max_length=1000000000000000019884624838656, is_fast=True,
padding_side='right', truncation_side='right',
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]',
'cls_token': '[CLS]', 'mask_token': '[MASK]'},
clean_up_tokenization_spaces=True)
'''
tokens = tokenizer.tokenize(sen)
tokens
'''
['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
'''
tokenizer.vocab
'''
{'湾': 3968,
'訴': 6260,
'##轶': 19824,
'洞': 3822,
' ̄': 8100,
'##劾': 14288,
'##care': 11014,
'asia': 8339,
'##嗑': 14679,
'##鹘': 20965,
'washington': 12262,
'##匕': 14321,
'##樟': 16619,
'癮': 4628,
'day3': 11649,
'##宵': 15213,
'##弧': 15536,
'##do': 8828,
'詭': 6279,
'3500': 9252,
'124': 9377,
'##価': 13957,
'##玄': 17428,
'##積': 18005,
'##肝': 18555,
...
'##维': 18392,
'與': 5645,
'##mark': 9882,
'偽': 984,
...}
'''
tokenizer.vocab_size
# 21128
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids
'''
[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]
'''
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens
'''
['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
'''
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen
'''
'弱 小 的 我 也 有 大 梦 想!'
'''
句子(字符串)转换为编码
# 将字符串转换为id序列,又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True) # add_special_tokens=True 默认值
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]
'''
# 将id序列转换为字符串,又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen
'''
'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'
'''
ids = tokenizer.encode(sen, add_special_tokens=False)
ids
'''
[2483, 2207, 4638, 2769, 738, 3300, 2523, 1920, 4638, 3457, 2682, 8013]
'''
str_len = tokenizer.decode(ids, skip_special_tokens=True)
str_len
'''
'弱 小 的 我 也 有 很 大 的 梦 想 !'
'''
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
'''
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids
'''
[101, 2483, 2207, 4638, 102]
'''
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
'''
以下为手写attention_mask
,token_type_ids
(7.快速调用方式有现成的)
attention_mask = [1 if idx != 0 else 0 for idx in ids]
# 填充的部分不需要做注意力机制,标记为0
token_type_ids = [0] * len(ids)
# 区别多个句子,第一个句子标记为0
ids, attention_mask, token_type_ids
'''
([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
'''
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs
'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
}
'''
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs
'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
}
'''
sens = ["弱小的我也有大梦想",
"有梦想谁都了不起",
"追逐梦想的心,比梦想本身,更可贵"]
res = tokenizer(sens)
res
'''
{
'input_ids': [
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102],
[101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102],
[101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]
],
'token_type_ids': [
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
],
'attention_mask': [
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
]
}
'''
%%time
# 单条循环处理
for i in range(1000):
tokenizer(sen)
'''
CPU times: total: 15.6 ms
Wall time: 32.5 ms
'''
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)
'''
CPU times: total: 0 ns
Wall time: 6 ms
'''
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer
'''
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer
'''
BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''
%%time
# 单条循环处理
for i in range(10000):
fast_tokenizer(sen)
'''
CPU times: total: 78.1 ms
Wall time: 363 ms
'''
%%time
# 单条循环处理
for i in range(10000):
slow_tokenizer(sen)
'''
CPU times: total: 188 ms
Wall time: 877 ms
'''
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)
'''
CPU times: total: 266 ms
Wall time: 80.6 ms
'''
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)
'''
CPU times: total: 219 ms
Wall time: 738 ms
'''
return_offsets_mapping
:在做序列标注、信息抽取等任务时,我们获取的原始数据标签是严格对应于原始的文本字符,于是在tokenizer处理后位置会变得不一样,因此需要返回offset_mapping,知道被处理后的每个token是对应于原始的哪些字符;
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs
'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]
}
'''
inputs.word_ids()
'''
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
'''
inputs = slow_tokenizer(sen, return_offsets_mapping=True)
'''报错
NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
'''
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer
'''
ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''
tokenizer.save_pretrained("chatglm_tokenizer")
'''
('chatglm_tokenizer\\tokenizer_config.json',
'chatglm_tokenizer\\special_tokens_map.json',
'chatglm_tokenizer\\ice_text.model',
'chatglm_tokenizer\\added_tokens.json'
)
'''
tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)
tokenizer.decode(tokenizer.encode(sen)) # 效果一致
tokenizer.convert_tokens_to_string(tokenizer.encode(sen))
'''
'弱小的我也有大Dreaming!'
'''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。