当前位置:   article > 正文

Transformers从零到精通教程——Tokenizer_tokenizer.json

tokenizer.json

image.png

from transformers import AutoTokenizer

sen = "弱小的我也有很大的梦想!"
  • 1
  • 2
  • 3

1.加载和保存

# 从HuggingFace加载,输入模型名称,即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

'''
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', 
vocab_size=21128, model_max_length=1000000000000000019884624838656, 
is_fast=True, padding_side='right', truncation_side='right', 
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]',
'cls_token': '[CLS]', 'mask_token': '[MASK]'},
clean_up_tokenization_spaces=True)
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

'''
('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

'''
BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, 
model_max_length=1000000000000000019884624838656, is_fast=True, 
padding_side='right', truncation_side='right', 
special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]',
'cls_token': '[CLS]', 'mask_token': '[MASK]'}, 
clean_up_tokenization_spaces=True)
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

2.句子分词

tokens = tokenizer.tokenize(sen)
tokens
'''
['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
'''
  • 1
  • 2
  • 3
  • 4
  • 5

3.查看词典

tokenizer.vocab

'''
{'湾': 3968,
 '訴': 6260,
 '##轶': 19824,
 '洞': 3822,
 ' ̄': 8100,
 '##劾': 14288,
 '##care': 11014,
 'asia': 8339,
 '##嗑': 14679,
 '##鹘': 20965,
 'washington': 12262,
 '##匕': 14321,
 '##樟': 16619,
 '癮': 4628,
 'day3': 11649,
 '##宵': 15213,
 '##弧': 15536,
 '##do': 8828,
 '詭': 6279,
 '3500': 9252,
 '124': 9377,
 '##価': 13957,
 '##玄': 17428,
 '##積': 18005,
 '##肝': 18555,
...
 '##维': 18392,
 '與': 5645,
 '##mark': 9882,
 '偽': 984,
 ...}
'''

tokenizer.vocab_size
# 21128
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38

4.索引转换

4.1将词序列转换为id序列

# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

'''
[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

4.2将id序列转换为token序列

# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens
'''
['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

4.3将token序列转换为string

# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'''
'弱 小 的 我 也 有 大 梦 想!'
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7

5.整合上面的操作

句子(字符串)转换为编码

# 将字符串转换为id序列,又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True) # add_special_tokens=True 默认值
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]
'''

# 将id序列转换为字符串,又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen
'''
'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

  • 忽略特殊token
ids = tokenizer.encode(sen, add_special_tokens=False)
ids
'''
[2483, 2207, 4638, 2769, 738, 3300, 2523, 1920, 4638, 3457, 2682, 8013]
'''

str_len = tokenizer.decode(ids, skip_special_tokens=True)
str_len
'''
'弱 小 的 我 也 有 很 大 的 梦 想 !'
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

5.填充与截断

5.1填充

# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

5.2截断

# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids
'''
[101, 2483, 2207, 4638, 102]
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6

6.其他输入部分

ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids
'''
[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]
'''
  • 1
  • 2
  • 3
  • 4
  • 5

以下为手写attention_masktoken_type_ids(7.快速调用方式有现成的)

  • attention_mask:填充的部分不需要做注意力机制,标记为0
  • token_type_ids:区别多个句子,第一个句子标记为0
attention_mask = [1 if idx != 0 else 0 for idx in ids]
# 填充的部分不需要做注意力机制,标记为0

token_type_ids = [0] * len(ids)
# 区别多个句子,第一个句子标记为0

ids, attention_mask, token_type_ids

'''
([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13

7.快速调用方式

inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs
'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
}
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 快捷使用方式
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
}
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10

8.处理batch数据

  • 无需做特殊处理,单个句子改成句子列表即可
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心,比梦想本身,更可贵"]
res = tokenizer(sens)
res
'''
{
'input_ids': [
 	[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], 
  	[101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], 
   	[101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]
   ],
   'token_type_ids': [
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
   ], 
   'attention_mask': [
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
   [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
   ]
}
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24

  • 结论:处理相同数量的数据,批处理要比循环处理更快
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

'''
CPU times: total: 15.6 ms
Wall time: 32.5 ms
'''

%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

'''
CPU times: total: 0 ns
Wall time: 6 ms
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18

9.Fast / Slow Tokenizer

fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

'''
BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''

slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer
'''
BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

  • 测速
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

'''
CPU times: total: 78.1 ms
Wall time: 363 ms
'''

%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)
'''
CPU times: total: 188 ms
Wall time: 877 ms
'''


%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)
'''
CPU times: total: 266 ms
Wall time: 80.6 ms
'''

%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)
'''
CPU times: total: 219 ms
Wall time: 738 ms
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35

  • 区别

return_offsets_mapping:在做序列标注、信息抽取等任务时,我们获取的原始数据标签是严格对应于原始的文本字符,于是在tokenizer处理后位置会变得不一样,因此需要返回offset_mapping,知道被处理后的每个token是对应于原始的哪些字符;

inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs
'''
{
'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]
}
'''

inputs.word_ids()
'''
[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

inputs = slow_tokenizer(sen, return_offsets_mapping=True)
'''报错
NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
'''
  • 1
  • 2
  • 3
  • 4
  • 5

10.特殊Tokenizer的加载

tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer
'''
ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
'''

tokenizer.save_pretrained("chatglm_tokenizer")
'''
('chatglm_tokenizer\\tokenizer_config.json',
 'chatglm_tokenizer\\special_tokens_map.json',
 'chatglm_tokenizer\\ice_text.model',
 'chatglm_tokenizer\\added_tokens.json'
)
'''

tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)
tokenizer.decode(tokenizer.encode(sen)) # 效果一致
tokenizer.convert_tokens_to_string(tokenizer.encode(sen))
'''
'弱小的我也有大Dreaming!'
'''
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/281471
推荐阅读
相关标签
  

闽ICP备14008679号