- from transformers import AutoTokenizer, AutoConfig
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- text = "我爱学习"
- tokens = tokenizer.encode(text)
- print("tokens:", tokens)
- ''' 打印结果:
- tokens: [5, 76202, 63992, 130001, 130004]
- '''
- import sentencepiece as spm
- sp = spm.SentencePieceProcessor()
- sp.load('THUDM/chatglm-6b/ice_text.model')
- save_vocab = []
- for id in range(sp.vocab_size()):
- save_vocab.append(str(id)+"\t"+sp.id_to_piece(id))
- print(sp.id_to_piece(id))
- with open("vocab.txt", 'w+', encoding='utf-8') as f:
- f.write('\n'.join(save_vocab))
特殊字符 | token_id | 说明 |
<n> | 4 | 回车 |
▁ | 5 | 连接符,标记了一个词的开头 |
[gMASK] | 130001 | 生成下文用的mask |
<sop> | 130004 | output的开始 |
<eop> | 130005 | output的结尾 |
<|tab|> | 130008 | 制表符 |
<|blank_{length}|> | 130009-130087 | 每n个连续的空格会被组成一个特殊字符, 上限80,即<|blank_80|> |
ChatGLM和LLaMA的分词都用了SentencePiece 库,SentencePiece 库的_EncodeAsPiecesBatch 方法返回的每段(每段是用空格分隔的)数据最前面有一个特殊的下划线 ▁,我们称之为连接符。因为 SentencePiece 使用连接符来表示一个词的开始。值得注意的是他不是普通的下划线,普通的下划线是这样的_。连接符标记了一个词的开头,这有助于区分连续的词汇。
a.词边界标记:SentencePiece 处理的文本通常没有明确的空格或者其他明显的词边界标记(尤其是在某些亚洲语言中)。使用连接符作为词的前缀可以帮助模型识别词的边界。
b.可逆性:在 SentencePiece 的编码和解码过程中,连接符的使用保证了操作的可逆性。这意味着你可以从编码的子词序列准确地重建原始文本,包括空格和词边界。
- from transformers import AutoTokenizer, AutoConfig
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- vocab = tokenizer.get_vocab()
- vocab_exchange = dict([val, key] for key, val in vocab.items())
- text1 = "苹果我是昨天买的"
- tokens1 = tokenizer.encode(text1, add_special_tokens=False)
- print("tokens1:", tokens1)
- participles1 = [vocab_exchange[token] for token in tokens1]
- print("participles1:", participles1)
- text2 = "我是昨天买的苹果"
- tokens2 = tokenizer.encode(text2, add_special_tokens=False)
- print("tokens2:", tokens2)
- participles2 = [vocab_exchange[token] for token in tokens2]
- print("participles2:", participles2)
- '''
- tokens1: [5, 65319, 65806, 67363, 68543]
- participles1: ['▁', '苹果', '我是', '昨天', '买的']
- tokens2: [71232, 67363, 68543, 65319]
- participles2: ['▁我是', '昨天', '买的', '苹果']
- '''
可以看到第一个例子符合我们前面说的每段的开头会自动加一个▁ 但是第二个例子的▁被融合到了起始的分词中,这是因为在这段的开头加完▁后,能在词典中找到能匹配的'▁我是',根据匹配是长度优先的原则,肯定是选择组合成一个:'▁我是',而不是分成两个:'▁'和'我是'。
- from transformers import AutoTokenizer, AutoConfig
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- vocab = tokenizer.get_vocab()
- vocab_exchange = dict([val, key] for key, val in vocab.items())
- # 1
- text1 = "Hello World"
- tokens1 = tokenizer.encode(text1, add_special_tokens=False)
- print("tokens1:", tokens1)
- participles1 = [vocab_exchange[token] for token in tokens1]
- print("participles1:", participles1)
- # 2
- text2 = "我是 昨天买的苹果"
- tokens2 = tokenizer.encode(text2, add_special_tokens=False)
- print("tokens2:", tokens2)
- participles2 = [vocab_exchange[token] for token in tokens2]
- print("participles2:", participles2)
- # 3
- text3 = "我是 昨天买的苹果"
- tokens3 = tokenizer.encode(text3, add_special_tokens=False)
- print("tokens3:", tokens3)
- participles3 = [vocab_exchange[token] for token in tokens3]
- print("participles3:", participles3)
- '''
- tokens1: [14833, 398]
- participles1: ['▁hello', '▁world']
- tokens2: [71232, 70831, 68543, 65319]
- participles2: ['▁我是', '▁昨天', '买的', '苹果']
- tokens3: [71232, 130009, 67363, 68543, 65319]
- participles3: ['▁我是', '<|blank_2|>', '昨天', '买的', '苹果']
- '''
[gMASK]是生成下文用的mask,表示从这里开始往下生成,在训练的时候会先mask掉[gMASK]后面的内容,然后预测后面的内容。ChatGLM的注意力模式是Prefix decoder,也就是下面的第二种,[gMASK]的功能可以理解为分隔input和output,这个到介绍结构时再说。
(3)<sop> 和 <eop>
ChatGLM中的这两个标记分别被当做<bos>(Beginning Of Sentence)和<eos>(Ending Of Sentence)来使用,会被加在output的头尾。
下面看一个例子,数据是训练集中的一行,因为是训练数据所以是有明确的输出作为Ground Truth,训练之前数据预处理的过程就是这样的:
- from transformers import AutoTokenizer, AutoConfig
- def preprocess(tokenizer, config, example, max_seq_length):
- prompt = example["context"]
- target = example["target"]
- prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
- target_ids = tokenizer.encode(
- target,
- max_length=max_seq_length,
- truncation=True,
- add_special_tokens=False)
- input_ids = prompt_ids + target_ids + [config.eos_token_id]
- return {"input_ids": input_ids, "seq_len": len(prompt_ids)}
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, device_map='auto')
- max_seq_length = 200
- example = {
- "context": "你是谁",
- "target": "人家是城堡中的小公主"
- }
- token = preprocess(tokenizer, config, example, max_seq_length)
- print("token:", token)
- '''
- token: {'input_ids': [5, 108293, 130001, 130004, 5, 65870, 63829, 75581, 64102, 103559, 130005], 'seq_len': 4}
- '''
- from transformers import AutoTokenizer, AutoConfig
- def preprocess(tokenizer, config, example, max_seq_length):
- prompt = example["context"]
- target = example["target"]
- prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
- target_ids = tokenizer.encode(
- target,
- max_length=max_seq_length,
- truncation=True,
- add_special_tokens=False)
- input_ids = prompt_ids + target_ids + [config.eos_token_id]
- return {"input_ids": input_ids, "seq_len": len(prompt_ids)}
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, device_map='auto')
- max_seq_length = 200
- example = {
- "context": "你要干什么",
- "target": "小公主 我们来玩吧\nHAHA\tHAHA"
- }
- token = preprocess(tokenizer, config, example, max_seq_length)
- print("token:", token)
- '''
- token: {'input_ids': [85117, 72675, 130001, 130004, 5, 103559, 130010, 63869, 111415, 63956, 4, 26650, 130008, 26650, 130005], 'seq_len': 4}
- '''
- ...
- "remove_space": false,
- "do_lower_case": true,
- ...
- def preprocess_text(self, inputs):
- if self.remove_space:
- outputs = " ".join(inputs.strip().split())
- else:
- outputs = inputs
- if self.do_lower_case:
- outputs = outputs.lower()
- return outputs
\n替换成<n>; \t替换成<|tab|> ;空格被替换成<|blank_{length}|>,{length}是空格的个数,最多到80,值得注意的是,虽然80这个值是一个参数,但是只能小于等于80,因为词典中没有超过80的token。
- @staticmethod
- def _encode_whitespaces(text: str, max_len: int = 80):
- # 替换制表符
- text = text.replace("\t", SPTokenizer.get_tab_token())
- # 替换空格
- for i in range(max_len, 1, -1):
- text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
- return text
- def _preprocess(self, text: str, linebreak=True, whitespaces=True):
- if linebreak:
- # 替换回车
- text = text.replace("\n", "<n>")
- if whitespaces:
- text = self._encode_whitespaces(text, max_len=self.max_blank_length)
- return text
上面的处理之后,调用sentencepiece的EncodeAsIds()方法生成token,特殊的下划线就是这个时候拼上的。sentencepiece还是值得研究一下的,ice_text.model也是使用它训练的,从词典能看出来,用的是BPE (Byte Pair Encoding)算法。
- def build_inputs_with_special_tokens(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens. A BERT sequence has the following format:
- - single sequence: `[CLS] X [SEP]`
- - pair of sequences: `[CLS] A [SEP] B [SEP]`
- Args:
- token_ids_0 (`List[int]`):
- List of IDs to which the special tokens will be added.
- token_ids_1 (`List[int]`, *optional*):
- Optional second list of IDs for sequence pairs.
- Returns:
- `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
- """
- gmask_id = self.sp_tokenizer[self.gmask_token]
- eos_id = self.sp_tokenizer[self.eos_token]
- token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
- if token_ids_1 is not None:
- token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
- return token_ids_0
- def build_inputs_with_special_tokens(
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
- ) -> List[int]:
- """
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens.
- This implementation does not add special tokens and this method should be overridden in a subclass.
- Args:
- token_ids_0 (`List[int]`): The first tokenized sequence.
- token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
- Returns:
- `List[int]`: The model input with special tokens.
- """
- if token_ids_1 is None:
- return token_ids_0
- return token_ids_0 + token_ids_1
- from transformers import AutoTokenizer, AutoConfig
- if __name__ == "__main__":
- model_name = "THUDM/chatglm-6b"
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
- vocab = tokenizer.get_vocab()
- vocab_exchange = dict([val, key] for key, val in vocab.items())
- tokens = [5, 19316, 932]
- participles = [vocab_exchange[token] for token in tokens]
- print("participles:", participles)
- decode_tokens = tokenizer.decode(tokens)
- print("decode_tokens:", decode_tokens)
- '''
- participles: ['▁', '▁Hello', '▁World']
- decode_tokens: Hello World
- '''
现在还有一个问题,词典(ice_text.model)是怎么生成的,ChatGLM和LLaMA其实都使用了sentencepiece包中的BPE,sentencepiece实现了BPE (Byte Pair Encoding)、Unigram、Word和Char四种算法,那这四种算法是什么,最终为什么选择BPE,因为篇(lan)幅(de)有(xie)限(le)以后会单独说。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。