赞
踩
#coding=utf8 #训练分词器 def train(): import sentencepiece as spm spm.SentencePieceTrainer.Train( input="/home/chatglm6/sentences_vocab/tianlongbabu.txt", model_prefix="tokenizer", vocab_size=50000, user_defined_symbols=['foo', 'bar'], character_coverage=1.0, model_type="bpe", ) def practice_sentencepiece(): import sentencepiece as spm sp_model=spm.SentencePieceProcessor() sp_model.Load("./tokenizer.model") #编码 print(sp_model.EncodeAsPieces("你好是一个汉语词语")) print(sp_model.EncodeAsIds("你好是一个汉语词语")) #解码 print(sp_model.DecodePieces(['▁', '你好', '是', '一个', '汉', '语', '词', '语'])) print(sp_model.Decode([46706, 2382, 46699, 21, 47120, 47105, 48432, 47105])) #practice_sentencepiece() #transformer 分词器使用 加载 def tf_practice(): import os from transformers import LlamaTokenizer from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model import sentencepiece as spm chinese_sp_model_file = "./tokenizer.model" # 加载分词器模型 chinese_sp_model = spm.SentencePieceProcessor() chinese_sp_model.Load(chinese_sp_model_file) chinese_spm = sp_pb2_model.ModelProto() chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto()) #序列化 ## Save output_dir = './transformers_tokenizer/chinese/' os.makedirs(output_dir, exist_ok=True) with open(output_dir + 'chinese.model', 'wb') as f: f.write(chinese_spm.SerializeToString()) #模型保存 #transformer 加载 tokenizer = LlamaTokenizer(vocab_file=output_dir + 'chinese.model') tokenizer.save_pretrained(output_dir) #并保存到transformer格式 #测试 chinese_tokenizer = LlamaTokenizer.from_pretrained(output_dir) #transformer 格式加载 print(tokenizer.all_special_tokens) print(tokenizer.all_special_ids) print(tokenizer.special_tokens_map) text = '''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n", text) print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_tokenizer.tokenize(text)}") #tf_practice() #transformer 分词器合并 def combine(): import os from transformers import LlamaTokenizer from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model import sentencepiece as spm llama_tokenizer_dir = "/home/llama_/CodeLlama-7b/tokenizer.model" chinese_sp_model_file = "./tokenizer.model" # load llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir) #已有分词器 chinese_sp_model = spm.SentencePieceProcessor() chinese_sp_model.Load(chinese_sp_model_file) #新分词器 llama_spm = sp_pb2_model.ModelProto() llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto()) #已有分词序列化 chinese_spm = sp_pb2_model.ModelProto() chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto()) #现有分词序列化 # print number of tokens print(len(llama_tokenizer), len(chinese_sp_model)) print(llama_tokenizer.all_special_tokens) print(llama_tokenizer.all_special_ids) print(llama_tokenizer.special_tokens_map) ## Add Chinese tokens to LLaMA tokenizer llama_spm_tokens_set = set(p.piece for p in llama_spm.pieces) #分词片段 print(len(llama_spm_tokens_set)) print(f"Before:{len(llama_spm_tokens_set)}") for p in chinese_spm.pieces: #将中文分词添加到已有分词器中 piece = p.piece if piece not in llama_spm_tokens_set: new_p = sp_pb2_model.ModelProto().SentencePiece() #创建词片对象 new_p.piece = piece new_p.score = 0 llama_spm.pieces.append(new_p) #添加到词片段集合中 print(f"New model pieces: {len(llama_spm.pieces)}") ## Save 保存 output_sp_dir = 'transformers_tokenizer/llama_chinese' output_hf_dir = 'transformers_tokenizer/llama_chinese' # the path to save Chinese-LLaMA tokenizer os.makedirs(output_sp_dir, exist_ok=True) with open(output_sp_dir + '/chinese_llama.model', 'wb') as f: f.write(llama_spm.SerializeToString()) #序列号,并保存到新模型 tokenizer = LlamaTokenizer(vocab_file=output_sp_dir + '/chinese_llama.model') #加载新模型,并保存 tokenizer.save_pretrained(output_hf_dir) print(f"Chinese-LLaMA tokenizer has been saved to {output_hf_dir}") # Test llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir) chinese_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir) print(tokenizer.all_special_tokens) print(tokenizer.all_special_ids) print(tokenizer.special_tokens_map) text = '''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n", text) print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}") print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_llama_tokenizer.tokenize(text)}") #combine() #单独使用合并后的transformers def test(): from transformers import LlamaTokenizer output_hf_dir = 'transformers_tokenizer/llama_chinese' tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir) print(tokenizer.all_special_tokens) print(tokenizer.all_special_ids) print(tokenizer.special_tokens_map) text = '''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n", text) print(f"Tokenized by Chinese-LLaMA tokenizer:{tokenizer.tokenize(text)}")
def make_self_tokenizer(): import os from sentences_vocab.tokenization import NewChineseTokenizer import sentencepiece as spm from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model chinese_sp_model_file = "./tokenizer.model" # 加载分词器模型 chinese_sp_model = spm.SentencePieceProcessor() chinese_sp_model.Load(chinese_sp_model_file) chinese_spm = sp_pb2_model.ModelProto() chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto()) # 序列化 ## Save output_dir = './transformers_tokenizer/news/' os.makedirs(output_dir, exist_ok=True) with open(output_dir + 'chinese.model', 'wb') as f: f.write(chinese_spm.SerializeToString()) # 模型保存 #加载 tokenizer=NewChineseTokenizer(vocab_file=output_dir+"chinese.model") tokenizer.save_pretrained(output_dir) # 并保存到transformer格式 #make_self_tokenizer() def test(): #手动导入类,加载 from sentences_vocab.tokenization import NewChineseTokenizer output_dir = './transformers_tokenizer/news/' chinese_tokenizer=NewChineseTokenizer.from_pretrained(output_dir) text = '''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n", text) print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_tokenizer.tokenize(text)}") test() #创建自己的tokenizer print("======================================") def test1(): ''' 修改配置文件,让模型能够找到对应的类 { "name_or_path": "", "remove_space": false, "do_lower_case": false, "tokenizer_class": "ChatGLMTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_chatglm.ChatGLMTokenizer", null ] } } 开始: { "add_bos_token": true, "add_eos_token": false, "added_tokens_decoder": { ... "tokenizer_class": "NewChineseTokenizer", "unk_token": "<unk>", } 插入后: { "add_bos_token": true, "add_eos_token": false, "added_tokens_decoder": { ... "tokenizer_class": "NewChineseTokenizer", "unk_token": "<unk>", "name_or_path": "", "remove_space": false, "do_lower_case": false, "tokenizer_class": "ChatGLMTokenizer", "auto_map": { "AutoTokenizer": [ "tokenization_chatglm.ChatGLMTokenizer", null ] } } ''' #自动加载 from transformers import AutoTokenizer output_dir = './transformers_tokenizer/news/' chinese_tokenizer=AutoTokenizer.from_pretrained(output_dir,trust_remote_code=True) text = '''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n", text) print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_tokenizer.tokenize(text)}") #test1()
tokenization.py: 将llama 的tokenizer 改个名而已。
# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its # original forms to accommodate minor architectural differences compared # to GPT-NeoX and OPT used by the Meta AI team that trained the model. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for LLaMA.""" import os from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} # PRETRAINED_VOCAB_FILES_MAP = { # "vocab_file": { # "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model", # }, # "tokenizer_file": { # "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json", # }, # } # PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { # "hf-internal-testing/llama-tokenizer": 2048, # } class NewChineseTokenizer(PreTrainedTokenizer): """ Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. Args: vocab_file (`str`): Path to the vocabulary file. """ vocab_files_names = VOCAB_FILES_NAMES # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] def __init__( self, vocab_file, unk_token="<unk>", bos_token="<s>", eos_token="</s>", pad_token=None, sp_model_kwargs: Optional[Dict[str, Any]] = None, add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) super().__init__( bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, add_bos_token=add_bos_token, add_eos_token=add_eos_token, sp_model_kwargs=self.sp_model_kwargs, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) @property def vocab_size(self): """Returns vocab size""" return self.sp_model.get_piece_size() def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab def _tokenize(self, text): """Returns a tokenized string.""" return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" token = self.sp_model.IdToPiece(index) return token def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" prev_is_special = False for i, token in enumerate(tokens): # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) return out_string def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: save_directory (`str`): The directory in which to save the vocabulary. Returns: `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) return (out_vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = bos_token_id + token_ids_0 + eos_token_id if token_ids_1 is not None: output = output + bos_token_id + token_ids_1 + eos_token_id return output def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: token_ids_0 (`List[int]`): List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. already_has_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not the token list is already formatted with special tokens for the model. Returns: `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True ) bos_token_id = [1] if self.add_bos_token else [] eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id return ( bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id ) def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | ``` if token_ids_1 is None, only returns the first portion of the mask (0s). Args: token_ids_0 (`List[int]`): List of ids. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. Returns: `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) if token_ids_1 is not None: output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output
'''复制已经训练过的tokenizer''' #只需要修改一下配置文件的名字 from transformers import AutoTokenizer tokenizer=AutoTokenizer.from_pretrained("../chatglm3-6b",trust_remote_code=True) print(tokenizer.encode("白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including")) print(tokenizer.tokenize("白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including")) import os output_dir = './transformers_tokenizer/news1/' os.makedirs(output_dir, exist_ok=True) # tokenizer.save_pretrained(output_dir) #不知道什么原因,需要中tokenizer_config.py特殊字符的配置给删除掉 tokenizer1=AutoTokenizer.from_pretrained(output_dir,trust_remote_code=True) print(tokenizer1.encode("白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including")) print(tokenizer1.tokenize("白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including"))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。