赞
踩
git clone --recursive https://github.com/QwenLM/qwen.cpp && cd qwen.cpp
git submodule update --init --recursive
python3 qwen_cpp/convert.py -i /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat -t q4_0 -o qwen7b-ggml.bin
cmake -B build
cmake --build build -j --config Release
./build/bin/main -m ./qwen7b-ggml.bin --tiktoken /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/qwen.tiktoken -p 你好
将CMakeLists.txt的Release改成Debug
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Debug)
endif ()
cmake --build build -j --config Bebug
gdb ./build/bin/main
问题:
https://github.com/QwenLM/qwen.cpp/pull/40
139行assert中的!=应改为==,否则在debug模式下无法正常运行
""" Convert Hugging Face Qwen models to GGML format """ import argparse import platform import struct import sys from enum import Enum from pathlib import Path from typing import BinaryIO import torch from tabulate import tabulate from tqdm import tqdm from transformers import AutoTokenizer, AutoModelForCausalLM GGML_QK8_0 = 32 GGML_QK4_0 = 32 GGML_QK4_1 = 32 GGML_QK5_0 = 32 GGML_QK5_1 = 32 GGML_MEM_ALIGN = 16 if platform.system() == "Darwin": # cpm_kernels doesn't support macOS but transformers will check missing packages, so mock it sys.modules["cpm_kernels"] = object() class GGMLType(Enum): F32 = 0 F16 = 1 Q4_0 = 2 Q4_1 = 3 Q5_0 = 6 Q5_1 = 7 Q8_0 = 8 def quantize_q8_0(tensor: torch.Tensor) -> torch.CharTensor: # equivalent to ggml_quantize_q8_0 in ggml.c assert tensor.shape[1] % GGML_QK8_0 == 0 tensor = tensor.view(-1, GGML_QK8_0) scale = tensor.abs().max(dim=-1, keepdim=True).values / ((1 << 7) - 1) tensor = (tensor / scale).round().clamp(min=-128, max=127).char() # add scale into each block tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1) return tensor def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor: # equivalent to ggml_quantize_q4_0 in ggml.c,GGML_QK4_0=32 assert tensor.shape[1] % GGML_QK4_0 == 0 tensor = tensor.view(-1, GGML_QK4_0) # 将tensor调整为二维张量 abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices # :这行代码计算了每行中绝对值最大的元素的索引 max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1) # 这行代码利用上一步计算得到的索引,从张量中取出对应的元素值,形成一个新的张量 max_values。 scale = max_values / -8 tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char() # compress two int4 weights into an int8 tensor = tensor[:, :16] | (tensor[:, 16:] << 4) # add scale into each block tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1) return tensor def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor: # equivalent to ggml_quantize_q4_1 in ggml.c assert tensor.shape[1] % GGML_QK4_1 == 0 tensor = tensor.view(-1, GGML_QK4_1) min_vals = tensor.min(dim=-1, keepdim=True).values max_vals = tensor.max(dim=-1, keepdim=True).values scale = (max_vals - min_vals) / ((1 << 4) - 1) tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=15).char() # compress two int4 weights into an int8 tensor = tensor[:, :16] | (tensor[:, 16:] << 4) # add scale & min into each block tensor = torch.cat((scale.half().view(torch.int8), min_vals.half().view(torch.int8), tensor), dim=-1) return tensor def quantize_q5_0(tensor: torch.Tensor) -> torch.CharTensor: # equivalent to ggml_quantize_q5_0 in ggml.c assert tensor.shape[1] % GGML_QK5_0 == 0 tensor = tensor.view(-1, GGML_QK5_0) abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1) scale = max_values / -16 tensor = (tensor / scale + 16).round().clamp(min=0, max=31).char() qs = (tensor[:, :16] & 0x0F) | (tensor[: 16:] << 4) qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32) for i in range(32): qh |= ((tensor[:, i] & 0x10) >> 4).int() << i # add scale into each block tensor = torch.cat((scale.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1) return tensor def quantize_q5_1(tensor: torch.Tensor) -> torch.CharTensor: # equivalent to ggml_quantize_q5_1 in ggml.c assert tensor.shape[1] % GGML_QK5_1 == 0 tensor = tensor.view(-1, GGML_QK5_1) min_vals = tensor.min(dim=-1, keepdim=True).values max_vals = tensor.max(dim=-1, keepdim=True).values scale = (max_vals - min_vals) / ((1 << 5) - 1) tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=31).char() qs = (tensor[:, :16] & 0x0F) | (tensor[:, 16:] << 4) qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32) for i in range(32): qh |= ((tensor[:, i] & 0x10) >> 4).int() << i # add scale & min into each block tensor = torch.cat( (scale.half().view(torch.int8), min_vals.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1 ) return tensor def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType): assert tensor.dtype == torch.float32 # tensor name f.write(struct.pack("i", len(name.encode()))) f.write(name.encode()) # tensor shape & dtype f.write(struct.pack("i" * (2 + tensor.ndim), tensor.ndim, *tensor.shape, ggml_type.value)) # tensor data if ggml_type == GGMLType.F32: tensor = tensor.float() elif ggml_type == GGMLType.F16: tensor = tensor.half() elif ggml_type == GGMLType.Q8_0: tensor = quantize_q8_0(tensor) elif ggml_type == GGMLType.Q4_0: tensor = quantize_q4_0(tensor) elif ggml_type == GGMLType.Q4_1: tensor = quantize_q4_1(tensor) elif ggml_type == GGMLType.Q5_0: tensor = quantize_q5_0(tensor) elif ggml_type == GGMLType.Q5_1: tensor = quantize_q5_1(tensor) else: raise NotImplementedError(f"Cannot dump tensor of dtype {tensor.dtype}") # align address aligned_pos = (f.tell() + (GGML_MEM_ALIGN - 1)) // GGML_MEM_ALIGN * GGML_MEM_ALIGN f.seek(aligned_pos) tensor.numpy().tofile(f) '''OrderedDict([('transformer.wte.weight', tensor([[-1.6846e-02, -9.5825e-03, 8.1787e-03, ..., 1.6357e-02, 1.3351e-03, 1.7578e-02], [ 2.9297e-03, 9.6436e-03, -6.0425e-03, ..., 1.8799e-02, -8.4839e-03, 9.1553e-03], [ 1.6235e-02, -2.2461e-02, -1.0193e-02, ..., 3.9307e-02, 2.3071e-02, -1.2589e-03], ..., [ 1.1921e-07, -8.4043e-06, 2.6822e-05, ..., 2.5153e-05, -4.7684e-06, 4.4703e-06], [-8.4639e-06, 1.2040e-05, 2.1219e-05, ..., 1.9431e-05, 3.8147e-06, 3.0398e-06], [-2.5749e-05, 2.4080e-05, -1.0252e-05, ..., 2.9802e-06, -1.3709e-05, 1.4067e-05]], dtype=torch.float16)), ('transformer.h.0.ln_1.weight', tensor([0.0977, 0.0884, 0.1050, ..., 0.0938, 0.0957, 0.0938], dtype=torch.float16)), ('transformer.h.0.attn.c_attn.weight', tensor([[ 0.0295, -0.0085, 0.0361, ..., 0.0059, -0.0104, 0.0110], [ 0.0023, -0.0036, -0.0481, ..., 0.0031, 0.0276, 0.0084], [ 0.0303, -0.0239, -0.0075, ..., 0.0315, 0.0053, -0.0413], ..., [-0.0037, 0.0234, -0.0079, ..., 0.0068, 0.0258, -0.0160], [ 0.0034, 0.0060, -0.0135, ..., -0.0159, -0.0009, 0.0036], [-0.0008, 0.0073, 0.0067, ..., -0.0405, -0.0018, 0.0080]], dtype=torch.float16)), ('transformer.h.0.attn.c_attn.bias', tensor([-9.4531e-01, 1.8828e+00, -7.4609e-01, ..., 1.7548e-04, -3.1128e-03, -1.0605e-03], dtype=torch.float16)), ('transformer.h.0.attn.c_proj.weight', tensor([[-7.2327e-03, -2.6550e-03, -6.4373e-05, ..., -2.7954e-02, 5.1880e-03, 1.2146e-02], [ 7.3242e-03, 3.7384e-03, -1.1047e-02, ..., -8.5449e-03, -7.9956e-03, -2.6978e-02], [-2.9907e-02, 8.3618e-03, -6.0425e-03, ..., 4.0771e-02, -1.6403e-03, -2.9541e-02], ..., [ 9.2773e-03, -9.5215e-03, 4.6997e-03, ..., 6.3782e-03, 3.7003e-04, -8.9111e-03], [-3.8574e-02, -6.2256e-03, -3.8574e-02, ..., -2.8839e-03, 1.2665e-03, -8.3008e-03], [-4.7913e-03, 1.7090e-02, 3.9795e-02, ..., -1.1292e-02, -1.6602e-02, -1.1215e-03]], dtype=torch.float16)), ('transformer.h.0.ln_2.weight', tensor([0.1768, 0.1719, 0.1680, ..., 0.1611, 0.1719, 0.1660], dtype=torch.float16)), ('transformer.h.0.mlp.w1.weight', tensor([[-0.0276, 0.0123, -0.0299, ..., 0.0215, -0.0173, -0.0293], [-0.0036, -0.0060, 0.0062, ..., -0.0009, 0.0176, 0.0119], [-0.0013, -0.0004, 0.0214, ..., 0.0002, -0.0216, 0.0304], ..., [-0.0161, 0.0149, -0.0017, ..., -0.0085, -0.0064, 0.0126], [ 0.0002, 0.0021, 0.0122, ..., 0.0026, 0.0254, 0.0322], [ 0.0033, 0.0184, -0.0200, ..., -0.0052, -0.0137, 0.0056]], dtype=torch.float16)), ('transformer.h.0.mlp.w2.weight', tensor([[-0.0264, -0.0044, 0.0160, ..., 0.0128, -0.0221, -0.0148], [ 0.0217, 0.0097, -0.0359, ..., -0.0049, 0.0225, 0.0077], [ 0.0192, 0.0140, -0.0132, ..., -0.0040, 0.0102, -0.0449], ..., [ 0.0005, -0.0295, -0.0195, ..., -0.0117, 0.0026, -0.0044], [ 0.0035, -0.0194, -0.0269, ..., -0.0302, -0.0015, -0.0043], [-0.0215, -0.0140, 0.0017, ..., 0.0043, 0.0157, 0.0112]], dtype=torch.float16)), ('transformer.h.0.mlp.c_proj.weight', tensor([[-0.0042, -0.0111, -0.0013, ..., 0.0018, -0.0116, 0.0288], [ 0.0192, 0.0131, -0.0081, ..., 0.0120, 0.0176, 0.0120], [ 0.0030, 0.0109, 0.0067, ..., 0.0003, 0.0166, -0.0005], ..., [-0.0120, 0.0133, 0.0131, ..., -0.0273, 0.0016, -0.0011], [ 0.0320, 0.0041, -0.0244, ..., -0.0078, 0.0053, 0.0132], [-0.0337, -0.0137, -0.0013, ..., -0.0088, 0.0315, -0.0094]], dtype=torch.float16)), ('transformer.h.1.ln_1.weight', tensor([0.1348, 0.0913, 0.1045, ..., 0.1289, 0.0806, 0.0938], dtype=torch.float16)), ('transformer.h.1.attn.c_attn.weight', tensor([[ 4.4922e-02, -8.2397e-03, 2.1484e-02, ..., -7.0496e-03, -1.6724e-02, -3.0670e-03], [-2.2583e-02, 2.9449e-03, -3.7994e-03, ..., 9.8877e-03, 7.4768e-03, -2.1210e-03], [-3.3447e-02, 7.6599e-03, -1.7822e-02, ..., 1.5869e-02, 1.9775e-02, -2.3193e-02], ..., [-4.5166e-03, 3.6621e-03, 1.8799e-02, ..., -9.1553e-03, 4.2152e-04, -1.0803e-02], [-6.7139e-03, 8.2397e-03, 1.2756e-02, ..., -1.7929e-03, -2.6733e-02, -5.9843e-05], [-1.1963e-02, -1.5259e-02, -9.7046e-03, ..., 2.1210e-03, -9.5215e-03, -7.6294e-04]], dtype=torch.float16)), ('transformer.h.1.attn.c_attn.bias', tensor([ 4.1797e-01, -1.1172e+00, -1.1094e+00, ..., 4.0436e-04, -1.9455e-03, 1.8921e-03], dtype=torch.float16)), ('transformer.h.1.attn.c_proj.weight', tensor([[ 0.0018, 0.0124, 0.0074, ..., -0.0155, -0.0009, 0.0004], [-0.0031, -0.0286, 0.0116, ..., 0.0066, 0.0021, 0.0006], [ 0.0028, -0.0011, 0.0085, ..., -0.0044, -0.0176, 0.0090], ..., [ 0.0216, 0.0042, -0.0164, ..., 0.0072, 0.0076, 0.0093], [-0.0007, 0.0248, -0.0166, ..., 0.0151, 0.0106, 0.0018], [ 0.0001, -0.0057, -0.0295, ..., 0.0075, -0.0088, -0.0012]], dtype=torch.float16)), ('transformer.h.1.ln_2.weight', tensor([0.2441, 0.2441, 0.2393, ..., 0.2217, 0.2363, 0.2305], dtype=torch.float16)), ...])''' def dump_state_dict(f, weight_names, state_dict, ggml_type): tensor_info = [] for name in tqdm(weight_names, desc="Processing model states"): tensor = state_dict[name] if tensor.ndim == 2: # 2d weight: should quantize it if needed # step 1: de-quantize it back to float32 tensor = tensor.float() # step 2: quantize it into ggml format tensor_ggml_type = ggml_type else: # 1d weight: convert it to float32 assert tensor.ndim == 1 tensor = tensor.float() tensor_ggml_type = GGMLType.F32 dump_tensor(f, name, tensor, tensor_ggml_type) # [('transformer.wte.weight', torch.Size([151936, 4096]), 'Q4_0')] tensor_info.append((name, tensor.shape, tensor_ggml_type.name)) print(tabulate(tensor_info, headers=["name", "shape", "dtype"], tablefmt="psql")) ''' /root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/configuration_qwen.py configuration_qwen.py调用/root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/modeling_qwen.py 到QWenLMHeadModel停止,config:/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/config.json 两句重要映射代码 self.transformer = QWenModel(config) self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) model: QWenLMHeadModel( (transformer): QWenModel( (wte): Embedding(151936, 4096) (drop): Dropout(p=0.0, inplace=False) (rotary_emb): RotaryEmbedding() (h): ModuleList( (0-31): 32 x QWenBlock( (ln_1): RMSNorm() (attn): QWenAttention( (c_attn): Linear(in_features=4096, out_features=12288, bias=True) (c_proj): Linear(in_features=4096, out_features=4096, bias=False) (attn_dropout): Dropout(p=0.0, inplace=False) ) (ln_2): RMSNorm() (mlp): QWenMLP( (w1): Linear(in_features=4096, out_features=11008, bias=False) (w2): Linear(in_features=4096, out_features=11008, bias=False) (c_proj): Linear(in_features=11008, out_features=4096, bias=False) ) ) ) (ln_f): RMSNorm() ) (lm_head): Linear(in_features=4096, out_features=151936, bias=False) ) tokenizer:分词器的配置 /root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/tokenization_qwen.py vocab_file:'/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/qwen.tiktoken' QWenTokenizer(name_or_path='/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat', vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True), added_tokens_decoder={ } ''' class QwenConverter: @classmethod def convert(cls, f, model, tokenizer, ggml_type): f.write(b"ggml") # magic cls.dump_config(f, model.config, model.generation_config, tokenizer, ggml_type) cls.dump_model(f, model, ggml_type) @staticmethod def dump_config(f, config, generation_config, tokenizer, ggml_type): config_values = [ ggml_type.value, config.vocab_size, config.hidden_size, config.num_attention_heads, config.num_attention_heads, config.num_hidden_layers, config.intermediate_size, config.seq_length, generation_config.eos_token_id, generation_config.pad_token_id, tokenizer.im_start_id, tokenizer.im_end_id, ] # dump_config 函数用于将模型配置信息以二进制格式写入文件 f.write(struct.pack("i" * len(config_values), *config_values)) # 这段代码定义了一个静态方法 dump_model,用于将模型的权重信息以二进制格式写入文件transformer.h.{0-31}.ln_1.weight @staticmethod def dump_model(f, model, ggml_type): weight_names = ["transformer.wte.weight"] for i in range(model.config.num_hidden_layers): weight_names += [ f"transformer.h.{i}.ln_1.weight", f"transformer.h.{i}.attn.c_attn.weight", f"transformer.h.{i}.attn.c_attn.bias", f"transformer.h.{i}.attn.c_proj.weight", f"transformer.h.{i}.ln_2.weight", f"transformer.h.{i}.mlp.w1.weight", f"transformer.h.{i}.mlp.w2.weight", f"transformer.h.{i}.mlp.c_proj.weight", ] weight_names += [ "transformer.ln_f.weight", "lm_head.weight", ] dump_state_dict(f, weight_names, model.state_dict(), ggml_type) def convert(f: BinaryIO, model_name_or_path: str, dtype: str = "q4_0"): # 将q4_0中的q改变为大写Q,<GGMLType.Q4_0: 2> ggml_type = GGMLType[dtype.upper()] '''AutoTokenizer通用的分词器类, 分词器(Tokenizer)是自然语言处理(NLP)中的一个重要组件,用于将文本分解成单词、 词组或其他基本单元,以便计算机可以更好地理解和处理文本数据。 在NLP任务中,文本通常以连续的字符序列形式输入到计算机中, 但计算机处理文本的方式更多地依赖于词汇和语义单位。因此,分词器的主要作用是将连续的文本流分割成有意义的单位, 以便进行后续的处理,如词频统计、词性标注、命名实体识别、句法分析等。 根据路径或者名称下载AutoTokenizer预训练分词器,AutoModelForCausalLM语言模型 这个构建模型和一系列config配置参数没有llama.c简单 ''' tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True) QwenConverter.convert(f, model, tokenizer, ggml_type) def main(): parser = argparse.ArgumentParser("qwen-convert") # parser.add_argument( # "-i", # "--model_name_or_path", # default="Qwen/Qwen-7B-Chat", # type=str, # help="Model name or path used in AutoModel.from_pretrained", # ) parser.add_argument( "-i", "--model_name_or_path", default="/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat", type=str, help="Model name or path used in AutoModel.from_pretrained", ) # parser.add_argument( # "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model" # ) parser.add_argument( "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model" ) parser.add_argument( "-t", "--type", default="q4_0", type=str, choices=["f32", "f16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"], help="GGML model quantization type", ) args = parser.parse_args() # f就是qwen7b-ggml.bin with open(args.save_path, "wb") as f: convert(f, args.model_name_or_path, dtype=args.type) print(f"GGML model saved to {args.save_path}") if __name__ == "__main__": main() ''' 读取模型和config,中途进行量化,两维的量化为int4,并拼成int8,一维的直接fp32 '''
export1.py
https://github.com/karpathy/llama2.c/blob/de005474d37d0cde1356739b8c79ebe7b42b5973/export_meta_llama_bin.py
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('shakechen/Llama-2-7b',cache_dir='/mnt/workspace/llama2.c/llama2pth')
def concat_weights(models): state_dict = {} for name in list(models[0]): tensors = [model[name] for model in models] if len(tensors) == 1 or len(tensors[0].shape) == 1: state_dict[name] = tensors[0] continue is_axis_1 = ( name.startswith('tok_embeddings.') or name.endswith('.attention.wo.weight') or name.endswith('.feed_forward.w2.weight') ) axis = 1 if is_axis_1 else 0 state_dict[name] = torch.cat(tensors, dim=axis) for model in models: del model[name] return state_dict def load_and_export(model_path, output_path): # 读取配置文件,json.load(f)方法解析json内容并存储在变量params中 params_path = os.path.join(model_path, 'params.json') with open(params_path) as f: params = json.load(f) print(params) # 将pth加载到cpu上,然后将加载后的模型字典存入列表models。concat这儿只有一个model model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth'))) models = [torch.load(p, map_location='cpu') for p in model_paths] state_dict = concat_weights(models) del models
解释:
models [{'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06, ..., 8.7172e-07, -6.5267e-06, 8.9034e-07], [ 1.8616e-03, -3.3722e-03, 3.9864e-04, ..., -8.3008e-03, 2.5787e-03, -3.9368e-03], [ 1.0986e-02, 9.8877e-03, -5.0964e-03, ..., 2.5177e-03, 7.7057e-04, -5.0049e-03], ..., [-1.3977e-02, -2.7313e-03, -1.9897e-02, ..., -1.0437e-02, 9.5825e-03, -1.8005e-03], [-1.0742e-02, 9.3384e-03, 1.2939e-02, ..., -3.3203e-02, -1.6357e-02, 3.3875e-03], [-8.3008e-03, -4.0588e-03, -1.1063e-03, ..., 3.4790e-03, -1.2939e-02, 3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047, ..., 1.7188, 1.8281, 1.6016], dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039, 0.0032, -0.0071, ..., 0.0053, -0.0082, 0.0070], [-0.0315, 0.0466, -0.0023, ..., -0.0211, 0.0173, 0.0334], [-0.0125, 0.0036, 0.0195, ..., -0.0271, 0.0143, -0.0082], ..., [-0.0281, -0.0195, -0.0024, ..., 0.0123, -0.0117, -0.0237], [ 0.0229, 0.0255, 0.0315, ..., 0.0067, -0.0092, -0.0058], [ 0.0080, -0.0088, 0.0063, ..., -0.0293, -0.0200, 0.0337]], dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022, ..., 0.0045, 0.0017, -0.0036], [ 0.0089, -0.0136, 0.0003, ..., -0.0100, -0.0198, 0.0078], [ 0.0142, -0.0043, 0.0028, ..., -0.0093, -0.0114, 0.0076], ..., [ 0.0256, 0.0102, 0.0032, ..., -0.0334, -0.0156, -0.0123], [-0.0086, -0.0022, -0.0008, ..., 0.0237, -0.0081, 0.0059], [-0.0134, -0.0066, 0.0018, ..., 0.0181, 0.0166, -0.0082]], dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162, 0.0079, -0.0013, ..., 0.0166, -0.0099, -0.0135], [ 0.0255, 0.0170, 0.0019, ..., -0.0081, 0.0113, 0.0103], [ 0.0192, 0.0015, 0.0036, ..., -0.0211, 0.0152, 0.0234], ..., [-0.0056, 0.0173, -0.0032, ..., -0.0032, 0.0115, -0.0110], [ 0.0178, -0.0038, 0.0003, ..., 0.0053, -0.0109, 0.0104], [ 0.0037, -0.0021, 0.0013, ..., 0.0070, -0.0115, 0.0095]], dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006, 0.0019, ..., 0.0059, -0.0006, 0.0103], [-0.0069, -0.0005, -0.0077, ..., -0.0106, 0.0126, 0.0048], [ 0.0018, 0.0096, 0.0010, ..., 0.0048, -0.0139, -0.0142], ..., [-0.0063, -0.0057, 0.0103, ..., 0.0031, 0.0040, -0.0022], [ 0.0031, 0.0048, -0.0010, ..., 0.0054, 0.0156, 0.0007], [ 0.0001, 0.0025, 0.0056, ..., -0.0007, -0.0007, 0.0015]], dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03, 4.8828e-03, ..., 5.9204e-03, 3.4485e-03, -9.5215e-03], [ 2.7618e-03, 1.8463e-03, -1.2970e-03, ..., -1.0300e-03, 1.8082e-03, 6.2561e-03], [ 2.3346e-03, -2.7275e-04, 9.2697e-04, ..., -1.6556e-03, -5.7373e-03, -6.3705e-04], ..., [ 4.1809e-03, -3.3264e-03, 5.8899e-03, ..., 1.2131e-03, 2.6093e-03, 4.3030e-03], [-3.3569e-03, -2.4872e-03, -2.5787e-03, ..., 6.1951e-03, -3.4790e-03, -5.1117e-04], [ 6.1951e-03, -6.5613e-04, 2.6245e-03, ..., 5.4932e-03, -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02, 1.7090e-02, 3.1494e-02, ..., -1.5869e-02, 6.5002e-03, 1.5869e-02], [-2.1667e-03, -6.0120e-03, 5.6458e-03, ..., 1.6113e-02, -8.6670e-03, 9.8877e-03], [ 6.8359e-03, -2.1606e-02, 2.0508e-02, ..., -1.3000e-02, 1.8921e-02, 1.9409e-02], ..., [ 1.4126e-05, -3.2227e-02, 5.7983e-03, ..., -8.9111e-03, -1.3489e-02, 4.0283e-02], [ 2.6611e-02, 2.0142e-02, -1.7090e-02, ..., -3.4332e-03, -6.4087e-03, -1.8921e-02], [-5.9891e-04, -1.1353e-02, -2.3682e-02, ..., 1.1063e-03, 5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145, 0.0083, ..., -0.0175, -0.0054, 0.0014], [ 0.0046, -0.0042, 0.0090, ..., 0.0160, -0.0138, 0.0334], [ 0.0020, 0.0339, -0.0044, ..., -0.0146, 0.0220, 0.0167], ..., [-0.0089, -0.0114, 0.0052, ..., 0.0231, -0.0135, 0.0295], [-0.0177, 0.0374, 0.0090, ..., -0.0069, -0.0122, -0.0219], [ 0.0120, -0.0013, -0.0079, ..., -0.0003, -0.0030, -0.0302]], dtype=torch.bfloat16), ...}]
def export(p, state_dict, filepath='model.bin'): """export the model weights in fp32 into .bin file to be read from C""" f = open(filepath, 'wb') def serialize(key): print(f"writing {key}...") t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy() f.write(memoryview(t)) del state_dict[key] # first write out the header # 从预训练模型的状态字典(state_dict)中获取第一个隐藏层的前馈神经网络(FFN)的第一层权重矩阵(w1)的维度,将其赋值给变量 # hidden_dim。这个值通常代表了模型内部的隐藏状态维度大小。 hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0] p['vocab_size'] = 32000 p['max_seq_len'] = 2048 # 如果参数字典p中有键'n_kv_heads',则获取其值作为n_kv_heads;否则使用p['n_heads']的值。这可能是用来指定模型在注意力机 # 制中用于查询(query)和键(key)计算的特定数量的注意力头。 n_kv_heads = p.get('n_kv_heads') or p['n_heads'] # 通过struct.pack函数构建模型头信息,它会将多个整数打包成二进制格式。这里的参数包括模型维度、隐藏维度、层数、总注意力头 # 数、查询/键注意力头数、词汇表大小以及最大序列长度。 header = struct.pack( 'iiiiiii', # 这里的 'iiiiiii' 表示七次连续的32位整数打包操作,每个 'i' 代表一个整数类型。 # 在Python的struct模块中,每个字符代表一种特定类型的数据。这里使用的 'i' 表示32位有符号整数(signed integer)。 # 所以 'iiiiiii' 表示连续打包七个32位整数。 p['dim'], hidden_dim, p['n_layers'], p['n_heads'], n_kv_heads, -p['vocab_size'], p['max_seq_len'] ) # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present # in the checkpoint and should be loaded. f.write(header)
解释:该 export 函数的主要目的是将模型权重以浮点32(fp32)格式导出到一个名为 .bin 的二进制文件中,以便C语言程序能够读取这些权重。以
state_dict {'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06, ..., 8.7172e-07, -6.5267e-06, 8.9034e-07], [ 1.8616e-03, -3.3722e-03, 3.9864e-04, ..., -8.3008e-03, 2.5787e-03, -3.9368e-03], [ 1.0986e-02, 9.8877e-03, -5.0964e-03, ..., 2.5177e-03, 7.7057e-04, -5.0049e-03], ..., [-1.3977e-02, -2.7313e-03, -1.9897e-02, ..., -1.0437e-02, 9.5825e-03, -1.8005e-03], [-1.0742e-02, 9.3384e-03, 1.2939e-02, ..., -3.3203e-02, -1.6357e-02, 3.3875e-03], [-8.3008e-03, -4.0588e-03, -1.1063e-03, ..., 3.4790e-03, -1.2939e-02, 3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047, ..., 1.7188, 1.8281, 1.6016], dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039, 0.0032, -0.0071, ..., 0.0053, -0.0082, 0.0070], [-0.0315, 0.0466, -0.0023, ..., -0.0211, 0.0173, 0.0334], [-0.0125, 0.0036, 0.0195, ..., -0.0271, 0.0143, -0.0082], ..., [-0.0281, -0.0195, -0.0024, ..., 0.0123, -0.0117, -0.0237], [ 0.0229, 0.0255, 0.0315, ..., 0.0067, -0.0092, -0.0058], [ 0.0080, -0.0088, 0.0063, ..., -0.0293, -0.0200, 0.0337]], dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022, ..., 0.0045, 0.0017, -0.0036], [ 0.0089, -0.0136, 0.0003, ..., -0.0100, -0.0198, 0.0078], [ 0.0142, -0.0043, 0.0028, ..., -0.0093, -0.0114, 0.0076], ..., [ 0.0256, 0.0102, 0.0032, ..., -0.0334, -0.0156, -0.0123], [-0.0086, -0.0022, -0.0008, ..., 0.0237, -0.0081, 0.0059], [-0.0134, -0.0066, 0.0018, ..., 0.0181, 0.0166, -0.0082]], dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162, 0.0079, -0.0013, ..., 0.0166, -0.0099, -0.0135], [ 0.0255, 0.0170, 0.0019, ..., -0.0081, 0.0113, 0.0103], [ 0.0192, 0.0015, 0.0036, ..., -0.0211, 0.0152, 0.0234], ..., [-0.0056, 0.0173, -0.0032, ..., -0.0032, 0.0115, -0.0110], [ 0.0178, -0.0038, 0.0003, ..., 0.0053, -0.0109, 0.0104], [ 0.0037, -0.0021, 0.0013, ..., 0.0070, -0.0115, 0.0095]], dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006, 0.0019, ..., 0.0059, -0.0006, 0.0103], [-0.0069, -0.0005, -0.0077, ..., -0.0106, 0.0126, 0.0048], [ 0.0018, 0.0096, 0.0010, ..., 0.0048, -0.0139, -0.0142], ..., [-0.0063, -0.0057, 0.0103, ..., 0.0031, 0.0040, -0.0022], [ 0.0031, 0.0048, -0.0010, ..., 0.0054, 0.0156, 0.0007], [ 0.0001, 0.0025, 0.0056, ..., -0.0007, -0.0007, 0.0015]], dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03, 4.8828e-03, ..., 5.9204e-03, 3.4485e-03, -9.5215e-03], [ 2.7618e-03, 1.8463e-03, -1.2970e-03, ..., -1.0300e-03, 1.8082e-03, 6.2561e-03], [ 2.3346e-03, -2.7275e-04, 9.2697e-04, ..., -1.6556e-03, -5.7373e-03, -6.3705e-04], ..., [ 4.1809e-03, -3.3264e-03, 5.8899e-03, ..., 1.2131e-03, 2.6093e-03, 4.3030e-03], [-3.3569e-03, -2.4872e-03, -2.5787e-03, ..., 6.1951e-03, -3.4790e-03, -5.1117e-04], [ 6.1951e-03, -6.5613e-04, 2.6245e-03, ..., 5.4932e-03, -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02, 1.7090e-02, 3.1494e-02, ..., -1.5869e-02, 6.5002e-03, 1.5869e-02], [-2.1667e-03, -6.0120e-03, 5.6458e-03, ..., 1.6113e-02, -8.6670e-03, 9.8877e-03], [ 6.8359e-03, -2.1606e-02, 2.0508e-02, ..., -1.3000e-02, 1.8921e-02, 1.9409e-02], ..., [ 1.4126e-05, -3.2227e-02, 5.7983e-03, ..., -8.9111e-03, -1.3489e-02, 4.0283e-02], [ 2.6611e-02, 2.0142e-02, -1.7090e-02, ..., -3.4332e-03, -6.4087e-03, -1.8921e-02], [-5.9891e-04, -1.1353e-02, -2.3682e-02, ..., 1.1063e-03, 5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145, 0.0083, ..., -0.0175, -0.0054, 0.0014], [ 0.0046, -0.0042, 0.0090, ..., 0.0160, -0.0138, 0.0334], [ 0.0020, 0.0339, -0.0044, ..., -0.0146, 0.0220, 0.0167], ..., [-0.0089, -0.0114, 0.0052, ..., 0.0231, -0.0135, 0.0295], [-0.0177, 0.0374, 0.0090, ..., -0.0069, -0.0122, -0.0219], [ 0.0120, -0.0013, -0.0079, ..., -0.0003, -0.0030, -0.0302]], dtype=torch.bfloat16), 'layers.0.feed_forward.w3.weight': tensor([[ 0.0003, -0.0292, 0.0148, ..., -0.0210, -0.0270, 0.0065], [-0.0111, -0.0312, 0.0128, ..., 0.0190, 0.0060, 0.0025], [-0.0059, 0.0149, -0.0084, ..., -0.0227, 0.0075, 0.0017], ..., [-0.0091, -0.0016, -0.0067, ..., 0.0295, -0.0028, 0.0183], [-0.0166, 0.0073, 0.0189, ..., 0.0014, -0.0166, 0.0031], [ 0.0190, 0.0197, -0.0004, ..., 0.0118, -0.0143, -0.0388]], dtype=torch.bfloat16), 'layers.0.attention_norm.weight': tensor([0.0297, 0.0136, 0.0020, ..., 0.0103, 0.0110, 0.0061], dtype=torch.bfloat16), 'layers.0.ffn_norm.weight': tensor([0.0503, 0.0525, 0.0500, ..., 0.0525, 0.0535, 0.0491], dtype=torch.bfloat16), 'layers.1.attention.wq.weight': tensor([[-0.0125, 0.0073, -0.0381, ..., -0.0024, -0.0588, 0.0356], [-0.0195, 0.0410, 0.0544, ..., 0.0214, -0.0308, 0.0315], [-0.0006, -0.0082, 0.0079, ..., -0.0083, -0.0488, 0.0277], ..., [-0.0021, -0.0038, -0.0042, ..., 0.0088, 0.0052, 0.0062], [ 0.0024, -0.0022, 0.0049, ..., 0.0098, 0.0005, 0.0128], [ 0.0003, 0.0048, 0.0067, ..., -0.0079, -0.0005, -0.0111]], dtype=torch.bfloat16), 'layers.1.attention.wk.weight': tensor([[-0.0248, -0.0025, 0.0383, ..., 0.0179, 0.0208, -0.0096], [ 0.0147, 0.0030, -0.0276, ..., -0.0227, -0.0044, -0.0032], [-0.0295, 0.0046, -0.0114, ..., -0.0159, 0.0094, -0.0588], ..., [ 0.0080, -0.0192, 0.0040, ..., -0.0121, -0.0015, -0.0065], [ 0.0090, -0.0239, 0.0014, ..., -0.0122, 0.0027, -0.0074], [-0.0080, 0.0147, 0.0007, ..., 0.0040, -0.0016, 0.0060]], dtype=torch.bfloat16), ...}
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。