赞
踩
本文是
ai超元域-超哥视频的手动pip install安装版实现
相应的autodl镜像我已经制作好了,目前在审核中,可能现在还打不开。
可以参考 https://blog.csdn.net/fengxiaoyangfeng/article/details/134315290
### 版本要求 11.8和12.1都可以
cuda: 12.1
cudnn:cudnn-linux-x86_64-8.8.1.3_cuda12-archive.tar
注意:这里python环境不要超过3.10,否则后面axolotl相关的包会有不兼容
# 虚拟环境创建
conda create -n chat310 python=3.10
# torch安装
(chat310) E:\share\github\08yue\axolotl> pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
git clone https://github.com/axolotl-ai-cloud/axolotl
cd axolotl
pip3 install packaging ninja
pip3 install -e '.[flash-attn,deepspeed]'
上面可能报错fschat包相关问题
原因是pip3 install -e '.[flash-attn,deepspeed]' 执行安装的是requirements.txt中的包
fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
上面这一行声明的包,fscha官方已经删掉了。
所以只能把requirements.txt中上面一行 @后面去掉,直接安装fscha
(但是这样又会导致后面训练时 AttributeError: LLAMA3. Did you mean: 'LLAMA2'? 报错,请看后文)
rm examples/llama-3/qlora.yml
wget -P examples/llama-3/ https://raw.githubusercontent.com/win4r/mytest/main/qlora.yml
CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/llama-3/qlora.yml
accelerate launch -m axolotl.cli.train examples/llama-3/qlora.yml
accelerate launch -m axolotl.cli.inference examples/llama-3/qlora.yml \
--lora_model_dir="./outputs/qlora-out"
# gradio
accelerate launch -m axolotl.cli.inference examples/llama-3/qlora.yml \
--lora_model_dir="./outputs/qlora-out" --gradio
File "/root/axolotl/src/axolotl/prompt_strategies/sharegpt.py", line 50, in register_llama3_template
sep_style=SeparatorStyle.LLAMA3,
File "/root/miniconda3/lib/python3.10/enum.py", line 437, in __getattr__
raise AttributeError(name) from None
AttributeError: LLAMA3. Did you mean: 'LLAMA2'?
则到下面文件进行修改:
axolotl/src/axolotl/prompt_strategies/sharegpt.py
def register_llama3_template(system_message=None):
system_message = system_message or "You are a helpful assistant."
register_conv_template(
Conversation(
name="llama3",
system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
system_message=system_message,
roles=("user", "assistant"),
# 这里修改
sep_style=SeparatorStyle.LLAMA2,
sep="",
stop_str="<|eot_id|>",
stop_token_ids=[128001, 128009],
)
)
SeparatorStyle.LLAMA2 改为sep_style=SeparatorStyle.LLAMA2,
#合并模型
python3 -m axolotl.cli.merge_lora examples/llama-3/qlora.yml --lora_model_dir="./outputs/qlora-out"
#模型合并后 推送到hugging face
huggingface-cli login
#然后输入write权限的token
huggingface-cli upload leo009/merged-llama3.1-8b outputs/qlora-out/merged
超哥的代码,在 “使用bert-base-chinese进行文本分块” 生成数据集那块的代码,简单写了一下,没加cuda和batch_size可能有点慢,这里
我稍微修改了一下,可以快一写:
import torch
from transformers import BertTokenizer, BertModel
import re
import os
from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader, Dataset
# 检查CUDA是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SentenceDataset(Dataset):
def __init__(self, sentences, tokenizer, max_length):
self.sentences = sentences
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
return self.tokenizer(
self.sentences[idx],
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=self.max_length,
)
def collate_fn(batch):
input_ids = torch.cat([item["input_ids"] for item in batch], dim=0)
attention_mask = torch.cat([item["attention_mask"] for item in batch], dim=0)
return {"input_ids": input_ids, "attention_mask": attention_mask}
def get_sentence_embeddings(sentences, model, tokenizer, batch_size=8):
dataset = SentenceDataset(sentences, tokenizer, max_length=512)
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
all_embeddings = []
with torch.no_grad():
for batch in dataloader:
# 将数据移动到GPU
batch = {key: value.to(device) for key, value in batch.items()}
outputs = model(**batch)
embeddings = outputs.last_hidden_state.mean(dim=1)
all_embeddings.append(embeddings.cpu()) # 移回CPU
return torch.cat(all_embeddings).numpy()
def split_text_by_semantic(text, max_length, similarity_threshold=0.5, batch_size=8):
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertModel.from_pretrained("bert-base-chinese").to(device)
model.eval()
# 定义中文标点符号
chinese_punctuation = "。!?;“”()……"
# 按中文标点符号拆分
sentences = []
for part in text:
part = part.strip()
sub_parts = re.split(f"([{chinese_punctuation}])", part)
sentences.extend([sub for sub in sub_parts if sub]) # 排除空字符串
# sentences = re.split(r"(。|!|?|;)", text)
sentences = [s + p for s, p in zip(sentences[::2], sentences[1::2]) if s]
chunks = []
current_chunk = sentences[0]
embeddings = get_sentence_embeddings(sentences, model, tokenizer, batch_size)
current_embedding = embeddings[0]
for idx in range(1, len(sentences)):
sentence_embedding = embeddings[idx]
similarity = 1 - cosine(current_embedding, sentence_embedding)
if (
similarity > similarity_threshold
and len(tokenizer.tokenize(current_chunk + sentences[idx])) <= max_length
):
current_chunk += sentences[idx]
current_embedding = (current_embedding + sentence_embedding) / 2
else:
chunks.append(current_chunk)
current_chunk = sentences[idx]
current_embedding = sentence_embedding
if current_chunk:
chunks.append(current_chunk)
return chunks
def read_text_file(file_path):
with open(file_path, "r", encoding="utf-8") as file:
return file.readlines()
def save_chunks_to_files(chunks, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for i, chunk in enumerate(chunks):
chunk_file_path = os.path.join(output_dir, f"chunk_{i + 1}.txt")
with open(chunk_file_path, "w", encoding="utf-8") as file:
file.write(chunk)
print(f"已保存第 {i + 1} 个文本块到 {chunk_file_path}")
# 主程序
input_file_path = r"./book/1.txt"
output_dir = "./红楼梦chunk/"
long_text = read_text_file(input_file_path)
max_length = 512
similarity_threshold = 0.7
batch_size = 64
text_chunks = split_text_by_semantic(
long_text, max_length, similarity_threshold, batch_size
)
save_chunks_to_files(text_chunks, output_dir)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。