赞
踩
对应的B站搭建流程视频教程:https://www.bilibili.com/video/BV1b34y1g7Ah/?spm_id_from=333.999.0.0&vd_source=2a400e9a86101d2bf4ac9d3ed71e65c9
python 版本 python 3.8 + cuda11.7.
下载bert-vits2 git clone https://github.com/fishaudio/Bert-VITS2.git
pytorch linux直接pip安装, windows去官网下载(nvcc -v查看cuda版本)
模型下载 中文模型,日文模型,一般都是缺少大文件,去huggingface下载对应缺少的文件放在项目的bert下。
中:https://huggingface.co/hfl/chinese-roberta-wwm-ext-large
日:https://huggingface.co/cl-tohoku/bert-base-japanese-v3/tree/main
# 下载好放在bert里
测试
python ./text/chinese_bert.py
python ./text/japanese_bert.py
下载数据集 在raw文件夹下创建对应名称的文件夹
https://www.bilibili.com/read/cv26659988/?from=articleDetail
# 放在./raw/{name}下
注: 如果想要训练自己的音色,而项目代码并没给如何得到.wav.lab格式的文件形式,下面是我自己写的自动标注代码,只用把音频放在新建的data/name下,name要与下面文件中的对应运行这个文件就行:
import os from pathlib import Path import librosa from scipy.io import wavfile import numpy as np import whisper a="ll" # 请在这里修改说话人的名字,目前只支持中文语音,将音频放在data/ll下 def split_long_audio(model, filepaths, save_dir="data_dir", out_sr=44100): files=os.listdir(filepaths) filepaths=[os.path.join(filepaths,i) for i in files] i=0 for file_idx, filepath in enumerate(filepaths): save_path = Path(save_dir) save_path.mkdir(exist_ok=True, parents=True) print(f"Transcribing file {file_idx}: '{filepath}' to segments...") result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5) segments = result['segments'] wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True) wav, _ = librosa.effects.trim(wav, top_db=20) peak = np.abs(wav).max() if peak > 1.0: wav = 0.98 * wav / peak wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr) wav2 /= max(wav2.max(), -wav2.min()) for i, seg in enumerate(segments): start_time = seg['start'] end_time = seg['end'] wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)] wav_seg_name = f"{a}_{i}.wav" # 修改名字 i+=1 out_fpath = save_path / wav_seg_name wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16)) def transcribe_one(audio_path): # 使用whisper语音识别 # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio_path) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) print(f"Detected language: {max(probs, key=probs.get)}") lang = max(probs, key=probs.get) # decode the audio options = whisper.DecodingOptions(beam_size=5) result = whisper.decode(model, mel, options) # print the recognized text print(result.text) return result.text if __name__ == '__main__': whisper_size = "medium" model = whisper.load_model(whisper_size) audio_path = f"./raw/{a}" if os.path.exists(audio_path): for filename in os.listdir(audio_path): # 删除原来的音频和文本 file_path = os.path.join(audio_path, filename) os.remove(file_path) split_long_audio(model, f"data/{a}", f"./raw/{a}") files=os.listdir(audio_path) file_list_sorted = sorted(files, key=lambda x: int(os.path.splitext(x)[0].split('_')[1])) filepaths=[os.path.join(audio_path,i) for i in file_list_sorted] for file_idx, filepath in enumerate(filepaths): # 循环使用whisper遍历每一个音频,写入.alb text = transcribe_one(filepath) with open(f"./raw/{a}/{a}_{file_idx}.lab",'w') as f: f.write(text)
ch_name import os out_file = f"filelists/genshin_out.txt" def process(): with open(out_file,'w' , encoding="Utf-8") as wf: ch_name = 'nxt' ch_language = 'ZH' path = f"./raw/{ch_name}" files = os.listdir(path) for f in files: if f.endswith(".lab"): with open(os.path.join(path,f),'r', encoding="utf-8") as perFile: line = perFile.readline() result = f"./dataset/{ch_name}/{f.split('.')[0]}.wav|{ch_name}|{ch_language}|{line}" wf.write(f"{result}\n") if __name__ == "__main__": process()
修改preprocess_text.py,运行该文件。filelists 文件夹下会生成.clean等文件
default="filelists/genshin_out.txt"
重采样,会在dataset下生成重采样后的音频,如果修改了源音频要进行二次训练,需要将原dataset下的文件删除。
python resample.py
9.运行bert_gen.py,生成pt文件
# 修改num_processes
python bert_gen.py
10.底模下载:放在logs/genshin_mix
https://openi.pcl.ac.cn/Stardust_minus/Bert-VITS2/modelmanage/model_filelist_tmpl?name=Bert-VITS2%E5%BA%95%E6%A8%A1
# D_0 G_0 DUR_0
开始训练
python train_ms.py -m genshin_mix -c configs/config.json
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。