赞
踩
使用sherpa的参考代码如下
import soundfile as sf import sherpa_onnx def write(text,output_filename,sid=10,provider='cpu'): tts_config = sherpa_onnx.OfflineTtsConfig( model=sherpa_onnx.OfflineTtsModelConfig( vits=sherpa_onnx.OfflineTtsVitsModelConfig( model='vits-aishell3.onnx', lexicon='lexicon.txt', tokens='tokens.txt', ), provider=provider ), max_num_sentences=2, ) audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid) sf.write( output_filename, audio.samples, samplerate=audio.sample_rate, subtype="PCM_16", )
使用edge-tts(微软家的,需要联网)的示例代码如下:
#import edge_tts,os,asyncio,sys
import sys
from pydub import AudioSegment,playback
#async def read(text):
# tts = edge_tts.Communicate(text=text, voice='zh-CN-YunxiNeural',rate = '+5%')
# if 'temp.mp3' in os.listdir('.'):
# os.system("rm temp.mp3")
# await tts.save(text+".mp3")
#asyncio.run(read(sys.argv[1]))
playback.play(AudioSegment.from_mp3(sys.argv[1]+'.mp3'))
import soundfile as sf import whisper,pyaudio,wave,os,warnings,time,torch,sherpa_onnx from pydub import AudioSegment,playback from transformers import AutoModelForCausalLM, AutoTokenizer from autogen import OpenAIWrapper client = OpenAIWrapper(api_key="NULL", base_url="http://localhost:2600/v1", api_type="open_ai") warnings.filterwarnings('ignore') model = whisper.load_model("medium") import soundfile as sf import sherpa_onnx def write(text,output_filename,sid=10,provider='cpu'): tts_config = sherpa_onnx.OfflineTtsConfig( model=sherpa_onnx.OfflineTtsModelConfig( vits=sherpa_onnx.OfflineTtsVitsModelConfig( model='vits-aishell3.onnx', lexicon='lexicon.txt', tokens='tokens.txt', ), provider=provider ), max_num_sentences=2, ) audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid) sf.write( output_filename, audio.samples, samplerate=audio.sample_rate, subtype="PCM_16", ) def asr(filename): os.system('ffmpeg -i %s -af silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-30dB -ac 1 -ar 16000 %s'%(filename,'trans_'+filename)) res = "".join(os.popen('whisper.cpp/main -m /Users/czhang39/.cache/huggingface/hub/models--ggerganov--whisper.cpp/snapshots/d15393806e24a74f60827e23e986f0c10750b358/ggml-large-v2.bin -np -nt -l zh --prompt 你好小特,以下是普通话。 -f %s'%('trans_'+filename)).readlines()).replace('\n','') os.system('rm %s'%filename) os.system('rm %s'%('trans_'+filename)) return res def write(text,output_filename,sid=10,provider='cpu'): tts_config = sherpa_onnx.OfflineTtsConfig( model=sherpa_onnx.OfflineTtsModelConfig( vits=sherpa_onnx.OfflineTtsVitsModelConfig( model='vits-aishell3.onnx', lexicon='lexicon.txt', tokens='tokens.txt', ), provider=provider ), #rule_fsts=args.tts_rule_fsts, max_num_sentences=2, ) if not tts_config.validate(): raise ValueError("Please check your config") tts = sherpa_onnx.OfflineTts(tts_config) start = time.time() audio = tts.generate(text, sid=sid) end = time.time() if len(audio.samples) == 0: print("Error in generating audios. Please read previous error messages.") return elapsed_seconds = end - start audio_duration = len(audio.samples) / audio.sample_rate real_time_factor = elapsed_seconds / audio_duration sf.write( output_filename, audio.samples, samplerate=audio.sample_rate, subtype="PCM_16", ) def wakeup(seconds = 2): chunk = 1024 # Record in chunks of 1024 samples sample_format = pyaudio.paInt16 # 16 bits per sample channels = 1 fs = 44100 # Record at 44100 samples per second filename = "output.wav" p = pyaudio.PyAudio() # Create an interface to PortAudio stream = p.open(format=sample_format, channels=channels, rate=fs, frames_per_buffer=chunk, input=True) frames = [] for i in range(0, int(fs / chunk * seconds)): data = stream.read(chunk) frames.append(data) # Stop and close the stream stream.stop_stream() stream.close() # Terminate the PortAudio interface p.terminate() wf = wave.open(filename, 'wb') wf.setnchannels(channels) wf.setsampwidth(p.get_sample_size(sample_format)) wf.setframerate(fs) wf.writeframes(b''.join(frames)) wf.close() result = model.transcribe(filename,language='zh',initial_prompt='你好, 以下是普通话') #result=asr(filename) return result['text'] def read(text): write(text,'reply.wav',10) playback.play(AudioSegment.from_mp3('reply.wav')) os.environ['TOKENIZERS_PARALLELISM']='false' activated = 0 waiting_time = 5 while 1: text = wakeup() if activated>1: if 20>len(text)>1: print(text) #reply = llm_model.chat(tokenizer,text)[0] response = client.create(messages=[{"role": "user", "content": "<用户>%s<AI>"%text}], model="guff") reply = client.extract_text_or_completion_object(response)[0] print(reply) read(reply) activated = 5 else: activated-=1 print(activated) elif '小特' in text: print('activated') activated = waiting_time read('你好啊!我被唤醒了') elif activated==1: read('你好啊!再见') print('sleep') activated = 0
使用whisper.cpp的command代码,修改部分如下:
std::string k_prompt = "自定义唤醒词";
std::system("python read.py 我在");
if (command=="待机"){
fprintf(stdout,"好的!");
std::system("python read.py 好的");
ask_prompt = true;
}
else if (command=="退出"){
fprintf(stdout,"下次再见!");
std::system("python read.py 下次再见");
is_running = false;
}
else{
char str3[strlen(command.c_str())+30];
sprintf(str3, "%s%s%s", "python chat.py \"", command.c_str(),"\"");
std::system(str3);
}
接下来是调用tts和本地大模型的python代码:
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。