当前位置:   article > 正文

深度学习系列63:tts和智能语音助手

深度学习系列63:tts和智能语音助手

1. tts

使用sherpa的参考代码如下

import soundfile as sf
import sherpa_onnx
def write(text,output_filename,sid=10,provider='cpu'):
    tts_config = sherpa_onnx.OfflineTtsConfig(
        model=sherpa_onnx.OfflineTtsModelConfig(
            vits=sherpa_onnx.OfflineTtsVitsModelConfig(
                model='vits-aishell3.onnx',
                lexicon='lexicon.txt',
                tokens='tokens.txt',
            ),
            provider=provider
        ),
        max_num_sentences=2,
    )
    audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid)
    sf.write(
        output_filename,
        audio.samples,
        samplerate=audio.sample_rate,
        subtype="PCM_16",
    )
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21

使用edge-tts(微软家的,需要联网)的示例代码如下:

#import edge_tts,os,asyncio,sys
import sys
from pydub import AudioSegment,playback
#async def read(text):
#    tts = edge_tts.Communicate(text=text, voice='zh-CN-YunxiNeural',rate = '+5%')
#    if 'temp.mp3' in os.listdir('.'):
#    	os.system("rm temp.mp3")
#    await tts.save(text+".mp3")

#asyncio.run(read(sys.argv[1]))
playback.play(AudioSegment.from_mp3(sys.argv[1]+'.mp3'))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11

2. 使用python制作智能语音助手

import soundfile as sf
import whisper,pyaudio,wave,os,warnings,time,torch,sherpa_onnx
from pydub import AudioSegment,playback
from transformers import AutoModelForCausalLM, AutoTokenizer
from autogen import OpenAIWrapper
client = OpenAIWrapper(api_key="NULL", base_url="http://localhost:2600/v1", api_type="open_ai")
warnings.filterwarnings('ignore') 
model = whisper.load_model("medium")

import soundfile as sf
import sherpa_onnx
def write(text,output_filename,sid=10,provider='cpu'):
    tts_config = sherpa_onnx.OfflineTtsConfig(
        model=sherpa_onnx.OfflineTtsModelConfig(
            vits=sherpa_onnx.OfflineTtsVitsModelConfig(
                model='vits-aishell3.onnx',
                lexicon='lexicon.txt',
                tokens='tokens.txt',
            ),
            provider=provider
        ),
        max_num_sentences=2,
    )
    audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid)
    sf.write(
        output_filename,
        audio.samples,
        samplerate=audio.sample_rate,
        subtype="PCM_16",
    )
    
def asr(filename):
    os.system('ffmpeg -i %s -af silenceremove=stop_periods=-1:stop_duration=1:stop_threshold=-30dB -ac 1 -ar 16000 %s'%(filename,'trans_'+filename))
    res = "".join(os.popen('whisper.cpp/main -m /Users/czhang39/.cache/huggingface/hub/models--ggerganov--whisper.cpp/snapshots/d15393806e24a74f60827e23e986f0c10750b358/ggml-large-v2.bin -np -nt -l zh --prompt 你好小特,以下是普通话。 -f %s'%('trans_'+filename)).readlines()).replace('\n','')
    os.system('rm %s'%filename)
    os.system('rm %s'%('trans_'+filename))
    return res
    
def write(text,output_filename,sid=10,provider='cpu'):
    tts_config = sherpa_onnx.OfflineTtsConfig(
        model=sherpa_onnx.OfflineTtsModelConfig(
            vits=sherpa_onnx.OfflineTtsVitsModelConfig(
                model='vits-aishell3.onnx',
                lexicon='lexicon.txt',
                tokens='tokens.txt',
            ),
            provider=provider
        ),
        #rule_fsts=args.tts_rule_fsts,
        max_num_sentences=2,
    )
    if not tts_config.validate():
        raise ValueError("Please check your config")

    tts = sherpa_onnx.OfflineTts(tts_config)

    start = time.time()
    audio = tts.generate(text, sid=sid)
    end = time.time()

    if len(audio.samples) == 0:
        print("Error in generating audios. Please read previous error messages.")
        return

    elapsed_seconds = end - start
    audio_duration = len(audio.samples) / audio.sample_rate
    real_time_factor = elapsed_seconds / audio_duration

    sf.write(
        output_filename,
        audio.samples,
        samplerate=audio.sample_rate,
        subtype="PCM_16",
    )

def wakeup(seconds = 2):
    chunk = 1024  # Record in chunks of 1024 samples
    sample_format = pyaudio.paInt16  # 16 bits per sample
    channels = 1
    fs = 44100  # Record at 44100 samples per second
    filename = "output.wav"
    p = pyaudio.PyAudio()  # Create an interface to PortAudio
    stream = p.open(format=sample_format,
                    channels=channels,
                    rate=fs,
                    frames_per_buffer=chunk,
                    input=True)

    frames = [] 
    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)

    # Stop and close the stream 
    stream.stop_stream()
    stream.close()
    # Terminate the PortAudio interface
    p.terminate()
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()
    result = model.transcribe(filename,language='zh',initial_prompt='你好, 以下是普通话')
    #result=asr(filename)
    return result['text']

def read(text):
    write(text,'reply.wav',10)
    playback.play(AudioSegment.from_mp3('reply.wav'))

os.environ['TOKENIZERS_PARALLELISM']='false'
activated = 0
waiting_time = 5
while 1:
    text = wakeup()
    if activated>1:
        if 20>len(text)>1:
            print(text)
            #reply = llm_model.chat(tokenizer,text)[0]
            response = client.create(messages=[{"role": "user", "content": "<用户>%s<AI>"%text}], model="guff")
            reply = client.extract_text_or_completion_object(response)[0]
            print(reply)
            read(reply)
            activated = 5
        else:
            activated-=1
            print(activated)
    elif '小特' in text:
        print('activated')
        activated = waiting_time
        read('你好啊!我被唤醒了')
    elif activated==1:
        read('你好啊!再见')
        print('sleep')
        activated = 0
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137

3. 使用C++代码

使用whisper.cpp的command代码,修改部分如下:

  1. 第559行,修改唤醒词:std::string k_prompt = "自定义唤醒词";
  2. 第607行,增加唤醒后的处理代码:std::system("python read.py 我在");
  3. 第664行开始,自定义待机/关机/活跃状态的代码:
                        if (command=="待机"){
                            fprintf(stdout,"好的!");
                            std::system("python read.py 好的");
                            ask_prompt = true;
                            }
                        else if (command=="退出"){
                            fprintf(stdout,"下次再见!");
                            std::system("python read.py 下次再见");
                            is_running = false;
                            }
                        else{
                            char str3[strlen(command.c_str())+30];
                            sprintf(str3, "%s%s%s", "python chat.py \"", command.c_str(),"\"");
                            std::system(str3);
                        }
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

接下来是调用tts和本地大模型的python代码:

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/Cpp五条/article/detail/337846?site
推荐阅读
相关标签
  

闽ICP备14008679号