赞
踩
本方案直接传入PCM 数据buffer,并不从wav文件读入,更符合在线STT方式。前提是已经从fastASR源码编译出来了libfastasr.a库(参照speech to text 库FastASR交叉编译arm target的配置-CSDN博客),源码示例如下:
- #include "speechrecog.h"
- #include <math.h>
- #include <string.h>
- #include <webrtc_vad.h>
-
- #define UNTRIGGERED 0
- #define TRIGGERED 1
-
- #define SPEECH_LEN_5S (16000 * 5)
- #define SPEECH_LEN_10S (16000 * 10)
- #define SPEECH_LEN_20S (16000 * 20)
- #define SPEECH_LEN_30S (16000 * 30)
- class AudioWindow {
- private:
- int *window;
- int in_idx;
- int out_idx;
- int sum;
- int window_size = 0;
-
- public:
- AudioWindow(int window_size) : window_size(window_size)
- {
- window = (int *)calloc(sizeof(int), (window_size + 1));
- in_idx = 0;
- out_idx = 1;
- sum = 0;
- };
- ~AudioWindow(){};
- int put(int val)
- {
- //cout<<"enter put"<<endl;
- sum = sum + val - window[out_idx];
- //cout<<"sum:"<<sum<<endl;
- window[in_idx] = val;
- in_idx = (in_idx == window_size) ? 0 : (in_idx + 1);
- out_idx = (out_idx == window_size) ? 0 : (out_idx + 1);
- //cout<<"get out put, sum:"<<sum<<endl;
- return sum;
- };
- };
- SpeechRecog::SpeechRecog()
- {
- mm = create_model("/data/FastASR/models/k2_rnnt2_cli", 2);
- }
- SpeechRecog::~SpeechRecog()
- {
- if(mm){
- mm->reset();
- mm = nullptr;
- }
- }
-
- void SpeechRecog::init()
- {
- printf("SR init done\n");
- }
-
- string SpeechRecog::recognize(short* buffer, unsigned long framesize)
- {
- VadInst *handle = WebRtcVad_Create();
- WebRtcVad_Init(handle);
- WebRtcVad_set_mode(handle, 2);
- int window_size = 10;
- AudioWindow audiowindow(window_size);
- int status = UNTRIGGERED;
- int offset = 0;
- int fs = 16000;
- int step = 480;
- int start = 0;
- int speech_align_len = (int)(ceil((float)framesize / align_size) * align_size);
- float* buf;//[speech_align_len] = {0};
- float scale = 32768;
- bool found = false;
- int16_t *speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);
-
- memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
-
- buf = (float *)malloc(sizeof(float) * speech_align_len);
- memset(buf, 0, sizeof(float) * speech_align_len);
-
- for(unsigned int i=0; i<framesize; i++){
- buf[i] = (float)buffer[i] / scale;
- speech_buff[i] = buffer[i];
- }
- result_str = "";
- while (offset < (framesize - step)) {
- int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
- if ((status == UNTRIGGERED) && (audiowindow.put(n) >= (window_size - 1))) {
- start = offset - step * (window_size - 1);
- status = TRIGGERED;
- found = false;
- } else if (status == TRIGGERED) {
- int win_weight = audiowindow.put(n);
- int voice_len = (offset - start);
- int gap = 0;
- if (voice_len < SPEECH_LEN_5S) {
- offset += step;
- continue;
- } else if (voice_len < SPEECH_LEN_10S) {
- gap = 1;
- } else if (voice_len < SPEECH_LEN_20S) {
- gap = window_size / 5;
- } else {
- gap = window_size / 2;
- }
-
- if (win_weight < gap) {
- status = UNTRIGGERED;
- float num_samples = offset - start;
- float frame_length = 400;
- float frame_shift = 160;
- float num_new_samples =
- ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
-
- offset = start + num_new_samples;
- mm->reset();
- string msg = mm->forward(buf+start, num_new_samples, 2);//this is voice recoged buffer range
- //cout<<"do while, done recog"<<endl;
- result_str += msg;
- found = true;
- //cout<<to_string(start)<<" found: " << msg;
- }
- }
- offset += step;
- }
- if (!found &&(status == TRIGGERED)){
- float num_samples = offset - start;
- float frame_length = 400;
- float frame_shift = 160;
- float num_new_samples =
- ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
- mm->reset();
- string msg = mm->forward(buf+start, num_new_samples, 2);//S_begin:0, s_middle:1, s_end:2.
- //cout <<"last: "<< msg;
- result_str += msg;
- }
- /*we need to strcat all returned msg and return of one buffer in*/
- free(buf);
- free(speech_buff);
- return result_str;
- }
Cmake的配置很简单,链接fastasr fftw3f openblas webrtcvad几个库即可。但需要注意是需要将源码包的webrtc目录和Model.h拷贝到当前工程目录下,并在cmake添加add_subdirectory("./webrtc")。由于比较简单,上面源码对应的头文件就不贴了,可以自行书写。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。