当前位置:   article > 正文

speech to text 库fastASR接口调用示例C++

fastasr

本方案直接传入PCM 数据buffer,并不从wav文件读入,更符合在线STT方式。前提是已经从fastASR源码编译出来了libfastasr.a库(参照speech to text 库FastASR交叉编译arm target的配置-CSDN博客),源码示例如下:

  1. #include "speechrecog.h"
  2. #include <math.h>
  3. #include <string.h>
  4. #include <webrtc_vad.h>
  5. #define UNTRIGGERED 0
  6. #define TRIGGERED 1
  7. #define SPEECH_LEN_5S (16000 * 5)
  8. #define SPEECH_LEN_10S (16000 * 10)
  9. #define SPEECH_LEN_20S (16000 * 20)
  10. #define SPEECH_LEN_30S (16000 * 30)
  11. class AudioWindow {
  12. private:
  13. int *window;
  14. int in_idx;
  15. int out_idx;
  16. int sum;
  17. int window_size = 0;
  18. public:
  19. AudioWindow(int window_size) : window_size(window_size)
  20. {
  21. window = (int *)calloc(sizeof(int), (window_size + 1));
  22. in_idx = 0;
  23. out_idx = 1;
  24. sum = 0;
  25. };
  26. ~AudioWindow(){};
  27. int put(int val)
  28. {
  29. //cout<<"enter put"<<endl;
  30. sum = sum + val - window[out_idx];
  31. //cout<<"sum:"<<sum<<endl;
  32. window[in_idx] = val;
  33. in_idx = (in_idx == window_size) ? 0 : (in_idx + 1);
  34. out_idx = (out_idx == window_size) ? 0 : (out_idx + 1);
  35. //cout<<"get out put, sum:"<<sum<<endl;
  36. return sum;
  37. };
  38. };
  39. SpeechRecog::SpeechRecog()
  40. {
  41. mm = create_model("/data/FastASR/models/k2_rnnt2_cli", 2);
  42. }
  43. SpeechRecog::~SpeechRecog()
  44. {
  45. if(mm){
  46. mm->reset();
  47. mm = nullptr;
  48. }
  49. }
  50. void SpeechRecog::init()
  51. {
  52. printf("SR init done\n");
  53. }
  54. string SpeechRecog::recognize(short* buffer, unsigned long framesize)
  55. {
  56. VadInst *handle = WebRtcVad_Create();
  57. WebRtcVad_Init(handle);
  58. WebRtcVad_set_mode(handle, 2);
  59. int window_size = 10;
  60. AudioWindow audiowindow(window_size);
  61. int status = UNTRIGGERED;
  62. int offset = 0;
  63. int fs = 16000;
  64. int step = 480;
  65. int start = 0;
  66. int speech_align_len = (int)(ceil((float)framesize / align_size) * align_size);
  67. float* buf;//[speech_align_len] = {0};
  68. float scale = 32768;
  69. bool found = false;
  70. int16_t *speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);
  71. memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
  72. buf = (float *)malloc(sizeof(float) * speech_align_len);
  73. memset(buf, 0, sizeof(float) * speech_align_len);
  74. for(unsigned int i=0; i<framesize; i++){
  75. buf[i] = (float)buffer[i] / scale;
  76. speech_buff[i] = buffer[i];
  77. }
  78. result_str = "";
  79. while (offset < (framesize - step)) {
  80. int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
  81. if ((status == UNTRIGGERED) && (audiowindow.put(n) >= (window_size - 1))) {
  82. start = offset - step * (window_size - 1);
  83. status = TRIGGERED;
  84. found = false;
  85. } else if (status == TRIGGERED) {
  86. int win_weight = audiowindow.put(n);
  87. int voice_len = (offset - start);
  88. int gap = 0;
  89. if (voice_len < SPEECH_LEN_5S) {
  90. offset += step;
  91. continue;
  92. } else if (voice_len < SPEECH_LEN_10S) {
  93. gap = 1;
  94. } else if (voice_len < SPEECH_LEN_20S) {
  95. gap = window_size / 5;
  96. } else {
  97. gap = window_size / 2;
  98. }
  99. if (win_weight < gap) {
  100. status = UNTRIGGERED;
  101. float num_samples = offset - start;
  102. float frame_length = 400;
  103. float frame_shift = 160;
  104. float num_new_samples =
  105. ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
  106. offset = start + num_new_samples;
  107. mm->reset();
  108. string msg = mm->forward(buf+start, num_new_samples, 2);//this is voice recoged buffer range
  109. //cout<<"do while, done recog"<<endl;
  110. result_str += msg;
  111. found = true;
  112. //cout<<to_string(start)<<" found: " << msg;
  113. }
  114. }
  115. offset += step;
  116. }
  117. if (!found &&(status == TRIGGERED)){
  118. float num_samples = offset - start;
  119. float frame_length = 400;
  120. float frame_shift = 160;
  121. float num_new_samples =
  122. ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
  123. mm->reset();
  124. string msg = mm->forward(buf+start, num_new_samples, 2);//S_begin:0, s_middle:1, s_end:2.
  125. //cout <<"last: "<< msg;
  126. result_str += msg;
  127. }
  128. /*we need to strcat all returned msg and return of one buffer in*/
  129. free(buf);
  130. free(speech_buff);
  131. return result_str;
  132. }

Cmake的配置很简单,链接fastasr fftw3f openblas webrtcvad几个库即可。但需要注意是需要将源码包的webrtc目录和Model.h拷贝到当前工程目录下,并在cmake添加add_subdirectory("./webrtc")。由于比较简单,上面源码对应的头文件就不贴了,可以自行书写。

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/木道寻08/article/detail/806093
推荐阅读
相关标签
  

闽ICP备14008679号