当前位置:   article > 正文

Unity对接科大讯飞实时语音转写WebAPI(Windows平台)(一)_unity webgl对接讯飞实时语音听写

unity webgl对接讯飞实时语音听写

科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)

参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客

        要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。

        直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。

  1. using System.Collections;
  2. using System.Collections.Generic;
  3. using UnityEngine;
  4. using System;
  5. using WebSocketSharp;
  6. using System.Text;
  7. using System.Security.Cryptography;
  8. using LitJson;
  9. using Newtonsoft.Json;
  10. public class SpeechHelper : MonoBehaviour
  11. {
  12. public event Action<string> 语音识别完成事件; //语音识别回调事件
  13. public AudioClip RecordedClip;
  14. private string micphoneName = string.Empty;
  15. WebSocket speechWebSocket;
  16. private System.Action<string> resultCallback;
  17. public void InitSpeechHelper(System.Action<string> textCallback)
  18. {
  19. resultCallback = textCallback;
  20. }
  21. public void StartSpeech()
  22. {
  23. if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open)
  24. {
  25. Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
  26. return;
  27. }
  28. if(Microphone.devices.Length <= 0)
  29. {
  30. Debug.LogWarning("找不到麦克风");
  31. return;
  32. }
  33. messageQueue.Clear();
  34. micphoneName = Microphone.devices[0];
  35. Debug.Log("micphoneName:" + micphoneName);
  36. try
  37. {
  38. RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);
  39. ConnectSpeechWebSocket();
  40. }
  41. catch(Exception ex)
  42. {
  43. Debug.LogError(ex.Message);
  44. }
  45. }
  46. public void StopSpeech()
  47. {
  48. Microphone.End(micphoneName);
  49. Debug.Log("识别结束,停止录音");
  50. }
  51. void ConnectSpeechWebSocket()
  52. {
  53. try
  54. {
  55. speechWebSocket = new WebSocket(GetWebSocketUrl());
  56. }
  57. catch (Exception ex)
  58. {
  59. UnityEngine.Debug.LogError(ex.Message);
  60. return;
  61. }
  62. speechWebSocket.OnOpen += (sender, e) =>
  63. {
  64. Debug.Log("OnOpen");
  65. speechWebSocket.OnClose += OnWebSocketClose;
  66. };
  67. speechWebSocket.OnMessage += OnInitMessage;
  68. speechWebSocket.OnError += OnError;
  69. speechWebSocket.ConnectAsync();
  70. StartCoroutine(SendVoiceData());
  71. }
  72. void OnWebSocketClose(object sender, CloseEventArgs e)
  73. {
  74. Debug.Log("OnWebSocketClose");
  75. }
  76. private static Queue<string> messageQueue = new Queue<string>();
  77. void OnInitMessage(object sender, MessageEventArgs e)
  78. {
  79. UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);
  80. messageQueue.Enqueue(e.Data);
  81. }
  82. private void MainThreadOnMessage(string message)
  83. {
  84. try
  85. {
  86. XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);
  87. if (0 != response.code)
  88. {
  89. return;
  90. }
  91. if (response.action.Equals("result"))
  92. {
  93. var result = ParseXunfeiRecognitionResult(response.data);
  94. if(result.IsFinal)
  95. {
  96. Debug.Log("Text最终:" + result.Text);
  97. resultCallback?.Invoke(result.Text);
  98. }else
  99. {
  100. Debug.Log("Text中间:" + result.Text);
  101. }
  102. }
  103. }
  104. catch (Exception ex)
  105. {
  106. Debug.LogError(ex.Message);
  107. }
  108. }
  109. void OnError(object sender, ErrorEventArgs e)
  110. {
  111. UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);
  112. }
  113. public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson)
  114. {
  115. StringBuilder builder = new StringBuilder();
  116. SpeechRecognitionResult res = new SpeechRecognitionResult();
  117. try
  118. {
  119. JsonData data = JsonMapper.ToObject(dataJson);
  120. JsonData cn = data["cn"];
  121. JsonData st = cn["st"];
  122. if (st["ed"].ToString().Equals("0"))
  123. {
  124. res.IsFinal = false;
  125. }
  126. else
  127. {
  128. res.IsFinal = true;
  129. }
  130. JsonData rtArry = st["rt"];
  131. foreach (JsonData rtObject in rtArry)
  132. {
  133. JsonData wsArr = rtObject["ws"];
  134. foreach (JsonData wsObject in wsArr)
  135. {
  136. JsonData cwArr = wsObject["cw"];
  137. foreach (JsonData cwObject in cwArr)
  138. {
  139. builder.Append(cwObject["w"].ToString());
  140. }
  141. }
  142. }
  143. }catch(Exception ex)
  144. {
  145. Debug.LogError(ex.Message);
  146. }
  147. res.Text = builder.ToString();
  148. return res;
  149. }
  150. void SendData(byte[] voiceData)
  151. {
  152. Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);
  153. if (speechWebSocket.ReadyState != WebSocketState.Open)
  154. {
  155. return;
  156. }
  157. try
  158. {
  159. if (speechWebSocket != null && speechWebSocket.IsAlive)
  160. {
  161. speechWebSocket.SendAsync(voiceData, success =>
  162. {
  163. if (success)
  164. {
  165. UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);
  166. }
  167. else
  168. {
  169. UnityEngine.Debug.Log("WebSoclet:发送失败:");
  170. }
  171. });
  172. }
  173. }
  174. catch
  175. {
  176. }
  177. }
  178. void SendEndMsg(System.Action callback)
  179. {
  180. string endMsg = "{\"end\": true}";
  181. byte[] data = Encoding.UTF8.GetBytes(endMsg);
  182. try
  183. {
  184. if (speechWebSocket != null && speechWebSocket.IsAlive)
  185. {
  186. speechWebSocket.SendAsync(data, success =>
  187. {
  188. if (success)
  189. {
  190. UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);
  191. }
  192. else
  193. {
  194. UnityEngine.Debug.Log("WebSoclet:发送END失败:");
  195. }
  196. callback?.Invoke();
  197. });
  198. }
  199. }
  200. catch
  201. {
  202. }
  203. }
  204. IEnumerator SendVoiceData()
  205. {
  206. yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));
  207. yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);
  208. float t = 0;
  209. int position = Microphone.GetPosition(micphoneName);
  210. const float waitTime = 0.04f;//每隔40ms发送音频
  211. int lastPosition = 0;
  212. const int Maxlength = 640;//最大发送长度
  213. //Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);
  214. while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open)
  215. {
  216. t += waitTime;
  217. yield return new WaitForSecondsRealtime(waitTime);
  218. if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);
  219. //Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
  220. if (position <= lastPosition)
  221. {
  222. Debug.LogWarning("字节流发送完毕!强制结束!");
  223. break;
  224. }
  225. int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
  226. byte[] date = GetClipData(lastPosition, length, RecordedClip);
  227. SendData(date);
  228. lastPosition = lastPosition + length;
  229. }
  230. yield return new WaitForSecondsRealtime(waitTime);
  231. SendEndMsg(null);
  232. Microphone.End(micphoneName);
  233. }
  234. public byte[] GetClipData(int star, int length, AudioClip recordedClip)
  235. {
  236. float[] soundata = new float[length];
  237. recordedClip.GetData(soundata, star);
  238. int rescaleFactor = 32767;
  239. byte[] outData = new byte[soundata.Length * 2];
  240. for (int i = 0; i < soundata.Length; i++)
  241. {
  242. short temshort = (short)(soundata[i] * rescaleFactor);
  243. byte[] temdata = BitConverter.GetBytes(temshort);
  244. outData[i * 2] = temdata[0];
  245. outData[i * 2 + 1] = temdata[1];
  246. }
  247. return outData;
  248. }
  249. private string GetWebSocketUrl()
  250. {
  251. string appid = "appid";
  252. string ts = GetCurrentUnixTimestampMillis().ToString();
  253. string baseString = appid + ts;
  254. string md5 = GetMD5Hash(baseString);
  255. UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);
  256. string sha1 = CalculateHmacSha1(md5, "appkey");
  257. string signa = sha1;
  258. string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);
  259. UnityEngine.Debug.Log(url);
  260. return url;
  261. }
  262. private long GetCurrentUnixTimestampMillis()
  263. {
  264. DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();
  265. DateTime now = DateTime.Now;// DateTime.UtcNow;
  266. TimeSpan timeSpan = now - unixStartTime;
  267. long timestamp = (long)timeSpan.TotalSeconds;
  268. return timestamp;
  269. }
  270. public string GetMD5Hash(string input)
  271. {
  272. MD5 md5Hasher = MD5.Create();
  273. byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));
  274. StringBuilder sBuilder = new StringBuilder();
  275. for (int i = 0; i < data.Length; i++)
  276. {
  277. sBuilder.Append(data[i].ToString("x2"));
  278. }
  279. return sBuilder.ToString();
  280. }
  281. public string CalculateHmacSha1(string data, string key)
  282. {
  283. HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));
  284. byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));
  285. return Convert.ToBase64String(hashBytes);
  286. }
  287. private void Update()
  288. {
  289. if(messageQueue.Count > 0)
  290. {
  291. MainThreadOnMessage(messageQueue.Dequeue());
  292. }
  293. }
  294. }

Json解析类。

  1. [Serializable]
  2. public struct XFResponse
  3. {
  4. public string action;
  5. public int code;
  6. public string data;
  7. public string desc;
  8. public string sid;
  9. }
  10. [Serializable]
  11. public struct SpeechRecognitionResult
  12. {
  13. public string Text;
  14. public bool IsFinal;
  15. }

值得注意的问题。

1、Microphone使用时传默认设备名比传null好使

2、握手阶段时间戳用的是秒(不是毫秒)

3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样

4、如果Microphone.devices的长度为0,电脑确实又有麦克风设备,那么可能是麦克风的名字是中文导致的

遗留问题:

yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢,语音识别慢。

2024.5.24更新第二篇,有效解决消息发送慢,识别慢的问题

2024.6.19更新:取消协程中发送数据,直接在Update中发送。解决消息发送很慢问题

  1. private void Update()
  2. {
  3. if (isRunning)
  4. {
  5. byte[] voiceData = GetVoiveData();
  6. if (voiceData != null)
  7. {
  8. SendData(voiceData);
  9. }
  10. }
  11. if (messageQueue.Count > 0)
  12. {
  13. MainThreadOnMessage(messageQueue.Dequeue());
  14. }
  15. }
  16. private int last_length = -1;
  17. private float[] volumeData = new float[9999];
  18. private short[] intData = new short[9999];
  19. bool isRunning;
  20. private byte[] GetVoiveData()
  21. {
  22. if (RecordedClip == null)
  23. {
  24. return null;
  25. }
  26. int new_length = Microphone.GetPosition(null);
  27. if (new_length == last_length)
  28. {
  29. if (Microphone.devices.Length == 0)
  30. {
  31. isRunning = false;
  32. }
  33. return null;
  34. }
  35. int length = new_length - last_length;
  36. int offset = last_length + 1;
  37. last_length = new_length;
  38. if (offset < 0)
  39. {
  40. return null;
  41. }
  42. if (length < 0)
  43. {
  44. float[] temp = new float[RecordedClip.samples];
  45. RecordedClip.GetData(temp, 0);
  46. int lengthTail = RecordedClip.samples - offset;
  47. int lengthHead = new_length + 1;
  48. try
  49. {
  50. Array.Copy(temp, offset, volumeData, 0, lengthTail);
  51. Array.Copy(temp, 0, volumeData, lengthTail + 1, lengthHead);
  52. length = lengthTail + lengthHead;
  53. }
  54. catch (Exception)
  55. {
  56. return null;
  57. }
  58. }
  59. else
  60. {
  61. if (length > volumeData.Length)
  62. {
  63. volumeData = new float[length];
  64. intData = new short[length];
  65. }
  66. RecordedClip.GetData(volumeData, offset);
  67. }
  68. byte[] bytesData = new byte[length * 2];
  69. int rescaleFactor = 32767; //to convert float to Int16
  70. for (int i = 0; i < length; i++)
  71. {
  72. intData[i] = (short)(volumeData[i] * rescaleFactor);
  73. byte[] byteArr = BitConverter.GetBytes(intData[i]);
  74. byteArr.CopyTo(bytesData, i * 2);
  75. }
  76. return bytesData;
  77. }

本文内容由网友自发贡献,转载请注明出处:https://www.wpsshop.cn/w/秋刀鱼在做梦/article/detail/904759
推荐阅读
相关标签
  

闽ICP备14008679号