赞
踩
科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)
参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客
要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。
直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。
- using System.Collections;
- using System.Collections.Generic;
- using UnityEngine;
- using System;
- using WebSocketSharp;
- using System.Text;
- using System.Security.Cryptography;
- using LitJson;
- using Newtonsoft.Json;
-
- public class SpeechHelper : MonoBehaviour
- {
- public event Action<string> 语音识别完成事件; //语音识别回调事件
- public AudioClip RecordedClip;
- private string micphoneName = string.Empty;
- WebSocket speechWebSocket;
- private System.Action<string> resultCallback;
- public void InitSpeechHelper(System.Action<string> textCallback)
- {
- resultCallback = textCallback;
- }
- public void StartSpeech()
- {
- if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open)
- {
- Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
- return;
- }
- if(Microphone.devices.Length <= 0)
- {
- Debug.LogWarning("找不到麦克风");
- return;
- }
- messageQueue.Clear();
- micphoneName = Microphone.devices[0];
- Debug.Log("micphoneName:" + micphoneName);
- try
- {
- RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);
- ConnectSpeechWebSocket();
- }
- catch(Exception ex)
- {
- Debug.LogError(ex.Message);
- }
- }
-
- public void StopSpeech()
- {
- Microphone.End(micphoneName);
- Debug.Log("识别结束,停止录音");
- }
-
- void ConnectSpeechWebSocket()
- {
- try
- {
- speechWebSocket = new WebSocket(GetWebSocketUrl());
- }
- catch (Exception ex)
- {
- UnityEngine.Debug.LogError(ex.Message);
- return;
- }
-
- speechWebSocket.OnOpen += (sender, e) =>
- {
- Debug.Log("OnOpen");
- speechWebSocket.OnClose += OnWebSocketClose;
- };
- speechWebSocket.OnMessage += OnInitMessage;
- speechWebSocket.OnError += OnError;
- speechWebSocket.ConnectAsync();
- StartCoroutine(SendVoiceData());
- }
- void OnWebSocketClose(object sender, CloseEventArgs e)
- {
- Debug.Log("OnWebSocketClose");
- }
- private static Queue<string> messageQueue = new Queue<string>();
- void OnInitMessage(object sender, MessageEventArgs e)
- {
- UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);
- messageQueue.Enqueue(e.Data);
- }
- private void MainThreadOnMessage(string message)
- {
- try
- {
- XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);
- if (0 != response.code)
- {
-
- return;
- }
- if (response.action.Equals("result"))
- {
- var result = ParseXunfeiRecognitionResult(response.data);
- if(result.IsFinal)
- {
- Debug.Log("Text最终:" + result.Text);
- resultCallback?.Invoke(result.Text);
- }else
- {
- Debug.Log("Text中间:" + result.Text);
- }
- }
- }
- catch (Exception ex)
- {
- Debug.LogError(ex.Message);
- }
- }
- void OnError(object sender, ErrorEventArgs e)
- {
- UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);
-
- }
-
- public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson)
- {
- StringBuilder builder = new StringBuilder();
- SpeechRecognitionResult res = new SpeechRecognitionResult();
- try
- {
- JsonData data = JsonMapper.ToObject(dataJson);
- JsonData cn = data["cn"];
- JsonData st = cn["st"];
- if (st["ed"].ToString().Equals("0"))
- {
- res.IsFinal = false;
- }
- else
- {
- res.IsFinal = true;
- }
- JsonData rtArry = st["rt"];
- foreach (JsonData rtObject in rtArry)
- {
- JsonData wsArr = rtObject["ws"];
- foreach (JsonData wsObject in wsArr)
- {
- JsonData cwArr = wsObject["cw"];
- foreach (JsonData cwObject in cwArr)
- {
- builder.Append(cwObject["w"].ToString());
- }
- }
- }
- }catch(Exception ex)
- {
- Debug.LogError(ex.Message);
- }
- res.Text = builder.ToString();
- return res;
- }
-
- void SendData(byte[] voiceData)
- {
- Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);
- if (speechWebSocket.ReadyState != WebSocketState.Open)
- {
- return;
- }
- try
- {
- if (speechWebSocket != null && speechWebSocket.IsAlive)
- {
- speechWebSocket.SendAsync(voiceData, success =>
- {
- if (success)
- {
- UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);
- }
- else
- {
- UnityEngine.Debug.Log("WebSoclet:发送失败:");
- }
- });
- }
- }
- catch
- {
-
- }
- }
- void SendEndMsg(System.Action callback)
- {
- string endMsg = "{\"end\": true}";
- byte[] data = Encoding.UTF8.GetBytes(endMsg);
- try
- {
- if (speechWebSocket != null && speechWebSocket.IsAlive)
- {
- speechWebSocket.SendAsync(data, success =>
- {
- if (success)
- {
- UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);
- }
- else
- {
- UnityEngine.Debug.Log("WebSoclet:发送END失败:");
- }
- callback?.Invoke();
- });
- }
- }
- catch
- {
-
- }
- }
-
- IEnumerator SendVoiceData()
- {
- yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));
- yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);
- float t = 0;
- int position = Microphone.GetPosition(micphoneName);
- const float waitTime = 0.04f;//每隔40ms发送音频
- int lastPosition = 0;
- const int Maxlength = 640;//最大发送长度
- //Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);
- while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open)
- {
- t += waitTime;
- yield return new WaitForSecondsRealtime(waitTime);
- if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);
- //Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
- if (position <= lastPosition)
- {
- Debug.LogWarning("字节流发送完毕!强制结束!");
- break;
- }
- int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
- byte[] date = GetClipData(lastPosition, length, RecordedClip);
- SendData(date);
- lastPosition = lastPosition + length;
- }
- yield return new WaitForSecondsRealtime(waitTime);
- SendEndMsg(null);
- Microphone.End(micphoneName);
- }
- public byte[] GetClipData(int star, int length, AudioClip recordedClip)
- {
- float[] soundata = new float[length];
- recordedClip.GetData(soundata, star);
- int rescaleFactor = 32767;
- byte[] outData = new byte[soundata.Length * 2];
- for (int i = 0; i < soundata.Length; i++)
- {
- short temshort = (short)(soundata[i] * rescaleFactor);
- byte[] temdata = BitConverter.GetBytes(temshort);
- outData[i * 2] = temdata[0];
- outData[i * 2 + 1] = temdata[1];
- }
- return outData;
- }
-
- private string GetWebSocketUrl()
- {
- string appid = "appid";
- string ts = GetCurrentUnixTimestampMillis().ToString();
- string baseString = appid + ts;
- string md5 = GetMD5Hash(baseString);
- UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);
- string sha1 = CalculateHmacSha1(md5, "appkey");
- string signa = sha1;
- string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);
- UnityEngine.Debug.Log(url);
- return url;
- }
- private long GetCurrentUnixTimestampMillis()
- {
- DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();
- DateTime now = DateTime.Now;// DateTime.UtcNow;
- TimeSpan timeSpan = now - unixStartTime;
- long timestamp = (long)timeSpan.TotalSeconds;
- return timestamp;
- }
- public string GetMD5Hash(string input)
- {
- MD5 md5Hasher = MD5.Create();
- byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));
- StringBuilder sBuilder = new StringBuilder();
- for (int i = 0; i < data.Length; i++)
- {
- sBuilder.Append(data[i].ToString("x2"));
- }
- return sBuilder.ToString();
- }
- public string CalculateHmacSha1(string data, string key)
- {
- HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));
- byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));
- return Convert.ToBase64String(hashBytes);
- }
-
- private void Update()
- {
- if(messageQueue.Count > 0)
- {
- MainThreadOnMessage(messageQueue.Dequeue());
- }
- }
- }
Json解析类。
- [Serializable]
- public struct XFResponse
- {
- public string action;
- public int code;
- public string data;
- public string desc;
- public string sid;
- }
- [Serializable]
- public struct SpeechRecognitionResult
- {
- public string Text;
- public bool IsFinal;
- }
值得注意的问题。
1、Microphone使用时传默认设备名比传null好使
2、握手阶段时间戳用的是秒(不是毫秒)
3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样
4、如果Microphone.devices的长度为0,电脑确实又有麦克风设备,那么可能是麦克风的名字是中文导致的
遗留问题:
yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢,语音识别慢。
2024.5.24更新第二篇,有效解决消息发送慢,识别慢的问题
2024.6.19更新:取消协程中发送数据,直接在Update中发送。解决消息发送很慢问题
- private void Update()
- {
- if (isRunning)
- {
- byte[] voiceData = GetVoiveData();
- if (voiceData != null)
- {
- SendData(voiceData);
- }
- }
- if (messageQueue.Count > 0)
- {
- MainThreadOnMessage(messageQueue.Dequeue());
- }
- }
-
- private int last_length = -1;
- private float[] volumeData = new float[9999];
- private short[] intData = new short[9999];
- bool isRunning;
- private byte[] GetVoiveData()
- {
- if (RecordedClip == null)
- {
- return null;
- }
- int new_length = Microphone.GetPosition(null);
- if (new_length == last_length)
- {
- if (Microphone.devices.Length == 0)
- {
- isRunning = false;
- }
- return null;
- }
- int length = new_length - last_length;
- int offset = last_length + 1;
- last_length = new_length;
- if (offset < 0)
- {
- return null;
- }
- if (length < 0)
- {
- float[] temp = new float[RecordedClip.samples];
- RecordedClip.GetData(temp, 0);
- int lengthTail = RecordedClip.samples - offset;
- int lengthHead = new_length + 1;
- try
- {
- Array.Copy(temp, offset, volumeData, 0, lengthTail);
- Array.Copy(temp, 0, volumeData, lengthTail + 1, lengthHead);
- length = lengthTail + lengthHead;
-
- }
- catch (Exception)
- {
- return null;
- }
- }
- else
- {
- if (length > volumeData.Length)
- {
- volumeData = new float[length];
- intData = new short[length];
- }
- RecordedClip.GetData(volumeData, offset);
- }
- byte[] bytesData = new byte[length * 2];
- int rescaleFactor = 32767; //to convert float to Int16
- for (int i = 0; i < length; i++)
- {
- intData[i] = (short)(volumeData[i] * rescaleFactor);
- byte[] byteArr = BitConverter.GetBytes(intData[i]);
- byteArr.CopyTo(bytesData, i * 2);
- }
- return bytesData;
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。