unity通过WebAPI连接Websocket实现讯飞语音识别与合成。
unity写的讯飞语音识别合成与评测功能,走的是webapi,连接讯飞WebSocket服务器没有接入任何sdk,也没有多余插件,开发工具unity2019.3.6 ,真正实现了跨平台,不用在每个平台重新接入sdk这么麻烦最近项目要实现语音识别合成与评测功能,看到了讯飞正好符合要求就打算介入讯飞sdk来实现,但是讯飞的sdk没有unity插件只有单个平台的sdk,看了开发文档之后发现可以用weba
下载地址:https://download.csdn.net/download/qq_39735878/12447473
unity写的讯飞语音识别合成与评测功能,走的是webapi,连接讯飞WebSocket服务器没有接入任何sdk,也没有多余插件,开发工具unity2019.3.6 ,真正实现了跨平台,不用在每个平台重新接入sdk这么麻烦
最近项目要实现语音识别合成与评测功能,看到了讯飞正好符合要求就打算介入讯飞sdk来实现,但是讯飞的sdk没有unity插件只有单个平台的sdk,看了开发文档之后发现可以用webapi实现跨平台这个可以再不同平台实现语音识别功能。讯飞语音合成与识别用的是WebSocket网络协议先要构建接口鉴权。在握手阶段,请求方需要对请求进行签名,服务端通过签名来校验请求的合法性。代码如下:
string GetUrl(string uriStr)
{
Uri uri = new Uri(uriStr);
string date = DateTime.Now.ToString("r");
string signature_origin = string.Format("host: " + uri.Host + "\ndate: " + date + "\nGET " + uri.AbsolutePath + " HTTP/1.1");
HMACSHA256 mac = new HMACSHA256(Encoding.UTF8.GetBytes(APISecret));
string signature = Convert.ToBase64String(mac.ComputeHash(Encoding.UTF8.GetBytes(signature_origin)));
string authorization_origin = string.Format("api_key=\"{0}\",algorithm=\"hmac-sha256\",headers=\"host date request-line\",signature=\"{1}\"", APIKey, signature);
string authorization = Convert.ToBase64String(Encoding.UTF8.GetBytes(authorization_origin));
string url = string.Format("{0}?authorization={1}&date={2}&host={3}", uri, authorization, date, uri.Host);
return url;
}
获取到鉴权整个url后就需要连接讯飞WebSocket服务器了。连接成功之后需要获取unity录音的数据流AudioClip转化为byte[],需要每隔一段时间发送数据直到录音结束。经过研究这里提供一种思路来截取语音流:
public static byte[] 获取音频流片段(int star, int length, AudioClip recordedClip)
{
float[] soundata = new float[length];
recordedClip.GetData(soundata, star);
int rescaleFactor = 32767;
byte[] outData = new byte[soundata.Length * 2];
for (int i = 0; i < soundata.Length; i++)
{
short temshort = (short)(soundata[i] * rescaleFactor);
byte[] temdata = BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
return outData;
}
其中参数start是录音的位置与音频采样率有关,举个例子如果采样率是16000,那么start=16000的位置就是从获取一秒后语音流位置,以此类推。同样length也是采样长度。如果采样率是16000,start=16000x,length=16000y则是从x秒开始录音长度为y的一段语音流数据。明白这个道理后再研究文档就可以实现语音识别逻辑了:
string APPID = "5c81de59";
string APISecret = "ea4d5e9b06f8cfb0deae4d5360e7f8a7";
string APIKey = "94348d7a6d5f3807176cb1f4923efa5c";
public event Action<string> 语音识别完成事件; //语音识别回调事件
public AudioClip RecordedClip;
ClientWebSocket 语音识别WebSocket;
public void 开始语音识别()
{
if (语音识别WebSocket!=null && 语音识别WebSocket.State == WebSocketState.Open)
{
Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
return;
}
连接语音识别WebSocket();
RecordedClip = Microphone.Start(null, false, 60, 16000);
}
public IEnumerator 停止语音识别()
{
Microphone.End(null);
yield return new WaitUntil(()=>语音识别WebSocket.State != WebSocketState.Open);
Debug.Log("识别结束,停止录音");
}
async void 连接语音识别WebSocket()
{
using (语音识别WebSocket = new ClientWebSocket())
{
CancellationToken ct = new CancellationToken();
Uri url = new Uri(GetUrl("wss://iat-api.xfyun.cn/v2/iat"));
await 语音识别WebSocket.ConnectAsync(url, ct);
Debug.Log("连接成功");
StartCoroutine(发送录音数据流(语音识别WebSocket));
StringBuilder stringBuilder = new StringBuilder();
while (语音识别WebSocket.State == WebSocketState.Open)
{
var result = new byte[4096];
await 语音识别WebSocket.ReceiveAsync(new ArraySegment<byte>(result), ct);//接受数据
List<byte> list = new List<byte>(result);while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1);//去除空字节
string str = Encoding.UTF8.GetString(list.ToArray());
Debug.Log("接收消息:" + str);
stringBuilder.Append(获取识别单词(str));
JSONNode js = JSONNode.Parse(str);
JSONNode data = js["data"];
if (data["status"] == 2)
{
语音识别WebSocket.Abort();
}
}
Debug.LogWarning("断开连接");
string s = stringBuilder.ToString();
if (!string.IsNullOrEmpty(s))
{
语音识别完成事件?.Invoke(s);
Debug.LogWarning("识别到声音:" + s);
}
}
}
string 获取识别单词(string str)
{
StringBuilder stringBuilder = new StringBuilder();
if (!string.IsNullOrEmpty(str))
{
JSONNode recivejson = JSONNode.Parse(str);
JSONNode ws = recivejson["data"]["result"]["ws"];
foreach (JSONNode item in ws)
{
JSONNode cw = item["cw"];
foreach (JSONNode item1 in cw)
{
stringBuilder.Append((string)item1["w"]);
}
}
}
return stringBuilder.ToString();
}
void 发送数据(byte[] audio, int status, ClientWebSocket socket)
{
if (socket.State != WebSocketState.Open)
{
return;
}
JSONNode jn = new JSONNode
{
{
"common",new JSONNode{{ "app_id",APPID}}},
{
"business",new JSONNode{
{ "language","zh_cn"},//识别语音
{ "domain","iat"},
{ "accent","mandarin"},
{ "vad_eos",2000}
}
},
{
"data",new JSONNode{
{ "status",0 },
{ "encoding","raw" },
{ "format","audio/L16;rate=16000"}
}
}
};
JSONNode data = jn["data"];
if (status < 2)
{
data["audio"] = Convert.ToBase64String(audio);
}
data["status"] = status;
Debug.Log("发送消息:" + jn);
socket.SendAsync(new ArraySegment<byte>(Encoding.UTF8.GetBytes(jn)), WebSocketMessageType.Binary, true, new CancellationToken()); //发送数据
}
IEnumerator 发送录音数据流(ClientWebSocket socket)
{
yield return new WaitWhile(() => Microphone.GetPosition(null) <= 0);
float t = 0;
int position = Microphone.GetPosition(null);
const float waitTime = 0.04f;//每隔40ms发送音频
int status = 0;
int lastPosition = 0;
const int Maxlength = 640;//最大发送长度
while (position < RecordedClip.samples && socket.State == WebSocketState.Open)
{
t += waitTime;
yield return new WaitForSecondsRealtime(waitTime);
if (Microphone.IsRecording(null)) position = Microphone.GetPosition(null);
Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
if (position <= lastPosition)
{
Debug.LogWarning("字节流发送完毕!强制结束!");
break;
}
int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
byte[] date = 获取音频流片段(lastPosition, length, RecordedClip);
发送数据(date, status, socket);
lastPosition = lastPosition + length;
status = 1;
}
发送数据(null, 2, socket);
//WebSocket.CloseAsync(WebSocketCloseStatus.NormalClosure, "关闭WebSocket连接",new CancellationToken());
Microphone.End(null);
}
接下来再说说语音合成,主要的难点是语音数据流的播放。服务器发送给客户端的数据不是连续的,需要分段接受,并解析。并且需要把客户端接受的byte[]转化为AudioClip可读取的float[]类型。通过查阅资料可用以下方法实现:
public static float[] bytesToFloat(byte[] byteArray)//byte[]数组转化为AudioClip可读取的float[]类型
{
float[] sounddata = new float[byteArray.Length / 2];
for (int i = 0; i < sounddata.Length; i++)
{
sounddata[i] = bytesToFloat(byteArray[i * 2], byteArray[i * 2 + 1]);
}
return sounddata;
}
static float bytesToFloat(byte firstByte, byte secondByte)
{
// convert two bytes to one short (little endian)
//小端和大端顺序要调整
short s;
if (BitConverter.IsLittleEndian)
s = (short)((secondByte << 8) | firstByte);
else
s = (short)((firstByte << 8) | secondByte);
// convert to range from -1 to (just below) 1
return s / 32768.0F;
}
当每次获取并解析成功一段语音流是可以把获取的float[]放入到队列中准备播放。这时还需要语音流播放完毕判断并停止播放。我这里的逻辑是用一个变量记录语音流总长度,当接受到新的语音流数据或播放空帧音频是增加这个变量长度,再实时与播放长度做对比就可以了。不废话了合成的所有逻辑代码如下:
public AudioSource 语音合成流播放器;
ClientWebSocket 语音合成WebSocket;
int 语音流总长度 = 0;
Queue<float> 播放队列 = new Queue<float>();
/// <summary>
/// 开始语音合成
/// </summary>
/// <param name="s">需要合成的字符串</param>
/// <param name="voice">发音人,默认是xiaoyan</param>
/// <param name="speed">语速,范围是0~100,默认是50</param>
/// <param name="volume">音量,范围是0~100,默认50</param>
public IEnumerator 开始语音合成(String text, string voice = "xiaoyan", int speed = 50, int volume = 50)
{
if (语音合成WebSocket != null)
{
语音合成WebSocket.Abort();
}
if (语音合成流播放器==null)
{
语音合成流播放器 = gameObject.AddComponent<AudioSource>();
}
语音合成流播放器.Stop();
连接语音合成WebSocket(GetUrl("wss://tts-api.xfyun.cn/v2/tts"),text, voice,speed,volume);
语音合成流播放器.loop = true;
语音合成流播放器.clip = AudioClip.Create("语音合成流", 16000*60 , 1, 16000, true, OnAudioRead);//播放器循环播放采样率16000,最长播放时长60秒,如果合成语音更长可以设置一个必60足够大的数
语音合成流播放器.Play();//播放语音流
while (true)
{
yield return null;
if (!语音合成流播放器.isPlaying || 语音合成WebSocket.State == WebSocketState.Aborted && 语音合成流播放器.timeSamples >= 语音流总长度)
{
Debug.Log(text+"语音流播放完毕!");
语音合成流播放器.Stop();
break;
}
}
}
void OnAudioRead(float[] data)
{
for (int i = 0; i < data.Length; i++)
{
if (播放队列.Count > 0)
{
data[i] = 播放队列.Dequeue();
}
else
{
if (语音合成WebSocket == null || 语音合成WebSocket.State != WebSocketState.Aborted) 语音流总长度++;
data[i] = 0;
}
}
}
public async void 连接语音合成WebSocket(string urlStr, String text, string voice , int speed, int volume)
{
//ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3;
using (语音合成WebSocket = new ClientWebSocket())
{
CancellationToken ct = new CancellationToken();
Uri url = new Uri(urlStr);
await 语音合成WebSocket.ConnectAsync(url, ct);
text = Convert.ToBase64String(Encoding.UTF8.GetBytes(text));
JSONNode sendJson = new JSONNode
{
{ "common",new JSONNode{ { "app_id", APPID } } },
{ "business",new JSONNode{ { "vcn", voice },{ "aue", "raw" },{ "speed", speed },{ "volume", volume },{ "tte", "UTF8" } } },
{ "data",new JSONNode{ { "status", 2 }, { "text", text } } }
};
Debug.Log("发送消息:" + sendJson);
Debug.Log("连接成功");
await 语音合成WebSocket.SendAsync(new ArraySegment<byte>(Encoding.UTF8.GetBytes(sendJson)), WebSocketMessageType.Binary, true, ct); //发送数据
StringBuilder sb = new StringBuilder();
播放队列.Clear();
while (语音合成WebSocket.State == WebSocketState.Open)
{
var result = new byte[4096];
await 语音合成WebSocket.ReceiveAsync(new ArraySegment<byte>(result), ct);//接受数据
List<byte> list = new List<byte>(result); while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1);//去除空字节
var str = Encoding.UTF8.GetString(list.ToArray());
Debug.Log(str);
sb.Append(str);
if (str.EndsWith("}}"))
{
JSONNode json = JSONNode.Parse(sb.ToString());
sb.Clear();
Debug.Log("收到完整json数据:" + json);
JSONNode data = json["data"];
int status = data["status"];
float[] fs = bytesToFloat(Convert.FromBase64String(data["audio"]));
语音流总长度 += fs.Length;
foreach (float f in fs) 播放队列.Enqueue(f);
if (status == 2)
{
语音合成WebSocket.Abort();
break;
}
}
}
}
}
原文链接:https://blog.csdn.net/chunyu90225/article/details/106172895
更多推荐
所有评论(0)