赞
踩
在线语音识别结合语义分析,具有识别精准,灵活性高的特点,但是,其处理速度不如离线识别。
与离线识别不同的是,在esp32被唤醒后,会进行录音,录音结束后将音频发送到云端进行语音识别,并将返回的文本结果进行词法分析,得到特征值,根据特征值,执行相应的命令。
esp32被唤醒后就开始录音,通过VAD音量检测,判断用户是否在讲话,若讲话停止则停止录音(或到达录音最大时间),并将数据通过http客户端发送到百度云语音识别接口。
这里展示主要的代码,录音的数据保存到recoder中:
//创建vad音量采集 vad_handle_t vad_inst = vad_create(VAD_MODE_4, VAD_SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); //创建音量检测模型 int16_t *vad_buff = (int16_t *)malloc(VAD_BUFFER_LENGTH * sizeof(short)); //录音buffer if (vad_buff == NULL) { ESP_LOGE(TAG, "Memory allocation failed!"); } int index = 0; int timeout = 0; //超过一定时间无声音则停止录音 int total_rec = 0; //录音时间 while (1) { //读取流水线的音频缓存到buffer 960k raw_stream_read(raw_read, (char *)buffer, audio_wn_chunksize * sizeof(short)); if (enable_wn) { //将音频数据输入唤醒模型 if (wakenet->detect(model_wn_data, (int16_t *)buffer) == 1) { ESP_LOGI(TAG, "wake up start listening"); //匹配,唤醒 LED_ON; enable_wn = false; } } else { //唤醒后,raw_stream_read继续读取音频到buffer if (recoder != NULL) { //判断 达到录音最长或停止说话 if (total_rec < (MAX_RECODER - 960) && timeout < RECODER_TIMEOUT) { //继续录音 //将buffer的音频复制到recoder memcpy(recoder + (index * audio_wn_chunksize * sizeof(short)), buffer, audio_wn_chunksize * sizeof(short)); index++; //记录总音频数据大小 total_rec += audio_wn_chunksize * sizeof(short); //max=131072 } else { LED_OFF; //停止录音 准备将音频发送到百度api ESP_LOGI(TAG, "stop listening"); memset(http_buff, 0, MAX_HTTP_LEN); //重置http buff memset(url, 0, 200); //配置http_client esp_http_client_config_t config = { .method = HTTP_METHOD_POST, //post方式 .event_handler = http_event_handle, //注册http回调函数 .user_data = (void *)http_buff, //传递参数 }; sprintf(url, BAIDU_ASR_URL, baidu_access_token); //将token组装到url config.url = url; printf("start connect to url = %s\r\n", config.url); //http连接开始准备 esp_http_client_handle_t client = esp_http_client_init(&config); esp_http_client_set_header(client, "Content-Type", "audio/pcm;rate=16000"); //设置http头部 esp_http_client_set_post_field(client, (const char *)recoder, total_rec); //将录音添加到http body ESP_LOGI(TAG, "start trasnlate"); esp_http_client_perform(client); //执行http连接 esp_http_client_close(client); //关闭清除 等待回调函数 esp_http_client_cleanup(client); free(recoder); //释放录音内存 recoder = NULL; index = 0; total_rec = 0; timeout = 0; enable_wn = true; //进入睡眠,等下次唤醒 } } else{ recoder = malloc(MAX_RECODER); //为录音分配内存 } //复制buffer的音频数据到vad_buff memcpy(vad_buff, buffer, VAD_BUFFER_LENGTH * sizeof(short)); //将vad_buff的音频输入到声音检测模型 vad_state_t vad_state = vad_process(vad_inst, vad_buff); //判断是否有声音 if (vad_state == VAD_SPEECH) { //讲话未结束 timeout = 0; } else { //计时增加 timeout++; }
接收到语音识别返回的文本后,还需要对文本进行词法分析,解析出文本中包含的指令。这个功能使用的是百度的词法分析定制版。具体逻辑是:首先我们确定一个词汇集,如:{打开,开启,启动},并将该词汇集命名为OPEN,{空调,格力空调}命名为”AC“
例如,用户输入“打开空调”,词法分析就会得到以下结果
{ "log_id": 4870567568319578302, "items": [ { "loc_details": [ ], "byte_offset": 0, "uri": "", "ne": "OPEN", "basic_words": [ "打开" ], "item": "打开", "pos": "", "byte_length": 6, "formal": "" }, { "loc_details": [ ], "byte_offset": 6, "uri": "", "ne": "AC", "basic_words": [ "空调" ], "item": "空调", "pos": "", "byte_length": 6, "formal": "" } ], "text": "打开空调" }
我们读取”ne“键中的内容,就能判断用户的意图。以下代码请求词法分析
/* * 根据语音结果进行词性分析 * 成功返回1 */ int Etymology_Analysis() { //语音识别结果存放在 http_buff cJSON *root = cJSON_Parse(http_buff); if(root==NULL) { ESP_LOGI(TAG,"cjson parse error"); return 0; } cJSON *item=cJSON_GetObjectItem(root, "err_no"); if(item->valueint!=0) { ESP_LOGI(TAG,"translate error,err_no:%d",item->valueint); cJSON_Delete(root); return 0; } item = cJSON_GetObjectItem(root, "result"); item = cJSON_GetArrayItem(item,0); char *result = cJSON_GetStringValue(item); //获取语音识别文本 //将文本添加进json字符串 char *post_data = malloc(POST_DATA_LEN); //将rersult组成json并存到post_data snprintf(post_data, POST_DATA_LEN, "{\"text\":\"%s\"}", result); ESP_LOGI(TAG, "POST DATA:%s", post_data); //清空http buff准备接收数据 memset(http_buff, 0, MAX_HTTP_LEN); memset(url, 0, 200); //初始化http客户端 准备调用词性分析api esp_http_client_config_t config={ .method=HTTP_METHOD_POST, //post方式 .event_handler=http_event_handle, //注册回调函数 .user_data = (void *)http_buff, //传递参数 }; sprintf(url, BAIDU_ETY_URL, baidu_access_token); //将token加入url config.url = url; esp_http_client_handle_t client = esp_http_client_init(&config); esp_http_client_set_header(client, "Content-Type", "application/json"); //设置http头 esp_http_client_set_post_field(client,(const char*)post_data,strlen(post_data)); //将json字符串填入body printf("start connect to url = %s\r\n",config.url); esp_http_client_perform(client); //开始连接 int con_len = esp_http_client_get_content_length(client); ESP_LOGI(TAG, "Status = %d, content_length = %d", esp_http_client_get_status_code(client), con_len); //关掉http客户端 esp_http_client_close(client); esp_http_client_cleanup(client); //删除cjson cJSON_Delete(root); free(post_data); //释放postdata,留给下次 return 1; }
在获取到以上的JSON数据后,接下来就是提取“ne”中的内容,用下面的数据结构来辅助解析。
//词性解析出的元素如:打开lexical=Open,text="OPEN" typedef struct { enum Lexical lexical; //词性或特征 char text[10]; //文本内容 } Ety_Element; static Ety_Element ety_eles[10] = {0}; //一般一个命令的元素在10以内 //命令结构体包括命令的对象,操作,数量,时间(未完成) typedef struct { int number; enum Object object; //only for "Aircon Bt Weather" enum AC_Option option; //only for "open close up down " } Audio_Order;
以下代码会解析每个词,并填充到ety_eles数组,每个单词对应一个ety_eles成员:
/* * 解析每个单词的词性 * 返回:单词数量 */ int parse_items() { cJSON *root = cJSON_Parse(http_buff); //解析语音json cJSON *items = cJSON_GetObjectItem(root, "items"); if(items == NULL) { return 0; } int arry_size=cJSON_GetArraySize(items); //每个ety_eles存放一个单词 清空准备接收新的单词 memset(ety_eles, 0, 10 * sizeof(Ety_Element)); cJSON *item,*sub_item; char *character, *text; //词性及文本内容 for (int i = 0; i < arry_size; i++) { item = cJSON_GetArrayItem(items, i); //ne和pos都是描述词性,两者只能出现一个 sub_item = cJSON_GetObjectItem(item, "pos"); character = cJSON_GetStringValue(sub_item); //pos为空串时说明ne有效 if (strncmp(character,"",1)==0) { //ESP_LOGI(TAG, "pos is null"); sub_item = cJSON_GetObjectItem(item, "ne");//!ne需要特殊处理 character = cJSON_GetStringValue(sub_item); } //sub_item = cJSON_GetObjectItem(item, "item"); printf("char = %s \r\n", character); //获取单词的词性 if (strncmp(character, "NUM", 3) == 0) { /* ety_eles[i].lexical = Num; sub_item = cJSON_GetObjectItem(item, "item"); text = cJSON_GetStringValue(sub_item); strncpy(ety_eles[i].text, text, strlen(text)); //保存二位数字 */ } else if(strncmp(character,"AC",2)==0){ ety_eles[i].lexical = Aircon; } else if(strncmp(character,"BT",2)==0){ ety_eles[i].lexical = Bt; } else if(strncmp(character,"WEA",3)==0){ ety_eles[i].lexical = Weather; } else if(strncmp(character,"DOWN",4)==0){ ety_eles[i].lexical = Down; } else if(strncmp(character,"UP",2)==0){ ety_eles[i].lexical = Up; } else if(strncmp(character,"CLOSE",5)==0){ ety_eles[i].lexical = Close; } else if(strncmp(character,"OPEN",4)==0){ ety_eles[i].lexical = Open; } else if(strncmp(character,"TOMO",4)==0) { ety_eles[i].lexical = Tomorrow; } else if(strncmp(character,"AFTTO",5)==0) { ety_eles[i].lexical = Aftermotorrow; } else if(strncmp(character,"TODAY",4)==0) { ety_eles[i].lexical = Today; } else if(strncmp(character,"TIME",4)==0){ //TODO 如何解析中文的时间,暂时不做时间方面的功能 ety_eles[i].lexical = TIME; } else if(strncmp(character,"n",1)==0){ ety_eles[i].lexical = Nouns; } else if(strncmp(character,"w",1)==0){ ety_eles[i].lexical = Word; } else if(strncmp(character,"v",1)==0){ ety_eles[i].lexical = Verbs; } else if(strncmp(character,"m",1)==0){ //eg:26度 100块 需要提取basiword的第一个 sub_item = cJSON_GetObjectItem(item, "basic_words"); sub_item = cJSON_GetArrayItem(sub_item, 0); text = cJSON_GetStringValue(sub_item); //数字字符串 ety_eles[i].lexical = Mount; strncpy(ety_eles[i].text, text, strlen(text)); //保存数量 } else if(strncmp(character,"r",1)==0){ ety_eles[i].lexical = Pronouns; } else{ ety_eles[i].lexical = Other; } //printf("ele char =%u,text=%s \r\n", ety_eles[i].lexical, ety_eles[i].text); } cJSON_Delete(root); return arry_size; }
下面,根据得到的ety_eles数组,组装成一个Audio_Order类型的命令:
/* * 组装一个语音命令 * i:命令中的单词数量 * */ Audio_Order build_order(int i) { //初始化一个语音命令 Audio_Order ord={ .number=0, .object=obj_other, .option=AC_OPTION_MAX }; //遍历单词,提取对命令有关的信息 for (int x = 0; x < i; x++) { //寻找操作对象 switch(ety_eles[x].lexical) { case Aircon: ord.object = obj_Ac; break; case Bt: ord.object = obj_Bt; break; case Weather: ord.object = obj_Weather; break; case Open: ord.option = AC_OPTION_OPEN; break; case Close: ord.option = AC_OPTION_CLOSE; break; case Up: ord.option = AC_OPTION_UP; break; case Down: ord.option = AC_OPTION_DOWN; break; case Num: ord.number = atoi(ety_eles[x].text);//字符串数字转整型数字 //printf("num=%d\r\n", ord.number); break; case Mount: ord.number = atoi(ety_eles[x].text); //printf("num=%d\r\n", ord.number); case TIME: break; case Today: ord.number = 0; break; case Tomorrow: ord.number = 1; break; case Aftermotorrow: ord.number = 2; break; //其他属性忽略 default: break; } } return ord; }
有了Audio_Order命令,我们就能根据命令的内容作出反应。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。