赞
踩
质量声明:原创文章,内容质量问题请评论吐槽。如对您产生干扰,可私信删除。
主要参考:阿里云官方接口文档
摘要: Python调用阿里云的智能语音交互接口,依靠对象存储服务(OSS)上传音频,实现录音文件识别,输出为txt文本。支持单轨/双轨的wav、mp3格式,最大支持文件512MB,最大录音时长2个小时。
由于阿里、百度、腾讯、讯飞等语音处理平台都对语音参数有特定要求,所以我们需要预先处理音频。好在要求都基本一致,主要是采样率必须是16k Hz或8k Hz,采样位数16bit,单/双通道,wav或mp3。这些通过调用pydub包即可实现转换。
pip3 install pydub
ffmpeg -version
from pydub import AudioSegment def wavSample(from_path, to_path, frame_rate=16000, channels=1, startMin=0, endMin=None): # 根据文件的类型选择导入方法 audio = AudioSegment.from_wav(from_path) # mp3_version = AudioSegment.from_mp3("never_gonna_give_you_up.mp3") # ogg_version = AudioSegment.from_ogg("never_gonna_give_you_up.ogg") # flv_version = AudioSegment.from_flv("never_gonna_give_you_up.flv") startTime = startMin * 60 * 1000 # 单位ms endTime = endMin * 60 * 1000 + 1 if endMin else None # 单位ms audio = audio[startTime:endTime] mono = audio.set_frame_rate(frame_rate).set_channels(channels) # 设置声道和采样率 mono.export(to_path, format='wav', codec='pcm_s16le') # codec此参数本意是设定16bits pcm编码器 if __name__ == '__main__': wavSample("sample.WAV", "sample_new.WAV")
由于各大智能语音识别服务的平台,都仅支持基于HTTP可访问的URL地址,不支持提交本地文件,所以需要上传至网络。我用的是阿里云的OSS存储,使用方法参考官方文档:阿里云对象存储 OSS 快速入门。主要浏览如何开通OSS,如何上传录音文件并开放读权限即可。上传完成,需要获得访问链接,格式如:https://xxxx.oss-cn-beijing.aliyuncs.com/xxxx.WAV
开通服务: 参考官方文档,浏览如何开通智能语音交互即可,需要获得
调用代码: 改动自官方Demo,新增识别结果解析,组合后存储为txt文件
# -*- coding: utf8 -*- import json import time from aliyunsdkcore.acs_exception.exceptions import ClientException from aliyunsdkcore.acs_exception.exceptions import ServerException from aliyunsdkcore.client import AcsClient from aliyunsdkcore.request import CommonRequest def fileTrans(akId, akSecret, appKey, fileLink): # 地域ID,常量内容,请勿改变 REGION_ID = "cn-shanghai" PRODUCT = "nls-filetrans" DOMAIN = "filetrans.cn-shanghai.aliyuncs.com" API_VERSION = "2018-08-17" POST_REQUEST_ACTION = "SubmitTask" GET_REQUEST_ACTION = "GetTaskResult" # 请求参数key KEY_APP_KEY = "appkey" KEY_FILE_LINK = "file_link" KEY_VERSION = "version" KEY_ENABLE_WORDS = "enable_words" # 是否开启智能分轨 KEY_AUTO_SPLIT = "auto_split" # 响应参数key KEY_TASK = "Task" KEY_TASK_ID = "TaskId" KEY_STATUS_TEXT = "StatusText" KEY_RESULT = "Result" # 状态值 STATUS_SUCCESS = "SUCCESS" STATUS_RUNNING = "RUNNING" STATUS_QUEUEING = "QUEUEING" # 创建AcsClient实例 client = AcsClient(akId, akSecret, REGION_ID) # 提交录音文件识别请求 postRequest = CommonRequest() postRequest.set_domain(DOMAIN) postRequest.set_version(API_VERSION) postRequest.set_product(PRODUCT) postRequest.set_action_name(POST_REQUEST_ACTION) postRequest.set_method('POST') # 新接入请使用4.0版本,已接入(默认2.0)如需维持现状,请注释掉该参数设置 # 设置是否输出词信息,默认为false,开启时需要设置version为4.0 task = {KEY_APP_KEY: appKey, KEY_FILE_LINK: fileLink, KEY_VERSION: "4.0", KEY_ENABLE_WORDS: False} # 开启智能分轨,如果开启智能分轨 task中设置KEY_AUTO_SPLIT : True # task = {KEY_APP_KEY : appKey, KEY_FILE_LINK : fileLink, KEY_VERSION : "4.0", KEY_ENABLE_WORDS : False, KEY_AUTO_SPLIT : True} task = json.dumps(task) postRequest.add_body_params(KEY_TASK, task) taskId = "" try: postResponse = client.do_action_with_exception(postRequest) postResponse = json.loads(postResponse) statusText = postResponse[KEY_STATUS_TEXT] if statusText == STATUS_SUCCESS: print("录音文件识别请求成功响应!") taskId = postResponse[KEY_TASK_ID] else: print("录音文件识别请求失败!") return except ServerException as e: print(e) except ClientException as e: print(e) # 创建CommonRequest,设置任务ID getRequest = CommonRequest() getRequest.set_domain(DOMAIN) getRequest.set_version(API_VERSION) getRequest.set_product(PRODUCT) getRequest.set_action_name(GET_REQUEST_ACTION) getRequest.set_method('GET') getRequest.add_query_param(KEY_TASK_ID, taskId) # 提交录音文件识别结果查询请求 # 以轮询的方式进行识别结果的查询,直到服务端返回的状态描述符为"SUCCESS"、"SUCCESS_WITH_NO_VALID_FRAGMENT", # 或者为错误描述,则结束轮询。 statusText = "" while True: try: getResponse = client.do_action_with_exception(getRequest) getResponse = json.loads(getResponse) statusText = getResponse[KEY_STATUS_TEXT] if statusText == STATUS_RUNNING or statusText == STATUS_QUEUEING: # 继续轮询 time.sleep(30) else: # 退出轮询 break except ServerException as e: print(e) except ClientException as e: print(e) # 结果解析与保存 if statusText == STATUS_SUCCESS: texts = "" result = getResponse["Result"] sentences = result["Sentences"] maxlength = 30 # 按长度分段 for i, sentence in enumerate(sentences): index = i % (maxlength + 1) if index == maxlength: # 以追加方式存入文件 with open("recognition.txt", "a+") as f: f.write(texts + "\r\n\r\n") texts = "" text = sentence["Text"] texts += text print("录音文件识别成功!\n") else: print("录音文件识别失败!") def main(): # 配置阿里云接口 accessKeyId = "填入开通服务的accessKey Id" accessKeySecret = "填入开通服务的accessKey Secret " appKey = "填入开通服务的appKey" # 输入录音url fileLink = "填入上传至OSS的录音的url" # 执行录音文件识别,识别成功后将输出recognition.txt文件 fileTrans(accessKeyId, accessKeySecret, appKey, fileLink) if __name__ == '__main__': main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。