赞
踩
语音识别的三个解决方案:
原本用途:本来是要求从视频中识别语音,然后把文字内容提取出来,结果看了很多项目,中文的注释,识别的却是英文,感到授课的门槛有点低,我能看懂别人开源的代码,距离自己开发还是有距离的。后来探索了很多,比如字幕生成,把字幕不生成到视频下方而是一段一段增加到txt文本里,我也认为这是最好的办法,而且能顺便给每个字、每个句子一个时间戳。后来意识到一天的时间实在是很难完成,于是从网上找了最普通的,也是不难理解的解决方案及相关代码:还是从视频转音频,再从音频转文字。
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2022.1.5 # @File : duizhaozu1.py import os import speech_recognition as sr def file_to_wav(file_path, wav_path, sampling_rate): if os.path.exists(wav_path): # 如果文件存在 # 删除文件,可使用以下两种方法。 os.remove(wav_path) # 终端命令 command = "D:/download/ffmpeg-master-latest-win64-lgpl/bin/ffmpeg.exe -i {} -ac 1 -ar {} {}".format(file_path, sampling_rate, wav_path) os.system(command) if __name__ == '__main__': file_path = r'C:\Users\PineappleMan\Desktop\ok\DFS.mp4' wav_path = r'C:\Users\PineappleMan\Desktop\ok\DFS.wav' sampling_rate = 16000 file_to_wav(file_path, wav_path, sampling_rate) r=sr.Recognizer() with sr.AudioFile(wav_path) as source: audio =r.record(source) print("文本内容:",r.recognize_sphinx(audio,language="zh-CN"))
其中,文件为视频,如果直接是音频(wav文件)就直接把格式转换的代码删掉,代码借鉴了https://download.csdn.net/download/weixin_38693753/13709062
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple speech_recognition
运行的时候遇到了错误就是缺少pocketsphinx:https://download.csdn.net/download/yuxuwen1234/12195200,实际上去下载一个whl就行了,我的python是3.7所以下载了链接中的,很快就能运行好了,网上很多解决方案,因为报错的是swig的问题,但是“头痛医头,脚痛医脚”,就应该机智的去下载whl去解决这个问题。
import base64 import json import os import time import uuid import requests import urllib.response from inc import db_config from inc import rtysdb class BaiduRest: def __init__(self, cu_id, api_key, api_secert): self.token_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s" self.getvoice_url = "http://tsn.baidu.com/text2audio?tex=%s&lan=zh&cuid=%s&ctp=1&tok=%s" self.upvoice_url = 'http://vop.baidu.com/server_api' self.cu_id = cu_id self.get_token(api_key, api_secert) return def get_token(self, api_key, api_secert): token_url = self.token_url % (api_key, api_secert) r_str = urllib.response.urlopen(token_url).read() token_data = json.loads(r_str) self.token_str = token_data['access_token'] return True # 语音合成 # def text2audio(self, text, filename): # get_url = self.getvoice_url % (urllib.response.quote(text), self.cu_id, self.token_str) # # # voice_data = urllib.response.urlopen(get_url).read() # voice_fp = open(filename, 'wb+') # voice_fp.write(voice_data) # voice_fp.close() # return True ##语音识别 def audio2text(self, filename): data = {} data['format'] = 'wav' data['rate'] = 8000 data['channel'] = 1 data['cuid'] = self.cu_id data['token'] = self.token_str wav_fp = open(filename, 'rb') voice_data = wav_fp.read() data['len'] = len(voice_data) # data['speech'] = base64.b64encode(voice_data).decode('utf-8') data['speech'] = base64.b64encode(voice_data).replace('\n', '') # post_data = json.dumps(data) result = requests.post(self.upvoice_url, json=data, headers={'Content-Type': 'application/json'}) data_result = result.json() if (data_result['err_msg'] == 'success.'): return data_result['result'][0] else: return False def test_voice(voice_file): api_key = "vossGHIgEETS6IMRxBDeahv8" api_secert = "3c1fe6a6312f41fa21fa2c394dad5510" bdr = BaiduRest("0-57-7B-9F-1F-A1", api_key, api_secert) # 生成 # start = time.time() # bdr.text2audio("你好啊", "out.wav") # using = time.time() - start # print using # 识别 # start = time.time() result = bdr.audio2text(voice_file) # result = bdr.audio2text("weather.pcm") # using = time.time() - start return result def get_master_audio(check_status='cut_status'): if check_status == 'cut_status': sql = "SELECT id,url, time_long,sharps FROM ocenter_recognition WHERE status=0" elif check_status == 'finished_status': sql = "SELECT id,url, time_long,sharps FROM ocenter_recognition WHERE finished_status=0" else: return False data = rtysdb.select_data(sql, 'more') if data: return data else: return False def go_recognize(master_id): section_path = "C:/Users/PineappleMan/Desktop/ok/audio1.wav" sql = "SELECT id,rid,url,status FROM ocenter_section WHERE rid=%d AND status=0 order by id asc limit 10" % ( master_id) # print sql record = rtysdb.select_data(sql, 'more') # print record if not record: return False for rec in record: # print section_path+'/'+rec[1] voice_file = section_path + '/' + rec[2] if not os.patcvoice_file: continue result = test_voice(voice_file) print(result) exit(0) if result: # rtysdb.update_by_pk('ocenter_section',rec[0],{'content':result,'status':1}) sql = "update ocenter_section set content='%s', status='%d' where id=%d" % (result, 1, rec[0]) # print sql rtysdb.do_exec_sql(sql) parent_content = rtysdb.select_data("SELECT id,content FROM ocenter_recognition WHERE id=%d" % (rec[1])) # print parent_content if parent_content: new_content = parent_content[1] + result update_content_sql = "update ocenter_recognition set content='%s' where id=%d" % (new_content, rec[1]) rtysdb.do_exec_sql(update_content_sql) else: rtysdb.do_exec_sql("update ocenter_section set status='%d' where id=%d" % (result, 1, rec[0])) time.sleep(5) else: rtysdb.do_exec_sql("UPDATE ocenter_recognition SET finished_status=1 WHERE id=%d" % (master_id)) # 对百度语音识别不了的音频文件进行转换 def ffmpeg_convert(): section_path = "C:/Users/PineappleMan/Desktop/ok/audio1.wav" # print section_path used_audio = get_master_audio('cut_status') # print used_audio if used_audio: for audio in used_audio: audio_path = section_path + '/' + audio[1] new_audio = uuid.uuid1() command_line = "ffmpeg -i " + audio_path + " -ar 8000 -ac 1 -f wav " + section_path + "/Uploads/Convert/convert_" + str( new_audio) + ".wav"; # print command_line os.popen(command_line) if os.path.exists(section_path + "/Uploads/Convert/convert_" + str(new_audio) + ".wav"): convert_name = "Uploads/Convert/convert_" + str(new_audio) + ".wav" ffmpeg_cut(convert_name, audio[3], audio[0]) sql = "UPDATE ocenter_recognition SET status=1,convert_name='%s' where id=%d" % (convert_name, audio[0]) rtysdb.do_exec_sql(sql) # 将大音频文件切成碎片 def ffmpeg_cut(convert_name, sharps, master_id): section_path = "C:/Users/PineappleMan/Desktop/ok/audio1.wav" if sharps > 0: for i in range(0, sharps): timeArray = time.localtime(i * 30) h = time.strftime("%H", timeArray) h = int(h) - 8 h = "0" + str(h) ms = time.strftime("%M:%S", timeArray) start_time = h + ':' + str(ms) cut_name = section_path + '/' + convert_name db_store_name = "Uploads/Section/" + str(uuid.uuid1()) + '-' + str(i + 1) + ".wav" section_name = section_path + "/" + db_store_name command_line = "ffmpeg.exe -i " + cut_name + " -vn -acodec copy -ss " + start_time + " -t 00:00:30 " + section_name # print command_line os.popen(command_line) data = {} data['rid'] = master_id data['url'] = db_store_name data['create_time'] = int(time.time()) data['status'] = 0 rtysdb.insert_one('ocenter_section', data) if __name__ == "__main__": ffmpeg_convert() audio = get_master_audio('finished_status') if audio: for ad in audio: go_recognize(ad[0])
该项目参考了https://download.csdn.net/download/weixin_38531210/12867107,但是做了很多改动!无论是官网还是这些项目,调用的包都是比较陈旧的,甚至python3已经用别的名称进行取代了。所以改动的工作量是相当大的。
TIMIT是比较经典的英文的语音识别,找到相关代码并不难。这里就不多讲了。过两天我把每行代码做好注释再发出来。
其实一天实现一个功能,尤其是对我这种考研考了很久,忘了很多东西的人来说。但是这种有目的的功能实现,还是很锻炼自己的。再接再厉吧。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。