赞
踩
最近看国外教学视频的需求,有些不是很适应,找了找AI字幕效果也不是很好,遂打算基于Whisper和GPT做一个AI字幕给自己。
Windows:
进入 https://github.com/BtbN/FFmpeg-Builds/releases,点击 windows版本的FFMPEG对应的图标,进入下载界面点击 download 下载按钮。
解压下载好的zip文件到指定目录(放到你喜欢的位置)
将解压后的文件目录中 bin 目录(包含 ffmpeg.exe )添加进 path 环境变量中
DOS 命令行输入 ffmpeg -version, 出现以下界面说明安装完成:
运行以下程序,会自动安装Whisper-small的模型,并识别音频audio.mp3 输出识别到的文本。(如果没有科学上网的手段请手动下载)
import whisper
model = whisper.load_model("small")
result = model.transcribe("audio.mp3")
print(result["text"])
运行结果如下
实时录制音频并转录
import pyaudio import wave import numpy as np from pydub import AudioSegment from audioHandle import addAudio_volume,calculate_volume from faster_whisper import WhisperModel model_size = "large-v3" # Run on GPU with FP16 model = WhisperModel(model_size, device="cuda", compute_type="float16") def GetIndex(): p = pyaudio.PyAudio() # 要找查的设备名称中的关键字 target = '立体声混音' for i in range(p.get_device_count()): devInfo = p.get_device_info_by_index(i) # if devInfo['hostApi'] == 0: if devInfo['name'].find(target) >= 0 and devInfo['hostApi'] == 0: print(devInfo) print(devInfo['index']) return devInfo['index'] return -1 # 配置 FORMAT = pyaudio.paInt16 # 数据格式 CHANNELS = 1 # 声道数 RATE = 16000 # 采样率 CHUNK = 1024 # 数据块大小 RECORD_SECONDS = 5 # 录制时长 WAVE_OUTPUT_FILENAME = "output3.wav" # 输出文件 DEVICE_INDEX = GetIndex() # 设备索引,请根据您的系统声音设备进行替换 if DEVICE_INDEX==-1: print('请打开立体声混音') audio = pyaudio.PyAudio() # 开始录制 stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, input_device_index=DEVICE_INDEX) data = stream.read(CHUNK) print("recording...") frames = [] moreDatas=[] maxcount=3 count=0 while True: # 初始化一个空的缓冲区 datas = [] for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): data = stream.read(CHUNK) audio_data = np.frombuffer(data, dtype=np.int16) datas.append(data) # 计算音频的平均绝对值 volume = np.mean(np.abs(audio_data)) # 将音量级别打印出来 print("音量级别:", volume) moreDatas.append(datas) if len(moreDatas)>maxcount: moreDatas.pop(0) newDatas=[i for j in moreDatas for i in j] buffers=b'' for buffer in newDatas: buffers+=buffer print('开始识别') buffers=np.frombuffer(buffers, dtype=np.int16) # a = np.ndarray(buffer=np.array(datas), dtype=np.int16, shape=(CHUNK,)) segments, info = model.transcribe(np.array(buffers), language="en") text='' for segment in segments: print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)) text+=segment.text print(text) print("finished recording") # 停止录制 stream.stop_stream() stream.close() audio.terminate() # 保存录音 wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(audio.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(b''.join(frames)) wf.close() #addAudio_volume(WAVE_OUTPUT_FILENAME)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。