赞
踩
pyAudioKits是基于librosa和其他库的强大Python音频工作流支持。
通过pip安装:
pip install pyAudioKits
本项目的GitHub地址,如果这个项目帮助到了你,请为它点上一颗star,谢谢你的支持!如果你在使用过程中有任何问题,请在评论区留言或在GitHub上提issue,我将持续对该项目进行维护。
本节介绍一个使用pyAudioKits读取并提取音频特征,然后使用深度学习方法来进行语音识别的小项目。我们将提取梅尔频谱作为特征,用Keras+Tensorflow来搭建一个简单的卷积神经网络分类器,实现0-9这十个语音数字的识别。
所用的数据集来自:https://www.kaggle.com/datasets/bharatsahu/speech-commands-classification-dataset。请先下载该数据集,并保存为"speech-commands-classification-dataset"目录。
算法参考自:https://www.kaggle.com/code/ritvik1909/speech-classification-spectrogram-cnn
import matplotlib.pyplot as plt
import os
from tensorflow.keras import layers, models, callbacks
from keras.preprocessing.image import ImageDataGenerator
import pyAudioKits.audio as ak
import pyAudioKits.analyse as aly
首先生成预处理数据的路径
root="speech-commands-classification-dataset"
train_folders="train_set"
val_folders="val_set"
test_folders="test_set"
if not os.path.exists(train_folders):
os.mkdir(train_folders)
if not os.path.exists(val_folders):
os.mkdir(val_folders)
if not os.path.exists(test_folders):
os.mkdir(test_folders)
接下来开始提取梅尔频谱特征。我们先在0-9每个数字下选择一个样本,提取特征并可视化。
input_size = (128, 128, 1) labels=["zero","one","two","three","four","five","six","seven","eight","nine"] plt.tight_layout() fig,axes=plt.subplots(2,5,sharex=True,sharey=True) fig.set_size_inches(12,6) #迭代训练集、验证集和测试集,并获取每个语音数字下的第一个样本 for i,d in enumerate(labels): path=root+"/"+d for w in os.listdir(path): wf = ak.read_Audio(path+"/"+w) #读取语音数字 wf = wf.padding(22050) #将语音数字的样本长度补充到22050,为采样率的一半,使得频域和时域可以使用相同分辨率 features= aly.melSpec(wf, spec_h = input_size[0], spec_w= input_size[1]) #提取梅尔频谱,设置宽高均为128。其中高度决定了梅尔频谱的频率分辨率,而宽度决定了其时间分辨率 axes[int(i/5),i%5].set_title(d) axes[int(i/5),i%5].imshow(features) #绘制梅尔频谱特征 break
接下来正式开始提取特征。迭代数据集、提取特征并将其存为图片。
val_list=[str(i)[2:-3] for i in open(root+"/validation_list.txt","rb").readlines()] test_list=[str(i)[2:-3] for i in open(root+"/testing_list.txt","rb").readlines()] maxcount=[1839,239,230] for d in labels: count=[0,0,0] path=root+"/"+d dst=[None,None,None] dst[0]=train_folders+"/"+d dst[1]=val_folders+"/"+d dst[2]=test_folders+"/"+d for w in os.listdir(path): if d+"/"+w in val_list: #如果在val_list内,则归类到验证集 pos=1 elif d+"/"+w in test_list: #如果在test_list内,则归类到测试集 pos=2 else: #否则,归类到训练集 pos=0 count[pos]+=1 if count[pos]>maxcount[pos]: continue wf = ak.read_Audio(path+"/"+w) wf = wf.padding(22050) features= aly.melSpec(wf, spec_h = input_size[0], spec_w= input_size[1]) if not os.path.exists(dst[pos]): os.mkdir(dst[pos]) if not os.path.exists(dst[pos]): os.mkdir(dst[pos]) plt.imsave(dst[pos]+"/"+w[:-4]+".png", features, cmap='gray') #保存梅尔频谱特征为图片 print(d) ''' outputs: zero one two three four five six seven eight nine '''
使用Keras的API载入图片数据集,这样就把语音分类问题本质上转化为了一个图片分类问题。
training_set = ImageDataGenerator().flow_from_directory( train_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale' ) validation_set = ImageDataGenerator().flow_from_directory( val_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale' ) test_set = ImageDataGenerator().flow_from_directory( test_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale' ) ''' outputs: Found 18390 images belonging to 10 classes. Found 2369 images belonging to 10 classes. Found 2300 images belonging to 10 classes. '''
构造卷积神经网络、优化器和损失函数。
#构造卷积神经网络 model = models.Sequential([ layers.Conv2D(32, 3, activation='relu', input_shape=(128,128,1), padding='same'), layers.Conv2D(32, 3, activation='relu', padding='same'), layers.MaxPooling2D(padding='same'), layers.Dropout(0.25), layers.Conv2D(64, 3, activation='relu', padding='same'), layers.Conv2D(64, 3, activation='relu', padding='same'), layers.MaxPooling2D(padding='same'), layers.Dropout(0.25), layers.GlobalAveragePooling2D(), layers.Dense(len(labels), activation='softmax'), ]) model.summary() #优化器采用Adam;损失函数采用交叉熵损失函数;earlystopping指标采用准确率 model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'],) ''' outputs: Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= conv2d (Conv2D) (None, 128, 128, 32) 320 _________________________________________________________________ conv2d_1 (Conv2D) (None, 128, 128, 32) 9248 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 64, 64, 32) 0 _________________________________________________________________ dropout (Dropout) (None, 64, 64, 32) 0 _________________________________________________________________ conv2d_2 (Conv2D) (None, 64, 64, 64) 18496 _________________________________________________________________ conv2d_3 (Conv2D) (None, 64, 64, 64) 36928 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 32, 32, 64) 0 _________________________________________________________________ dropout_1 (Dropout) (None, 32, 32, 64) 0 _________________________________________________________________ global_average_pooling2d (Gl (None, 64) 0 _________________________________________________________________ dense (Dense) (None, 10) 650 ================================================================= Total params: 65,642 Trainable params: 65,642 Non-trainable params: 0 _________________________________________________________________ '''
开始训练模型。训练时采用earlystopping和学习率衰减策略。
#采用earlystopping和学习率衰减策略 es = callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True) rlp = callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=2, min_lr=1e-10, mode='min', verbose=1) #训练模型 history = model.fit( training_set, validation_data=validation_set, epochs=500, callbacks=[es, rlp], batch_size=8 ) ''' outputs: Epoch 1/500 575/575 [==============================] - 921s 2s/step - loss: 2.1807 - accuracy: 0.1993 - val_loss: 1.9604 - val_accuracy: 0.2921 Epoch 2/500 575/575 [==============================] - 820s 1s/step - loss: 1.8535 - accuracy: 0.3332 - val_loss: 1.7405 - val_accuracy: 0.3854 Epoch 3/500 575/575 [==============================] - 832s 1s/step - loss: 1.6132 - accuracy: 0.4388 - val_loss: 1.4009 - val_accuracy: 0.5395 Epoch 4/500 575/575 [==============================] - 824s 1s/step - loss: 1.2987 - accuracy: 0.5673 - val_loss: 1.1822 - val_accuracy: 0.5741 Epoch 5/500 575/575 [==============================] - 824s 1s/step - loss: 1.0400 - accuracy: 0.6605 - val_loss: 0.8329 - val_accuracy: 0.7341 Epoch 6/500 575/575 [==============================] - 841s 1s/step - loss: 0.8287 - accuracy: 0.7354 - val_loss: 0.7591 - val_accuracy: 0.7455 Epoch 7/500 575/575 [==============================] - 836s 1s/step - loss: 0.7186 - accuracy: 0.7748 - val_loss: 0.6612 - val_accuracy: 0.7940 Epoch 8/500 575/575 [==============================] - 829s 1s/step - loss: 0.6044 - accuracy: 0.8136 - val_loss: 0.7732 - val_accuracy: 0.7497 Epoch 9/500 575/575 [==============================] - 829s 1s/step - loss: 0.5407 - accuracy: 0.8302 - val_loss: 0.5135 - val_accuracy: 0.8328 Epoch 10/500 575/575 [==============================] - 845s 1s/step - loss: 0.4887 - accuracy: 0.8482 - val_loss: 0.3927 - val_accuracy: 0.8776 Epoch 11/500 575/575 [==============================] - 932s 2s/step - loss: 0.4534 - accuracy: 0.8581 - val_loss: 0.4472 - val_accuracy: 0.8531 Epoch 12/500 575/575 [==============================] - 831s 1s/step - loss: 0.4230 - accuracy: 0.8694 - val_loss: 0.3817 - val_accuracy: 0.8725 Epoch 13/500 575/575 [==============================] - 831s 1s/step - loss: 0.3928 - accuracy: 0.8812 - val_loss: 0.3996 - val_accuracy: 0.8713 Epoch 14/500 575/575 [==============================] - 839s 1s/step - loss: 0.3644 - accuracy: 0.8908 - val_loss: 0.3484 - val_accuracy: 0.8839 Epoch 15/500 575/575 [==============================] - 858s 1s/step - loss: 0.3474 - accuracy: 0.8936 - val_loss: 0.3596 - val_accuracy: 0.8869 Epoch 16/500 575/575 [==============================] - ETA: 0s - loss: 0.3291 - accuracy: 0.8980 Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513. 575/575 [==============================] - 845s 1s/step - loss: 0.3291 - accuracy: 0.8980 - val_loss: 0.3541 - val_accuracy: 0.8852 Epoch 17/500 575/575 [==============================] - 846s 1s/step - loss: 0.2301 - accuracy: 0.9314 - val_loss: 0.2641 - val_accuracy: 0.9152 Epoch 18/500 575/575 [==============================] - 844s 1s/step - loss: 0.2195 - accuracy: 0.9332 - val_loss: 0.2644 - val_accuracy: 0.9152 Epoch 19/500 575/575 [==============================] - ETA: 0s - loss: 0.2157 - accuracy: 0.9347 Epoch 00019: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05. 575/575 [==============================] - 849s 1s/step - loss: 0.2157 - accuracy: 0.9347 - val_loss: 0.2731 - val_accuracy: 0.9181 Epoch 20/500 575/575 [==============================] - 841s 1s/step - loss: 0.2034 - accuracy: 0.9395 - val_loss: 0.2667 - val_accuracy: 0.9147 Epoch 21/500 575/575 [==============================] - 829s 1s/step - loss: 0.1991 - accuracy: 0.9419 - val_loss: 0.2618 - val_accuracy: 0.9156 Epoch 22/500 575/575 [==============================] - 832s 1s/step - loss: 0.2027 - accuracy: 0.9402 - val_loss: 0.2621 - val_accuracy: 0.9177 Epoch 23/500 575/575 [==============================] - 829s 1s/step - loss: 0.1975 - accuracy: 0.9417 - val_loss: 0.2615 - val_accuracy: 0.9168 Epoch 24/500 575/575 [==============================] - 826s 1s/step - loss: 0.1981 - accuracy: 0.9411 - val_loss: 0.2614 - val_accuracy: 0.9168 Epoch 25/500 575/575 [==============================] - 830s 1s/step - loss: 0.1989 - accuracy: 0.9407 - val_loss: 0.2616 - val_accuracy: 0.9181 Epoch 26/500 575/575 [==============================] - 837s 1s/step - loss: 0.1976 - accuracy: 0.9414 - val_loss: 0.2606 - val_accuracy: 0.9185 Epoch 27/500 575/575 [==============================] - 837s 1s/step - loss: 0.2008 - accuracy: 0.9380 - val_loss: 0.2587 - val_accuracy: 0.9177 Epoch 28/500 575/575 [==============================] - 838s 1s/step - loss: 0.1980 - accuracy: 0.9411 - val_loss: 0.2584 - val_accuracy: 0.9211 Epoch 29/500 575/575 [==============================] - 843s 1s/step - loss: 0.1989 - accuracy: 0.9407 - val_loss: 0.2579 - val_accuracy: 0.9190 Epoch 30/500 575/575 [==============================] - 842s 1s/step - loss: 0.2006 - accuracy: 0.9401 - val_loss: 0.2608 - val_accuracy: 0.9177 Epoch 31/500 575/575 [==============================] - ETA: 0s - loss: 0.1966 - accuracy: 0.9398 Epoch 00031: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06. 575/575 [==============================] - 839s 1s/step - loss: 0.1966 - accuracy: 0.9398 - val_loss: 0.2614 - val_accuracy: 0.9185 Epoch 32/500 575/575 [==============================] - 848s 1s/step - loss: 0.1947 - accuracy: 0.9411 - val_loss: 0.2600 - val_accuracy: 0.9190 Epoch 33/500 575/575 [==============================] - ETA: 0s - loss: 0.1970 - accuracy: 0.9409 Epoch 00033: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07. 575/575 [==============================] - 853s 1s/step - loss: 0.1970 - accuracy: 0.9409 - val_loss: 0.2599 - val_accuracy: 0.9190 Epoch 34/500 575/575 [==============================] - ETA: 0s - loss: 0.1961 - accuracy: 0.9410Restoring model weights from the end of the best epoch. 575/575 [==============================] - 849s 1s/step - loss: 0.1961 - accuracy: 0.9410 - val_loss: 0.2598 - val_accuracy: 0.9190 Epoch 00034: early stopping '''
训练结束,对结果进行可视化。绘制训练集和验证集上损失和准确率随时间变化图。
import pandas as pd #绘制训练集和验证集上损失和准确率随时间变化图 fig, ax = plt.subplots(2, 1, figsize=(20, 8)) df = pd.DataFrame(history.history) df[['accuracy', 'val_accuracy']].plot(ax=ax[0]) df[['loss', 'val_loss']].plot(ax=ax[1]) ax[0].set_title('Accuracy', fontsize=15) ax[1].set_title('Loss', fontsize=15) ax[0].grid(linestyle="-.") ax[1].grid(linestyle="-.")
在测试集上进行测试的结果是测试集损失为0.245,准确率为0.93。
model.evaluate(test_set)
'''
outputs:
72/72 [==============================] - 24s 331ms/step - loss: 0.2450 - accuracy: 0.9304
[0.24495860934257507, 0.9304347634315491]
'''
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。