赞
踩
import librosa import os from random import shuffle import numpy as np from sklearn import svm import joblib import sklearn # C:误差项惩罚参数,对误差的容忍程度。C越大,越不能容忍误差 # gamma:选择RBF函数作为kernel,越大,支持的向量越少;越小,支持的向量越多 # kernel: linear, poly, rbf, sigmoid, precomputed # decision_function_shape: ovo, ovr(default) # # # path = './casia' # 定义一个情感标签的字典,用于将情感类别映射到整数标签。 EMOTION_LABEL = { 'angry': '1', 'fear': '2', 'happy': '3', 'neutral': '4', 'sad': '5', 'surprise': '6' } ''' getFeature函数用于从音频文件中提取特征。 主要步骤包括读取音频文件、提取MFCC(Mel频率倒谱系数)特征、零交叉率、能量和均方根等音频特征 ,并将它们拼接成一个特征向量。 ''' def getFeature(path, mfcc_feature_num=16): y, sr = librosa.load(path) # 对于每一个音频文件提取其mfcc特征 # y:音频时间序列; # n_mfcc:要返回的MFCC数量 mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16) zcr_feature = librosa.feature.zero_crossing_rate(y) # energy_feature = librosa.feature.rmse(y) energy_feature = librosa.feature.rms(y) rms_feature = librosa.feature.rms(y) mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num] zcr_feature = zcr_feature.flatten() energy_feature = energy_feature.flatten() rms_feature = rms_feature.flatten() zcr_feature = np.array([np.mean(zcr_feature)]) energy_feature = np.array([np.mean(energy_feature)]) rms_feature = np.array([np.mean(rms_feature)]) data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature, rms_feature)) return data_feature ''' getData函数用于获取所有语音文件的特征和对应的情感标签。 它首先遍历数据集中的所有音频文件,并对其进行随机排列。 然后调用getFeature()函数提取特征,并将特征向量与情感标签一一对应,形成训练数据集。 ''' def getData(mfcc_feature_num=16): """找到数据集中的所有语音文件的特征以及语音的情感标签""" wav_file_path = [] person_dirs = os.listdir(path) for person in person_dirs: if person.endswith('txt'): continue emotion_dir_path = os.path.join(path, person) emotion_dirs = os.listdir(emotion_dir_path) for emotion_dir in emotion_dirs: if emotion_dir.endswith('.ini'): continue emotion_file_path = os.path.join(emotion_dir_path, emotion_dir) emotion_files = os.listdir(emotion_file_path) for file in emotion_files: if not file.endswith('wav'): continue wav_path = os.path.join(emotion_file_path, file) wav_file_path.append(wav_path) # 将语音文件随机排列 shuffle(wav_file_path) data_feature = [] data_labels = [] for wav_file in wav_file_path: data_feature.append(getFeature(wav_file, mfcc_feature_num)) data_labels.append(int(EMOTION_LABEL[wav_file.split('\\')[-2]])) return np.array(data_feature), np.array(data_labels) ''' train函数用于训练并选择最佳的SVM模型。 它通过循环尝试不同的参数组合(C和mfcc_feature_num),从数据集中提取特征,并将数据集分为训练集和测试集。 然后使用SVM分类器进行训练,并计算准确率。在循环过程中记录最佳准确率对应的参数组合。 ''' def train(): # 使用svm进行预测 best_acc = 0 best_mfcc_feature_num = 0 best_C = 0 for C in range(13, 20): for i in range(40, 55): data_feature, data_labels = getData(i) split_num = 200 train_data = data_feature[:split_num, :] train_label = data_labels[:split_num] test_data = data_feature[split_num:, :] test_label = data_labels[split_num:] clf = svm.SVC( decision_function_shape='ovo', kernel='rbf', C=C, gamma=0.0001, probability=True) print("train start") clf.fit(train_data, train_label) print("train over") print(C, i) acc_dict = {} for test_x, test_y in zip(test_data, test_label): pre = clf.predict([test_x])[0] if pre in acc_dict.keys(): continue acc_dict[pre] = test_y acc = sklearn.metrics.accuracy_score( clf.predict(test_data), test_label) if acc > best_acc: best_acc = acc best_C = C best_mfcc_feature_num = i print('best_acc', best_acc) print('best_C', best_C) print('best_mfcc_feature_num', best_mfcc_feature_num) print() # 保存模型 joblib.dump(clf, 'Models/C_' + str(C) + '_mfccNum_' + str(i) + '.m') print('best_acc', best_acc) print('best_C', best_C) print('best_mfcc_feature_num', best_mfcc_feature_num) ''' 调用train()函数进行训练,并输出最佳准确率及对应的参数C和mfcc_feature_num。 ''' if __name__ == "__main__": train()
EMOTION_LABEL字典、get_feature函数和get_data函数和SVM的一致。build_model函数用于构建LSTM模型,这里只写了一个简单的框架,根据需要再自行调整吧
import os import librosa import numpy as np from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, LSTM, Dropout from tensorflow.keras.utils import to_categorical # 将情感标签转换为数字 EMOTION_LABEL = { 'angry': 0, 'fear': 1, 'happy': 2, 'neutral': 3, 'sad': 4, 'surprise': 5 } def get_feature(path, n_mfcc=13, max_len=80): # 从音频文件中提取 MFCC 特征 y, sr = librosa.load(path) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc) # 使用 StandardScaler 缩放 MFCC 特征 scaler = StandardScaler() mfcc = scaler.fit_transform(mfcc.T).T if (max_len > mfcc.shape[1]): pad_width = max_len - mfcc.shape[1] mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant') else: mfcc = mfcc[:, :max_len] return np.squeeze(np.expand_dims(mfcc, axis=-1), axis=-1) def get_data(n_mfcc=13, max_len=80): # 获取所有音频文件的 MFCC 特征和情感标签,并将 MFCC 特征转换为时间序列数据 wav_file_path = [] person_dirs = os.listdir('./casia') for person in person_dirs: if person.endswith('txt'): continue emotion_dir_path = os.path.join('./casia', person) emotion_dirs = os.listdir(emotion_dir_path) for emotion_dir in emotion_dirs: if emotion_dir.endswith('.ini'): continue emotion_file_path = os.path.join(emotion_dir_path, emotion_dir) emotion_files = os.listdir(emotion_file_path) for file in emotion_files: if not file.endswith('wav'): continue wav_path = os.path.join(emotion_file_path, file) wav_file_path.append(wav_path) # 随机打乱音频文件 np.random.shuffle(wav_file_path) data_feature = [] data_labels = [] for wav_file in wav_file_path: # 提取 MFCC 特征,并将其转换为时间序列数据 mfcc = get_feature(wav_file, n_mfcc, max_len) data_feature.append(mfcc) data_labels.append(EMOTION_LABEL[wav_file.split('\\')[-2]]) # 将情感标签转换为独热编码 data_labels = to_categorical(data_labels) return np.array(data_feature), np.array(data_labels) def build_model(n_mfcc=13, max_len=80, n_classes=6): # 构建 LSTM 模型 model = Sequential() model.add(LSTM(128, input_shape=(n_mfcc, max_len), return_sequences=True)) model.add(Dropout(0.5)) model.add(LSTM(64, return_sequences=False)) model.add(Dropout(0.5)) model.add(Dense(n_classes, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model if __name__ == '__main__': # 获取数据集 X, y = get_data() # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 构建模型 model = build_model() # 训练模型 model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=50) # 评估模型准确率 score = model.evaluate(X_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # 保存模型 model.save('emotion_recognition_model.h5')
import joblib import numpy as np import wave import librosa path = './casia' EMOTION_LABEL = { 'angry': '1', 'fear': '2', 'happy': '3', 'neutral': '4', 'sad': '5', 'surprise': '6' } def getFeature(path, mfcc_feature_num=16): y, sr = librosa.load(path) # 对于每一个音频文件提取其mfcc特征 # y:音频时间序列; # n_mfcc:要返回的MFCC数量 mfcc_feature = librosa.feature.mfcc(y, sr, n_mfcc=16) zcr_feature = librosa.feature.zero_crossing_rate(y) # energy_feature = librosa.feature.rmse(y) energy_feature = librosa.feature.rms(y) rms_feature = librosa.feature.rms(y) mfcc_feature = mfcc_feature.T.flatten()[:mfcc_feature_num] zcr_feature = zcr_feature.flatten() energy_feature = energy_feature.flatten() rms_feature = rms_feature.flatten() zcr_feature = np.array([np.mean(zcr_feature)]) energy_feature = np.array([np.mean(energy_feature)]) rms_feature = np.array([np.mean(rms_feature)]) data_feature = np.concatenate((mfcc_feature, zcr_feature, energy_feature, rms_feature)) return data_feature wav_paths = ['test1.wav','test2.wav','test3.wav','test4.wav'] model = joblib.load("E:/speech/media/weights/classfier.m") labels = np.array(['angry', 'fear', 'happy', 'neutral', 'sad', 'surprise']) emotion_label_list = [] emotion_value_list = [] for wav_path in wav_paths: print(wav_path) f = wave.open(wav_path, 'rb') data_feature = getFeature(wav_path, 48) probability_data = model.predict_proba([data_feature])[0] # 获取概率列表 max_probability_index = np.argmax(probability_data) # 最大概率的坐标 max_probability = probability_data[max_probability_index] # 最大概率值 emotion_label = labels[max_probability_index] # 最终的表情 emotion_label_list.append(emotion_label) emotion_value_list.append(max_probability) combined_list = [[emotion, value] for emotion, value in zip(emotion_label_list, emotion_value_list)] f.close() print(combined_list)
输出:
[[‘neutral’, 0.28888379468448255], [‘happy’, 0.4550522457604587], [‘happy’, 0.5324629391829391], [‘happy’, 0.4118509132866488]]
简单的一个demo。
个人邮箱:k1933211129@163.com
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。