赞
踩
项目来源于Speech-Emotion-Classification-with-PyTorch。使用模型Parallel 2D CNN - Trnasformer Eencoder在数据集RAVDESS上,实现了96.78%的准确率。
数据集RAVDESS包含了1440个音频文件(16bits,48kHz,.wav)。由北美的24个演员(12名男性和12名女性)连续说两句词性相匹配的话构成。(Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").)
数据集分布均衡。
每个文件都有一个单独的文件名。文件名由7部分数字标号组成((e.g., 03-01-06-01-02-01-12.wav))。Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)。Emotion有两个强度:01=normal,02=strong(neutral类除外,它只有normal强度)。Actor (01 to 24),奇数是男性,偶数是女性。
- import numpy as np
- import pandas as pd
- import os
- import librosa
- import librosa.display
- import IPython
- from IPython.display import Audio
- from IPython.display import Image
- import matplotlib.pyplot as plt
- import warnings
- warnings.filterwarnings("ignore")
-
- EMOTIONS = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 0:'surprise'} # surprise je promenjen sa 8 na 0
- DATA_PATH = '../archive/audio_speech_actors_01-24/'
- SAMPLE_RATE = 48000
-
- data = pd.DataFrame(columns=['Emotion', 'Emotion intensity', 'Gender','Path'])
- for dirpath, _, filenames in os.walk(DATA_PATH): #os.walk()返回(dirpath,dirnames,filenames),即(起始路径,起始路径下的文件夹,起始路径文件夹下的文件)
- for filename in filenames:
- file_path = os.path.join(dirpath, filename)
- identifiers = filename.split('.')[0].split('-') #filename: 03-01-01-01-01-01-01.wav
- # print("identifiers:",identifiers) # identifiers:['03', '01', '01', '01', '01', '01', '01']
- emotion = (int(identifiers[2]))
- if emotion == 8: # promeni surprise sa 8 na 0
- emotion = 0
- if int(identifiers[3]) == 1:
- emotion_intensity = 'normal'
- else:
- emotion_intensity = 'strong'
- if int(identifiers[6])%2 == 0:
- gender = 'female'
- else:
- gender = 'male'
-
- data = data.append({"Emotion": emotion,
- "Emotion intensity": emotion_intensity,
- "Gender": gender,
- "Path": file_path
- },
- ignore_index = True
- )
- print("number of files is {}".format(len(data)))
- data.head()
Signals are loaded with sample rate of 48kHz and cut off to be in the range of [0.5, 3] seconds. If the signal is shorter than 3s it is padded
- mel_spectrograms = []
- signals = []
- for i, file_path in enumerate(data.Path):
- audio, sample_rate = librosa.load(file_path, duration=3, offset=0.5, sr=SAMPLE_RATE)
- signal = np.zeros((int(SAMPLE_RATE*3,)))
- # signal= audio #stack时会报错ValueError: all input arrays must have the same shape,因为signal长度不一致
- # print("len_signal",len(signal))
- signal[:len(audio)] = audio
- signals.append(signal)
- print("\r Processed {}/{} files".format(i,len(data)),end='')
- # print("signals_before_stack:",signals)
- signals = np.stack(signals,axis=0)
- # print("signals_after_stack:",signals)
Dataset is splitted into train, validation and test sets, with following percentage: (80,10,10)%.
- X = signals
- train_ind,test_ind,val_ind = [],[],[]
- X_train,X_val,X_test = [],[],[]
- Y_train,Y_val,Y_test = [],[],[]
- for emotion in range(len(EMOTIONS)):
- emotion_ind = list(data.loc[data.Emotion==emotion,'Emotion'].index) #提取情感类别为emotion的索引
- emotion_ind = np.random.permutation(emotion_ind) #将索引打乱,以便划分数据集。permunation返回打乱后的新数组
- m = len(emotion_ind)
- ind_train = emotion_ind[:int(0.8*m)]
- ind_val = emotion_ind[int(0.8*m):int(0.9*m)]
- ind_test = emotion_ind[int(0.9*m):]
- X_train.append(X[ind_train,:])
- Y_train.append(np.array([emotion]*len(ind_train),dtype=np.int32))
- X_val.append(X[ind_val,:])
- Y_val.append(np.array([emotion]*len(ind_val),dtype=np.int32))
- X_test.append(X[ind_test,:])
- Y_test.append(np.array([emotion]*len(ind_test),dtype=np.int32))
- train_ind.append(ind_train)
- test_ind.append(ind_test)
- val_ind.append(ind_val)
- X_train = np.concatenate(X_train,0)
- X_val = np.concatenate(X_val,0)
- X_test = np.concatenate(X_test,0)
- Y_train = np.concatenate(Y_train,0)
- Y_val = np.concatenate(Y_val,0)
- Y_test = np.concatenate(Y_test,0)
- train_ind = np.concatenate(train_ind,0)
- val_ind = np.concatenate(val_ind,0)
- test_ind = np.concatenate(test_ind,0)
- print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')
- print(f'X_val:{X_val.shape}, Y_val:{Y_val.shape}')
- print(f'X_test:{X_test.shape}, Y_test:{Y_test.shape}')
- # check if all are unique
- unique, count = np.unique(np.concatenate([train_ind,test_ind,val_ind],0), return_counts=True)
- print("Number of unique indexes is {}, out of {}".format(sum(count==1), X.shape[0]))
-
- del X
Data augmentation is performed by adding Additive White Gaussian Noise (with SNR in range [15,30]) on the original signal. This enormously improved accuracy and removed overfitting.
- def addAWGN(signal, num_bits=16, augmented_num=2, snr_low=15, snr_high=30):
- signal_len = len(signal)
- # Generate White Gaussian noise
- noise = np.random.normal(size=(augmented_num, signal_len))
- # Normalize signal and noise
- norm_constant = 2.0**(num_bits-1)
- signal_norm = signal / norm_constant
- noise_norm = noise / norm_constant
- # Compute signal and noise power
- s_power = np.sum(signal_norm ** 2) / signal_len
- n_power = np.sum(noise_norm ** 2, axis=1) / signal_len
- # Random SNR: Uniform [15, 30] in dB
- target_snr = np.random.randint(snr_low, snr_high)
- # Compute K (covariance matrix) for each noise
- K = np.sqrt((s_power / n_power) * 10 ** (- target_snr / 10))
- K = np.ones((signal_len, augmented_num)) * K
- # Generate noisy signal
- return signal + K.T * noise
-
- aug_signals = []
- aug_labels = []
- for i in range(X_train.shape[0]):
- signal = X_train[i,:]
- augmented_signals = addAWGN(signal)
- for j in range(augmented_signals.shape[0]):
- aug_labels.append(data.loc[i,"Emotion"])
- aug_signals.append(augmented_signals[j,:])
- data = data.append(data.iloc[i], ignore_index=True)
- print("\r Processed {}/{} files".format(i,X_train.shape[0]),end='')
- aug_signals = np.stack(aug_signals,axis=0)
- X_train = np.concatenate([X_train,aug_signals],axis=0)
- aug_labels = np.stack(aug_labels,axis=0)
- Y_train = np.concatenate([Y_train,aug_labels])
- print('')
- print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')
MEL spectrogram is calculated and used as an input for the models (for the 1st and 2nd model the spectrogram is splitted into 7 chunks).
Example of the MEL spectrogram:
- def getMELspectrogram(audio, sample_rate):
- mel_spec = librosa.feature.melspectrogram(y=audio,
- sr=sample_rate,
- n_fft=1024,
- win_length = 512,
- window='hamming',
- hop_length = 256, #hop_length两个window之间的跳跃间隔
- n_mels=128,
- fmax=sample_rate/2
- )
- mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
- return mel_spec_db
-
- # test function
- audio, sample_rate = librosa.load(data.loc[0,'Path'], duration=3, offset=0.5,sr=SAMPLE_RATE)
- signal = np.zeros((int(SAMPLE_RATE*3,)))
- signal[:len(audio)] = audio
- mel_spectrogram = getMELspectrogram(signal, SAMPLE_RATE)
- librosa.display.specshow(mel_spectrogram, y_axis='mel', x_axis='time')
- print('MEL spectrogram shape: ',mel_spectrogram.shape)
- import torch
- import torch.nn as nn
-
- class ParallelModel(nn.Module):
- def __init__(self,num_emotions):
- super().__init__()
- # conv block
- self.conv2Dblock = nn.Sequential(
- # 1. conv block
- nn.Conv2d(in_channels=1,
- out_channels=16,
- kernel_size=3,
- stride=1,
- padding=1
- ),
- nn.BatchNorm2d(16),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=2, stride=2),
- nn.Dropout(p=0.3),
- # 2. conv block
- nn.Conv2d(in_channels=16,
- out_channels=32,
- kernel_size=3,
- stride=1,
- padding=1
- ),
- nn.BatchNorm2d(32),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=4, stride=4),
- nn.Dropout(p=0.3),
- # 3. conv block
- nn.Conv2d(in_channels=32,
- out_channels=64,
- kernel_size=3,
- stride=1,
- padding=1
- ),
- nn.BatchNorm2d(64),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=4, stride=4),
- nn.Dropout(p=0.3),
- # 4. conv block
- nn.Conv2d(in_channels=64,
- out_channels=64,
- kernel_size=3,
- stride=1,
- padding=1
- ),
- nn.BatchNorm2d(64),
- nn.ReLU(),
- nn.MaxPool2d(kernel_size=4, stride=4),
- nn.Dropout(p=0.3)
- )
- # Transformer block
- self.transf_maxpool = nn.MaxPool2d(kernel_size=[2,4], stride=[2,4])
- transf_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=512, dropout=0.4, activation='relu')
- self.transf_encoder = nn.TransformerEncoder(transf_layer, num_layers=4)
- # Linear softmax layer
- self.out_linear = nn.Linear(320,num_emotions)
- self.dropout_linear = nn.Dropout(p=0)
- self.out_softmax = nn.Softmax(dim=1)
- def forward(self,x):
- # conv embedding
- conv_embedding = self.conv2Dblock(x) #(b,channel,freq,time)
- conv_embedding = torch.flatten(conv_embedding, start_dim=1) # do not flatten batch dimension
- # transformer embedding
- x_reduced = self.transf_maxpool(x)
- x_reduced = torch.squeeze(x_reduced,1)
- x_reduced = x_reduced.permute(2,0,1) # requires shape = (time,batch,embedding)
- transf_out = self.transf_encoder(x_reduced)
- transf_embedding = torch.mean(transf_out, dim=0)
- # concatenate
- complete_embedding = torch.cat([conv_embedding, transf_embedding], dim=1)
- # final Linear
- output_logits = self.out_linear(complete_embedding)
- output_logits = self.dropout_linear(output_logits)
- output_softmax = self.out_softmax(output_logits)
- return output_logits, output_softmax
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。