当前位置:   article > 正文

音频数据的情感分类_pytorch实现_ravdess数据集

ravdess数据集

项目来源于Speech-Emotion-Classification-with-PyTorch。使用模型Parallel 2D CNN - Trnasformer Eencoder在数据集RAVDESS上,实现了96.78%的准确率。

一、Load the data

数据集RAVDESS包含了1440个音频文件(16bits,48kHz,.wav)。由北美的24个演员(12名男性和12名女性)连续说两句词性相匹配的话构成。(Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").)

数据集分布均衡。

每个文件都有一个单独的文件名。文件名由7部分数字标号组成((e.g., 03-01-06-01-02-01-12.wav))。Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)。Emotion有两个强度:01=normal,02=strong(neutral类除外,它只有normal强度)。Actor (01 to 24),奇数是男性,偶数是女性。

  1. import numpy as np
  2. import pandas as pd
  3. import os
  4. import librosa
  5. import librosa.display
  6. import IPython
  7. from IPython.display import Audio
  8. from IPython.display import Image
  9. import matplotlib.pyplot as plt
  10. import warnings
  11. warnings.filterwarnings("ignore")
  12. EMOTIONS = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 0:'surprise'} # surprise je promenjen sa 8 na 0
  13. DATA_PATH = '../archive/audio_speech_actors_01-24/'
  14. SAMPLE_RATE = 48000
  15. data = pd.DataFrame(columns=['Emotion', 'Emotion intensity', 'Gender','Path'])
  16. for dirpath, _, filenames in os.walk(DATA_PATH): #os.walk()返回(dirpath,dirnames,filenames),即(起始路径,起始路径下的文件夹,起始路径文件夹下的文件)
  17. for filename in filenames:
  18. file_path = os.path.join(dirpath, filename)
  19. identifiers = filename.split('.')[0].split('-') #filename: 03-01-01-01-01-01-01.wav
  20. # print("identifiers:",identifiers) # identifiers:['03', '01', '01', '01', '01', '01', '01']
  21. emotion = (int(identifiers[2]))
  22. if emotion == 8: # promeni surprise sa 8 na 0
  23. emotion = 0
  24. if int(identifiers[3]) == 1:
  25. emotion_intensity = 'normal'
  26. else:
  27. emotion_intensity = 'strong'
  28. if int(identifiers[6])%2 == 0:
  29. gender = 'female'
  30. else:
  31. gender = 'male'
  32. data = data.append({"Emotion": emotion,
  33. "Emotion intensity": emotion_intensity,
  34. "Gender": gender,
  35. "Path": file_path
  36. },
  37. ignore_index = True
  38. )
  1. print("number of files is {}".format(len(data)))
  2. data.head()

 

 Load the signals

Signals are loaded with sample rate of 48kHz and cut off to be in the range of [0.5, 3] seconds. If the signal is shorter than 3s it is padded

  1. mel_spectrograms = []
  2. signals = []
  3. for i, file_path in enumerate(data.Path):
  4. audio, sample_rate = librosa.load(file_path, duration=3, offset=0.5, sr=SAMPLE_RATE)
  5. signal = np.zeros((int(SAMPLE_RATE*3,)))
  6. # signal= audio #stack时会报错ValueError: all input arrays must have the same shape,因为signal长度不一致
  7. # print("len_signal",len(signal))
  8. signal[:len(audio)] = audio
  9. signals.append(signal)
  10. print("\r Processed {}/{} files".format(i,len(data)),end='')
  11. # print("signals_before_stack:",signals)
  12. signals = np.stack(signals,axis=0)
  13. # print("signals_after_stack:",signals)

Split the data 

Dataset is splitted into train, validation and test sets, with following percentage: (80,10,10)%.

  1. X = signals
  2. train_ind,test_ind,val_ind = [],[],[]
  3. X_train,X_val,X_test = [],[],[]
  4. Y_train,Y_val,Y_test = [],[],[]
  5. for emotion in range(len(EMOTIONS)):
  6. emotion_ind = list(data.loc[data.Emotion==emotion,'Emotion'].index) #提取情感类别为emotion的索引
  7. emotion_ind = np.random.permutation(emotion_ind) #将索引打乱,以便划分数据集。permunation返回打乱后的新数组
  8. m = len(emotion_ind)
  9. ind_train = emotion_ind[:int(0.8*m)]
  10. ind_val = emotion_ind[int(0.8*m):int(0.9*m)]
  11. ind_test = emotion_ind[int(0.9*m):]
  12. X_train.append(X[ind_train,:])
  13. Y_train.append(np.array([emotion]*len(ind_train),dtype=np.int32))
  14. X_val.append(X[ind_val,:])
  15. Y_val.append(np.array([emotion]*len(ind_val),dtype=np.int32))
  16. X_test.append(X[ind_test,:])
  17. Y_test.append(np.array([emotion]*len(ind_test),dtype=np.int32))
  18. train_ind.append(ind_train)
  19. test_ind.append(ind_test)
  20. val_ind.append(ind_val)
  21. X_train = np.concatenate(X_train,0)
  22. X_val = np.concatenate(X_val,0)
  23. X_test = np.concatenate(X_test,0)
  24. Y_train = np.concatenate(Y_train,0)
  25. Y_val = np.concatenate(Y_val,0)
  26. Y_test = np.concatenate(Y_test,0)
  27. train_ind = np.concatenate(train_ind,0)
  28. val_ind = np.concatenate(val_ind,0)
  29. test_ind = np.concatenate(test_ind,0)
  30. print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')
  31. print(f'X_val:{X_val.shape}, Y_val:{Y_val.shape}')
  32. print(f'X_test:{X_test.shape}, Y_test:{Y_test.shape}')
  33. # check if all are unique
  34. unique, count = np.unique(np.concatenate([train_ind,test_ind,val_ind],0), return_counts=True)
  35. print("Number of unique indexes is {}, out of {}".format(sum(count==1), X.shape[0]))
  36. del X

 Augment signals by adding AWGN

Data augmentation is performed by adding Additive White Gaussian Noise (with SNR in range [15,30]) on the original signal. This enormously improved accuracy and removed overfitting.

  1. def addAWGN(signal, num_bits=16, augmented_num=2, snr_low=15, snr_high=30):
  2. signal_len = len(signal)
  3. # Generate White Gaussian noise
  4. noise = np.random.normal(size=(augmented_num, signal_len))
  5. # Normalize signal and noise
  6. norm_constant = 2.0**(num_bits-1)
  7. signal_norm = signal / norm_constant
  8. noise_norm = noise / norm_constant
  9. # Compute signal and noise power
  10. s_power = np.sum(signal_norm ** 2) / signal_len
  11. n_power = np.sum(noise_norm ** 2, axis=1) / signal_len
  12. # Random SNR: Uniform [15, 30] in dB
  13. target_snr = np.random.randint(snr_low, snr_high)
  14. # Compute K (covariance matrix) for each noise
  15. K = np.sqrt((s_power / n_power) * 10 ** (- target_snr / 10))
  16. K = np.ones((signal_len, augmented_num)) * K
  17. # Generate noisy signal
  18. return signal + K.T * noise
  19. aug_signals = []
  20. aug_labels = []
  21. for i in range(X_train.shape[0]):
  22. signal = X_train[i,:]
  23. augmented_signals = addAWGN(signal)
  24. for j in range(augmented_signals.shape[0]):
  25. aug_labels.append(data.loc[i,"Emotion"])
  26. aug_signals.append(augmented_signals[j,:])
  27. data = data.append(data.iloc[i], ignore_index=True)
  28. print("\r Processed {}/{} files".format(i,X_train.shape[0]),end='')
  29. aug_signals = np.stack(aug_signals,axis=0)
  30. X_train = np.concatenate([X_train,aug_signals],axis=0)
  31. aug_labels = np.stack(aug_labels,axis=0)
  32. Y_train = np.concatenate([Y_train,aug_labels])
  33. print('')
  34. print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')

 Calculate mel spectrograms

MEL spectrogram is calculated and used as an input for the models (for the 1st and 2nd model the spectrogram is splitted into 7 chunks).
Example of the MEL spectrogram:

 

  1. def getMELspectrogram(audio, sample_rate):
  2. mel_spec = librosa.feature.melspectrogram(y=audio,
  3. sr=sample_rate,
  4. n_fft=1024,
  5. win_length = 512,
  6. window='hamming',
  7. hop_length = 256, #hop_length两个window之间的跳跃间隔
  8. n_mels=128,
  9. fmax=sample_rate/2
  10. )
  11. mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  12. return mel_spec_db
  13. # test function
  14. audio, sample_rate = librosa.load(data.loc[0,'Path'], duration=3, offset=0.5,sr=SAMPLE_RATE)
  15. signal = np.zeros((int(SAMPLE_RATE*3,)))
  16. signal[:len(audio)] = audio
  17. mel_spectrogram = getMELspectrogram(signal, SAMPLE_RATE)
  18. librosa.display.specshow(mel_spectrogram, y_axis='mel', x_axis='time')
  19. print('MEL spectrogram shape: ',mel_spectrogram.shape)

create the model

 

  1. import torch
  2. import torch.nn as nn
  3. class ParallelModel(nn.Module):
  4. def __init__(self,num_emotions):
  5. super().__init__()
  6. # conv block
  7. self.conv2Dblock = nn.Sequential(
  8. # 1. conv block
  9. nn.Conv2d(in_channels=1,
  10. out_channels=16,
  11. kernel_size=3,
  12. stride=1,
  13. padding=1
  14. ),
  15. nn.BatchNorm2d(16),
  16. nn.ReLU(),
  17. nn.MaxPool2d(kernel_size=2, stride=2),
  18. nn.Dropout(p=0.3),
  19. # 2. conv block
  20. nn.Conv2d(in_channels=16,
  21. out_channels=32,
  22. kernel_size=3,
  23. stride=1,
  24. padding=1
  25. ),
  26. nn.BatchNorm2d(32),
  27. nn.ReLU(),
  28. nn.MaxPool2d(kernel_size=4, stride=4),
  29. nn.Dropout(p=0.3),
  30. # 3. conv block
  31. nn.Conv2d(in_channels=32,
  32. out_channels=64,
  33. kernel_size=3,
  34. stride=1,
  35. padding=1
  36. ),
  37. nn.BatchNorm2d(64),
  38. nn.ReLU(),
  39. nn.MaxPool2d(kernel_size=4, stride=4),
  40. nn.Dropout(p=0.3),
  41. # 4. conv block
  42. nn.Conv2d(in_channels=64,
  43. out_channels=64,
  44. kernel_size=3,
  45. stride=1,
  46. padding=1
  47. ),
  48. nn.BatchNorm2d(64),
  49. nn.ReLU(),
  50. nn.MaxPool2d(kernel_size=4, stride=4),
  51. nn.Dropout(p=0.3)
  52. )
  53. # Transformer block
  54. self.transf_maxpool = nn.MaxPool2d(kernel_size=[2,4], stride=[2,4])
  55. transf_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=512, dropout=0.4, activation='relu')
  56. self.transf_encoder = nn.TransformerEncoder(transf_layer, num_layers=4)
  57. # Linear softmax layer
  58. self.out_linear = nn.Linear(320,num_emotions)
  59. self.dropout_linear = nn.Dropout(p=0)
  60. self.out_softmax = nn.Softmax(dim=1)
  61. def forward(self,x):
  62. # conv embedding
  63. conv_embedding = self.conv2Dblock(x) #(b,channel,freq,time)
  64. conv_embedding = torch.flatten(conv_embedding, start_dim=1) # do not flatten batch dimension
  65. # transformer embedding
  66. x_reduced = self.transf_maxpool(x)
  67. x_reduced = torch.squeeze(x_reduced,1)
  68. x_reduced = x_reduced.permute(2,0,1) # requires shape = (time,batch,embedding)
  69. transf_out = self.transf_encoder(x_reduced)
  70. transf_embedding = torch.mean(transf_out, dim=0)
  71. # concatenate
  72. complete_embedding = torch.cat([conv_embedding, transf_embedding], dim=1)
  73. # final Linear
  74. output_logits = self.out_linear(complete_embedding)
  75. output_logits = self.dropout_linear(output_logits)
  76. output_softmax = self.out_softmax(output_logits)
  77. return output_logits, output_softmax

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/390556
推荐阅读
相关标签
  

闽ICP备14008679号