赞
踩
这篇文章主要介绍使用resnetish和vggish对声音进行分类的代码,因为要通过分类模型的准确率来评价生成声音的质量,所以有必要好好实现一下这部分代码。需要实现如下的要求
继续弄吧,这部分是结果分析的最后一部分了,完成了这部分就可以完成整个实验的复现了。
这里是用了ml-sound-classifier项目,项目链接
需要考虑几个问题
这里使用的是torchaudio这个专门用来处理音频的库,然后进行处理音频,通过load函数读取音频,然后通过to_mel_spectrogram将数据转成mel频谱图,具体如下
'''将音频数据集转成对应的mel频谱图数据集并保存到指定路径'''''
def wav2mel_save(root_path,audio_dataset,yaml_file,mel_dataset):
'''
将数据集下方的所有音频文件转成对应的mel频谱图并保存到指定路径
:param root_path: 数据集所在文件的根目录
:param audio_dataset: 跟目录下方数据集的文件夹名称
:param yaml_file: 配置文件的路径
:param mel_dataset: 需要保存的mel频谱图数据集的根目录
:return:
'''
# 加载eav转成mel的实例对象
# 加载yaml文件,获取配置信息
with open(yaml_file) as conf:
cfg = EasyDict(yaml.safe_load(conf))
# 加载mel处理实例
to_mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=cfg.sample_rate, n_fft=cfg.n_fft, n_mels=cfg.n_mels,
hop_length=cfg.hop_length, f_min=cfg.f_min, f_max=cfg.f_max)
# 处理数据
audio_lengths = []
originRoot = Path(root_path) # 原始音频数据集根目录
targetPath = Path(mel_dataset) # mel频谱图数据集根目录
for folder in audio_dataset:
cur_folder = originRoot / folder
filenames = sorted(cur_folder.glob('*.wav'))
# 遍历所有wav文件
for filename in filenames:
# 加载wav文件
waveform, sr = torchaudio.load(filename)
# print('sample rate',sr)
assert sr == cfg.sample_rate
# 计算音频长度并且保存
num_samples = waveform.shape[-1] # Assuming shape is (num_channels, num_samples)
audio_length = num_samples / sr
audio_lengths.append(audio_length)
# 处理音频,确保长度和采样率一致
# waveform = process_audio(filename, max_length, target_sample_rate)
# waveform = torch.from_numpy(waveform)
# wav转成log-mel频谱图
# log_mel_spec = to_mel_spectrogram(waveform).log()
# print(log_mel_spec.shape)
# Write to work
# (targetPath / folder).mkdir(parents=True, exist_ok=True)
# np.save(targetPath / folder / filename.name.replace('.wav', '.npy'), log_mel_spec)
audio_lengths.sort()
index_20_percent = int(0.6 * len(audio_lengths))
min_length_80_percent = audio_lengths[index_20_percent]
return min_length_80_percent
# 制作数据集dataset类进行处理
class MelDataset(torch.utils.data.Dataset):
''' 对于数据处理类,要明确他的标签,文件名,以及数据的长度 '''
def __init__(self, filenames, labels, transforms=None):
assert len(filenames) == len(labels), f'Inconsistent length of filenames and labels.'
self.filenames = filenames
self.labels = labels
self.transforms = transforms
# 计算需要处理的音频的长度
self.sample_length = int((cfg.clip_length * cfg.sample_rate + cfg.hop_length - 1) // cfg.hop_length)
# print(self.sample_length)
# print(self[0][0].shape[-1])
# 测试第一个 wav 文件的长度
assert self[0][0].shape[-1] == self.sample_length, f'Check your files, failed to load {filenames[0]}'
# 展示基本信息
print(
f'Dataset will yield log-mel spectrogram {len(self)} data samples in shape [1, {cfg.n_mels}, {self[0][0].shape[-1]}]')
def __len__(self):
return len(self.filenames)
def __getitem__(self, index):
'''
返回索引index对应的数据和标签
:param index: 需要找到的数据的索引
:return:
'''
assert 0 <= index and index < len(self)
# 读取数据
log_mel_spec = np.load(self.filenames[index])
def sample_length(log_mel_spec):
return log_mel_spec.shape[-1]
# 填补数据到特定的长度
pad_size = self.sample_length - sample_length(log_mel_spec)
if pad_size > 0:
offset = pad_size // 2
log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, 0), (offset, pad_size - offset)), 'constant')
# 剪裁数据到特定的长度
crop_size = sample_length(log_mel_spec) - self.sample_length
if crop_size > 0:
start = np.random.randint(0, crop_size)
log_mel_spec = log_mel_spec[..., start:start + self.sample_length]
# 使用数据增强
if self.transforms is not None:
log_mel_spec = self.transforms(log_mel_spec)
# 处理 -inf 的值
if np.isneginf(log_mel_spec).any():
log_mel_spec[np.isneginf(log_mel_spec)] = 0 # 或者你想替换成的任何其他值
# 在第 0 维(最前面)添加一个新的维度,因为 PyTorch 的输入是一个 batch
return torch.Tensor(log_mel_spec), self.labels[index]
import torch
import torch.nn as nn
import torch.nn.functional as F
# Mostly borrowed from https://github.com/pytorch/examples/blob/master/mnist/main.py
class Net(nn.Module):
def __init__(self, n_classes):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.conv3 = nn.Conv2d(64, 128, 3, 1)
self.conv4 = nn.Conv2d(128, 256, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.pooling = nn.AdaptiveAvgPool2d((8, 8)) # extended
self.fc1 = nn.Linear(16384, 128)
self.fc2 = nn.Linear(128, n_classes)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = self.conv3(x)
x = F.relu(x)
x = self.conv4(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = self.pooling(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
return x
class MyLearner(pl.LightningModule):
def __init__(self, model, learning_rate=3e-4, loss=nn.CrossEntropyLoss(),
classes=4, train_loader=None, valid_loader=None, test_loader=None):
super().__init__()
self.learning_rate = learning_rate
self.model = model
self.loss = loss
self.classes = classes
self.train_loader = train_loader
self.valid_loader = valid_loader
self.test_loader = test_loader # 注意拼写
def forward(self, x):
x = self.model(x)
# x = F.log_softmax(x, dim=1)
return x
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = self.loss(logits, y)
# loss = F.nll_loss(logits, y)
avg_loss = loss.mean()
self.log('train_loss', avg_loss)
return {'loss': loss}
def validation_step(self, batch, batch_idx, split='val'):
x, y = batch
# print("Validation batch:", x, y) # 打印验证批次数据
logits = self(x)
loss = self.loss(logits, y)
# loss = F.nll_loss(logits, y)
# print("Validation loss:", loss) # 打印验证损失
preds = torch.argmax(logits, dim=1)
# print(preds)
acc = accuracy(preds, y, num_classes=self.classes, task="multiclass") # 注意使用 self.classes
avg_loss = loss.mean()
self.log(f'{split}_loss', avg_loss, prog_bar=True)
self.log(f'{split}_acc', acc, prog_bar=True)
return {'val_loss': loss, 'val_acc': acc}
def test_step(self, batch, batch_idx, split='val'):
x, y = batch
x = torch.where(torch.isfinite(x), x, torch.zeros_like(x))
# print("Validation batch:", x, y) # 打印验证批次数据
logits = self(x)
loss = self.loss(logits, y)
# loss = F.nll_loss(logits, y)
# print("Validation loss:", loss) # 打印验证损失
preds = torch.argmax(logits, dim=1)
# print(preds)
acc = accuracy(preds, y, num_classes=self.classes, task="multiclass") # 注意使用 self.classes
avg_loss = loss.mean()
self.log(f'{split}_loss', avg_loss, prog_bar=True)
self.log(f'{split}_acc', acc, prog_bar=True)
return {'test_loss': loss, 'test_acc': acc}
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.05)
# scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.05, patience=10,min_lr=0.0002)
return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}
def train_dataloader(self):
return self.train_loader
def val_dataloader(self):
return self.valid_loader
def test_dataloader(self):
return self.test_loader
# 处理 -inf 的值
if np.isneginf(log_mel_spec).any():
log_mel_spec[np.isneginf(log_mel_spec)] = 0 # 或者你想替换成的任何其他值
总结
总结
一个合适的网络结构,作用很大,主要体现在两个方面。
上面的实验过程是有点尴尬的,不过还是有提高的,虽然提高的不过,说明我的学习率是有问题的。下面就是讲我的学习率问题。
from pytorch_lightning import LightningModule
class MyModel(LightningModule):
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}
总结
小插曲
上面那个学习率调整方式,并不是很合理,因为是固定步长调整学习率,如果在前面学习率较大的情况下,还没有找到最有点,那么再后来学习率较小的情况下,移动会更加困难,优化的速率会更慢,所以这里选择使用根据准确率来调整学习率。
参数说明
具体代码如下
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
# scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.05)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=10)
return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'val_loss'}
个人感觉是我的训练精度有问题,最低的精度设置的太低了,后续变化并不明显,自己私下里可以在试试看。
又试了一下,感觉这个效果更好,参数如下
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.05, patience=10,min_lr=0.0002)
无论怎么更改我的模型,发现模型的精确度并没有显著的提高,而且达不到我所参考的项目,这里就不得不再回到数据处理上,进行比较一下。
我所处理过之后数据,是[1,64,401]。而参考项目的所处理的数据,是[1,64,501],数据的采样点比我的数据多,我觉得参考的一句比我的多,所以我认为这是影响模型的一个重要的因素。
这里就需要具体看一下代码中给定的采样配置文件
# basic setting parameters
clip_length: 5.0 # [sec]
# preprocessing parameters
sample_rate: 44100
hop_length: 441
n_fft: 1024
n_mels: 64
f_min: 0
f_max: 22050
采样率sample_rate
通过torchaudio.load函数,读取音频,会自动返回波形图wave还有采样率sample rate。这个采样率是音频本身自己的采样率,所以在读取音频的时候,需要确保采样率和原始音频一致。
clip_length截取长度
pad_size = self.sample_length - sample_length(log_mel_spec)
if pad_size > 0:
offset = pad_size // 2
log_mel_spec = np.pad(log_mel_spec, ((0, 0), (0, 0), (offset, pad_size - offset)), 'constant')
# Random crop
crop_size = sample_length(log_mel_spec) - self.sample_length
if crop_size > 0:
start = np.random.randint(0, crop_size)
log_mel_spec = log_mel_spec[..., start:start + self.sample_length]
to_mel_spectrogram转成mel频谱图的相关参数
to_mel_spectrogram = torchaudio.transforms.MelSpectrogram(
sample_rate=cfg.sample_rate, n_fft=cfg.n_fft, n_mels=cfg.n_mels,
hop_length=cfg.hop_length, f_min=cfg.f_min, f_max=cfg.f_max)
hop_length: 441
n_fft: 1024
n_mels: 64
f_min: 0
f_max: 22050
尽可能多的特征提取
如果要提取尽可能多的特征,这里需要考虑两个点
这里修改了数据的长度,由原来的4秒改成了5秒
# basic setting parameters
clip_length: 5.0 # [sec]
这里查看了80%的声音长度都是小于8秒,然后只有50的声音长度是小于5秒,所以这里还可以在增加声音的长度。
设置为10秒种,具体的试验效果如下
def eval_acc(model, device, dataloader, debug_name=None):
model = model.to(device).eval()
count = correct = 0
for X, gt in dataloader:
logits = model(X.to(device))
preds = torch.argmax(logits, dim=1)
correct += sum(preds.cpu() == gt)
count += len(gt)
acc = correct/count
if debug_name:
print(f'{debug_name} acc = {acc:.4f}')
return acc
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。