赞
踩
梅尔频谱图输入到生成器中,然后输出的是原始波形图
具体的结构图如下
生成器总结
python train.py --config config_v1.json
你可以使用我们提供的已经训练好的模型,链接
训练模型的具体细节如下,你可以按照自己的需要选取。我们提供了带有鉴别器权重的统一模型,可以用来作为迁移训练的基础。(我估计在vq-vae模型中,人家就对hifi-gan模型进行了迁移学习)
表格介绍
Audio File : LJ001-0001.wav
Mel-Spectrogram File : LJ001-0001.npy
python train.py --fine_tuning True --config config_v1.json
1、创建一个名为test_file的目录,并将需要推理的wav文件复制到对应的目录中
2、运行如下指令
python inference.py --checkpoint_file [generator checkpoint file path]
python inference_e2e.py --checkpoint_file [generator checkpoint file path]
残差模块一
class ResBlock1(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
# h: hyperparameters超参数
# channels: 通道数
# kernel_size: 卷积核大小
# dilation: 膨胀率
super(ResBlock1, self).__init__()
self.h = h
# 定义一个卷积层列表,并且每一个卷积层后面都跟着一个weight_norm,进行权重归一化
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2])))
])
# 对每一个卷积层进行初始化,调用了utils.py中的init_weights函数
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x):
# 迭代遍历两个卷积列表中的每一对
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
# 权重归一化可以帮助更好的训练模型
# 但是在测试的时候,权重归一化会影响模型的性能,所以在测试的时候需要移除权重归一化
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
残差模块二
class ResBlock2(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.h = h
# 两个卷积列表,并且每一个卷积后面都接上了一个weight_norm
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c(xt)
x = xt + x
return x
def remove_weight_norm(self):
# 权重归一化可以帮助更好的训练模型
# 但是在测试的时候,权重归一化会影响模型的性能,所以在测试的时候需要移除权重归一化
for l in self.convs:
remove_weight_norm(l)
具体代码
class Generator(torch.nn.Module):
def __init__(self, h):
super(Generator, self).__init__()
self.h = h
# num_kernels是
self.num_kernels = len(h.resblock_kernel_sizes)
# num_upsamples是上采样的次数
self.num_upsamples = len(h.upsample_rates)
# 定义一个预处理卷积层,并且后面跟着一个weight_norm
# 带有权重归一化的卷积层,输入通道数为80,输出通道数为h.upsample_initial_channel,卷积核大小为7,步长为1,padding为3
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
# 确定所使用的残差块类型
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
'''
定义上采样层模块
'''
self.ups = nn.ModuleList()
# usample_rates是上采样的倍数,upsample_kernel_sizes是上采样的卷积核大小
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
self.ups.append(weight_norm(
# ConvTranspose1d是一维的反卷积层,输入通道数为h.upsample_initial_channel//(2**i),
# 输出通道数为h.upsample_initial_channel//(2**(i+1)),卷积核大小为k,步长为u,padding为(k-u)//2
# 在对代码进行上采样的时候,使用的是反卷积,并且对通道数进行了缩减
ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
k, u, padding=(k-u)//2)))
'''
定义残差模块
'''
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
# 初始化残差模块,输入通道数为h.upsample_initial_channel//(2**(i+1)),
# 输出通道数为h.upsample_initial_channel//(2**(i+1)),卷积核大小为3,步长为1,padding为1
# 每一个上采样部分都有一个残差模块,这个残差模块的输入通道数和输出通道数都是h.upsample_initial_channel//(2**(i+1))
ch = h.upsample_initial_channel//(2**(i+1))
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
self.resblocks.append(resblock(h, ch, k, d))
# 定义一个后处理卷积层,并且后面跟着一个weight_norm
# 这个卷积层的输入通道数为h.upsample_initial_channel//(2**len(self.ups)),输出通道数为1,卷积核大小为7,步长为1,padding为3
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
def forward(self, x):
# 定义前向传播过程
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i*self.num_kernels+j](x)
else:
xs += self.resblocks[i*self.num_kernels+j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
# 权重归一化可以帮助更好的训练模型
# 但是在测试的时候,权重归一化会影响模型的性能,所以在测试的时候需要移除权重归一化
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
多个上采样层,是如何叠加到一块的?
为什么对梅尔频谱图进行多次上采样?
本来想将所有的代码分析还有数据都写在同一个博客里面,但是一看一下,已经一万多字了,就暂时先到这里,后面关于鉴别器、损失函数以及具体的训练函数的代码,在后续的博客将继续跟进。
这个生成器的代码看起来还是有点问题,对于每一个模块中间输出的张量大小,并没有任何的了解。这个要具体的跑起来才能看到,等我把整个模型都看完了,在放到服务器上跑一下。
可以加群一块讨论一下关于声音生成的技术
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。