赞
踩
@TOC
用GRU做自编码器,复原语音。就是将语音信号输入到RNN里,进行编码,然后再解码。
样本链接,提取码:vive
import keras
import numpy as np
import librosa
import librosa.display
import os
import os.path
import random
import time
import sys
import matplotlib.pyplot as plt
font = {'family' : 'SimHei',
'size' : '15'}
plt.rc('font', **font) # 步骤一(设置字体的更多属性)
plt.rc('axes', unicode_minus=False) # 步骤二(解决坐标轴负数的负号显示问题)
Using TensorFlow backend.
#获取音频文件路径
def get_wav_path(filePath):
wavPath = []
files = os.listdir(filePath)
for file in files:
wav = os.listdir(filePath+"/"+file)
for j in range(len(wav)):
fileType = wav[j].split(".")[1]
if fileType=="wav":
wavPath.append(filePath+"/"+file+"/"+wav[j])
return wavPath
#获取3秒时间序列 def get_data(wav_files,second=3,sampleRate=16000): """ :param wav_files:测试文件 :param second: 采样时间 :param sampleRate:采样率 :return """ #保存序列 x=[] begin_time = time.time() print('开始预处理文件') for index, wav in enumerate(wav_files): #根据采样率获取音频序列和采样率 signal, srate = librosa.load(wav, sr=sampleRate) if len(signal) <second*srate: continue # #超过3秒取三秒 if len(signal) >= second * srate: signal = signal[0:int(second * srate)] # else: #不足3秒补0 signal = signal.tolist() for j in range(second * srate - len(signal)): signal.append(0) signal = np.array(signal) x.append(signal) gaptime = time.time() - begin_time percent = float(index+1) * 100 / len(wav_files) eta_time = gaptime * 100 / (percent + 0.01) - gaptime strprogress = "[" + "=" * int(percent // 2) + ">" + "-" * int(50 - percent // 2) + "]" str_log = ("%.2f %% %s %s/%s \t 已用时间:%ds 剩余时间:%d s" % (percent,strprogress,index+1,len(wav_files),gaptime,eta_time)) sys.stdout.write('\r' + str_log) x= np.array(x) return x
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint,TensorBoard
from keras.models import load_model
filePath = "rnn_wavs/train"
wavPath = get_wav_path(filePath)
x=get_data(wavPath)
开始预处理文件
100.00 % [==================================================>] 50/50 已用时间:2s 剩余时间:0 s
print(x.shape)
print(x[0])
(50, 48000)
[-0.0090332 -0.01293945 -0.01196289 ... -0.03439331 -0.0390625
-0.03811646]
#显示波形图
def show_wave(x,title='waveplot'):
librosa.display.waveplot(x , sr=16000)
plt.title(title)
show_wave(x[0])
from keras import Model
from keras.layers import Dense,Input,CuDNNGRU
from keras.callbacks import TensorBoard
#将一个音频序列切分成N个向量,每个向量暂时定为100维,就是输入到RNN里的,可以看成每个音频是一个句,每个句子里有多个单词,每个单词都是100维
seq_vector=100
x_train=x.reshape(x.shape[0],-1,seq_vector)
print(x_train.shape)
print(x_train[0])
(50, 480, 100)
[[-0.0090332 -0.01293945 -0.01196289 ... -0.01239014 -0.01223755
-0.01193237]
[-0.01205444 -0.01248169 -0.01263428 ... -0.01290894 -0.0128479
-0.01260376]
[-0.01263428 -0.01239014 -0.01248169 ... -0.01202393 -0.01217651
-0.01254272]
...
[-0.0100708 -0.00872803 -0.00650024 ... -0.01254272 -0.01089478
-0.01132202]
[-0.01348877 -0.00967407 -0.00897217 ... -0.01647949 -0.01019287
-0.00961304]
[-0.0055542 -0.00222778 0.00445557 ... -0.03439331 -0.0390625
-0.03811646]]
#函数式要定义输入 input_image=Input((None,seq_vector)) #编码器 用CuDNNGRU可以用GPU,当然也可以用GRU encoder=CuDNNGRU(units=512, return_sequences=True, name="gru1")(input_image) encoder=CuDNNGRU(units=256, return_sequences=True,name="gru2")(encoder) encoder=Dense(256,activation='tanh')(encoder) encoder_out=Dense(32,activation='tanh')(encoder) encoder_model = Model(inputs=input_image, outputs=encoder_out) #解码器,反过来 decoder=Dense(256,activation='tanh')(encoder_out) decoder=CuDNNGRU(units=256, return_sequences=True,name="de_gru1")(decoder) decoder=CuDNNGRU(units=512, return_sequences=True, name="de_gru2")(decoder) decoder_out=Dense(seq_vector,activation='tanh')(decoder) autoencoder=Model(input_image,decoder_out)
autoencoder.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_4 (InputLayer) (None, None, 100) 0 _________________________________________________________________ gru1 (CuDNNGRU) (None, None, 512) 943104 _________________________________________________________________ gru2 (CuDNNGRU) (None, None, 256) 591360 _________________________________________________________________ dense_18 (Dense) (None, None, 256) 65792 _________________________________________________________________ dense_19 (Dense) (None, None, 32) 8224 _________________________________________________________________ dense_20 (Dense) (None, None, 256) 8448 _________________________________________________________________ de_gru1 (CuDNNGRU) (None, None, 256) 394752 _________________________________________________________________ de_gru2 (CuDNNGRU) (None, None, 512) 1182720 _________________________________________________________________ dense_21 (Dense) (None, None, 100) 51300 ================================================================= Total params: 3,245,700 Trainable params: 3,245,700 Non-trainable params: 0 _________________________________________________________________
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(x_train, x_train, epochs=50, batch_size=2,verbose=1,
callbacks=[TensorBoard(log_dir='log_RNN声音自编码器'),
ModelCheckpoint("rnn.h5",
monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)])
Epoch 1/50 50/50 [==============================] - 5s 103ms/step - loss: 0.0060 Epoch 00001: loss improved from inf to 0.00604, saving model to rnn.h5 Epoch 2/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0053 Epoch 00002: loss improved from 0.00604 to 0.00532, saving model to rnn.h5 Epoch 3/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0039 Epoch 00003: loss improved from 0.00532 to 0.00385, saving model to rnn.h5 Epoch 4/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0028 Epoch 00004: loss improved from 0.00385 to 0.00277, saving model to rnn.h5 Epoch 5/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0021 Epoch 00005: loss improved from 0.00277 to 0.00211, saving model to rnn.h5 Epoch 6/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0015 Epoch 00006: loss improved from 0.00211 to 0.00148, saving model to rnn.h5 Epoch 7/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0012 Epoch 00007: loss improved from 0.00148 to 0.00117, saving model to rnn.h5 Epoch 8/50 50/50 [==============================] - 4s 85ms/step - loss: 0.0010 Epoch 00008: loss improved from 0.00117 to 0.00102, saving model to rnn.h5 Epoch 9/50 50/50 [==============================] - 4s 85ms/step - loss: 9.1125e-04 Epoch 00009: loss improved from 0.00102 to 0.00091, saving model to rnn.h5 Epoch 10/50 50/50 [==============================] - 4s 85ms/step - loss: 8.4070e-04 Epoch 00010: loss improved from 0.00091 to 0.00084, saving model to rnn.h5 Epoch 11/50 50/50 [==============================] - 4s 85ms/step - loss: 7.7816e-04 Epoch 00011: loss improved from 0.00084 to 0.00078, saving model to rnn.h5 Epoch 12/50 50/50 [==============================] - 4s 85ms/step - loss: 6.8798e-04 Epoch 00012: loss improved from 0.00078 to 0.00069, saving model to rnn.h5 Epoch 13/50 50/50 [==============================] - 4s 85ms/step - loss: 6.0970e-04 Epoch 00013: loss improved from 0.00069 to 0.00061, saving model to rnn.h5 Epoch 14/50 50/50 [==============================] - 4s 85ms/step - loss: 5.6702e-04 Epoch 00014: loss improved from 0.00061 to 0.00057, saving model to rnn.h5 Epoch 15/50 50/50 [==============================] - 4s 85ms/step - loss: 5.2083e-04 Epoch 00015: loss improved from 0.00057 to 0.00052, saving model to rnn.h5 Epoch 16/50 50/50 [==============================] - 4s 85ms/step - loss: 4.7999e-04 Epoch 00016: loss improved from 0.00052 to 0.00048, saving model to rnn.h5 Epoch 17/50 50/50 [==============================] - 4s 85ms/step - loss: 4.6281e-04 Epoch 00017: loss improved from 0.00048 to 0.00046, saving model to rnn.h5 Epoch 18/50 50/50 [==============================] - 4s 85ms/step - loss: 4.4957e-04 Epoch 00018: loss improved from 0.00046 to 0.00045, saving model to rnn.h5 Epoch 19/50 50/50 [==============================] - 4s 85ms/step - loss: 4.5229e-04 Epoch 00019: loss did not improve Epoch 20/50 50/50 [==============================] - 4s 85ms/step - loss: 4.4043e-04 Epoch 00020: loss improved from 0.00045 to 0.00044, saving model to rnn.h5 Epoch 21/50 50/50 [==============================] - 4s 85ms/step - loss: 4.2521e-04: 1s - loss: 4.07 Epoch 00021: loss improved from 0.00044 to 0.00043, saving model to rnn.h5 Epoch 22/50 50/50 [==============================] - 4s 85ms/step - loss: 4.1621e-04 Epoch 00022: loss improved from 0.00043 to 0.00042, saving model to rnn.h5 Epoch 23/50 50/50 [==============================] - 4s 85ms/step - loss: 4.0283e-04 Epoch 00023: loss improved from 0.00042 to 0.00040, saving model to rnn.h5 Epoch 24/50 50/50 [==============================] - 4s 85ms/step - loss: 3.9148e-04 Epoch 00024: loss improved from 0.00040 to 0.00039, saving model to rnn.h5 Epoch 25/50 50/50 [==============================] - 4s 85ms/step - loss: 3.9139e-04 Epoch 00025: loss improved from 0.00039 to 0.00039, saving model to rnn.h5 Epoch 26/50 50/50 [==============================] - 4s 85ms/step - loss: 3.7617e-04 Epoch 00026: loss improved from 0.00039 to 0.00038, saving model to rnn.h5 Epoch 27/50 50/50 [==============================] - 4s 85ms/step - loss: 3.7279e-04 Epoch 00027: loss improved from 0.00038 to 0.00037, saving model to rnn.h5 Epoch 28/50 50/50 [==============================] - 4s 85ms/step - loss: 3.5802e-04 Epoch 00028: loss improved from 0.00037 to 0.00036, saving model to rnn.h5 Epoch 29/50 50/50 [==============================] - 4s 85ms/step - loss: 3.4530e-04 Epoch 00029: loss improved from 0.00036 to 0.00035, saving model to rnn.h5 Epoch 30/50 50/50 [==============================] - 4s 85ms/step - loss: 3.5566e-04 Epoch 00030: loss did not improve Epoch 31/50 50/50 [==============================] - 4s 85ms/step - loss: 3.4619e-04 Epoch 00031: loss did not improve Epoch 32/50 50/50 [==============================] - 4s 85ms/step - loss: 3.3743e-04 Epoch 00032: loss improved from 0.00035 to 0.00034, saving model to rnn.h5 Epoch 33/50 50/50 [==============================] - 4s 85ms/step - loss: 3.2660e-04 Epoch 00033: loss improved from 0.00034 to 0.00033, saving model to rnn.h5 Epoch 34/50 50/50 [==============================] - 4s 85ms/step - loss: 3.1028e-04 Epoch 00034: loss improved from 0.00033 to 0.00031, saving model to rnn.h5 Epoch 35/50 50/50 [==============================] - 4s 85ms/step - loss: 2.9729e-04 Epoch 00035: loss improved from 0.00031 to 0.00030, saving model to rnn.h5 Epoch 36/50 50/50 [==============================] - 4s 85ms/step - loss: 2.9330e-04 Epoch 00036: loss improved from 0.00030 to 0.00029, saving model to rnn.h5 Epoch 37/50 50/50 [==============================] - 4s 85ms/step - loss: 2.9591e-04 Epoch 00037: loss did not improve Epoch 38/50 50/50 [==============================] - 4s 85ms/step - loss: 2.8852e-04 Epoch 00038: loss improved from 0.00029 to 0.00029, saving model to rnn.h5 Epoch 39/50 50/50 [==============================] - 4s 85ms/step - loss: 2.7403e-04 Epoch 00039: loss improved from 0.00029 to 0.00027, saving model to rnn.h5 Epoch 40/50 50/50 [==============================] - 4s 85ms/step - loss: 2.7536e-04 Epoch 00040: loss did not improve Epoch 41/50 50/50 [==============================] - 4s 85ms/step - loss: 2.7173e-04 Epoch 00041: loss improved from 0.00027 to 0.00027, saving model to rnn.h5 Epoch 42/50 50/50 [==============================] - 4s 85ms/step - loss: 2.6409e-04 Epoch 00042: loss improved from 0.00027 to 0.00026, saving model to rnn.h5 Epoch 43/50 50/50 [==============================] - 4s 85ms/step - loss: 2.6014e-04 Epoch 00043: loss improved from 0.00026 to 0.00026, saving model to rnn.h5 Epoch 44/50 50/50 [==============================] - 4s 85ms/step - loss: 2.5614e-04 Epoch 00044: loss improved from 0.00026 to 0.00026, saving model to rnn.h5 Epoch 45/50 50/50 [==============================] - 4s 85ms/step - loss: 2.5297e-04 Epoch 00045: loss improved from 0.00026 to 0.00025, saving model to rnn.h5 Epoch 46/50 50/50 [==============================] - 4s 85ms/step - loss: 2.5371e-04 Epoch 00046: loss did not improve Epoch 47/50 50/50 [==============================] - 4s 85ms/step - loss: 2.5559e-04 Epoch 00047: loss did not improve Epoch 48/50 50/50 [==============================] - 4s 85ms/step - loss: 2.6840e-04 Epoch 00048: loss did not improve Epoch 49/50 50/50 [==============================] - 4s 85ms/step - loss: 2.7194e-04 Epoch 00049: loss did not improve Epoch 50/50 50/50 [==============================] - 4s 85ms/step - loss: 2.6571e-04 Epoch 00050: loss did not improve <keras.callbacks.History at 0x7380fc88>
filePath = "rnn_wavs/test"
testPath = get_wav_path(filePath)
x_test_ori=get_data(testPath)
x_test=x_test_ori.reshape(x_test_ori.shape[0],-1,seq_vector)
开始预处理文件
100.00 % [==================================================>] 4/4 已用时间:0s 剩余时间:0 s
decoded_seq = autoencoder.predict(x_test)
print(decoded_seq.shape)
(4, 480, 100)
d_shape=decoded_seq[0].shape
print(d_shape)
(480, 100)
decoded_seq[0]
array([[-0.01014584, -0.01277211, -0.01327355, ..., -0.01386975,
-0.0145765 , -0.01456202],
[-0.01393538, -0.01367248, -0.01350442, ..., -0.01421544,
-0.01455196, -0.0140023 ],
[-0.01335981, -0.01219707, -0.01206582, ..., -0.0150555 ,
-0.0149645 , -0.01383976],
...,
[-0.09125455, -0.10162869, -0.12252352, ..., -0.01989237,
-0.01427485, 0.01067922],
[-0.00659992, -0.00485723, -0.0141265 , ..., 0.26765165,
0.37300578, 0.42874205],
[ 0.5345399 , 0.59415686, 0.611799 , ..., 0.20478293,
0.2866281 , 0.3082694 ]], dtype=float32)
index=0
seq=decoded_seq[index].flatten()
show_wave(seq,'rnn 复原')
show_wave(x_test_ori[index],'原始')
#获取特征向量,可转成特征图片
encoded_latent = encoder_model.predict(x_test)
print(encoded_latent.shape)
(4, 480, 32)
#特征图片
def show_latent_images(start=0,end=4):
plt.figure(figsize=(20, 10))
for i in range(start,end):
ax = plt.subplot(2,end, i+1)
plt.imshow(encoded_latent[i].reshape(160, -1).T,cmap='binary')
plt.show()
show_latent_images()
testPath[0]
'rnn_wavs/test/A2/A2_239.wav'
#播放音频rnn_wavs/test/A2/A2_239.wav
import IPython
out_wav=testPath[0]
signal, srate = librosa.load(out_wav, sr=16000)
IPython.display.Audio(out_wav)
#取3秒
signal=signal[0:3*16000]
#播放前3秒的
out_new='3s.wav'
librosa.output.write_wav(out_new,signal,16000)
IPython.display.Audio(out_new)
#播放复原的3秒音频
out_new='new_3s.wav'
librosa.output.write_wav(out_new,seq,16000)
IPython.display.Audio(out_new)
好了,今天就到这里了,希望对学习理解有帮助,大神看见勿喷,仅为自己的学习理解,能力有限,请多包涵,侵删。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。