赞
踩
1. 知识点
2. 应用实例
实现思路:
机器翻译和触发词检测
将人类可读日期翻译成机器可读日期
带注意力的神经机器翻译
触发词检测
- from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
- from keras.layers import RepeatVector, Dense, Activation, Lambda
- from keras.optimizers import Adam
- from keras.utils import to_categorical
- from keras.models import load_model, Model
- import keras.backend as K
- import numpy as np
-
- from faker import Faker
- import random
- from tqdm import tqdm
- from babel.dates import format_date
- from nmt_utils import *
- import matplotlib.pyplot as plt
-
- %matplotlib inline
Using TensorFlow backend.
- m = 10000
- dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)
100%|████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 14551.92it/s]
dataset[:10]
[('9 may 1998', '1998-05-09'),
('10.09.70', '1970-09-10'),
('4/28/90', '1990-04-28'),
('thursday january 26 1995', '1995-01-26'),
('monday march 7 1983', '1983-03-07'),
('sunday may 22 1988', '1988-05-22'),
('tuesday july 8 2008', '2008-07-08'),
('08 sep 1999', '1999-09-08'),
('1 jan 1981', '1981-01-01'),
('monday may 22 1995', '1995-05-22')]
- Tx = 30
- Ty = 10
- X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)
-
- print("X.shape:", X.shape)
- print("Y.shape:", Y.shape)
- print("Xoh.shape:", Xoh.shape)
- print("Yoh.shape:", Yoh.shape)
X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)
- index = 0
- print("Source date:", dataset[index][0])
- print("Target date:", dataset[index][1])
- print()
- print("Source after preprocessing (indices):", X[index])
- print("Target after preprocessing (indices):", Y[index])
- print()
- print("Source after preprocessing (one-hot):", Xoh[index])
- print("Target after preprocessing (one-hot):", Yoh[index])
Source date: 9 may 1998
Target date: 1998-05-09Source after preprocessing (indices): [12 0 24 13 34 0 4 12 12 11 36 36 36 36 36 36 36 36 36 36 36 36 36 36
36 36 36 36 36 36]
Target after preprocessing (indices): [ 2 10 10 9 0 1 6 0 1 10]Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]
[0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
- # 将共享层定义为全局变量
- repeator = RepeatVector(Tx)
- concatenator = Concatenate(axis=-1)
- densor1 = Dense(10, activation = "tanh")
- densor2 = Dense(1, activation = "relu")
- activator = Activation(softmax, name='attention_weights') # 在这个 notebook 我们正在使用自定义的 softmax(axis = 1)
- dotor = Dot(axes = 1)
- # GRADED FUNCTION: one_step_attention
-
- def one_step_attention(a, s_prev):
- """
- 执行一步 attention: 输出一个上下文向量,输出作为注意力权重的点积计算的上下文向量
- "alphas" Bi-LSTM的 隐藏状态 "a"
-
- 参数:
- a -- Bi-LSTM的输出隐藏状态 numpy-array 维度 (m, Tx, 2*n_a)
- s_prev -- (post-attention) LSTM的前一个隐藏状态, numpy-array 维度(m, n_s)
-
- 返回:
- context -- 上下文向量, 下一个(post-attetion) LSTM 单元的输入
- """
-
- # 使用 repeator 重复 s_prev 维度 (m, Tx, n_s) 这样你就可以将它与所有隐藏状态"a" 连接起来。 (≈ 1 line)
- s_prev = repeator(s_prev)
- # 使用 concatenator 在最后一个轴上连接 a 和 s_prev (≈ 1 line)
- concat = concatenator([a, s_prev])
- # 使用 densor1 传入参数 concat, 通过一个小的全连接神经网络来计算“中间能量”变量 e。(≈1 lines)
- e = densor1(concat)
- # 使用 densor2 传入参数 e , 通过一个小的全连接神经网络来计算“能量”变量 energies。(≈1 lines)
- energies = densor2(e)
- # 使用 activator 传入参数 "energies" 计算注意力权重 "alphas" (≈ 1 line)
- alphas = activator(energies)
- # 使用 dotor 传入参数 "alphas" 和 "a" 计算下一个((post-attention) LSTM 单元的上下文向量 (≈ 1 line)
- context = dotor([alphas, a])
-
- return context
- n_a = 32
- n_s = 64
- post_activation_LSTM_cell = LSTM(n_s, return_state = True)
- output_layer = Dense(len(machine_vocab), activation=softmax)
- # GRADED FUNCTION: model
-
- def model(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
- """
- 参数:
- Tx -- 输入序列的长度
- Ty -- 输出序列的长度
- n_a -- Bi-LSTM的隐藏状态大小
- n_s -- post-attention LSTM的隐藏状态大小
- human_vocab_size -- python字典 "human_vocab" 的大小
- machine_vocab_size -- python字典 "machine_vocab" 的大小
- 返回:
- model -- Keras 模型实例
- """
-
- # 定义模型的输入,维度 (Tx,)
- # 定义 s0 和 c0, 初始化解码器 LSTM 的隐藏状态,维度 (n_s,)
- X = Input(shape=(Tx, human_vocab_size))
- s0 = Input(shape=(n_s,), name='s0')
- c0 = Input(shape=(n_s,), name='c0')
- s = s0
- c = c0
-
- # 初始化一个空的输出列表
- outputs = []
-
-
- # 第一步:定义 pre-attention Bi-LSTM。 记得使用 return_sequences=True. (≈ 1 line)
- a = Bidirectional(LSTM(n_a, return_sequences=True), input_shape=(m, Tx, n_a * 2))(X)
-
- # 第二步:迭代 Ty 步
- for t in range(Ty):
-
- # 第二步.A: 执行一步注意机制,得到在 t 步的上下文向量 (≈ 1 line)
- context = one_step_attention(a, s)
-
- # 第二步.B: 使用 post-attention LSTM 单元得到新的 "context"
- # 别忘了使用: initial_state = [hidden state, cell state] (≈ 1 line)
- s, _, c = post_activation_LSTM_cell(context, initial_state=[s, c])
-
- # 第二步.C: 使用全连接层处理post-attention LSTM 的隐藏状态输出 (≈ 1 line)
- out = output_layer(s)
-
- # 第二步.D: 追加 "out" 到 "outputs" 列表 (≈ 1 line)
- outputs.append(out)
-
- # 第三步:创建模型实例,获取三个输入并返回输出列表。 (≈ 1 line)
- model = Model(inputs=[X, s0, c0], outputs=outputs)
-
- return model
-
-
model = model(Tx, Ty, n_a, n_s, len(human_vocab), len(machine_vocab))
model.summary()
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) (None, 30, 37) 0
__________________________________________________________________________________________________
s0 (InputLayer) (None, 64) 0
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 30, 64) 17920 input_1[0][0]
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector) (None, 30, 64) 0 s0[0][0]
lstm_1[0][0]
lstm_1[1][0]
lstm_1[2][0]
lstm_1[3][0]
lstm_1[4][0]
lstm_1[5][0]
lstm_1[6][0]
lstm_1[7][0]
lstm_1[8][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 30, 128) 0 bidirectional_1[0][0]
repeat_vector_1[0][0]
bidirectional_1[0][0]
repeat_vector_1[1][0]
bidirectional_1[0][0]
repeat_vector_1[2][0]
bidirectional_1[0][0]
repeat_vector_1[3][0]
bidirectional_1[0][0]
repeat_vector_1[4][0]
bidirectional_1[0][0]
repeat_vector_1[5][0]
bidirectional_1[0][0]
repeat_vector_1[6][0]
bidirectional_1[0][0]
repeat_vector_1[7][0]
bidirectional_1[0][0]
repeat_vector_1[8][0]
bidirectional_1[0][0]
repeat_vector_1[9][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 30, 10) 1290 concatenate_1[0][0]
concatenate_1[1][0]
concatenate_1[2][0]
concatenate_1[3][0]
concatenate_1[4][0]
concatenate_1[5][0]
concatenate_1[6][0]
concatenate_1[7][0]
concatenate_1[8][0]
concatenate_1[9][0]
__________________________________________________________________________________________________
dense_2 (Dense) (None, 30, 1) 11 dense_1[0][0]
dense_1[1][0]
dense_1[2][0]
dense_1[3][0]
dense_1[4][0]
dense_1[5][0]
dense_1[6][0]
dense_1[7][0]
dense_1[8][0]
dense_1[9][0]
__________________________________________________________________________________________________
attention_weights (Activation) (None, 30, 1) 0 dense_2[0][0]
dense_2[1][0]
dense_2[2][0]
dense_2[3][0]
dense_2[4][0]
dense_2[5][0]
dense_2[6][0]
dense_2[7][0]
dense_2[8][0]
dense_2[9][0]
__________________________________________________________________________________________________
dot_1 (Dot) (None, 1, 64) 0 attention_weights[0][0]
bidirectional_1[0][0]
attention_weights[1][0]
bidirectional_1[0][0]
attention_weights[2][0]
bidirectional_1[0][0]
attention_weights[3][0]
bidirectional_1[0][0]
attention_weights[4][0]
bidirectional_1[0][0]
attention_weights[5][0]
bidirectional_1[0][0]
attention_weights[6][0]
bidirectional_1[0][0]
attention_weights[7][0]
bidirectional_1[0][0]
attention_weights[8][0]
bidirectional_1[0][0]
attention_weights[9][0]
bidirectional_1[0][0]
__________________________________________________________________________________________________
c0 (InputLayer) (None, 64) 0
__________________________________________________________________________________________________
lstm_1 (LSTM) [(None, 64), (None, 33024 dot_1[0][0]
s0[0][0]
c0[0][0]
dot_1[1][0]
lstm_1[0][0]
lstm_1[0][2]
dot_1[2][0]
lstm_1[1][0]
lstm_1[1][2]
dot_1[3][0]
lstm_1[2][0]
lstm_1[2][2]
dot_1[4][0]
lstm_1[3][0]
lstm_1[3][2]
dot_1[5][0]
lstm_1[4][0]
lstm_1[4][2]
dot_1[6][0]
lstm_1[5][0]
lstm_1[5][2]
dot_1[7][0]
lstm_1[6][0]
lstm_1[6][2]
dot_1[8][0]
lstm_1[7][0]
lstm_1[7][2]
dot_1[9][0]
lstm_1[8][0]
lstm_1[8][2]
__________________________________________________________________________________________________
dense_3 (Dense) (None, 11) 715 lstm_1[0][0]
lstm_1[1][0]
lstm_1[2][0]
lstm_1[3][0]
lstm_1[4][0]
lstm_1[5][0]
lstm_1[6][0]
lstm_1[7][0]
lstm_1[8][0]
lstm_1[9][0]
==================================================================================================
Total params: 52,960
Trainable params: 52,960
Non-trainable params: 0
__________________________________________________________________________________________________
- ### START CODE HERE ### (≈2 lines)
- opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01)
- model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
- ### END CODE HERE ###
- s0 = np.zeros((m, n_s))
- c0 = np.zeros((m, n_s))
- outputs = list(Yoh.swapaxes(0,1))
model.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100)
model.load_weights('models/model.h5')
- EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
- for example in EXAMPLES:
-
- source = string_to_int(example, Tx, human_vocab)
- source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
- source = np.expand_dims(source, axis=0)
- prediction = model.predict([source, s0, c0])
- prediction = np.argmax(prediction, axis = -1)
- output = [inv_machine_vocab[int(i)] for i in prediction]
-
- print("source:", example)
- print("output:", ''.join(output))
source: 3 May 1979
output: 1979-05-03
source: 5 April 09
output: 2009-05-05
source: 21th of August 2016
output: 2016-08-21
source: Tue 10 Jul 2007
output: 2007-07-10
source: Saturday May 9 2018
output: 2018-05-09
source: March 3 2001
output: 2001-03-03
source: March 3rd 2001
output: 2001-03-03
source: 1 March 2001
output: 2001-03-01
model.summary()
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) (None, 30, 37) 0
__________________________________________________________________________________________________
s0 (InputLayer) (None, 64) 0
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 30, 64) 17920 input_1[0][0]
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector) (None, 30, 64) 0 s0[0][0]
lstm_1[0][0]
lstm_1[1][0]
lstm_1[2][0]
lstm_1[3][0]
lstm_1[4][0]
lstm_1[5][0]
lstm_1[6][0]
lstm_1[7][0]
lstm_1[8][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 30, 128) 0 bidirectional_1[0][0]
repeat_vector_1[0][0]
bidirectional_1[0][0]
repeat_vector_1[1][0]
bidirectional_1[0][0]
repeat_vector_1[2][0]
bidirectional_1[0][0]
repeat_vector_1[3][0]
bidirectional_1[0][0]
repeat_vector_1[4][0]
bidirectional_1[0][0]
repeat_vector_1[5][0]
bidirectional_1[0][0]
repeat_vector_1[6][0]
bidirectional_1[0][0]
repeat_vector_1[7][0]
bidirectional_1[0][0]
repeat_vector_1[8][0]
bidirectional_1[0][0]
repeat_vector_1[9][0]
__________________________________________________________________________________________________
dense_1 (Dense) (None, 30, 10) 1290 concatenate_1[0][0]
concatenate_1[1][0]
concatenate_1[2][0]
concatenate_1[3][0]
concatenate_1[4][0]
concatenate_1[5][0]
concatenate_1[6][0]
concatenate_1[7][0]
concatenate_1[8][0]
concatenate_1[9][0]
__________________________________________________________________________________________________
dense_2 (Dense) (None, 30, 1) 11 dense_1[0][0]
dense_1[1][0]
dense_1[2][0]
dense_1[3][0]
dense_1[4][0]
dense_1[5][0]
dense_1[6][0]
dense_1[7][0]
dense_1[8][0]
dense_1[9][0]
__________________________________________________________________________________________________
attention_weights (Activation) (None, 30, 1) 0 dense_2[0][0]
dense_2[1][0]
dense_2[2][0]
dense_2[3][0]
dense_2[4][0]
dense_2[5][0]
dense_2[6][0]
dense_2[7][0]
dense_2[8][0]
dense_2[9][0]
__________________________________________________________________________________________________
dot_1 (Dot) (None, 1, 64) 0 attention_weights[0][0]
bidirectional_1[0][0]
attention_weights[1][0]
bidirectional_1[0][0]
attention_weights[2][0]
bidirectional_1[0][0]
attention_weights[3][0]
bidirectional_1[0][0]
attention_weights[4][0]
bidirectional_1[0][0]
attention_weights[5][0]
bidirectional_1[0][0]
attention_weights[6][0]
bidirectional_1[0][0]
attention_weights[7][0]
bidirectional_1[0][0]
attention_weights[8][0]
bidirectional_1[0][0]
attention_weights[9][0]
bidirectional_1[0][0]
__________________________________________________________________________________________________
c0 (InputLayer) (None, 64) 0
__________________________________________________________________________________________________
lstm_1 (LSTM) [(None, 64), (None, 33024 dot_1[0][0]
s0[0][0]
c0[0][0]
dot_1[1][0]
lstm_1[0][0]
lstm_1[0][2]
dot_1[2][0]
lstm_1[1][0]
lstm_1[1][2]
dot_1[3][0]
lstm_1[2][0]
lstm_1[2][2]
dot_1[4][0]
lstm_1[3][0]
lstm_1[3][2]
dot_1[5][0]
lstm_1[4][0]
lstm_1[4][2]
dot_1[6][0]
lstm_1[5][0]
lstm_1[5][2]
dot_1[7][0]
lstm_1[6][0]
lstm_1[6][2]
dot_1[8][0]
lstm_1[7][0]
lstm_1[7][2]
dot_1[9][0]
lstm_1[8][0]
lstm_1[8][2]
__________________________________________________________________________________________________
dense_3 (Dense) (None, 11) 715 lstm_1[0][0]
lstm_1[1][0]
lstm_1[2][0]
lstm_1[3][0]
lstm_1[4][0]
lstm_1[5][0]
lstm_1[6][0]
lstm_1[7][0]
lstm_1[8][0]
lstm_1[9][0]
==================================================================================================
Total params: 52,960
Trainable params: 52,960
Non-trainable params: 0
__________________________________________________________________________________________________
attention_map = plot_attention_map(model, human_vocab, inv_machine_vocab, "Tuesday 09 Oct 1993", num = 7, n_s = 64)
<Figure size 432x288 with 0 Axes>
- import numpy as np
- from pydub import AudioSegment
- import random
- import sys
- import io
- import os
- import glob
- import IPython
- from td_utils import *
- %matplotlib inline
IPython.display.Audio("./raw_data/activates/1.wav")
CSDN不支持播放音频
IPython.display.Audio("./raw_data/negatives/4.wav")
CSDN不支持播放音频
IPython.display.Audio("./raw_data/backgrounds/1.wav")
CSDN不支持播放音频
IPython.display.Audio("audio_examples/example_train.wav")
CSDN不支持播放音频
x = graph_spectrogram("audio_examples/example_train.wav")
- _, data = wavfile.read("audio_examples/example_train.wav")
- print("Time steps in audio recording before spectrogram", data[:,0].shape)
- print("Time steps in input after spectrogram", x.shape)
- Time steps in audio recording before spectrogram (441000,)
- Time steps in input after spectrogram (101, 5511)
- Tx = 5511 # 从频谱图输入到模型的时间步数
- n_freq = 101 # 在频谱图的每个时间步输入模型的频率数
Ty = 1375 # 我们模型输出中的时间步数
- # 使用pydub加载音频片段
- activates, negatives, backgrounds = load_raw_audio()
-
- print("background len: " + str(len(backgrounds[0]))) # 应该是10,000,因为它是一个10秒的剪辑
- print("activate[0] len: " + str(len(activates[0]))) # 也许大约1000,因为 "activate" 音频剪辑通常大约1秒(但变化很大)
- print("activate[1] len: " + str(len(activates[1]))) # 不同的 "activate" 剪辑可以具有不同的长度
background len: 10000
activate[0] len: 721
activate[1] len: 731
- def get_random_time_segment(segment_ms):
- """
- 获取 10,000 ms音频剪辑中时间长为 segment_ms 的随机时间段。
-
- 参数:
- segment_ms -- 音频片段的持续时间,以毫秒为单位("ms" 代表 "毫秒")
-
- 返回:
- segment_time -- 以ms为单位的元组(segment_start,segment_end)
- """
-
- segment_start = np.random.randint(low=0, high=10000-segment_ms) # 确保段不会超过10秒背景
- segment_end = segment_start + segment_ms - 1
-
- return (segment_start, segment_end)
- # GRADED FUNCTION: is_overlapping
-
- def is_overlapping(segment_time, previous_segments):
- """
- 检查段的时间是否与现有段的时间重叠。
-
- 参数:
- segment_time -- 新段的元组(segment_start,segment_end)
- previous_segments -- 现有段的元组列表(segment_start,segment_end)
-
- 返回:
- 如果时间段与任何现有段重叠,则为True,否则为False
- """
-
- segment_start, segment_end = segment_time
-
- # 第一步:将重叠标识 overlap 初始化为“False”标志 (≈ 1 line)
- overlap = False
-
- # 第二步:循环遍历 previous_segments 的开始和结束时间。
- # 比较开始/结束时间,如果存在重叠,则将标志 overlap 设置为True (≈ 3 lines)
- for previous_start, previous_end in previous_segments:
- if segment_start <= previous_end and segment_end >= previous_start:
- overlap = True
-
- return overlap
Overlap 1 = False
Overlap 2 = True
- # GRADED FUNCTION: insert_audio_clip
-
- def insert_audio_clip(background, audio_clip, previous_segments):
- """
- 在随机时间步骤中在背景噪声上插入新的音频片段,确保音频片段与现有片段不重叠。
-
- 参数:
- background -- 10秒背景录音。
- audio_clip -- 要插入/叠加的音频剪辑。
- previous_segments -- 已放置的音频片段的时间
-
- 返回:
- new_background -- 更新的背景音频
- """
-
- # 以ms为单位获取音频片段的持续时间
- segment_ms = len(audio_clip)
-
- # 第一步:使用其中一个辅助函数来选择要插入的随机时间段
- # 新的音频剪辑。 (≈ 1 line)
- segment_time = get_random_time_segment(segment_ms)
-
- # 第二步:检查新的segment_time是否与previous_segments之一重叠。
- # 如果重叠如果是这样,请继续随机选择新的 segment_time 直到它不重叠。(≈ 2 lines)
- while is_overlapping(segment_time, previous_segments):
- segment_time = get_random_time_segment(segment_ms)
-
- # 第三步: 将新的 segment_time 添加到 previous_segments 列表中 (≈ 1 line)
- previous_segments.append(segment_time)
-
- # 第四步: 叠加音频片段和背景
- new_background = background.overlay(audio_clip, position = segment_time[0])
-
- return new_background, segment_time
- np.random.seed(5)
- audio_clip, segment_time = insert_audio_clip(backgrounds[0], activates[0], [(3790, 4400)])
- audio_clip.export("insert_test.wav", format="wav")
- print("Segment Time: ", segment_time)
- IPython.display.Audio("insert_test.wav")
Segment Time: (2915, 3635)
CSDN不支持播放音频
- # 预期的音频
- IPython.display.Audio("audio_examples/insert_reference.wav")
CSDN不支持播放音频
- # GRADED FUNCTION: insert_ones
-
- def insert_ones(y, segment_end_ms):
- """
- 更新标签向量y。段结尾的后面50个输出的标签应设为 1。
- 严格来说,我们的意思是 segment_end_y 的标签应该是 0,而随后的50个标签应该是1。
-
- 参数:
- y -- numpy数组的维度 (1, Ty), 训练样例的标签
- segment_end_ms -- 以ms为单位的段的结束时间
-
- 返回:
- y -- 更新标签
- """
-
- # 背景持续时间(以频谱图时间步长表示)
- segment_end_y = int(segment_end_ms * Ty / 10000.0)
-
- # 将1添加到背景标签(y)中的正确索引
- for i in range(segment_end_y + 1, segment_end_y + 51):
- if i < Ty:
- y[0, i] = 1
-
- return y
- arr1 = insert_ones(np.zeros((1, Ty)), 9700)
- plt.plot(insert_ones(arr1, 4251)[0,:])
- print("sanity checks:", arr1[0][1333], arr1[0][634], arr1[0][635])
sanity checks: 0.0 1.0 0.0
- # GRADED FUNCTION: create_training_example
-
- def create_training_example(background, activates, negatives):
- """
- 创建具有给定背景,正例和负例的训练示例。
-
- 参数:
- background -- 10秒背景录音
- activates -- "activate" 一词的音频片段列表
- negatives -- 不是 "activate" 一词的音频片段列表
-
- 返回:
- x -- 训练样例的频谱图
- y -- 频谱图的每个时间步的标签
- """
-
- # 设置随机种子
- np.random.seed(18)
-
- # 让背景更安静
- background = background - 20
-
- # 第一步:初始化 y (标签向量)为0 (≈ 1 line)
- y = np.zeros((1, Ty))
-
- # 第二步:将段时间初始化为空列表 (≈ 1 line)
- previous_segments = []
-
- # 从整个 "activate" 录音列表中选择0-4随机 "activate" 音频片段
- number_of_activates = np.random.randint(0, 5)
- random_indices = np.random.randint(len(activates), size=number_of_activates)
- random_activates = [activates[i] for i in random_indices]
-
- # 第三步: 循环随机选择 "activate" 剪辑插入背景
- for random_activate in random_activates:
- # 插入音频剪辑到背景
- background, segment_time = insert_audio_clip(background, random_activate, previous_segments)
- # 从 segment_time 中取 segment_start 和 segment_end
- segment_start, segment_end = segment_time
- # 在 "y" 中插入标签
- y = insert_ones(y, segment_end_ms=segment_end)
-
- # 从整个负例录音列表中随机选择0-2个负例录音
- number_of_negatives = np.random.randint(0, 3)
- random_indices = np.random.randint(len(negatives), size=number_of_negatives)
- random_negatives = [negatives[i] for i in random_indices]
-
- # 第四步: 循环随机选择负例片段并插入背景中
- for random_negative in random_negatives:
- # 插入音频剪辑到背景
- background, _ = insert_audio_clip(background, random_negative, previous_segments)
-
- # 标准化音频剪辑的音量
- background = match_target_amplitude(background, -20.0)
-
- # 导出新的训练样例
- file_handle = background.export("train" + ".wav", format="wav")
- print("文件 (train.wav) 已保存在您的目录中。")
-
- # 获取并绘制新录音的频谱图(正例和负例叠加的背景)
- x = graph_spectrogram("train.wav")
-
- return x, y
-
x, y = create_training_example(backgrounds[0], activates, negatives)
文件 (train.wav) 已保存在您的目录中。
IPython.display.Audio("train.wav")
CSDN不支持播放音频
plt.plot(y[0])
- # 加载预处理的训练样例
- X = np.load("./XY_train/X.npy")
- Y = np.load("./XY_train/Y.npy")
- # 加载预处理开发集示例
- X_dev = np.load("./XY_dev/X_dev.npy")
- Y_dev = np.load("./XY_dev/Y_dev.npy")
- from keras.callbacks import ModelCheckpoint
- from keras.models import Model, load_model, Sequential
- from keras.layers import Dense, Activation, Dropout, Input, Masking, TimeDistributed, LSTM, Conv1D
- from keras.layers import GRU, Bidirectional, BatchNormalization, Reshape
- from keras.optimizers import Adam
- # GRADED FUNCTION: model
-
- def model(input_shape):
- """
- 用 Keras 创建模型的图 Function creating the model's graph in Keras.
-
- 参数:
- input_shape -- 模型输入数据的维度(使用Keras约定)
-
- 返回:
- model -- Keras 模型实例
- """
-
- X_input = Input(shape = input_shape)
-
- # 第一步:卷积层 (≈4 lines)
- X = Conv1D(196, 15, strides=4)(X_input) # CONV1D
- X = BatchNormalization()(X) # Batch normalization 批量标准化
- X = Activation('relu')(X) # ReLu activation ReLu 激活
- X = Dropout(0.8)(X) # dropout (use 0.8)
-
- # 第二步:第一个 GRU 层 (≈4 lines)
- X = GRU(units = 128, return_sequences=True)(X) # GRU (使用128个单元并返回序列)
- X = Dropout(0.8)(X) # dropout (use 0.8)
- X = BatchNormalization()(X) # Batch normalization 批量标准化
-
- # 第三步: 第二个 GRU 层 (≈4 lines)
- X = GRU(units = 128, return_sequences=True)(X) # GRU (使用128个单元并返回序列)
- X = Dropout(0.8)(X) # dropout (use 0.8)
- X = BatchNormalization()(X) # Batch normalization 批量标准化
- X = Dropout(0.8)(X) # dropout (use 0.8)
-
- # 第四步: 时间分布全连接层 (≈1 line)
- X = TimeDistributed(Dense(1, activation = "sigmoid"))(X) # time distributed (sigmoid)
-
- model = Model(inputs = X_input, outputs = X)
-
- return model
model = model(input_shape = (Tx, n_freq))
model.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) (None, 5511, 101) 0
_________________________________________________________________
conv1d_1 (Conv1D) (None, 1375, 196) 297136
_________________________________________________________________
batch_normalization_1 (Batch (None, 1375, 196) 784
_________________________________________________________________
activation_1 (Activation) (None, 1375, 196) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 1375, 196) 0
_________________________________________________________________
gru_1 (GRU) (None, 1375, 128) 124800
_________________________________________________________________
dropout_2 (Dropout) (None, 1375, 128) 0
_________________________________________________________________
batch_normalization_2 (Batch (None, 1375, 128) 512
_________________________________________________________________
gru_2 (GRU) (None, 1375, 128) 98688
_________________________________________________________________
dropout_3 (Dropout) (None, 1375, 128) 0
_________________________________________________________________
batch_normalization_3 (Batch (None, 1375, 128) 512
_________________________________________________________________
dropout_4 (Dropout) (None, 1375, 128) 0
_________________________________________________________________
time_distributed_1 (TimeDist (None, 1375, 1) 129
=================================================================
Total params: 522,561
Trainable params: 521,657
Non-trainable params: 904
_________________________________________________________________
model = load_model('./models/tr_model.h5')
- opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, decay=0.01)
- model.compile(loss='binary_crossentropy', optimizer=opt, metrics=["accuracy"])
model.fit(X, Y, batch_size = 5, epochs=1)
Epoch 1/1
26/26 [==============================] - 30s 1s/step - loss: 0.0893 - acc: 0.9717<keras.callbacks.History at 0x19249d8f358>
- loss, acc = model.evaluate(X_dev, Y_dev)
- print("Dev set accuracy = ", acc)
25/25 [==============================] - 3s 122ms/step
Dev set accuracy = 0.9296872615814209
- def detect_triggerword(filename):
- plt.subplot(2, 1, 1)
-
- x = graph_spectrogram(filename)
- # 频谱图输出(freqs,Tx),我们想要(Tx,freqs)输入到模型中
- x = x.swapaxes(0,1)
- x = np.expand_dims(x, axis=0)
- predictions = model.predict(x)
-
- plt.subplot(2, 1, 2)
- plt.plot(predictions[0,:,0])
- plt.ylabel('probability')
- plt.show()
- return predictions
- chime_file = "audio_examples/chime.wav"
- def chime_on_activate(filename, predictions, threshold):
- audio_clip = AudioSegment.from_wav(filename)
- chime = AudioSegment.from_wav(chime_file)
- Ty = predictions.shape[1]
- # 第一步:将连续输出步初始化为0
- consecutive_timesteps = 0
- # 第二步: 循环y中的输出步
- for i in range(Ty):
- # 第三步: 增加连续输出步
- consecutive_timesteps += 1
- # 第四步: 如果预测高于阈值并且已经过了超过75个连续输出步
- if predictions[0,i,0] > threshold and consecutive_timesteps > 75:
- # 第五步:使用pydub叠加音频和背景
- audio_clip = audio_clip.overlay(chime, position = ((i / Ty) * audio_clip.duration_seconds)*1000)
- # 第六步: 将连续输出步重置为0
- consecutive_timesteps = 0
-
- audio_clip.export("chime_output.wav", format='wav')
IPython.display.Audio("./raw_data/dev/1.wav")
CSDN不支持播放音频
IPython.display.Audio("./raw_data/dev/2.wav")
CSDN不支持播放音频
- filename = "./raw_data/dev/1.wav"
- prediction = detect_triggerword(filename)
- chime_on_activate(filename, prediction, 0.5)
- IPython.display.Audio("./chime_output.wav")
CSDN不支持播放音频
- filename = "./raw_data/dev/2.wav"
- prediction = detect_triggerword(filename)
- chime_on_activate(filename, prediction, 0.5)
- IPython.display.Audio("./chime_output.wav")
CSDN不支持播放音频
- # 将音频预处理为正确的格式
- def preprocess_audio(filename):
- # 将音频片段修剪或填充到 10000ms
- padding = AudioSegment.silent(duration=10000)
- segment = AudioSegment.from_wav(filename)[:10000]
- segment = padding.overlay(segment)
- # 将帧速率设置为 44100
- segment = segment.set_frame_rate(44100)
- # 导出为wav
- segment.export(filename, format='wav')
your_filename = "audio_examples/my_audio.wav"
- preprocess_audio(your_filename)
- IPython.display.Audio(your_filename) # 听你上传的音频
CSDN不支持播放音频
- chime_threshold = 0.5
- prediction = detect_triggerword(your_filename)
- chime_on_activate(your_filename, prediction, chime_threshold)
- IPython.display.Audio("./chime_output.wav")
CSDN不支持播放音频
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。