当前位置:   article > 正文

语音识别——基于深度学习的中文语音识别系统框架

基于深度学习的中文语音识别系统

本语音识别程序分为声学模型部分和语言模型部分,框架分别采用keras和tensorflow。

程序中使用的完整开源数据集是thchs30、aishell、prime、stcmd,但在程序的演示中只使用了部分的thchs30的数据集。

下面代码是声学模型语言模型的训练部分

  1. import os
  2. import tensorflow as tf
  3. from utils import get_data, data_hparams
  4. from keras.callbacks import ModelCheckpoint
  5. # 0.准备训练所需数据------------------------------
  6. data_args = data_hparams()
  7. data_args.data_type = 'train'
  8. data_args.data_path = 'data/'
  9. data_args.thchs30 = True
  10. data_args.aishell = True
  11. data_args.prime = True
  12. data_args.stcmd = True
  13. data_args.batch_size = 4
  14. data_args.data_length = 10
  15. # data_args.data_length = None
  16. data_args.shuffle = True
  17. train_data = get_data(data_args)
  18. # 0.准备验证所需数据------------------------------
  19. data_args = data_hparams()
  20. data_args.data_type = 'dev'
  21. # data_args.data_path = '../dataset/'
  22. data_args.data_path = 'data/'
  23. data_args.thchs30 = True
  24. data_args.aishell = True
  25. data_args.prime = True
  26. data_args.stcmd = True
  27. data_args.batch_size = 4
  28. # data_args.data_length = None
  29. data_args.data_length = 10
  30. data_args.shuffle = True
  31. dev_data = get_data(data_args)
  32. # 1.声学模型训练-----------------------------------
  33. from model_speech.cnn_ctc import Am, am_hparams
  34. am_args = am_hparams()
  35. am_args.vocab_size = len(train_data.am_vocab)
  36. am_args.gpu_nums = 1
  37. am_args.lr = 0.0008
  38. am_args.is_training = True
  39. am = Am(am_args)
  40. if os.path.exists('logs_am/model.h5'):
  41. print('load acoustic model...')
  42. am.ctc_model.load_weights('logs_am/model.h5')
  43. epochs = 10
  44. batch_num = len(train_data.wav_lst) // train_data.batch_size
  45. # checkpoint
  46. ckpt = "model_{epoch:02d}-{val_acc:.2f}.hdf5"
  47. checkpoint = ModelCheckpoint(os.path.join('./checkpoint', ckpt), monitor='val_loss', save_weights_only=False, verbose=1, save_best_only=True)
  48. batch = train_data.get_am_batch()
  49. am.ctc_model.fit_generator(batch, steps_per_epoch=batch_num, epochs=200, workers=1, use_multiprocessing=False)
  50. am.ctc_model.save_weights('logs_am/model.h5')
  51. # 2.语言模型训练-------------------------------------------
  52. from model_language.transformer import Lm, lm_hparams
  53. lm_args = lm_hparams()
  54. lm_args.num_heads = 8
  55. lm_args.num_blocks = 6
  56. lm_args.input_vocab_size = len(train_data.pny_vocab)
  57. lm_args.label_vocab_size = len(train_data.han_vocab)
  58. lm_args.max_length = 100
  59. lm_args.hidden_units = 512
  60. lm_args.dropout_rate = 0.2
  61. lm_args.lr = 0.0003
  62. lm_args.is_training = True
  63. lm = Lm(lm_args)
  64. epochs = 100
  65. with lm.graph.as_default():
  66. saver =tf.train.Saver()
  67. with tf.Session(graph=lm.graph) as sess:
  68. merged = tf.summary.merge_all()
  69. sess.run(tf.global_variables_initializer())
  70. add_num = 0
  71. if os.path.exists('logs_lm/checkpoint'):
  72. print('loading language model...')
  73. latest = tf.train.latest_checkpoint('logs_lm')
  74. add_num = int(latest.split('_')[-1])
  75. saver.restore(sess, latest)
  76. writer = tf.summary.FileWriter('logs_lm/tensorboard', tf.get_default_graph())
  77. for k in range(epochs):
  78. total_loss = 0
  79. batch = train_data.get_lm_batch()
  80. for i in range(batch_num):
  81. input_batch, label_batch = next(batch)
  82. feed = {lm.x: input_batch, lm.y: label_batch}
  83. cost,_ = sess.run([lm.mean_loss,lm.train_op], feed_dict=feed)
  84. total_loss += cost
  85. if (k * batch_num + i) % 10 == 0:
  86. rs=sess.run(merged, feed_dict=feed)
  87. writer.add_summary(rs, k * batch_num + i)
  88. print('epochs', k+1, ': average loss = ', total_loss/batch_num)
  89. saver.save(sess, 'logs_lm/model_%d' % (epochs + add_num))
  90. writer.close()

数据处理

  1. import difflib
  2. import numpy as np
  3. import tensorflow as tf
  4. import scipy.io.wavfile as wav
  5. from tqdm import tqdm
  6. from scipy.fftpack import fft
  7. from python_speech_features import mfcc
  8. from random import shuffle
  9. from keras import backend as K
  10. def data_hparams():
  11. params = tf.contrib.training.HParams(
  12. # vocab
  13. data_type='train',
  14. data_path='data/',
  15. thchs30=True,
  16. aishell=True,
  17. prime=True,
  18. stcmd=True,
  19. batch_size=1,
  20. data_length=10,
  21. shuffle=True)
  22. return params
  23. class get_data():
  24. def __init__(self, args):
  25. self.data_type = args.data_type
  26. self.data_path = args.data_path
  27. self.thchs30 = args.thchs30
  28. self.aishell = args.aishell
  29. self.prime = args.prime
  30. self.stcmd = args.stcmd
  31. self.data_length = args.data_length
  32. self.batch_size = args.batch_size
  33. self.shuffle = args.shuffle
  34. self.source_init()
  35. def source_init(self):
  36. print('get source list...')
  37. read_files = []
  38. if self.data_type == 'train':
  39. if self.thchs30 == True:
  40. read_files.append('thchs_train.txt')
  41. if self.aishell == True:
  42. read_files.append('aishell_train.txt')
  43. if self.prime == True:
  44. read_files.append('prime.txt')
  45. if self.stcmd == True:
  46. read_files.append('stcmd.txt')
  47. elif self.data_type == 'dev':
  48. if self.thchs30 == True:
  49. read_files.append('thchs_dev.txt')
  50. if self.aishell == True:
  51. read_files.append('aishell_dev.txt')
  52. elif self.data_type == 'test':
  53. if self.thchs30 == True:
  54. read_files.append('thchs_test.txt')
  55. if self.aishell == True:
  56. read_files.append('aishell_test.txt')
  57. self.wav_lst = []
  58. self.pny_lst = []
  59. self.han_lst = []
  60. for file in read_files:
  61. print('load ', file, ' data...')
  62. sub_file = 'data/' + file
  63. with open(sub_file, 'r', encoding='utf-8-sig') as f:
  64. data = f.readlines()
  65. for line in tqdm(data):
  66. wav_file, pny, han = line.split('\t')
  67. self.wav_lst.append(wav_file)
  68. self.pny_lst.append(pny.split(' '))
  69. self.han_lst.append(han.strip('\n'))
  70. if self.data_length:
  71. self.wav_lst = self.wav_lst[:self.data_length]
  72. self.pny_lst = self.pny_lst[:self.data_length]
  73. self.han_lst = self.han_lst[:self.data_length]
  74. print('make am vocab...')
  75. self.am_vocab = self.mk_am_vocab(self.pny_lst)
  76. print('make lm pinyin vocab...')
  77. self.pny_vocab = self.mk_lm_pny_vocab(self.pny_lst)
  78. print('make lm hanzi vocab...')
  79. self.han_vocab = self.mk_lm_han_vocab(self.han_lst)
  80. def get_am_batch(self):
  81. shuffle_list = [i for i in range(len(self.wav_lst))]
  82. while 1:
  83. if self.shuffle == True:
  84. shuffle(shuffle_list)
  85. for i in range(len(self.wav_lst) // self.batch_size):
  86. wav_data_lst = []
  87. label_data_lst = []
  88. begin = i * self.batch_size
  89. end = begin + self.batch_size
  90. sub_list = shuffle_list[begin:end]
  91. for index in sub_list:
  92. fbank = compute_fbank(self.data_path + self.wav_lst[index])
  93. pad_fbank = np.zeros((fbank.shape[0] // 8 * 8 + 8, fbank.shape[1]))
  94. pad_fbank[:fbank.shape[0], :] = fbank
  95. label = self.pny2id(self.pny_lst[index], self.am_vocab)
  96. label_ctc_len = self.ctc_len(label)
  97. if pad_fbank.shape[0] // 8 >= label_ctc_len:
  98. wav_data_lst.append(pad_fbank)
  99. label_data_lst.append(label)
  100. pad_wav_data, input_length = self.wav_padding(wav_data_lst)
  101. pad_label_data, label_length = self.label_padding(label_data_lst)
  102. inputs = {'the_inputs': pad_wav_data,
  103. 'the_labels': pad_label_data,
  104. 'input_length': input_length,
  105. 'label_length': label_length,
  106. }
  107. outputs = {'ctc': np.zeros(pad_wav_data.shape[0], )}
  108. yield inputs, outputs
  109. def get_lm_batch(self):
  110. batch_num = len(self.pny_lst) // self.batch_size
  111. for k in range(batch_num):
  112. begin = k * self.batch_size
  113. end = begin + self.batch_size
  114. input_batch = self.pny_lst[begin:end]
  115. label_batch = self.han_lst[begin:end]
  116. max_len = max([len(line) for line in input_batch])
  117. input_batch = np.array(
  118. [self.pny2id(line, self.pny_vocab) + [0] * (max_len - len(line)) for line in input_batch])
  119. label_batch = np.array(
  120. [self.han2id(line, self.han_vocab) + [0] * (max_len - len(line)) for line in label_batch])
  121. yield input_batch, label_batch
  122. def pny2id(self, line, vocab):
  123. return [vocab.index(pny) for pny in line]
  124. def han2id(self, line, vocab):
  125. return [vocab.index(han) for han in line]
  126. def wav_padding(self, wav_data_lst):
  127. wav_lens = [len(data) for data in wav_data_lst]
  128. wav_max_len = max(wav_lens)
  129. wav_lens = np.array([leng // 8 for leng in wav_lens])
  130. new_wav_data_lst = np.zeros((len(wav_data_lst), wav_max_len, 200, 1))
  131. for i in range(len(wav_data_lst)):
  132. new_wav_data_lst[i, :wav_data_lst[i].shape[0], :, 0] = wav_data_lst[i]
  133. return new_wav_data_lst, wav_lens
  134. def label_padding(self, label_data_lst):
  135. label_lens = np.array([len(label) for label in label_data_lst])
  136. max_label_len = max(label_lens)
  137. new_label_data_lst = np.zeros((len(label_data_lst), max_label_len))
  138. for i in range(len(label_data_lst)):
  139. new_label_data_lst[i][:len(label_data_lst[i])] = label_data_lst[i]
  140. return new_label_data_lst, label_lens
  141. def mk_am_vocab(self, data):
  142. vocab = []
  143. for line in tqdm(data):
  144. line = line
  145. for pny in line:
  146. if pny not in vocab:
  147. vocab.append(pny)
  148. vocab.append('_')
  149. return vocab
  150. def mk_lm_pny_vocab(self, data):
  151. vocab = ['<PAD>']
  152. for line in tqdm(data):
  153. for pny in line:
  154. if pny not in vocab:
  155. vocab.append(pny)
  156. return vocab
  157. def mk_lm_han_vocab(self, data):
  158. vocab = ['<PAD>']
  159. for line in tqdm(data):
  160. line = ''.join(line.split(' '))
  161. for han in line:
  162. if han not in vocab:
  163. vocab.append(han)
  164. return vocab
  165. def ctc_len(self, label):
  166. add_len = 0
  167. label_len = len(label)
  168. for i in range(label_len - 1):
  169. if label[i] == label[i + 1]:
  170. add_len += 1
  171. return label_len + add_len
  172. def compute_mfcc(file):
  173. fs, audio = wav.read(file)
  174. mfcc_feat = mfcc(audio, samplerate=fs, numcep=26)
  175. mfcc_feat = mfcc_feat[::3]
  176. mfcc_feat = np.transpose(mfcc_feat)
  177. return mfcc_feat
  178. def compute_fbank(file):
  179. x = np.linspace(0, 400 - 1, 400, dtype=np.int64)
  180. w = 0.54 - 0.46 * np.cos(2 * np.pi * (x) / (400 - 1))
  181. fs, wavsignal = wav.read(file)
  182. time_window = 25
  183. wav_arr = np.array(wavsignal)
  184. range0_end = int(len(wavsignal) / fs * 1000 - time_window) // 10 + 1
  185. data_input = np.zeros((range0_end, 200), dtype=np.float)
  186. data_line = np.zeros((1, 400), dtype=np.float)
  187. for i in range(0, range0_end):
  188. p_start = i * 160
  189. p_end = p_start + 400
  190. data_line = wav_arr[p_start:p_end]
  191. data_line = data_line * w
  192. data_line = np.abs(fft(data_line))
  193. data_input[i] = data_line[0:200]
  194. data_input = np.log(data_input + 1)
  195. return data_input
  196. # word error rate------------------------------------
  197. def GetEditDistance(str1, str2):
  198. leven_cost = 0
  199. s = difflib.SequenceMatcher(None, str1, str2)
  200. for tag, i1, i2, j1, j2 in s.get_opcodes():
  201. if tag == 'replace':
  202. leven_cost += max(i2-i1, j2-j1)
  203. elif tag == 'insert':
  204. leven_cost += (j2-j1)
  205. elif tag == 'delete':
  206. leven_cost += (i2-i1)
  207. return leven_cost
  208. # 解码器------------------------------------
  209. def decode_ctc(num_result, num2word):
  210. result = num_result[:, :, :]
  211. in_len = np.zeros((1), dtype = np.int32)
  212. in_len[0] = result.shape[1]
  213. r = K.ctc_decode(result, in_len, greedy = True, beam_width=10, top_paths=1)
  214. r1 = K.get_value(r[0][0])
  215. r1 = r1[0]
  216. text = []
  217. for i in r1:
  218. text.append(num2word[i])
  219. return r1, text

声学模型

  1. from keras.layers import Input, Conv2D, BatchNormalization, MaxPooling2D
  2. from keras.layers import Reshape, Dense, Dropout, Lambda
  3. from keras.optimizers import Adam
  4. from keras import backend as K
  5. from keras.models import Model
  6. from keras.utils import multi_gpu_model
  7. import tensorflow as tf
  8. def am_hparams():
  9. params = tf.contrib.training.HParams(
  10. vocab_size=50,
  11. lr=0.0008,
  12. gpu_nums=1,
  13. is_training=True)
  14. return params
  15. # =============================搭建模型====================================
  16. class Am():
  17. """docstring for Amodel."""
  18. def __init__(self, args):
  19. self.vocab_size = args.vocab_size
  20. self.gpu_nums = args.gpu_nums
  21. self.lr = args.lr
  22. self.is_training = args.is_training
  23. self._model_init()
  24. if self.is_training:
  25. self._ctc_init()
  26. self.opt_init()
  27. def _model_init(self):
  28. self.inputs = Input(name='the_inputs', shape=(None, 200, 1))
  29. self.h1 = cnn_cell(32, self.inputs)
  30. self.h2 = cnn_cell(64, self.h1)
  31. self.h3 = cnn_cell(128, self.h2)
  32. self.h4 = cnn_cell(128, self.h3, pool=False)
  33. self.h5 = cnn_cell(128, self.h4, pool=False)
  34. # 200 / 8 * 128 = 3200
  35. self.h6 = Reshape((-1, 3200))(self.h5)
  36. self.h6 = Dropout(0.2)(self.h6)
  37. self.h7 = dense(256)(self.h6)
  38. self.h7 = Dropout(0.2)(self.h7)
  39. self.outputs = dense(self.vocab_size, activation='softmax')(self.h7)
  40. self.model = Model(inputs=self.inputs, outputs=self.outputs)
  41. self.model.summary()
  42. def _ctc_init(self):
  43. self.labels = Input(name='the_labels', shape=[None], dtype='float32')
  44. self.input_length = Input(name='input_length', shape=[1], dtype='int64')
  45. self.label_length = Input(name='label_length', shape=[1], dtype='int64')
  46. self.loss_out = Lambda(ctc_lambda, output_shape=(1,), name='ctc')\
  47. ([self.labels, self.outputs, self.input_length, self.label_length])
  48. self.ctc_model = Model(inputs=[self.labels, self.inputs,
  49. self.input_length, self.label_length], outputs=self.loss_out)
  50. def opt_init(self):
  51. opt = Adam(lr = self.lr, beta_1 = 0.9, beta_2 = 0.999, decay = 0.01, epsilon = 10e-8)
  52. if self.gpu_nums > 1:
  53. self.ctc_model=multi_gpu_model(self.ctc_model,gpus=self.gpu_nums)
  54. self.ctc_model.compile(loss={'ctc': lambda y_true, output: output}, optimizer=opt)
  55. # ============================模型组件=================================
  56. def conv2d(size):
  57. return Conv2D(size, (3,3), use_bias=True, activation='relu',
  58. padding='same', kernel_initializer='he_normal')
  59. def norm(x):
  60. return BatchNormalization(axis=-1)(x)
  61. def maxpool(x):
  62. return MaxPooling2D(pool_size=(2,2), strides=None, padding="valid")(x)
  63. def dense(units, activation="relu"):
  64. return Dense(units, activation=activation, use_bias=True,
  65. kernel_initializer='he_normal')
  66. # x.shape=(none, none, none)
  67. # output.shape = (1/2, 1/2, 1/2)
  68. def cnn_cell(size, x, pool=True):
  69. x = norm(conv2d(size)(x))
  70. x = norm(conv2d(size)(x))
  71. if pool:
  72. x = maxpool(x)
  73. return x
  74. def ctc_lambda(args):
  75. labels, y_pred, input_length, label_length = args
  76. y_pred = y_pred[:, :, :]
  77. return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

语言模型

  1. import tensorflow as tf
  2. import numpy as np
  3. def normalize(inputs,
  4. epsilon = 1e-8,
  5. scope="ln",
  6. reuse=None):
  7. with tf.variable_scope(scope, reuse=reuse):
  8. inputs_shape = inputs.get_shape()
  9. params_shape = inputs_shape[-1:]
  10. mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
  11. beta= tf.Variable(tf.zeros(params_shape))
  12. gamma = tf.Variable(tf.ones(params_shape))
  13. normalized = (inputs - mean) / ( (variance + epsilon) ** (.5) )
  14. outputs = gamma * normalized + beta
  15. return outputs
  16. def embedding(inputs,
  17. vocab_size,
  18. num_units,
  19. zero_pad=True,
  20. scale=True,
  21. scope="embedding",
  22. reuse=None):
  23. with tf.variable_scope(scope, reuse=reuse):
  24. lookup_table = tf.get_variable('lookup_table',
  25. dtype=tf.float32,
  26. shape=[vocab_size, num_units],
  27. initializer=tf.contrib.layers.xavier_initializer())
  28. if zero_pad:
  29. lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
  30. outputs = tf.nn.embedding_lookup(lookup_table, inputs)
  31. if scale:
  32. outputs = outputs * (num_units ** 0.5)
  33. return outputs
  34. def multihead_attention(emb,
  35. queries,
  36. keys,
  37. num_units=None,
  38. num_heads=8,
  39. dropout_rate=0,
  40. is_training=True,
  41. causality=False,
  42. scope="multihead_attention",
  43. reuse=None):
  44. with tf.variable_scope(scope, reuse=reuse):
  45. # Set the fall back option for num_units
  46. if num_units is None:
  47. num_units = queries.get_shape().as_list[-1]
  48. # Linear projections
  49. Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # (N, T_q, C)
  50. K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
  51. V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # (N, T_k, C)
  52. # Split and concat
  53. Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
  54. K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
  55. V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
  56. # Multiplication
  57. outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
  58. # Scale
  59. outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
  60. # Key Masking
  61. key_masks = tf.sign(tf.abs(tf.reduce_sum(emb, axis=-1))) # (N, T_k)
  62. key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
  63. key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
  64. paddings = tf.ones_like(outputs)*(-2**32+1)
  65. outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  66. # Causality = Future blinding
  67. if causality:
  68. diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k)
  69. tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense() # (T_q, T_k)
  70. masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k)
  71. paddings = tf.ones_like(masks)*(-2**32+1)
  72. outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k)
  73. # Activation
  74. outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
  75. # Query Masking
  76. query_masks = tf.sign(tf.abs(tf.reduce_sum(emb, axis=-1))) # (N, T_q)
  77. query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
  78. query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
  79. outputs *= query_masks # broadcasting. (N, T_q, C)
  80. # Dropouts
  81. outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
  82. # Weighted sum
  83. outputs = tf.matmul(outputs, V_) # ( h*N, T_q, C/h)
  84. # Restore shape
  85. outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2 ) # (N, T_q, C)
  86. # Residual connection
  87. outputs += queries
  88. # Normalize
  89. outputs = normalize(outputs) # (N, T_q, C)
  90. return outputs
  91. def feedforward(inputs,
  92. num_units=[2048, 512],
  93. scope="multihead_attention",
  94. reuse=None):
  95. with tf.variable_scope(scope, reuse=reuse):
  96. # Inner layer
  97. params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
  98. "activation": tf.nn.relu, "use_bias": True}
  99. outputs = tf.layers.conv1d(**params)
  100. # Readout layer
  101. params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
  102. "activation": None, "use_bias": True}
  103. outputs = tf.layers.conv1d(**params)
  104. # Residual connection
  105. outputs += inputs
  106. # Normalize
  107. outputs = normalize(outputs)
  108. return outputs
  109. def label_smoothing(inputs, epsilon=0.1):
  110. K = inputs.get_shape().as_list()[-1] # number of channels
  111. return ((1-epsilon) * inputs) + (epsilon / K)
  112. class Lm():
  113. '''
  114. # 语言模型
  115. '''
  116. def __init__(self, arg):
  117. self.graph = tf.Graph()
  118. with self.graph.as_default():
  119. self.is_training = arg.is_training
  120. self.hidden_units = arg.hidden_units
  121. self.input_vocab_size = arg.input_vocab_size
  122. self.label_vocab_size = arg.label_vocab_size
  123. self.num_heads = arg.num_heads
  124. self.num_blocks = arg.num_blocks
  125. self.max_length = arg.max_length
  126. self.lr = arg.lr
  127. self.dropout_rate = arg.dropout_rate
  128. # input
  129. self.x = tf.placeholder(tf.int32, shape=(None, None))
  130. self.y = tf.placeholder(tf.int32, shape=(None, None))
  131. # embedding
  132. self.emb = embedding(self.x, vocab_size=self.input_vocab_size, num_units=self.hidden_units, scale=True, scope="enc_embed")
  133. # 编码模块
  134. self.enc = self.emb + embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=self.max_length,num_units=self.hidden_units, zero_pad=False, scale=False,scope="enc_pe")
  135. ## Dropout
  136. self.enc = tf.layers.dropout(self.enc,
  137. rate=self.dropout_rate,
  138. training=tf.convert_to_tensor(self.is_training))
  139. ## Blocks
  140. for i in range(self.num_blocks):
  141. with tf.variable_scope("num_blocks_{}".format(i)):
  142. ### Multihead Attention
  143. self.enc = multihead_attention(emb = self.emb,
  144. queries=self.enc,
  145. keys=self.enc,
  146. num_units=self.hidden_units,
  147. num_heads=self.num_heads,
  148. dropout_rate=self.dropout_rate,
  149. is_training=self.is_training,
  150. causality=False)
  151. ### Feed Forward
  152. self.outputs = feedforward(self.enc, num_units=[4*self.hidden_units, self.hidden_units])
  153. # Final linear projection
  154. self.logits = tf.layers.dense(self.outputs, self.label_vocab_size)
  155. self.preds = tf.to_int32(tf.argmax(self.logits, axis=-1))
  156. self.istarget = tf.to_float(tf.not_equal(self.y, 0))
  157. self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget))
  158. tf.summary.scalar('acc', self.acc)
  159. if self.is_training:
  160. # Loss
  161. self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=self.label_vocab_size))
  162. self.loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.y_smoothed)
  163. self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget))
  164. # Training Scheme
  165. self.global_step = tf.Variable(0, name='global_step', trainable=False)
  166. self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.98, epsilon=1e-8)
  167. self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step)
  168. # Summary
  169. tf.summary.scalar('mean_loss', self.mean_loss)
  170. self.merged = tf.summary.merge_all()
  171. def lm_hparams():
  172. params = tf.contrib.training.HParams(
  173. num_heads = 8,
  174. num_blocks = 6,
  175. # vocab
  176. input_vocab_size = 50,
  177. label_vocab_size = 50,
  178. # embedding size
  179. max_length = 100,
  180. hidden_units = 512,
  181. dropout_rate = 0.2,
  182. lr = 0.0003,
  183. is_training = True)
  184. return params

模型测试代码

  1. #coding=utf-8
  2. import tensorflow as tf
  3. import numpy as np
  4. from utils import decode_ctc, GetEditDistance
  5. # 0.准备解码所需字典,参数需和训练一致,也可以将字典保存到本地,直接进行读取
  6. from utils import get_data, data_hparams
  7. data_args = data_hparams()
  8. train_data = get_data(data_args)
  9. # 1.声学模型-----------------------------------
  10. from model_speech.cnn_ctc import Am, am_hparams
  11. am_args = am_hparams()
  12. am_args.vocab_size = len(train_data.am_vocab)
  13. am = Am(am_args)
  14. print('loading acoustic model...')
  15. am.ctc_model.load_weights('logs_am/model.h5')
  16. # 2.语言模型-------------------------------------------
  17. from model_language.transformer import Lm, lm_hparams
  18. lm_args = lm_hparams()
  19. lm_args.input_vocab_size = len(train_data.pny_vocab)
  20. lm_args.label_vocab_size = len(train_data.han_vocab)
  21. lm_args.dropout_rate = 0.
  22. print('loading language model...')
  23. lm = Lm(lm_args)
  24. sess = tf.Session(graph=lm.graph)
  25. with lm.graph.as_default():
  26. saver =tf.train.Saver()
  27. with sess.as_default():
  28. latest = tf.train.latest_checkpoint('logs_lm')
  29. saver.restore(sess, latest)
  30. # 3. 准备测试所需数据, 不必和训练数据一致,通过设置data_args.data_type测试,
  31. # 此处应设为'test',我用了'train'因为演示模型较小,如果使用'test'看不出效果,
  32. # 且会出现未出现的词。
  33. data_args.data_type = 'train'
  34. data_args.shuffle = False
  35. data_args.batch_size = 1
  36. test_data = get_data(data_args)
  37. # 4. 进行测试-------------------------------------------
  38. am_batch = test_data.get_am_batch()
  39. word_num = 0
  40. word_error_num = 0
  41. for i in range(8):
  42. print('\n the ', i, 'th example.')
  43. inputs, _ = next(am_batch)
  44. x = inputs['the_inputs']
  45. y = test_data.pny_lst[i]
  46. result = am.model.predict(x, steps=1)
  47. _, text = decode_ctc(result, train_data.am_vocab)
  48. text = ' '.join(text)
  49. print('文本结果:', text)
  50. print('原文结果:', ' '.join(y))
  51. with sess.as_default():
  52. text = text.strip('\n').split(' ')
  53. x = np.array([train_data.pny_vocab.index(pny) for pny in text])
  54. x = x.reshape(1, -1)
  55. preds = sess.run(lm.preds, {lm.x: x})
  56. label = test_data.han_lst[i]
  57. got = ''.join(train_data.han_vocab[idx] for idx in preds[0])
  58. print('原文汉字:', label)
  59. print('识别结果:', got)
  60. word_error_num += min(len(label), GetEditDistance(label, got))
  61. word_num += len(label)
  62. print('词错误率:', word_error_num / word_num)
  63. sess.close()

模型测试结果

  1. the 0 th example.
  2. 文本结果: lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de di3 se4 si4 yue4 de lin2 luan2 geng4 shi4 lv4 de2 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2
  3. 原文结果: lv4 shi4 yang2 chun1 yan1 jing3 da4 kuai4 wen2 zhang1 de di3 se4 si4 yue4 de lin2 luan2 geng4 shi4 lv4 de2 xian1 huo2 xiu4 mei4 shi1 yi4 ang4 ran2
  4. 原文汉字: 绿是阳春烟景大块文章的底色四月的林峦更是绿得鲜活秀媚诗意盎然
  5. 识别结果: 绿是阳春烟景大块文章的底色四月的林峦更是绿得鲜活秀媚诗意盎然
  6. the 1 th example.
  7. 文本结果: ta1 jin3 ping2 yao1 bu4 de li4 liang4 zai4 yong3 dao4 shang4 xia4 fan1 teng2 yong3 dong4 she2 xing2 zhuang4 ru2 hai3 tun2 yi4 zhi2 yi3 yi1 tou2 de you1 shi4 ling3 xian1
  8. 原文结果: ta1 jin3 ping2 yao1 bu4 de li4 liang4 zai4 yong3 dao4 shang4 xia4 fan1 teng2 yong3 dong4 she2 xing2 zhuang4 ru2 hai3 tun2 yi4 zhi2 yi3 yi1 tou2 de you1 shi4 ling3 xian1
  9. 原文汉字: 他仅凭腰部的力量在泳道上下翻腾蛹动蛇行状如海豚一直以一头的优势领先
  10. 识别结果: 他仅凭腰部的力量在蛹道上下翻腾蛹动蛇行状如海豚一直以一头的优势领先
  11. the 2 th example.
  12. 文本结果: qi3 ye4 yi1 kao4 ji4 shu4 wa1 qian2 zeng1 xiao4 ta1 fu4 ze2 quan2 chang3 chan3 pin3 zhi4 liang4 yu3 ji4 shu4 pei2 xun4 cheng2 le chang3 li3 de da4 mang2 ren2
  13. 原文结果: qi3 ye4 yi1 kao4 ji4 shu4 wa1 qian2 zeng1 xiao4 ta1 fu4 ze2 quan2 chang3 chan3 pin3 zhi4 liang4 yu3 ji4 shu4 pei2 xun4 cheng2 le chang3 li3 de da4 mang2 ren2
  14. 原文汉字: 企业依靠技术挖潜增效他负责全厂产品质量与技术培训成了厂里的大忙人
  15. 识别结果: 企业依靠技术挖潜增效他负责全厂产品质量与技术培训成了厂里的大忙人
  16. the 3 th example.
  17. 文本结果: cai4 zuo4 hao3 le yi1 wan3 qing1 zheng1 wu3 chang1 yu2 yi1 wan3 fan1 jia1 chao3 ji1 dan4 yi1 wan3 zha4 cai4 gan4 zi chao3 rou4 si1
  18. 原文结果: cai4 zuo4 hao3 le yi1 wan3 qing1 zheng1 wu3 chang1 yu2 yi1 wan3 fan1 jia1 chao3 ji1 dan4 yi1 wan3 zha4 cai4 gan4 zi chao3 rou4 si1
  19. 原文汉字: 菜做好了一碗清蒸武昌鱼一碗蕃茄炒鸡蛋一碗榨菜干子炒肉丝
  20. 识别结果: 菜做好了一碗清蒸武昌鱼一碗蕃茄炒鸡蛋一碗榨菜干子炒肉丝
  21. the 4 th example.
  22. 文本结果: ta1 kan4 kan4 ye4 ji3 hen3 shen1 bai2 tian1 de yan2 re4 yi3 gei3 ye4 liang2 chui1 san4 fen1 fu4 da4 jia1 ge4 zi4 an1 xi1 ming2 tian1 ji4 xu4 wan2 le4
  23. 原文结果: ta1 kan4 kan4 ye4 ji3 hen3 shen1 bai2 tian1 de yan2 re4 yi3 gei3 ye4 liang2 chui1 san4 fen1 fu4 da4 jia1 ge4 zi4 an1 xi1 ming2 tian1 ji4 xu4 wan2 le4
  24. 原文汉字: 她看看夜己很深白天的炎热已给夜凉吹散吩咐大家各自安息明天继续玩乐
  25. 识别结果: 她看看夜己很深白天的炎热已给夜凉吹散吩咐大家各自安息明天继续玩乐

项目文件的下载地址为:DeepSpeechRecognition.rar-深度学习文档类资源-CSDN下载

数据为 data文件夹,将data文件夹解压后放置在项目文件的根目录下即可以运行项目程序

数据的下载地址为:深度学习语音识别数据集data.rar-深度学习文档类资源-CSDN下载

声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:【wpsshop博客】
推荐阅读
相关标签
  

闽ICP备14008679号