赞
踩
import os source_file = 'D:\PythonTest\ASR_work\data_thchs30\data' def source_get(source_file): train_file = source_file label_lst = [] wav_lst = [] for root, dirs, files in os.walk(train_file): for file in files: if file.endswith('.wav') or file.endswith('.WAV'): wav_file = os.sep.join([root, file]) label_file = wav_file + '.trn' wav_lst.append(wav_file) label_lst.append(label_file) return label_lst, wav_lst label_lst, wav_lst = source_get(source_file) # with open('s.txt','w+',encoding="utf-8") as f: # for i in wav_lst: # i = i.split("\\")[-1] # i = i.split('.')[0] # f.write(i+'\n') def read_label(label_file): with open(label_file, 'r', encoding='utf8') as f: data = f.readlines() return data[0] # print(read_label(label_lst[0])) def gen_label_data(label_lst): label_data = [] for label_file in label_lst: pny = read_label(label_file) label_data.append(pny.strip('\n')) return label_data label_data = gen_label_data(label_lst) print(label_data[0:2]) print(wav_lst[0:2]) with open('ss.txt','w+',encoding="utf-8") as f: for i in range(len(wav_lst)): ii = wav_lst[i].split("\\")[-1] ii = ii.split('.')[0] line = label_data[i].split() l = ''.join(line) l = ' '.join(l) l = ii+' '+l f.write(l+'\n') with open('./aishell_transcript_v0.8.txt','r',encoding='utf-8') as f: lines = f.readlines() vocab = [] for line in lines: line = line.split() liness = line[1:] l = ''.join(liness) l = ' '.join(l) l = l.split(' ') for pny in l: if pny not in vocab: vocab.append(pny) vocab.append('_') with open('./sss.txt','w',encoding='utf-8') as fr: for i in vocab: fr.write(i) fr.write('\n')
赞
踩
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。