赞
踩
采用递归方式读取指定目录下的文件列表
- import os
- def get_filelist(path, list):
- list_dir = os.listdir(path)
- for i in list_dir:
- sub_dir = os.path.join(path, i)
- if os.path.isdir(sub_dir):
- get_filelist(sub_dir, list)
- else:
- list.append(sub_dir)
单通道 (matlab采用audioread实现)
读取音频的方式很多,主要要利用好数据量转换函数np.fromstring或np.frombuffer
- import wave
- import matplotlib.pyplot as plt
- import numpy as np
- import os
-
- filepath = "./data/" #添加路径
- filelist= os.listdir(filepath) #得到文件夹下的所有文件名称
- f = wave.open(filepath+filelist[1],'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes) #读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16) #将字符串转化为int
- waveData = waveData*1.0/(max(abs(waveData))) #wave幅值归一化
- # plot the wave
- time = np.arange(0,nframes)*(1.0 / framerate)
- plt.plot(time,waveData)
- plt.xlabel("Time(s)")
- plt.ylabel("Amplitude")
- plt.title("Single channel wavedata")
- plt.grid('on')#标尺,on:有,off:无。
-
- ##另一种语音读取方式
- f = open(filepath+filelist[1],'rb')
- bufferData = f.read()
- waveData = np.frombuffer(bufferData, dtype=np.int16)
结果图:
多通道
这里通道数为3,主要借助np.reshape一下,其他同单通道处理完全一致
- import wave
- import matplotlib.pyplot as plt
- import numpy as np
- import os
-
- filepath = "./data/" #添加路径
- filelist= os.listdir(filepath) #得到文件夹下的所有文件名称
- f = wave.open(filepath+filelist[0],'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes) #读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16)#将字符串转化为int
- waveData = waveData*1.0/(max(abs(waveData))) #wave幅值归一化
- waveData = np.reshape(waveData,[nframes,nchannels])
- f.close()
- # plot the wave
- time = np.arange(0,nframes)*(1.0 / framerate)
- plt.figure()
- plt.subplot(5,1,1)
- plt.plot(time,waveData[:,0])
- plt.xlabel("Time(s)")
- plt.ylabel("Amplitude")
- plt.title("Ch-1 wavedata")
- plt.grid('on')#标尺,on:有,off:无。
- plt.subplot(5,1,3)
- plt.plot(time,waveData[:,1])
- plt.xlabel("Time(s)")
- plt.ylabel("Amplitude")
- plt.title("Ch-2 wavedata")
- plt.grid('on')#标尺,on:有,off:无。
- plt.subplot(5,1,5)
- plt.plot(time,waveData[:,2])
- plt.xlabel("Time(s)")
- plt.ylabel("Amplitude")
- plt.title("Ch-3 wavedata")
- plt.grid('on')#标尺,on:有,off:无。
- plt.show()
效果图:
单通道为多通道的特例,所以多通道的读取方式对任意通道wav文件都适用。需要注意的是,waveData在reshape之后,与之前的数据结构是不同的。即waveData[0]等价于reshape之前的waveData,但不影响绘图分析,只是在分析频谱时才有必要考虑这一点。
matlab采用audiowrite实现
单通道数据写入:
- import wave
- #import matplotlib.pyplot as plt
- import numpy as np
- import os
- import struct
-
- #wav文件读取
- filepath = "./data/" #添加路径
- filenlist= os.listdir(filepath) #得到文件夹下的所有文件名称
- f = wave.open(filepath+filelist[1],'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes)#读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16)#将字符串转化为int
- waveData = waveData*1.0/(max(abs(waveData)))#wave幅值归一化
- f.close()
- #wav文件写入
- outData = waveData#待写入wav的数据,这里仍然取waveData数据
- outfile = filepath+'out1.wav'
- outwave = wave.open(outfile, 'wb')#定义存储路径以及文件名
- nchannels = 1
- sampwidth = 2
- fs = 8000
- data_size = len(outData)
- framerate = int(fs)
- nframes = data_size
- comptype = "NONE"
- compname = "not compressed"
- outwave.setparams((nchannels, sampwidth, framerate, nframes,
- comptype, compname))
-
- for v in outData:
- outwave.writeframes(struct.pack('h', int(v * 64000 / 2)))#outData:16位,-32767~32767,注意不要溢出
- outwave.close()
多通道数据写入:
多通道的写入与多通道读取类似,多通道读取是将一维数据reshape为二维,多通道的写入是将二维的数据reshape为一维,其实就是一个逆向的过程:
- import wave
- #import matplotlib.pyplot as plt
- import numpy as np
- import os
- import struct
-
- #wav文件读取
- filepath = "./data/" #添加路径
- filelist= os.listdir(filepath) #得到文件夹下的所有文件名称
- f = wave.open(filepath+filelist[0],'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes)#读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16)#将字符串转化为int
- waveData = waveData*1.0/(max(abs(waveData)))#wave幅值归一化
- waveData = np.reshape(waveData,[nframes,nchannels])
- f.close()
- #wav文件写入
- outData = waveData#待写入wav的数据,这里仍然取waveData数据
- outData = np.reshape(outData,[nframes*nchannels,1])
- outfile = filepath+'out2.wav'
- outwave = wave.open(outfile, 'wb')#定义存储路径以及文件名
- nchannels = 3
- sampwidth = 2
- fs = 8000
- data_size = len(outData)
- framerate = int(fs)
- nframes = data_size
- comptype = "NONE"
- compname = "not compressed"
- outwave.setparams((nchannels, sampwidth, framerate, nframes,
- comptype, compname))
-
- for v in outData:
- outwave.writeframes(struct.pack('h', int(v * 64000 / 2)))#outData:16位,-32767~32767,注意不要溢出
- outwave.close()
通常对信号截断、分帧需要加窗,因为截断都有频域能量泄露,而窗函数可以减少截断带来的影响。
窗函数在scipy.signal信号处理工具箱中,如hamming窗:
- import pylab as pl
- import scipy.signal as signal
- pl.figure(figsize=(6,2))
- pl.plot(signal.hanning(512))
信号分帧的理论依据,其中x是语音信号,w是窗函数:
加窗截断类似采样,为了保证相邻帧不至于差别过大,通常帧与帧之间有帧移,其实就是插值平滑的作用。
给出示意图:
这是没有加窗的示例:
- import numpy as np
- import wave
- import os
- #import math
-
- def enframe(signal, nw, inc):
- '''将音频信号转化为帧。
- 参数含义:
- signal:原始音频型号
- nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔)
- inc:相邻帧的间隔(同上定义)
- '''
- signal_length=len(signal) #信号总长度
- if signal_length<=nw: #若信号长度小于一个帧的长度,则帧数定义为1
- nf=1
- else: #否则,计算帧的总长度
- nf=int(np.ceil((1.0*signal_length-nw+inc)/inc))
- pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
- zeros=np.zeros((pad_length-signal_length,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作
- pad_signal=np.concatenate((signal,zeros)) #填补后的信号记为pad_signal
- indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵
- indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
- frames=pad_signal[indices] #得到帧信号
- # win=np.tile(winfunc(nw),(nf,1)) #window窗函数,这里默认取1
- # return frames*win #返回帧信号矩阵
- return frames
- def wavread(filename):
- f = wave.open(filename,'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes)#读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16)#将字符串转化为int
- f.close()
- waveData = waveData*1.0/(max(abs(waveData)))#wave幅值归一化
- waveData = np.reshape(waveData,[nframes,nchannels]).T
- return waveData
-
- filepath = "./data/" #添加路径
- dirname= os.listdir(filepath) #得到文件夹下的所有文件名称
- filename = filepath+dirname[0]
- data = wavread(filename)
- nw = 512
- inc = 128
- Frame = enframe(data[0], nw, inc)
如果需要加窗,只需要将函数修改为:
- def enframe(signal, nw, inc, winfunc):
- '''将音频信号转化为帧。
- 参数含义:
- signal:原始音频型号
- nw:每一帧的长度(这里指采样点的长度,即采样频率乘以时间间隔)
- inc:相邻帧的间隔(同上定义)
- '''
- signal_length=len(signal) #信号总长度
- if signal_length<=nw: #若信号长度小于一个帧的长度,则帧数定义为1
- nf=1
- else: #否则,计算帧的总长度
- nf=int(np.ceil((1.0*signal_length-nw+inc)/inc))
- pad_length=int((nf-1)*inc+nw) #所有帧加起来总的铺平后的长度
- zeros=np.zeros((pad_length-signal_length,)) #不够的长度使用0填补,类似于FFT中的扩充数组操作
- pad_signal=np.concatenate((signal,zeros)) #填补后的信号记为pad_signal
- indices=np.tile(np.arange(0,nw),(nf,1))+np.tile(np.arange(0,nf*inc,inc),(nw,1)).T #相当于对所有帧的时间点进行抽取,得到nf*nw长度的矩阵
- indices=np.array(indices,dtype=np.int32) #将indices转化为矩阵
- frames=pad_signal[indices] #得到帧信号
- win=np.tile(winfunc,(nf,1)) #window窗函数,这里默认取1
- return frames*win #返回帧信号矩阵
其中窗函数,以hamming窗为例:
- winfunc = signal.hamming(nw)
- Frame = enframe(data[0], nw, inc, winfunc)
调用即可。
其实得到了分帧信号,频域变换取幅值,就可以得到语谱图,如果仅仅是观察,matplotlib.pyplot有specgram指令:
- import wave
- import matplotlib.pyplot as plt
- import numpy as np
- import os
-
- filepath = "./data/" #添加路径
- filename= os.listdir(filepath) #得到文件夹下的所有文件名称
- f = wave.open(filepath+filename[0],'rb')
- params = f.getparams()
- nchannels, sampwidth, framerate, nframes = params[:4]
- strData = f.readframes(nframes)#读取音频,字符串格式
- waveData = np.fromstring(strData,dtype=np.int16)#将字符串转化为int
- waveData = waveData*1.0/(max(abs(waveData)))#wave幅值归一化
- waveData = np.reshape(waveData,[nframes,nchannels]).T
- f.close()
- # plot the wave
- plt.specgram(waveData[0],Fs = framerate, scale_by_freq = True, sides = 'default')
- plt.ylabel('Frequency(Hz)')
- plt.xlabel('Time(s)')
- plt.show()
- def stft(signal, window, win_size, hop_size, last_sample=False):
- """Convert time-domain signal to time-frequency domain.
- Args:
- signal : multi-channel time-domain signal
- window : window function, see cola_hamming as an example.
- win_size : window size (number of samples)
- hop_size : hop size (number of samples)
- last_sample : include last sample, by default (due to legacy bug),
- the last sample is not included.
- Returns:
- tf : multi-channel time-frequency domain signal.
- """
- assert signal.ndim == 2
- w = window(win_size, hop_size)
- return np.array([[
- np.fft.fft(c[t:t + win_size] * w)
- for t in range(0,
- len(c) - win_size + (1 if last_sample else 0), hop_size)
- ] for c in signal])
-
-
- def istft(tf, hop_size):
- """Inverse STFT
- Args:
- tf : multi-channel time-frequency domain signal.
- hop_size : hop size (number of samples)
- Returns:
- signal : multi-channel time-domain signal
- """
- tf = np.asarray(tf)
- nch, nframe, nfbin = tf.shape
- signal = np.zeros((nch, (nframe - 1) * hop_size + nfbin))
- for t in range(nframe):
- signal[:, t*hop_size:t*hop_size+nfbin] += \
- np.real(np.fft.ifft(tf[:, t]))
- return signal
- def steering_vector(delay, win_size=0, fbins=None, fs=None):
- """Compute the steering vector.
- One and only one of the conditions are true:
- - win_size != 0
- - fbins is not None
- Args:
- delay : delay of each channel (see compute_delay),
- unit is second if fs is not None, otherwise sample
- win_size : (default 0) window (FFT) size. If zero, use fbins.
- fbins : (default None) center of frequency bins, as discrete value.
- fs : (default None) sample rate
- Returns:
- stv : steering vector, indices (cf)
- """
- assert (win_size != 0) != (fbins is not None)
- delay = np.asarray(delay)
- if fs is not None:
- delay *= fs # to discrete-time value
- if fbins is None:
- fbins = np.fft.fftfreq(win_size)
- return np.exp(-2j * math.pi * np.outer(delay, fbins))
- def compute_delay(m_pos, doa, c=340, fs=None):
- """Compute delay of signal arrival at microphones.
- Args:
- m_pos : microphone positions, (M,3) array,
- M is number of microphones.
- doa : direction of arrival, (3,) array or (N,3) array,
- N is the number of sources.
- c : (default 340) speed of sound (m/s).
- fs : (default None) sample rate.
- Return:
- delay : delay with reference of arrival at first microphone.
- first element is always 0.
- unit is second if fs is None, otherwise sample.
- """
- m_pos = np.asarray(m_pos)
- doa = np.asarray(doa)
-
- # relative position wrt first microphone
- r_pos = m_pos - m_pos[0]
-
- # inner product -> different in time
- if doa.ndim == 1:
- doa /= np.sqrt(np.sum(doa**2.0)) # normalize
- diff = -np.einsum('ij,j->i', r_pos, doa) / c
- else:
- assert doa.ndim == 2
- doa /= np.sqrt(np.sum(doa**2.0, axis=1, keepdims=True)) # normalize
- diff = -np.einsum('ij,kj->ki', r_pos, doa) / c
-
- if fs is not None:
- return diff * fs
- else:
- return diff
- def cov_matrix(tf):
- """Covariance matrix of the multi-channel signal.
- Args:
- tf : multi-channel time-frequency domain signal.
- Returns:
- cov : covariance matrix, indexed by (ccf)
- """
- nch, nframe, nfbin = tf.shape
- return np.einsum('itf,jtf->ijf', tf, tf.conj()) / float(nframe)
11. VAD操作
- def vad_by_threshold(fs, sig, vadrate, threshold_db, neighbor_size=0):
- """Voice Activity Detection by threshold
- Args:
- fs : sample rate.
- signal : multi-channel time-domain signal.
- vadrate : output vad rate
- threshold_db : threshold in decibel
- neighbor_size : half size of (excluding center) neighbor area
- Returns:
- vad : VAD label (0: silence, 1: active)
- """
- nch, nsamples = sig.shape
- nframes = nsamples * vadrate / fs
- fpower = np.zeros((nch, nframes)) # power at frame level
- for i in range(nframes):
- fpower[:, i] = power(sig[:,
- (i * fs / vadrate):((i + 1) * fs / vadrate)])
-
- # average power in neighbor area
- if neighbor_size == 0:
- apower = fpower
- else:
- apower = np.zeros((nch, nframes))
- for i in range(nframes):
- apower[:, i] = np.mean(
- fpower[:,
- max(0, i - neighbor_size):min(nframes, i +
- neighbor_size + 1)],
- axis=1)
- return (apower > 10.0**(threshold_db / 10.0)).astype(int)
参考资料
https://www.cnblogs.com/xingshansi/p/6799994.html
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。