2021-04-27
本文主要为代码整合贡献,主要贡献为参数解释,原理见参考文章。基于python语言,有使用库函数5行解决问题;还有一步步实现搭建50行实现。如何获取39维MFCC信号,即 能量 + 12维mfcc信号 + 13维度detla+ 13维detla - detla。39维MFCC信号用与GMM - HMM语音识别。
代码:获取39维MFFC信号
整了一天在一步步完整读取MFCC代码后,发现有原有的库,但是之前没有理清楚,不知道参数全部的意义。
理论即代码参考:
语音识别:https://blog.csdn.net/chinatelecom08
外文:https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
代码:http://fancyerii.github.io/books/mfcc/#deltas%E5%92%8Cdelta-deltas%E7%89%B9%E5%BE%81
原理:https://blog.csdn.net/qq_40168949/article/details/85058227(并不详经,不懂的可以多看看其它的)
参数意义(自己的理解——针对自己码也适用于库调用):
1、分帧注意:
帧长、帧移、覆盖信息的关系,覆盖信息 = 帧长 - 帧移
2、FFT变换注意事项:
>>mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
#frames为输入,对每一帧进行FFT变换,输出为 NFFT/2 + 1个点(复数)的模值; ——(,NFFT/2 + 1)
#其代表的意义(简述):采样频率 / 2 == 信号频率;由NFFT/2 + 1个点,表示信号频率下的频率信息;每一个点代表一个信息。
3、mel滤波:
>>nfilt = 40
#mel滤波器个数一般取40,输出为 ——(,40)
4、dct变化
>>num_ceps = 12
#取多少维度的MFCC信息,取2~13个维度(从第2个维度开始取)
#当取13维度时,第一个维度变为能量,剩下的为12个MFCC维度
5、正弦提升器
>>cep_lifter = 23
#类似于滤波效果,默认常用22或23
6、delta动态特征
>>N = 2
#典型值,默认为2
*注意:在13维MFFC中:第1维能量,与其它12维度MFFC,应具有相同的对数处理 —— y = np.log( x ) 或 y = 20 * np.log10( x )。
调用库:
import scipy.io.wavfile as wav
from python_speech_features import mfcc,delta
import numpy as np
file = "./hmm_gmm/test_data/1_1.wav"
fs,signal = wav.read(file)
MFCC13=mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,nfilt=26,nfft=512,
lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,winfunc=np.hamming)
delta1 = delta(MFCC13,2)
delta2 = delta(delta1,2)
temp_np1 = np.append(MFCC13,delta1,axis=1)
MFCC39 = np.append(temp_np1,delta2,axis=1)
print(MFCC13.shape)
print(MFCC39.shape)
自己码:(多方借鉴)
import numpy as np
import scipy.io.wavfile as wav
from scipy.fft import dct
def delta(feat, N = 2):
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = np.empty_like(feat)
padded = np.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = np.dot(np.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
def Get_39MFCC(file_path):
# 读取wav文件
sample_rate, signal = wav.read(file_path)
# 预处理
pre_emphasis = 0.97
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
# 分帧
frame_size = 0.025 # 帧长
frame_stride = 0.01 # 帧移
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # ceil向上去整数值,但需要 + 1 才为真实值
# num_frames = int((signal_length - frame_length)/frame_step) + 1 #为实际能取到的帧数,但是会被截断
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z)
num_frames = num_frames + 1
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(
np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)] # frmae.shape == (num_frame,frame_length)
# 加汉明窗
frames *= np.hamming(frame_length)
# frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1)) # Explicit Implementation **
#FFT变换
NFFT = 256
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # 对于每一帧的加窗信号,进行N点FFT变换
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
energy = np.sum(pow_frames, axis=1) # 每一帧的能量
energy = np.where(energy == 0, np.finfo(float).eps, energy)
# Mel滤波组
nfilt = 40
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale(mel中心)
hz_points = (700 * (10 ** (mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
# 去均值,提高信噪比
filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
# DCT离散余弦变化
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0:(num_ceps + 1)] # 取了13维
# 正弦提升器
(n_frames, ncoeff) = mfcc.shape
cep_lifter = 23
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift
mfcc[:, 0] = 20*np.log10(energy) #第0维更换为能量
delta1 = delta(mfcc)
delta2 = delta(delta1)
temp_np1 = np.append(mfcc,delta1,axis=1)
result = np.append(temp_np1,delta2,axis=1)
return result
file = "./hmm_gmm/test_data/1_1.wav"
mfcc = Get_MFCC39(file)
print(mfcc.shape)
更多推荐
所有评论(0)