代码:获取39维MFFC信号

整了一天在一步步完整读取MFCC代码后,发现有原有的库,但是之前没有理清楚,不知道参数全部的意义。

理论即代码参考:

语音识别:https://blog.csdn.net/chinatelecom08

外文:https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html

代码:http://fancyerii.github.io/books/mfcc/#deltas%E5%92%8Cdelta-deltas%E7%89%B9%E5%BE%81

原理:https://blog.csdn.net/qq_40168949/article/details/85058227(并不详经,不懂的可以多看看其它的)

参数意义(自己的理解——针对自己码也适用于库调用):

1、分帧注意:
帧长、帧移、覆盖信息的关系,覆盖信息 = 帧长 - 帧移

2、FFT变换注意事项:
>>mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
#frames为输入,对每一帧进行FFT变换,输出为 NFFT/2 + 1个点(复数)的模值;  ——(,NFFT/2 + 1)
#其代表的意义(简述):采样频率 / 2 == 信号频率;由NFFT/2 + 1个点,表示信号频率下的频率信息;每一个点代表一个信息。

3、mel滤波:
>>nfilt = 40
#mel滤波器个数一般取40,输出为             ——(,40)

4、dct变化
>>num_ceps = 12
#取多少维度的MFCC信息,取2~13个维度(从第2个维度开始取)
#当取13维度时,第一个维度变为能量,剩下的为12个MFCC维度

5、正弦提升器
>>cep_lifter = 23
#类似于滤波效果,默认常用22或23


6、delta动态特征
>>N = 2
#典型值,默认为2


*注意:在13维MFFC中:第1维能量,与其它12维度MFFC,应具有相同的对数处理 ——  y  = np.log( x ) 或 y = 20 * np.log10( x )。 

调用库:

import scipy.io.wavfile as wav
from python_speech_features import mfcc,delta
import numpy as np

file = "./hmm_gmm/test_data/1_1.wav"
fs,signal = wav.read(file)
MFCC13=mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,nfilt=26,nfft=512,
     lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,winfunc=np.hamming)
delta1 = delta(MFCC13,2)
delta2 = delta(delta1,2)
temp_np1 = np.append(MFCC13,delta1,axis=1)
MFCC39 = np.append(temp_np1,delta2,axis=1)
print(MFCC13.shape)
print(MFCC39.shape)

自己码:(多方借鉴)

import numpy as np
import scipy.io.wavfile as wav
from scipy.fft import dct

def delta(feat, N = 2):
    if N < 1:
        raise ValueError('N must be an integer >= 1')
    NUMFRAMES = len(feat)
    denominator = 2 * sum([i**2 for i in range(1, N+1)])
    delta_feat = np.empty_like(feat)
    padded = np.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
    for t in range(NUMFRAMES):
        delta_feat[t] = np.dot(np.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
    return delta_feat
def Get_39MFCC(file_path):
    # 读取wav文件
    sample_rate, signal = wav.read(file_path)
    # 预处理
    pre_emphasis = 0.97
    emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
    # 分帧
    frame_size = 0.025  # 帧长
    frame_stride = 0.01  # 帧移
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))    # ceil向上去整数值,但需要 + 1 才为真实值
    # num_frames = int((signal_length - frame_length)/frame_step) + 1          #为实际能取到的帧数,但是会被截断
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_signal, z)
    num_frames = num_frames + 1
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(
        np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]                   # frmae.shape == (num_frame,frame_length)
    # 加汉明窗
    frames *= np.hamming(frame_length)
    # frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1))  # Explicit Implementation **
    #FFT变换
    NFFT = 256
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))       # 对于每一帧的加窗信号,进行N点FFT变换
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))         # Power Spectrum
    energy = np.sum(pow_frames, axis=1)  # 每一帧的能量
    energy = np.where(energy == 0, np.finfo(float).eps, energy)
    # Mel滤波组
    nfilt = 40
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale(mel中心)
    hz_points = (700 * (10 ** (mel_points / 2595) - 1))  # Convert Mel to Hz
    bin = np.floor((NFFT + 1) * hz_points / sample_rate)
    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])  # left
        f_m = int(bin[m])            # center
        f_m_plus = int(bin[m + 1])   # right
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks = 20 * np.log10(filter_banks)  # dB
    # 去均值,提高信噪比
    filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
    # DCT离散余弦变化
    num_ceps = 12
    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0:(num_ceps + 1)]  # 取了13维
    # 正弦提升器
    (n_frames, ncoeff) = mfcc.shape
    cep_lifter = 23
    n = np.arange(ncoeff)
    lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
    mfcc *= lift
    mfcc[:, 0] = 20*np.log10(energy)          #第0维更换为能量
    delta1 = delta(mfcc)
    delta2 = delta(delta1)
    temp_np1 = np.append(mfcc,delta1,axis=1)
    result = np.append(temp_np1,delta2,axis=1)
    return result
file = "./hmm_gmm/test_data/1_1.wav"
mfcc = Get_MFCC39(file)
print(mfcc.shape)

 

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐