MFCC-DTW

Simple MFCC extractor and an speech recognition algorithm (Dynamic Time Warping)

一个MFCC参数提取模板,和语音识别算法(DTW)


main.cpp里有详细的用法,提取原理请参考其他博客。识别算法介绍请参考其他博客。抽象成一个类后,我们不必关心内部的实现方法,直接调用函数提取mfcc参数即可。


Github地址:https://github.com/Linzecong/MFCC-DTW



WaveStructure.h

#ifndef WAVESTRUCTURE_H
#define WAVESTRUCTURE_H

#include<vector>
#include<complex>
#include<cmath>
#include<string>
#include<iostream>
#include<stdlib.h>
#include<malloc.h>
#include<bitset>
using   namespace   std;
#define INF 1000000000000
const float PI=3.1415926536;
const int FS=8;
const int FiltNum=25;

struct RIFF_HEADER{
   char szRiffID[4];  // 'R','I','F','F'
   unsigned long dwRiffSize;
   char szRiffFormat[4]; // 'W','A','V','E'
};

struct FMT_HEADER{
   char  szFmtID[4]; // 'f','m','t',' '
   unsigned long  dwFmtSize; // 18 OR 16
};

struct DATA_BLOCK{
    char szDataID[4]; // 'd','a','t','a'
    unsigned long dwDataSize;
};

struct WAVE_FORMAT{
   unsigned short  wFormatTag;
   unsigned short  wChannels;
   unsigned long dwSamplesPerSec;
   unsigned long dwAvgBytesPerSec;
   unsigned short  wBlockAlign;
   unsigned short  wBitsPerSample;
   unsigned short  wAppendData;
};

struct mTWavHeader{
        int rId;    //标志符(RIFF)
        int rLen;   //数据大小,包括数据头的大小和音频文件的大小
        int wId;    //格式类型("WAVE")
        int fId;    //"fmt"
        int fLen;   //Sizeof(WAVEFORMATEX)
        short wFormatTag;       //编码格式,包括WAVE_FORMAT_PCM,WAVEFORMAT_ADPCM等
        short nChannels;        //声道数,单声道为1,双声道为2
        int nSamplesPerSec;   //采样频率
        int nAvgBytesPerSec;  //每秒的数据量
        short nBlockAlign;      //块对齐
        short wBitsPerSample;   //WAVE文件的采样大小
        int dId;              //"data"
        int wSampleLength;    //音频数据的大小
};

#endif // WAVESTRUCTURE_H


WaveFunction.h


#ifndef WAVEFUNCTION_H
#define WAVEFUNCTION_H
#include"WaveStructure.h"

class WaveFunction{
private:
    int CharCmp(const char *first,const char *second,unsigned short len);
    void InitHamming();

    //设置滤波器参数
    //输入参数:无
    //输出参数:FiltCoe1,三角形滤波器左边的系数,FiltCoe2三角形滤波器右边的系数,Num决定每个点属于哪一个滤波器
    void InitFilt(float *FiltCoe1, float *FiltCoe2, int *Num);

    //加窗,输入为buf,输出为data
    void HammingWindow(short* buf,float* data);

    //计算傅里叶参数
    void ComputeFFT(float *buffer,vector<complex<float> >& vecList);

    //傅里叶变换
    void FFT(const unsigned long & ulN, vector<complex<float> >& vecList);

    /*
    根据滤波器参数计算频带能量
    输入参数:*spdata  ---预处理之后的一帧语音信号
              *FiltCoe1---三角形滤波器左边的系数
              *FiltCoe2---三角形滤波器右边的系数
              *Num     ---决定每个点属于哪一个滤波器
    输出参数:*En      ---输出对数频带能量
    */
    void Filt(float *spdata, float *FiltCoe1, float *FiltCoe2, int *Num, float *En,vector<complex<float> >& vecList);

    /*
    计算MFCC系数
    输入参数:*En ---对数频带能量
    */
    void MFCC(float *En);


    float ComputeDTW(float *cep1, float *cep2, int num1, int num2);
    float Distance(float * ps1,float * ps2,int k1,int k2);

    void AdjustSize();

private:
    vector<float> xishu;
    double *Hamming;
    vector<vector<float> > SourceMFCCs;
    int MFCC_P;
    int MFCC_Pf;
    int FrmLen;
    int FFTLen;

public:
    vector<vector<float> > getMFCCs(string filename);
    vector<vector<float> > addFirstOrderDifference(vector<vector<float> > mfccs);
    vector<vector<float> > addOrderDifference(vector<vector<float> > mfccs);

    //输入为两个mfcc参数,cep1,cep2,返回最短距离
    float ComputeDTW(vector<vector<float> > cep1,vector<vector<float> > cep2);
    WaveFunction(int frm_len, int mfcc_num);
    ~WaveFunction();
};

#endif // WAVEFUNCTION_H


WaveFunction.cpp

#include "WaveFunction.h"

WaveFunction::WaveFunction(int frm_len, int mfcc_num){
    MFCC_P=mfcc_num;
    MFCC_Pf=float(mfcc_num);
    FrmLen=frm_len;
    FFTLen=frm_len;
    Hamming=new double[FrmLen];
}

WaveFunction::~WaveFunction(){
    delete []Hamming;
}

vector<vector<float> > WaveFunction::getMFCCs(string filename){
    xishu.clear();
    SourceMFCCs.clear();
    //mfcc分析
    mTWavHeader waveheader;
    FILE *sourcefile;
    short buffer[FrmLen];
    float data[FrmLen];
    float FiltCoe1[FFTLen/2+1];  //左系数
    float FiltCoe2[FFTLen/2+1];  //右系数
    int Num[FFTLen/2+1];     //决定每个点属于哪一个滤波器
    float En[FiltNum+1];         //频带能量
    vector<complex<float> > vecList;

    sourcefile=fopen(filename.c_str(),"rb");
    fread(&waveheader,sizeof(mTWavHeader),1,sourcefile);
    InitHamming();//初始化汉明窗
    InitFilt(FiltCoe1,FiltCoe2,Num); //初始化MEL滤波系数

    while(fread(buffer,sizeof(short),FrmLen,sourcefile)==FrmLen){
        HammingWindow(buffer,data);
        ComputeFFT(data,vecList);
        Filt(data, FiltCoe1, FiltCoe2, Num, En,vecList);
        MFCC(En);
        vecList.clear();
        fseek(sourcefile, -FrmLen/2, SEEK_CUR);//考虑到帧移,每次移动半帧
    }

    int stdlength=xishu.size();

    for(int i=0;i<stdlength/MFCC_P;i++){
        vector<float> temp;
        for(int j=0;j<MFCC_P;j++)
            temp.push_back(xishu[i*MFCC_P+j]);
        SourceMFCCs.push_back(temp);
    }
    fclose(sourcefile);
    return SourceMFCCs;
}

vector<vector<float> > addFirstOrderDifference(vector<vector<float> > mfccs){
    vector<vector<float> > temp;
    for(int i=0;i<mfccs.size();i++){
        vector<float> line=mfccs[i];
        int size=line.size();
        for(int t=0;t<size;t++){
            if(t<2)
                line.push_back(line[t+1]-line[t]);
            else{
                if(t>size-2||t==size-2)
                    line.push_back(line[t]-line[t-1]);
                else{
                    float fenzi=line[t+1]-line[t-1]+2*(line[t+2]-line[t-2]);
                    float fenmu=sqrtf(10);
                    line.push_back(fenzi/fenmu);

                }
            }
        }
        temp.push_back(line);
    }
    return temp;
}

vector<vector<float> > addOrderDifference(vector<vector<float> > mfccs){
    vector<vector<float> > temp;
    for(int i=0;i<mfccs.size();i++){
        vector<float> line=mfccs[i];
        int size=line.size();
        //一阶差分
        for(int t=0;t<size;t++){
            if(t<2)
                line.push_back(line[t+1]-line[t]);
            else{
                if(t>size-2||t==size-2)
                    line.push_back(line[t]-line[t-1]);
                else{
                    float fenzi=line[t+1]-line[t-1]+2*(line[t+2]-line[t-2]);
                    float fenmu=sqrtf(10);
                    line.push_back(fenzi/fenmu);

                }
            }
        }
        //二阶差分
        for(int t=size;t<size*2;t++){
            if(t<2)
                line.push_back(line[t+1]-line[t]);
            else{
                if(t>size-2||t==size-2)
                    line.push_back(line[t]-line[t-1]);
                else{
                    float fenzi=line[t+1]-line[t-1]+2*(line[t+2]-line[t-2]);
                    float fenmu=sqrtf(10);
                    line.push_back(fenzi/fenmu);

                }
            }
        }
        temp.push_back(line);
    }
    return temp;
}

float WaveFunction::ComputeDTW(vector<vector<float> > cep1, vector<vector<float> > cep2)
{
    vector<float> temp;
    for(int i=0;i<cep1.size();i++)
        for(int j=0;j<cep1[i].size();j++)
            temp.push_back(cep1[i][j]);
    int stdlength=temp.size();
    float * stdmfcc = new float[stdlength];
    std::copy(temp.begin(),temp.end(),stdmfcc);

    vector<float> temp1;
    for(int i=0;i<cep2.size();i++)
        for(int j=0;j<cep2[i].size();j++)
            temp1.push_back(cep2[i][j]);
    int testlen=temp1.size();
    float * testmfcc = new float[testlen];
    std::copy(temp1.begin(),temp1.end(),testmfcc);
    return ComputeDTW(stdmfcc,testmfcc,stdlength/MFCC_P,testlen/MFCC_P);
}

int WaveFunction::CharCmp(const char *first,const char *second,unsigned short len)
{
    int i=0;
    while((first[i]==second[i])&&(i++<len));
    if(i>=len)
        return 0;
    else if(first[i-1]>second[i-1])
        return 1;
    else
        return -1;
}

void WaveFunction::InitHamming(){

    float twopi;
    int i;
    twopi=8.0F*atan(1.0F);
    for(i=0;i<FrmLen;i++)
        Hamming[i]=(float)(0.54-0.46*cos((float)i*twopi/(float)(FrmLen-1)));
}

void WaveFunction::InitFilt(float *FiltCoe1, float *FiltCoe2, int *Num){
    int i,j;
    float Freq;
    int FiltFreq[FiltNum+1] = {0,100,200,300,400,500,600,700,800,900,1000,
                               1149,1320,1516,1741,2000,2297,2639,3031,3482,4000,
                               4595,5278,6063,6964,8001};//滤波器的中心频率
    int BW[FiltNum+1]={100,100,100,100,100,100,100,100,100,100,124,
                       160,184,211,242,278,320,367,422,484,556,
                       639,734,843,969,1112};//滤波器的带宽
    for(i = 0 ; i<= FFTLen/2 ; i++ )
    {
        Num[i]=0;
    }

    for(i = 0 ; i <= FFTLen/2 ; i++)
    {
        Freq = FS * 1000.0F * (float)(i) / (float)(FFTLen);
        for(j = 0 ; j <FiltNum ; j++)
        {
            if(Freq >= (float)FiltFreq[j] && Freq <= (float)FiltFreq[j+1])
            {
                Num[i] = j;
                if(j == 0)
                {
                    FiltCoe1[i] = 0.0F;
                }
                else
                {
                    FiltCoe1[i] = ((float)(FiltFreq[j]+BW[j])-Freq) / (float)(BW[j]);
                }
                FiltCoe2[i] = (Freq-(float)(FiltFreq[j+1]-BW[j+1])) / (float)(BW[j+1]);
                FiltCoe1[i] = FiltCoe1[i] * FiltCoe1[i];
                FiltCoe2[i] = FiltCoe2[i] * FiltCoe2[i];
                break;
            }
        }
    }

}

void WaveFunction::HammingWindow(short *buf, float *data){
    int i;
    for(i=0;i<FrmLen;i++)
        data[i]=buf[i]*Hamming[i];
}

void WaveFunction::ComputeFFT(float *data, vector<complex<float> > &vecList){
    for(int i=0;i<FFTLen;++i)
    {
        if(i<FrmLen)
        {
            complex<float> temp(data[i]);
            vecList.push_back(temp);
        }
        else
        {
            complex<float> temp(0);
            vecList.push_back(temp);
        }
    }
    FFT(FFTLen,vecList);
}

void WaveFunction::FFT(const unsigned long &ulN, vector<complex<float> > &vecList){
    //得到幂数

    unsigned long ulPower = 0; //幂数
    unsigned long ulN1 = ulN - 1;
    while(ulN1 > 0)
    {
        ulPower++;
        ulN1 /= 2;
    }
    //反序

    bitset<sizeof(unsigned long) * 8> bsIndex; //二进制容器
    unsigned long ulIndex; //反转后的序号
    unsigned long ulK;
    for(unsigned long p = 0; p < ulN; p++)
    {
        ulIndex = 0;
        ulK = 1;
        bsIndex = bitset<sizeof(unsigned long) * 8>(p);
        for(unsigned long j = 0; j < ulPower; j++)
        {
            ulIndex += bsIndex.test(ulPower - j - 1) ? ulK : 0;
            ulK *= 2;
        }

        if(ulIndex > p)
        {
            complex<float> c = vecList[p];
            vecList[p] = vecList[ulIndex];
            vecList[ulIndex] = c;
        }
    }

    //计算旋转因子

    vector<complex<float> > vecW;
    for(unsigned long i = 0; i < ulN / 2; i++)
    {
        vecW.push_back(complex<float>(cos(2 * i * PI / ulN) , -1 * sin(2 * i * PI / ulN)));
    }

    /*for(unsigned long m = 0; m < ulN / 2; m++)
    {
        cout<< "\nvW[" << m << "]=" << vecW[m];
    } */

    //计算FFT

    unsigned long ulGroupLength = 1; //段的长度
    unsigned long ulHalfLength = 0; //段长度的一半
    unsigned long ulGroupCount = 0; //段的数量
    complex<float> cw; //WH(x)
    complex<float> c1; //G(x) + WH(x)
    complex<float> c2; //G(x) - WH(x)
    for(unsigned long b = 0; b < ulPower; b++)
    {
        ulHalfLength = ulGroupLength;
        ulGroupLength *= 2;
        for(unsigned long j = 0; j < ulN; j += ulGroupLength)
        {
            for(unsigned long k = 0; k < ulHalfLength; k++)
            {
                cw = vecW[k * ulN / ulGroupLength] * vecList[j + k + ulHalfLength];
                c1 = vecList[j + k] + cw;
                c2 = vecList[j + k] - cw;
                vecList[j + k] = c1;
                vecList[j + k + ulHalfLength] = c2;
            }
        }
    }
}

void WaveFunction::Filt(float *spdata, float *FiltCoe1, float *FiltCoe2, int *Num, float *En, vector<complex<float> > &vecList){
    float temp=0;
    int id, id1, id2;

    for(id = 0 ; id <= FiltNum ; id++)
    {
        En[id]=0.0F;
    }
    for(id = 0 ; id < FFTLen/2 ; id++)
    {
        temp = vecList[id].real()*vecList[id].real()+vecList[id].imag()*vecList[id].imag();
        id1 = Num[id];
        id2 = id1+1;
        En[id1] = En[id1] + FiltCoe1[id] * temp;
        En[id2] = En[id2] + FiltCoe2[id] * temp;
    }
    for(id = 1 ; id <= FiltNum ; id++)
    {
        if(En[id]!=0)
            En[id]=(float)log(En[id]);
    }
}

void WaveFunction::MFCC(float *En)
{
    int idcep, iden;
    float Cep[MFCC_P];

    for(idcep = 0 ; idcep < MFCC_P ; idcep++)
    {
        Cep[idcep] = 0.0;

        for(iden = 1 ; iden <= FiltNum ; iden++)
        {
            Cep[idcep] = Cep[idcep] + En[iden] * (float)cos((idcep+1) * (iden-0.5F) * PI/(FiltNum));
        }
        Cep[idcep] = Cep[idcep] / 10.0F;
        xishu.push_back(Cep[idcep]);
    }
}


float WaveFunction::ComputeDTW(float *cep1, float *cep2, int num1, int num2){
    struct record
    {		int x;
                int y;
    };
    struct point
    {		int x,y;
                float minvalue;
                        int stepnum;
                                bool recheck;               //记录该点是否被记录过
    };
    record * re;
    record * newre;

    newre=new record[num1*num2];    //记录下一层的所有点
    re=new record[num1*num2];       //记录当层的所有点
    int renum;
    int newrenum=0;
    int i,j;
    point * poi;
    poi=new point[num1*num2];

    for(i=0;i<num1*num2;i++)
    {
        poi[i].recheck=0;
        poi[i].minvalue=INF;
        poi[i].stepnum=0;
    }								//设置初始值

    for(i=0;i<5;i++)                //起始点
    {
        if(i==0)  {	re[i].x=1; re[i].y=1; }
        if(i==1)  {	re[i].x=1; re[i].y=2; }
        if(i==2)  {	re[i].x=1; re[i].y=3; }
        if(i==3)  {	re[i].x=2; re[i].y=1; }
        if(i==4)  {	re[i].x=3; re[i].y=1; }
        poi[(re[i].y-1)*num1+re[i].x-1].minvalue=Distance(cep1,cep2,re[i].x,re[i].y);
        poi[(re[i].y-1)*num1+re[i].x-1].stepnum=1;
    }
    renum=5;
    int newx,newy;                   //newvalue;
    for(i=0;i<renum;i++)
    {
        for(j=0;j<3;j++)
        {
            if(j==0){ newx=re[i].x+1; newy=re[i].y+2; }
            if(j==1){ newx=re[i].x+1; newy=re[i].y+1; }
            if(j==2){ newx=re[i].x+2; newy=re[i].y+1; }

            /三种可能路径

            if(newx>=num1||newy>=num2)
                continue;
            if(fabs(newx-newy)<=fabs(num1-num2)+3)
            {
                if(poi[(newy-1)*num1+newx-1].recheck==0)
                {
                    newre[newrenum].x=newx;
                    newre[newrenum].y=newy;
                    newrenum++;
                }
                float tmpdis;
                int addstepnum;
                if(j==0){ tmpdis=Distance(cep1,cep2,newx-1,newy-1)*2+Distance(cep1,cep2,newx,newy); addstepnum=2;}
                if(j==1){ tmpdis=Distance(cep1,cep2,newx,newy)*2; addstepnum=1;}
                if(j==2){ tmpdis=Distance(cep1,cep2,newx-1,newy-1)*2+Distance(cep1,cep2,newx,newy); addstepnum=2;}
                if(poi[(newy-1)*num1+newx-1].minvalue>(poi[(re[i].y-1)*num1+re[i].x-1].minvalue+tmpdis))
                {
                    poi[(newy-1)*num1+newx-1].minvalue=(poi[(re[i].y-1)*num1+re[i].x-1].minvalue+tmpdis);
                    poi[(newy-1)*num1+newx-1].stepnum=poi[(re[i].y-1)*num1+re[i].x-1].stepnum+addstepnum;
                }
                if(poi[(newy-1)*num1+newx-1].recheck==0)
                    poi[(newy-1)*num1+newx-1].recheck=1;
            }
        }
        if(newrenum!=0 && i>=(renum-1))
        {
            renum=newrenum;
            newrenum=0;
            struct	record * tt;
            tt=re;
            re=newre;
            newre=tt;
            i=-1;
        }
    }
    float min=INF;
    for(j=0;j<renum;j++)
    {
        if((poi[(re[j].y-1)*num1+re[j].x-1].minvalue)/poi[(re[j].y-1)*num1+re[j].x-1].stepnum<min)
            min=(poi[(re[j].y-1)*num1+re[j].x-1].minvalue)/poi[(re[j].y-1)*num1+re[j].x-1].stepnum;
    }

    //	min;
    delete []poi;
    delete []newre;
    delete []re;
    delete []cep1;
    delete []cep2;
    return min;
}

float WaveFunction::Distance(float *ps1, float *ps2, int k1, int k2){
    int i=0;
    float sum=0;
    for(i=0;i<MFCC_P;i++)
        sum+=(1+MFCC_Pf/2*(float)sin(PI*i/MFCC_Pf))*(ps1[k1+i]-ps2[k2+i])*(ps1[k1+i]-ps2[k2+i]);

    return sum;
}

main.cpp

#include"wavefunction.h"

int main(int argc, char *argv[])
{
    WaveFunction* a=new WaveFunction(128,13);//每帧多少个采样点,MFCC参数的维数
    
    vector<vector<float> > mfccs1 = a->getMFCCs("D:\\1.wav");//提取mfcc参数
    vector<vector<float> > mfccs2 = a->getMFCCs("D:\\2.wav");

    //mfccs1=a->addFirstOrderDifference(mfccs1);//增加一阶差分系数,此时mfcc参数变为13+13维

    mfccs1=a->addOrderDifference(mfccs1);//增加一阶差分和二阶差分系数,此时mfcc参数变为13*2+13维
    mfccs2=a->addOrderDifference(mfccs2);

    
    cout<<a->ComputeDTW(mfccs1,mfccs2);//利用动态时间规整算法,计算两个语音的相似度,越小相似度越大

    return 0;
}



Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐