在python中用pyTorch搭建CNN神经网络实现数字(0~9)语音识别

1.收集训练数据
speech_commands_v0.01.tar.gz
http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
自己用迅雷下载什么都行(推荐迅雷)

2.准备环境
①pycharm软件
②cuda和cudnn(我的是11.3)
③python(我的是3.9)
④支持cuda的pytorch

对于环境的准备,老样子,自己csdn查教程一大把。
推荐不要用conda,直接全pip,一步到胃。

3.直接上代码
用pycharm创建CNN_project
(1)提取数据并保存
①先把所需数据集(里面包括0-9语音集)保存到dataset文件夹
在这里插入图片描述

②对数据集提取语谱图(spectrogram)并保存数据集和标签集为data.npy,label.npy

import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2


def get_spectrogram(path):
    data, fs = librosa.load(path, sr=None, mono=True)
    spect = librosa.stft(data, n_fft=1024, hop_length=320, win_length=1024)
    # print(spect.shape)
    # 画语谱图
    # plt.matshow(spect)
    # plt.ylabel('Frequency')
    # plt.xlabel('Time(s)')
    # plt.title('Spectrogram')
    # plt.show()
    return spect


def extract_features():
    data_path = "./dataset"
    labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    print("标签名:", labels)

    total_data = []
    total_label = []

    for label in labels:
        label_path = data_path + "\\" + label
        wav_names = os.listdir(label_path)
        for wav_name in wav_names:
            if wav_name.endswith(".wav"):
                wav_path = label_path + "\\" + wav_name
                print(wav_path)
                spect = get_spectrogram(wav_path)
                spect = np.abs(spect)
                spect = cv2.resize(spect, (28, 28))
                total_data.append(spect)
                total_label.append(labels.index(label))

    total_data = np.array(total_data)
    total_label = np.array(total_label)
    print(total_data.shape)
    print(total_label.shape)
    np.save("data.npy", total_data)
    np.save("label.npy", total_label)

if __name__ == '__main__':
    extract_features()

(2)创建模型

#Cnn.py
import torch
from torch import nn


class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,  # 输入为单层图像
                out_channels=16,  # 卷积成16层
                kernel_size=5,  # 卷积壳5x5
                stride=1,  # 步长,每次移动1步
                padding=2,  # 边缘层,给图像边缘增加像素值为0的框      
            ),
            nn.ReLU(),  # 激活函数
            nn.MaxPool2d(kernel_size=2),  # 池化层,将图像长宽减少一半
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, 5, 1, 2),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.out = nn.Linear(32 * 7 * 7, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)
        output = self.out(x)
        return output


def save_model(net, path):
    torch.save(net, path)


def load_model(path):
    net = torch.load(path)
    return net

(3)训练模型并保存

import cv2

from Cnn import CNN, save_model, load_model
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import torch.utils.data as Data
import numpy as np

from Process_data import get_spectrogram


def train_model(net, data, label, lr, batch_size, epoch):
    print(net)
    # 能用显卡跑的就用显卡跑
    # net = net.cuda()
    data = torch.Tensor(data)
    data = data.unsqueeze(1)
    label = torch.Tensor(label).long()
    # data =data.cuda()
    # label=label.cuda()
    # 训练集和测试集7:3
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.3, random_state=0)

    # 学习率
    LR = lr
    # 每次投入训练数据大小
    BATCH_SIZE = batch_size
    # 训练模型次数
    EPOCH = epoch

    optimizer = torch.optim.Adam(net.parameters(), lr=LR)

    train_dataset = Data.TensorDataset(train_data, train_label)
    train_loader = Data.DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )

    test_dataset = Data.TensorDataset(test_data, test_label)
    test_loader = Data.DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
    )
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, LR, epochs=EPOCH, steps_per_epoch=len(train_loader))

    for epoch in range(EPOCH):
        for step, (batch_data, batch_label) in enumerate(train_loader):
            print('Epoch:', epoch + 1, '/', EPOCH, 'Step:', step)
            prediction = net(batch_data)
            loss = F.cross_entropy(prediction, batch_label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            if step % 50 == 0:
                accuracy = []
                for stp, (test_x, test_y) in enumerate(test_loader):
                    test_output = net(test_x)
                    _, pred_y = torch.max(test_output, 1)
                    accuracy.append(torch.sum(pred_y == test_y).item() / len(test_y))

                print('Epoch', epoch + 1, '| train loss:%.4f' % loss, '| test accuracy:%.4f' % np.mean(accuracy))

    return net


def test_model(net, data, label):
    data = torch.Tensor(data)
    data = data.unsqueeze(1)
    label = torch.Tensor(label).long()
    # 训练集和测试集7:3
    train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.3, random_state=0)

    test_dataset = Data.TensorDataset(test_data, test_label)
    test_loader = Data.DataLoader(
        dataset=test_dataset,
        batch_size=32,
        shuffle=True,
    )

    y_true = []
    y_pred = []
    for stp, (test_x, test_y) in enumerate(test_loader):
        test_output = net(test_x)
        _, pred_y = torch.max(test_output, 1)
        y_true.extend(test_y)
        y_pred.extend(pred_y)

    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision_score:", precision_score(y_true, y_pred, average='macro'))
    print("Recall_score:", recall_score(y_true, y_pred, average='macro'))
    print("F1_score", f1_score(y_true, y_pred, average='macro'))


def predict(model, file):
    spect = get_spectrogram(file)
    spect = np.abs(spect)
    spect = cv2.resize(spect, (28, 28))
    data = torch.Tensor(spect)
    data = data.unsqueeze(0)
    data = data.unsqueeze(0)

    output = model(data)
    confidence, pred_y = torch.max(output, 1)
    print("识别结果为:",pred_y.numpy())


if __name__ == '__main__':

    data = np.load("data.npy")
    label = np.load("label.npy")

    cnn = CNN()
    cnn = train_model(cnn, data, label, lr=0.03, batch_size=500, epoch=20)
    save_model(cnn, "cnn.pkl")

    test_model(cnn, data, label)
 

    file = "./dataset/zero/096456f9_nohash_0.wav"
    cnn = load_model("cnn.pkl")
    predict(cnn, file)
            

4.最终实现效果
识别率大概在0.9左右

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐