在python中用pyTorch搭建CNN神经网络实现数字(0~9)语音识别
在python中用pyTorch搭建CNN神经网络实现数字(0~9)语音识别1.收集训练数据speech_commands_v0.01.tar.gzhttp://download.tensorflow.org/data/speech_commands_v0.01.tar.gz自己用迅雷下载什么都行(推荐迅雷)2.准备环境①pycharm软件②cuda和cudnn(我的是11.3)③python(我
·
在python中用pyTorch搭建CNN神经网络实现数字(0~9)语音识别
1.收集训练数据
speech_commands_v0.01.tar.gz
http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
自己用迅雷下载什么都行(推荐迅雷)
2.准备环境
①pycharm软件
②cuda和cudnn(我的是11.3)
③python(我的是3.9)
④支持cuda的pytorch
对于环境的准备,老样子,自己csdn查教程一大把。
推荐不要用conda,直接全pip,一步到胃。
3.直接上代码
用pycharm创建CNN_project
(1)提取数据并保存
①先把所需数据集(里面包括0-9语音集)保存到dataset文件夹
②对数据集提取语谱图(spectrogram)并保存数据集和标签集为data.npy,label.npy
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
def get_spectrogram(path):
data, fs = librosa.load(path, sr=None, mono=True)
spect = librosa.stft(data, n_fft=1024, hop_length=320, win_length=1024)
# print(spect.shape)
# 画语谱图
# plt.matshow(spect)
# plt.ylabel('Frequency')
# plt.xlabel('Time(s)')
# plt.title('Spectrogram')
# plt.show()
return spect
def extract_features():
data_path = "./dataset"
labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
print("标签名:", labels)
total_data = []
total_label = []
for label in labels:
label_path = data_path + "\\" + label
wav_names = os.listdir(label_path)
for wav_name in wav_names:
if wav_name.endswith(".wav"):
wav_path = label_path + "\\" + wav_name
print(wav_path)
spect = get_spectrogram(wav_path)
spect = np.abs(spect)
spect = cv2.resize(spect, (28, 28))
total_data.append(spect)
total_label.append(labels.index(label))
total_data = np.array(total_data)
total_label = np.array(total_label)
print(total_data.shape)
print(total_label.shape)
np.save("data.npy", total_data)
np.save("label.npy", total_label)
if __name__ == '__main__':
extract_features()
(2)创建模型
#Cnn.py
import torch
from torch import nn
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels=1, # 输入为单层图像
out_channels=16, # 卷积成16层
kernel_size=5, # 卷积壳5x5
stride=1, # 步长,每次移动1步
padding=2, # 边缘层,给图像边缘增加像素值为0的框
),
nn.ReLU(), # 激活函数
nn.MaxPool2d(kernel_size=2), # 池化层,将图像长宽减少一半
)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, 5, 1, 2),
nn.ReLU(),
nn.MaxPool2d(2),
)
self.out = nn.Linear(32 * 7 * 7, 10)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = x.view(x.size(0), -1)
output = self.out(x)
return output
def save_model(net, path):
torch.save(net, path)
def load_model(path):
net = torch.load(path)
return net
(3)训练模型并保存
import cv2
from Cnn import CNN, save_model, load_model
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import torch.utils.data as Data
import numpy as np
from Process_data import get_spectrogram
def train_model(net, data, label, lr, batch_size, epoch):
print(net)
# 能用显卡跑的就用显卡跑
# net = net.cuda()
data = torch.Tensor(data)
data = data.unsqueeze(1)
label = torch.Tensor(label).long()
# data =data.cuda()
# label=label.cuda()
# 训练集和测试集7:3
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.3, random_state=0)
# 学习率
LR = lr
# 每次投入训练数据大小
BATCH_SIZE = batch_size
# 训练模型次数
EPOCH = epoch
optimizer = torch.optim.Adam(net.parameters(), lr=LR)
train_dataset = Data.TensorDataset(train_data, train_label)
train_loader = Data.DataLoader(
dataset=train_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
test_dataset = Data.TensorDataset(test_data, test_label)
test_loader = Data.DataLoader(
dataset=test_dataset,
batch_size=BATCH_SIZE,
shuffle=True,
)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, LR, epochs=EPOCH, steps_per_epoch=len(train_loader))
for epoch in range(EPOCH):
for step, (batch_data, batch_label) in enumerate(train_loader):
print('Epoch:', epoch + 1, '/', EPOCH, 'Step:', step)
prediction = net(batch_data)
loss = F.cross_entropy(prediction, batch_label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step()
if step % 50 == 0:
accuracy = []
for stp, (test_x, test_y) in enumerate(test_loader):
test_output = net(test_x)
_, pred_y = torch.max(test_output, 1)
accuracy.append(torch.sum(pred_y == test_y).item() / len(test_y))
print('Epoch', epoch + 1, '| train loss:%.4f' % loss, '| test accuracy:%.4f' % np.mean(accuracy))
return net
def test_model(net, data, label):
data = torch.Tensor(data)
data = data.unsqueeze(1)
label = torch.Tensor(label).long()
# 训练集和测试集7:3
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.3, random_state=0)
test_dataset = Data.TensorDataset(test_data, test_label)
test_loader = Data.DataLoader(
dataset=test_dataset,
batch_size=32,
shuffle=True,
)
y_true = []
y_pred = []
for stp, (test_x, test_y) in enumerate(test_loader):
test_output = net(test_x)
_, pred_y = torch.max(test_output, 1)
y_true.extend(test_y)
y_pred.extend(pred_y)
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Precision_score:", precision_score(y_true, y_pred, average='macro'))
print("Recall_score:", recall_score(y_true, y_pred, average='macro'))
print("F1_score", f1_score(y_true, y_pred, average='macro'))
def predict(model, file):
spect = get_spectrogram(file)
spect = np.abs(spect)
spect = cv2.resize(spect, (28, 28))
data = torch.Tensor(spect)
data = data.unsqueeze(0)
data = data.unsqueeze(0)
output = model(data)
confidence, pred_y = torch.max(output, 1)
print("识别结果为:",pred_y.numpy())
if __name__ == '__main__':
data = np.load("data.npy")
label = np.load("label.npy")
cnn = CNN()
cnn = train_model(cnn, data, label, lr=0.03, batch_size=500, epoch=20)
save_model(cnn, "cnn.pkl")
test_model(cnn, data, label)
file = "./dataset/zero/096456f9_nohash_0.wav"
cnn = load_model("cnn.pkl")
predict(cnn, file)
4.最终实现效果
识别率大概在0.9左右
更多推荐
已为社区贡献6条内容
所有评论(0)