
深度学习(一):给你的数据集打标签
深度学习之给自建数据集打标签
·
1、构建数据集
构建一个文件data,在data文件夹中构建n个子文件夹,n表示你的类别个数。
2、如何让机器能读懂
这里要先知道,何为深度学习,要清楚你需要给每一个类别的图片打上一个标签。
这里介绍俩种打标签的方法。
第一种是制造txt文件。
import os
a=0
while(a<4):
dir = 'data/'+str(a)+'/'
label = a
files = os.listdir(dir)
files.sort()
train = open('train.txt','a')
val = open('val.txt', 'a')
i = 1
for file in files:
if i<300: #假如文件中有420张图片,此处的300指的其中299张图片作为train数据集,剩下的121张图片作为test数据集
fileType = os.path.split(file)
if fileType[1] == '.txt':
continue
name = str(dir) + file + ' ' + str(int(label)) +'\n'
train.write(name)
i = i+1
print(i)
else:
fileType = os.path.split(file)
if fileType[1] == '.txt':
continue
name = str(dir) +file + ' ' + str(int(label)) +'\n'
val.write(name)
i = i+1
print(i)
val.close()
train.close()
print(a)
a = a + 1
第二种是创建表格
import numpy as np
import pandas as pd
import cv2
import os
def readfilename(filepath):
return os.listdir(filepath)
a_path = '位置'
b_path = '位置'
c_path = '位置'
d_path = '位置'
df = pd.DataFrame()
filenames_1 = readfilename(a_path)
for filename in filenames_1:
read_path = a_path + filename
img = cv2.imread(read_path)
img = img.reshape(1, -1)
data = np.concatenate(([[0]], img), axis=1)
data = pd.DataFrame(data)
df = df.append(data)
print(df.head())
filenames_2 = readfilename(b_path)
for filename in filenames_2:
read_path = b_path + filename
img = cv2.imread(read_path)
img = img.reshape(1, -1)
data = np.concatenate(([[1]], img), axis=1)
data = pd.DataFrame(data)
df = df.append(data)
print(df.tail())
filenames_3 = readfilename(c_path)
for filename in filenames_3:
read_path = c_path + filename
img = cv2.imread(read_path)
img = img.reshape(1, -1)
data = np.concatenate(([[2]], img), axis=1)
data = pd.DataFrame(data)
df = df.append(data)
print(df.tail())
filenames_4 = readfilename(d_path)
for filename in filenames_4:
read_path = d_path + filename
img = cv2.imread(read_path)
img = img.reshape(1, -1)
data = np.concatenate(([[3]], img), axis=1)
data = pd.DataFrame(data)
df = df.append(data)
print(df.tail())
df.to_csv('train.csv', index=0)
第三种是利用文件夹的命名规则直接读取标签
import os
import random
import cv2
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
#导入相关库
image_types = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")
def list_images(basePath, contains=None):
# 返回有效的图片路径数据集
return list_files(basePath, validExts=image_types, contains=contains)
def list_files(basePath, validExts=None, contains=None):
# 遍历图片数据目录,生成每张图片的路径
for (rootDir, dirNames, filenames) in os.walk(basePath):
# 循环遍历当前目录中的文件名
for filename in filenames:
# if the contains string is not none and the filename does not contain
# the supplied string, then ignore the file
if contains is not None and filename.find(contains) == -1:
continue
# 通过确定.的位置,从而确定当前文件的文件扩展名
ext = filename[filename.rfind("."):].lower()
# 检查文件是否为图像,是否应进行处理
if validExts is None or ext.endswith(validExts):
# 构造图像路径
imagePath = os.path.join(rootDir, filename)
yield imagePath
# 加载自建数据集
data = []
labels = []
# 拿到图像数据路径,方便后续读取
imagePaths = sorted(list(list_images('./dataset')))
random.seed(42)
random.shuffle(imagePaths)
# 遍历读取数据
for imagePath in imagePaths:
# 读取图像数据
image = cv2.imread(imagePath, 1)
image = cv2.resize(image, (64, 96))
data.append(image)
# 读取标签
label = imagePath.split(os.path.sep)[-2] #文件路径的倒数第二个就是文件夹的名字被定义为标签
labels.append(label)
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
(x_train, x_test, y_train, y_test) = train_test_split(data, labels, test_size=0.3, random_state=42) #这里的test指的是训练集中的测试集,也就是val
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
在下一篇博文会介绍如何让机器读取这些数据。
更多推荐
所有评论(0)