MLP多标签分类实现-tensorflow
#!/usr/bin/env python# -*- coding: utf-8 -*-"""@Time: 2022/3/11 14:33@Author:@File: tf_ml_label.py@ desc:"""import platformimport mathimport tensorflow as tfimport timeimport osimport sysimport numpy
·
1、网络结构介绍
(1)3层MLP + sigmoid输出 + focal loss。
(2)数据划分:0.8的训练集,0.1的验证集,0.1的测试集。
(3)特征处理:id型特征转初始化再随网络训练,数值型有两种处理方式,一种是分桶处理后转换成id型特征再训练,一种是通过标准化的方式归一化。最终所有的特征一起做concat,形成一个1*n的向量作为模型的输入。
注:对于id型特征,需要保存真实特征与特征在模型训练中的id的映射表,类似NLP中的vocabulary。
(4)计算指标
a、训练过程,计算训练集和验证集的macro_f1和accuracy
b、测试过程,计算总的macro_f1,precision、recall、accuracy,以及每个类别对应的f1、precision、recall。
2、代码如下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time : 2022/3/11 14:33
@Author :
@File : tf_ml_label.py
@ desc:
"""
import json
import platform
import math
import tensorflow as tf
import time
import os
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.python.ops import array_ops
plat = platform.system().lower()
file_name = os.path.basename(sys.argv[0])
if plat == "windows":
dimension = 32 # add artist
else:
dimension = 32
# configure
tf.app.flags.DEFINE_string("tables", "", "Tables info")
tf.app.flags.DEFINE_string("outputs", "", "test result output table")
tf.app.flags.DEFINE_string("checkpointDir", '', "Model checkpoint dir")
tf.app.flags.DEFINE_integer("train_epochs", 10, "Number of epochs")
tf.app.flags.DEFINE_integer("batch_size", 128, "Number of batch size")
tf.app.flags.DEFINE_float("lr", 0.0001, "Learning rate")
tf.app.flags.DEFINE_integer("hidden_unit", 128, "Number of hidden neurons per layer")
tf.app.flags.DEFINE_integer("dimension", dimension, "Embedding size")
tf.app.flags.DEFINE_float("display_step", 100, "Display step")
FLAGS = tf.app.flags.FLAGS
# 输出模型的参数
print("train_epochs: ", FLAGS.train_epochs)
print("batch_size: ", FLAGS.batch_size)
print("lr: ", FLAGS.lr)
print("hidden_unit: ", FLAGS.hidden_unit)
# 年代
save_label = ["label1", "label2", "label3"]
# #####################################
class Tools(object):
@staticmethod
def currect_time():
return time.strftime("%H:%M:%S", time.localtime()) + '.%03d' % (time.time() % 1 * 1000)
@staticmethod
def log_print(content):
print("[" + Tools.currect_time() + "] " + content)
######################################
# network
class Network(object):
def __init__(self, art_num, lang_num, release_num):
# Network Parameters
self.num_dimension = FLAGS.dimension + 1
self.num_labels = len(save_label)
self.num_hidden = FLAGS.hidden_unit
self.art_dimension = FLAGS.dimension
self.art_num = art_num
self.lang_num = lang_num
self.release_num = release_num
self.batch_samples = tf.placeholder(tf.int32, shape=[1], name='batch_samples') # 当前batch样本的数据
# artist embedding
self.art_id = tf.placeholder(tf.int32, shape=[None, 1], name='art_id')
self.art_emb_table = tf.get_variable(
name='artist_embedding', shape=[self.art_num, self.art_dimension],
initializer=tf.contrib.layers.xavier_initializer())
art_embedding = tf.nn.embedding_lookup(self.art_emb_table, self.art_id)
self.art_embedding = tf.reshape(art_embedding, (self.batch_samples[0], art_embedding.shape[-1]))
# language embedding
self.lang_id = tf.placeholder(tf.int32, shape=[None, 1], name='lang_id')
self.lang_emb_table = tf.get_variable(
name='lang_embedding', shape=[self.lang_num, self.art_dimension],
initializer=tf.contrib.layers.xavier_initializer())
lang_embedding = tf.nn.embedding_lookup(self.lang_emb_table, self.lang_id)
self.lang_embedding = tf.reshape(lang_embedding, (self.batch_samples[0], lang_embedding.shape[-1]))
# release embedding
self.release_id = tf.placeholder(tf.int32, shape=[None, 1], name='release_id')
self.release_emb_table = tf.get_variable(
name='release_embedding', shape=[self.release_num, self.art_dimension],
initializer=tf.contrib.layers.xavier_initializer())
release_embedding = tf.nn.embedding_lookup(self.release_emb_table, self.release_id)
self.release_embedding = tf.reshape(release_embedding, (self.batch_samples[0], release_embedding.shape[-1]))
# inputs <-> goldens
self.music_embedding = tf.placeholder("float", [None, self.num_dimension], name="music_embedding")
self.inputs = tf.concat([self.music_embedding, self.art_embedding, self.lang_embedding, self.release_embedding],
axis=1)
out = self.fc('fc1', self.inputs, self.num_dimension + self.art_dimension * 3, self.num_hidden, True)
out = self.fc('fc2', out, self.num_hidden, self.num_hidden, True)
# last tail layer
self.golden = tf.placeholder("float", [None, self.num_labels])
self.logits = self.fc('fc_out', out, self.num_hidden, self.num_labels, False)
tf.add_to_collection('pred_network', self.logits)
def get_weight_varible(self, name, shape):
return tf.get_variable(name, shape=shape,
initializer=tf.contrib.layers.xavier_initializer())
def get_bias_varible(self, name, shape):
return tf.get_variable(name, shape=shape,
initializer=tf.contrib.layers.xavier_initializer())
def fc(self, layer_name, x, inp_dim, out_dim, with_act):
with tf.variable_scope(layer_name):
y = tf.reshape(x, shape=[-1, inp_dim])
w = self.get_weight_varible('w', [inp_dim, out_dim])
b = self.get_bias_varible('b', [out_dim])
y = tf.add(tf.matmul(y, w), b)
if with_act:
y = tf.nn.relu(y)
return y
#####################################
# main route
def save_model(saver, sess, model_path):
Tools.log_print('save model to {0}.'.format(model_path))
saver.save(sess, model_path)
def load_model(saver, sess, model_path):
Tools.log_print('try to load model from {0}.'.format(model_path))
saver.restore(sess, model_path)
Tools.log_print('load model success')
return True
def load_from_local():
import xlrd
samples = []
all_records = dict()
file = "./data/local_sample.xlsx"
workbook = xlrd.open_workbook(file)
sheet = workbook.sheet_by_index(0)
for index in range(1, sheet.nrows):
line = sheet.row_values(index)
all_records[line[0]] = line[1:-1]
item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate = \
line[0], line[2].split(","), line[3].split(","), line[6], line[4], line[9], json.loads(line[10]), \
line[11], line[12]
samples.append([item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate])
return samples, all_records
def load_data():
if plat == 'windows':
samples, all_records = load_from_local()
# 数据处理
data_id, data_x, data_y, data_labels = [], [], [], []
duration_fea = []
art_to_id = {"unk": 0}
lang_to_id = {"unk": 0}
_id_art = 1
_id_lang = 1
releasedate_to_id = {"unk": 0, "0000": 1, "1111": 2}
_id = 3
for i in range(1970, 2025, 5):
releasedate_to_id[str(i)] = _id
_id += 1
for line in samples:
item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate = line
tag = [x for x in tag if x in save_label]
if len(tag) == 0:
continue
# ID化
if artist_id not in art_to_id.keys():
art_to_id[artist_id] = _id_art
_id_art += 1
# ID化
if language not in lang_to_id.keys():
lang_to_id[language] = _id_lang
_id_lang += 1
# ID化
if releasedate is None or releasedate == '\\N':
release_id = "0000"
else:
releaseyear = releasedate.split("-")[0]
if releaseyear == "0000" or int(releaseyear) > 2022:
release_id = "0000"
elif int(releaseyear) < 1970:
release_id = "1111"
else:
release_id = str(int(releaseyear) - int(releaseyear) % 5)
data_id.append(item_id)
feature.append(str(art_to_id[artist_id]))
feature.append(str(lang_to_id[language]))
feature.append(str(releasedate_to_id[release_id]))
data_x.append(feature)
data_labels.append(tag)
duration_fea.append(float(duration))
# 标签数据one-hot处理
tmp = [0] * len(save_label)
for item in tag:
tmp[save_label.index(item)] = 1
data_y.append(tmp)
# 标准化
duration_mean = np.mean(duration_fea)
duration_std = np.std(duration_fea)
duration_fea = ((duration_fea - np.mean(duration_fea)) / np.std(duration_fea)).reshape(-1, 1)
data_x = np.hstack((data_x, duration_fea))
Tools.log_print('loading dataset data process...')
# 划分数据集
data_x = np.array(data_x).astype(np.float32)
data_y = np.array(data_y).astype(np.float32)
x_train, x_no_train, y_train, y_no_train, id_train, id_no_train = train_test_split(data_x, data_y, data_id,
test_size=0.2, random_state=6)
x_val, x_test, y_val, y_test, id_val, id_test = train_test_split(x_no_train, y_no_train, id_no_train,
test_size=0.5, random_state=6)
return x_train, y_train, x_val, y_val, x_test, y_test, art_to_id, lang_to_id, releasedate_to_id, id_test, \
duration_mean, duration_std, all_records
def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.75, gamma=2):
r"""Compute focal loss for predictions.
Multi-labels Focal loss formula:
FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
Args:
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing the predicted logits for each class
target_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing one-hot encoded classification targets
weights: A float tensor of shape [batch_size, num_anchors]
alpha: A scalar tensor for focal loss alpha hyper-parameter
gamma: A scalar tensor for focal loss gamma hyper-parameter
Returns:
loss: A (scalar) tensor representing the value of the loss function
"""
sigmoid_p = tf.nn.sigmoid(prediction_tensor)
zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
# For poitive prediction, only need consider front part loss, back part is 0;
# target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
# For negative prediction, only need consider back part loss, front part is 0;
# target_tensor > zeros <=> z=1, so negative coefficient = 0.
neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
- (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
if weights is not None:
per_entry_cross_ent = per_entry_cross_ent * weights
return tf.reduce_mean(per_entry_cross_ent)
def macro_f1_func(y, y_hat, thresh=0.5):
"""Compute the macro F1-score on a batch of observations (average F1 across labels)
Args:
y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
thresh: probability value above which we predict positive
Returns:
macro_f1 (scalar Tensor): value of macro F1 for the batch
"""
y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
fp = tf.cast(tf.math.count_nonzero(y_pred * (tf.ones_like(y) - y), axis=0), tf.float32)
fn = tf.cast(tf.math.count_nonzero((tf.ones_like(y_pred) - y_pred) * y, axis=0), tf.float32)
pre = tp / (tp + fp)
rec = tp / (tp + fn)
f1 = 2 * tp / (2 * tp + fn + fp + 1e-16)
mean_cls_pre = tf.reduce_mean(pre)
mean_cls_rec = tf.reduce_mean(rec)
mean_cls_f1 = tf.reduce_mean(f1)
return tp, fp, fn, pre, rec, f1, mean_cls_pre, mean_cls_rec, mean_cls_f1
def mapping_func(name, feat_to_id):
mapping_recs = []
for k, v in feat_to_id.items():
mapping_recs.append([name, k, str(int(v))])
return mapping_recs
def train():
if plat == "windows":
model_dir = os.path.join("model", 'mll')
else:
model_dir = os.path.join(FLAGS.checkpointDir, 'mll')
Tools.log_print('loading dataset...')
train_x, train_y, val_x, val_y, test_x, test_y, art_to_id, lang_to_id, release_to_id, test_id, \
duration_mean, duration_std, all_records = load_data()
label_samples = train_y.sum(axis=0)
weight = min(label_samples) / label_samples
print("the number of artist: ", len(art_to_id))
Tools.log_print('building network...')
network = Network(art_num=len(art_to_id), lang_num=len(lang_to_id), release_num=len(release_to_id))
# training parameters
learning_rate = FLAGS.lr
train_epochs = FLAGS.train_epochs
batchsize = FLAGS.batch_size
display_step = FLAGS.display_step
# BCE loss
loss_op = focal_loss(prediction_tensor=network.logits, target_tensor=network.golden, weights=weight)
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
# train operator
train_op = optimizer.minimize(loss_op)
# accuracy
correct_op = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(network.logits), 0.5), tf.int32),
tf.cast(network.golden, tf.int32))
accuracy_op = tf.reduce_mean(tf.reduce_min(tf.cast(correct_op, tf.float32), 1))
macro_f1_op = macro_f1_func(network.golden, tf.sigmoid(network.logits), 0.5)
# initialize the variables (i.e. assign their default value)
init_op = tf.global_variables_initializer()
Tools.log_print('build network success.\n')
# 测试集相关的指标输出
train_log = []
log_count = 0
test_indicator = []
saver = tf.train.Saver()
with tf.Session() as sess:
# run the initializer
sess.run(init_op)
# run epochs
Tools.log_print('start training...')
for epoch in range(1, train_epochs + 1):
train_steps = math.ceil(len(train_x) / batchsize)
total_loss = 0.0
total_acc = 0.0
total_macro_f1 = 0.0
for step in range(1, train_steps + 1):
start = (step - 1) * batchsize
end = step * batchsize
batch_x, batch_y = train_x[start:end, :], train_y[start:end, :]
music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
music_x = np.hstack((music_x, dur_x))
art_x = art_x.reshape([len(art_x), 1]).astype(int)
lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
release_x = release_x.reshape([len(release_x), 1]).astype(int)
_, batch_loss, batch_acc, f1_result = sess.run(
[train_op, loss_op, accuracy_op, macro_f1_op],
feed_dict={network.music_embedding: music_x,
network.batch_samples: [len(art_x)],
network.art_id: art_x,
network.lang_id: lang_x,
network.release_id: release_x,
network.golden: batch_y})
total_loss += batch_loss
total_acc += batch_acc
total_macro_f1 += f1_result[-1]
if step % display_step == 0:
# 训练集的情况
avg_loss = format(total_loss / display_step, ".4f")
avg_acc = total_acc / display_step
avg_f1 = format(total_macro_f1 / display_step, ".4f")
total_loss = 0.0
total_acc = 0.0
total_macro_f1 = 0.0
# 验证集的计算
val_total_loss = 0
val_tp = []
val_fp = []
val_fn = []
val_steps = math.ceil(len(val_x) / batchsize)
for val_step in range(1, val_steps + 1):
start = (val_step - 1) * batchsize
end = val_step * batchsize
batch_x, batch_y = val_x[start:end, :], val_y[start:end, :]
music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
music_x = np.hstack((music_x, dur_x))
art_x = art_x.reshape([len(art_x), 1]).astype(int)
lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
release_x = release_x.reshape([len(release_x), 1]).astype(int)
val_batch_loss, val_batch_f1 = sess.run([loss_op, macro_f1_op],
feed_dict={network.music_embedding: music_x,
network.batch_samples: [len(art_x)],
network.art_id: art_x,
network.lang_id: lang_x,
network.release_id: release_x,
network.golden: batch_y})
tp, fp, fn = val_batch_f1[:3]
val_total_loss += val_batch_loss * len(batch_x)
val_tp.append(tp)
val_fp.append(fp)
val_fn.append(fn)
# 计算验证集的loss
val_loss = format(val_total_loss / len(val_x), ".4f")
val_tp = np.sum(val_tp, axis=0)
val_fp = np.sum(val_fp, axis=0)
val_fn = np.sum(val_fn, axis=0)
val_macro_f1 = format(np.mean(2 * val_tp / (2 * val_tp + val_fn + val_fp + 1e-16)), ".4f")
Tools.log_print("Epoch[%d/%d] Step[%d/%d] Train Minibatch Loss= %s, macro_f1=%s, "
"Accuracy= %.4f, val_loss= %s, val_macro_f1= %s" %
(epoch, train_epochs, step, train_steps, avg_loss, avg_f1, avg_acc, val_loss,
val_macro_f1))
tmp = (
log_count, "{}/{}".format(epoch, train_epochs), "{}/{}".format(step, train_steps), avg_loss, avg_f1,
val_loss, val_macro_f1)
log_count += 1
train_log.append(tmp)
saver.save(sess, save_path=model_dir)
Tools.log_print("finished training!")
# load model and test
Tools.log_print('start testing...')
if load_model(saver, sess, model_dir):
test_steps = math.ceil(len(test_x) / batchsize)
prediction_y = []
test_total_loss = 0
test_tp = []
test_fp = []
test_fn = []
for step in range(1, test_steps + 1):
start = (step - 1) * batchsize
end = step * batchsize
batch_x, batch_y = test_x[start:end, :], test_y[start:end, :]
music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
music_x = np.hstack((music_x, dur_x))
art_x = art_x.reshape([len(art_x), 1]).astype(int)
lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
release_x = release_x.reshape([len(release_x), 1]).astype(int)
test_batch_loss, test_batch_f1, batch_prediction = sess.run([loss_op, macro_f1_op, network.logits],
feed_dict={network.music_embedding: music_x,
network.batch_samples: [
len(art_x)],
network.art_id: art_x,
network.lang_id: lang_x,
network.release_id: release_x,
network.golden: batch_y})
tp, fp, fn = test_batch_f1[:3]
test_total_loss += test_batch_loss * len(batch_x)
test_tp.append(tp)
test_fp.append(fp)
test_fn.append(fn)
prediction_y += batch_prediction.tolist()
# 计算测试集的loss
test_loss = test_total_loss / len(val_x)
test_tp = np.sum(test_tp, axis=0)
test_fp = np.sum(test_fp, axis=0)
test_fn = np.sum(test_fn, axis=0)
test_precision = test_tp / (test_tp + test_fp)
test_recall = test_tp / (test_tp + test_fn)
test_f1_score = 2 * test_tp / (2 * test_tp + test_fn + test_fp + 1e-16)
# 计算测试集的acc
prediction_y = tf.sigmoid(np.array(prediction_y)).eval()
correct = tf.equal(tf.cast(tf.greater_equal(prediction_y, 0.5), tf.int32),
tf.cast(test_y, tf.int32))
test_acc = tf.reduce_mean(tf.reduce_min(tf.cast(correct, tf.float32), 1)).eval()
mean_f1_score = format(np.mean(test_f1_score), ".4f")
mean_precision = format(np.mean(test_precision), ".4f")
mean_recall = format(np.mean(test_recall), ".4f")
Tools.log_print("test_loss=%.4f, test_accuracy=%.4f, test_macro_f1=%s, test_macro_precision=%s, "
"test_macro_recall=%s" % (test_loss, test_acc, mean_f1_score, mean_precision, mean_recall))
test_indicator.append(("total", mean_f1_score, mean_precision, mean_recall, "", "", "", ""))
# 计算每个类别的precision和recall
for i, label in enumerate(save_label):
tp_i = int(test_tp[i])
fp_i = int(test_fp[i])
fn_i = int(test_fn[i])
tn_i = len(test_y) - tp_i - fp_i - fn_i
cf_mx = np.array([[tp_i, fn_i], [fp_i, tn_i]])
f1 = format(test_f1_score[i], ".4f")
pre = format(test_precision[i], ".4f")
rec = format(test_recall[i], ".4f")
print("{}:macro_f1={}, precision={}, recall={}".format(label, f1, pre, rec))
print(cf_mx)
print()
test_indicator.append((label, f1, pre, rec, tp_i, fn_i, fp_i, tn_i))
# 计算每个样本的预测情况
pred_y = tf.cast(tf.greater(prediction_y, 0.5), tf.float32).eval()
error_count = tf.cast(tf.math.count_nonzero(pred_y - test_y, axis=1), tf.float32).eval()
# 输出预测样本的本身标签和预测标签
write_recs = []
prediction_y = prediction_y.tolist()
for index, count in enumerate(error_count):
test_tmp = test_y[index]
test_tmp = [save_label[_x] for _x, _y in enumerate(test_tmp) if int(_y) == 1]
label_prob = dict(zip(save_label, [format(x, ".4f") for x in prediction_y[index]]))
label_prob = dict(sorted(label_prob.items(), key=lambda x: x[1], reverse=True))
pre_tmp = [k for k, v in label_prob.items() if float(v) > 0.5]
features = all_records[test_id[index]]
tmp = (test_id[index], ",".join(test_tmp), ",".join(pre_tmp), features[10], features[9],
features[0], features[6], features[7], features[5], json.dumps(label_prob, ensure_ascii=False))
write_recs.append(tmp)
map_table, log_table, indicator_table, label_table = FLAGS.outputs.split(",")
# 写各个特征的map文件
art_mapping = mapping_func("artist", art_to_id)
lang_mapping = mapping_func("language", lang_to_id)
release_mapping = mapping_func("release", release_to_id)
mapping_records = art_mapping + lang_mapping + release_mapping + \
[["duration", "mean", duration_mean], ["duration", "std", duration_std]]
Tools.log_print('finished testing...')
if __name__ == '__main__':
train()
更多推荐
所有评论(0)