MLP多标签分类实现-tensorflow

#!/usr/bin/env python# -*- coding: utf-8 -*-"""@Time: 2022/3/11 14:33@Author:@File: tf_ml_label.py@ desc:"""import platformimport mathimport tensorflow as tfimport timeimport osimport sysimport numpy
_____miss

2088人浏览 · 2022-06-05 17:51:15
_____miss · 2022-06-05 17:51:15 发布
1、网络结构介绍
（1）3层MLP + sigmoid输出 + focal loss。
（2）数据划分：0.8的训练集，0.1的验证集，0.1的测试集。
（3）特征处理：id型特征转初始化再随网络训练，数值型有两种处理方式，一种是分桶处理后转换成id型特征再训练，一种是通过标准化的方式归一化。最终所有的特征一起做concat，形成一个1*n的向量作为模型的输入。
注：对于id型特征，需要保存真实特征与特征在模型训练中的id的映射表，类似NLP中的vocabulary。
（4）计算指标
a、训练过程，计算训练集和验证集的macro_f1和accuracy
b、测试过程，计算总的macro_f1，precision、recall、accuracy，以及每个类别对应的f1、precision、recall。
2、代码如下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Time    : 2022/3/11 14:33
@Author  : 
@File    : tf_ml_label.py
@ desc:  
"""
import json
import platform
import math
import tensorflow as tf
import time
import os
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.python.ops import array_ops


plat = platform.system().lower()
file_name = os.path.basename(sys.argv[0])

if plat == "windows":
    dimension = 32  # add artist
else:
    dimension = 32

# configure
tf.app.flags.DEFINE_string("tables", "", "Tables info")
tf.app.flags.DEFINE_string("outputs", "", "test result output table")
tf.app.flags.DEFINE_string("checkpointDir", '', "Model checkpoint dir")
tf.app.flags.DEFINE_integer("train_epochs", 10, "Number of epochs")
tf.app.flags.DEFINE_integer("batch_size", 128, "Number of batch size")
tf.app.flags.DEFINE_float("lr", 0.0001, "Learning rate")
tf.app.flags.DEFINE_integer("hidden_unit", 128, "Number of hidden neurons per layer")
tf.app.flags.DEFINE_integer("dimension", dimension, "Embedding size")
tf.app.flags.DEFINE_float("display_step", 100, "Display step")
FLAGS = tf.app.flags.FLAGS

# 输出模型的参数
print("train_epochs: ", FLAGS.train_epochs)
print("batch_size: ", FLAGS.batch_size)
print("lr: ", FLAGS.lr)
print("hidden_unit: ", FLAGS.hidden_unit)

# 年代
save_label = ["label1", "label2", "label3"]


# #####################################
class Tools(object):
    @staticmethod
    def currect_time():
        return time.strftime("%H:%M:%S", time.localtime()) + '.%03d' % (time.time() % 1 * 1000)

    @staticmethod
    def log_print(content):
        print("[" + Tools.currect_time() + "] " + content)


######################################
# network
class Network(object):
    def __init__(self, art_num, lang_num, release_num):
        # Network Parameters
        self.num_dimension = FLAGS.dimension + 1
        self.num_labels = len(save_label)
        self.num_hidden = FLAGS.hidden_unit
        self.art_dimension = FLAGS.dimension  
        self.art_num = art_num                
        self.lang_num = lang_num
        self.release_num = release_num

        self.batch_samples = tf.placeholder(tf.int32, shape=[1], name='batch_samples')  # 当前batch样本的数据

        # artist embedding
        self.art_id = tf.placeholder(tf.int32, shape=[None, 1], name='art_id')
        self.art_emb_table = tf.get_variable(
            name='artist_embedding', shape=[self.art_num, self.art_dimension],
            initializer=tf.contrib.layers.xavier_initializer())
        art_embedding = tf.nn.embedding_lookup(self.art_emb_table, self.art_id)
        self.art_embedding = tf.reshape(art_embedding, (self.batch_samples[0], art_embedding.shape[-1]))

        # language embedding
        self.lang_id = tf.placeholder(tf.int32, shape=[None, 1], name='lang_id')
        self.lang_emb_table = tf.get_variable(
            name='lang_embedding', shape=[self.lang_num, self.art_dimension],
            initializer=tf.contrib.layers.xavier_initializer())
        lang_embedding = tf.nn.embedding_lookup(self.lang_emb_table, self.lang_id)
        self.lang_embedding = tf.reshape(lang_embedding, (self.batch_samples[0], lang_embedding.shape[-1]))

        # release embedding
        self.release_id = tf.placeholder(tf.int32, shape=[None, 1], name='release_id')
        self.release_emb_table = tf.get_variable(
            name='release_embedding', shape=[self.release_num, self.art_dimension],
            initializer=tf.contrib.layers.xavier_initializer())
        release_embedding = tf.nn.embedding_lookup(self.release_emb_table, self.release_id)
        self.release_embedding = tf.reshape(release_embedding, (self.batch_samples[0], release_embedding.shape[-1]))

        # inputs <-> goldens
        self.music_embedding = tf.placeholder("float", [None, self.num_dimension], name="music_embedding")
        self.inputs = tf.concat([self.music_embedding, self.art_embedding, self.lang_embedding, self.release_embedding],
                                axis=1)
        out = self.fc('fc1', self.inputs, self.num_dimension + self.art_dimension * 3, self.num_hidden, True)

        out = self.fc('fc2', out, self.num_hidden, self.num_hidden, True)

        # last tail layer
        self.golden = tf.placeholder("float", [None, self.num_labels])
        self.logits = self.fc('fc_out', out, self.num_hidden, self.num_labels, False)

        tf.add_to_collection('pred_network', self.logits)

    def get_weight_varible(self, name, shape):
        return tf.get_variable(name, shape=shape,
                               initializer=tf.contrib.layers.xavier_initializer())

    def get_bias_varible(self, name, shape):
        return tf.get_variable(name, shape=shape,
                               initializer=tf.contrib.layers.xavier_initializer())

    def fc(self, layer_name, x, inp_dim, out_dim, with_act):
        with tf.variable_scope(layer_name):
            y = tf.reshape(x, shape=[-1, inp_dim])
            w = self.get_weight_varible('w', [inp_dim, out_dim])
            b = self.get_bias_varible('b', [out_dim])
            y = tf.add(tf.matmul(y, w), b)
            if with_act:
                y = tf.nn.relu(y)
        return y


#####################################
# main route
def save_model(saver, sess, model_path):
    Tools.log_print('save model to {0}.'.format(model_path))
    saver.save(sess, model_path)


def load_model(saver, sess, model_path):
    Tools.log_print('try to load model from {0}.'.format(model_path))
    saver.restore(sess, model_path)
    Tools.log_print('load model success')
    return True


def load_from_local():
    import xlrd
    samples = []
    all_records = dict()
    file = "./data/local_sample.xlsx"
    workbook = xlrd.open_workbook(file)
    sheet = workbook.sheet_by_index(0)
    for index in range(1, sheet.nrows):
        line = sheet.row_values(index)
        all_records[line[0]] = line[1:-1]
        item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate = \
            line[0], line[2].split(","), line[3].split(","), line[6], line[4], line[9], json.loads(line[10]), \
            line[11], line[12]

        samples.append([item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate])

    return samples, all_records


def load_data():
    if plat == 'windows':
        samples, all_records = load_from_local()

    # 数据处理
    data_id, data_x, data_y, data_labels = [], [], [], []
    duration_fea = []
    art_to_id = {"unk": 0}
    lang_to_id = {"unk": 0}
    _id_art = 1
    _id_lang = 1

    releasedate_to_id = {"unk": 0, "0000": 1, "1111": 2}
    _id = 3

    for i in range(1970, 2025, 5):
        releasedate_to_id[str(i)] = _id
        _id += 1

    for line in samples:
        item_id, feature, tag, duration, artist_id, language, tagcount, collectcount, releasedate = line
        tag = [x for x in tag if x in save_label]
        if len(tag) == 0:
            continue

        # ID化
        if artist_id not in art_to_id.keys():
            art_to_id[artist_id] = _id_art
            _id_art += 1

        # ID化
        if language not in lang_to_id.keys():
            lang_to_id[language] = _id_lang
            _id_lang += 1

        # ID化
        if releasedate is None or releasedate == '\\N':
            release_id = "0000"
        else:
            releaseyear = releasedate.split("-")[0]
            if releaseyear == "0000" or int(releaseyear) > 2022:
                release_id = "0000"
            elif int(releaseyear) < 1970:
                release_id = "1111"
            else:
                release_id = str(int(releaseyear) - int(releaseyear) % 5)

        data_id.append(item_id)
        feature.append(str(art_to_id[artist_id]))
        feature.append(str(lang_to_id[language]))
        feature.append(str(releasedate_to_id[release_id]))
        data_x.append(feature)
        data_labels.append(tag)
        duration_fea.append(float(duration))

        # 标签数据one-hot处理
        tmp = [0] * len(save_label)
        for item in tag:
            tmp[save_label.index(item)] = 1
        data_y.append(tmp)

    # 标准化
    duration_mean = np.mean(duration_fea)
    duration_std = np.std(duration_fea)
    duration_fea = ((duration_fea - np.mean(duration_fea)) / np.std(duration_fea)).reshape(-1, 1)
    data_x = np.hstack((data_x, duration_fea))

    Tools.log_print('loading dataset data process...')

    # 划分数据集
    data_x = np.array(data_x).astype(np.float32)
    data_y = np.array(data_y).astype(np.float32)
    x_train, x_no_train, y_train, y_no_train, id_train, id_no_train = train_test_split(data_x, data_y, data_id,
                                                                                       test_size=0.2, random_state=6)
    x_val, x_test, y_val, y_test, id_val, id_test = train_test_split(x_no_train, y_no_train, id_no_train,
                                                                     test_size=0.5, random_state=6)

    return x_train, y_train, x_val, y_val, x_test, y_test, art_to_id, lang_to_id, releasedate_to_id, id_test, \
           duration_mean, duration_std, all_records


def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.75, gamma=2):
    r"""Compute focal loss for predictions.
        Multi-labels Focal loss formula:
            FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
                 ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
    Args:
     prediction_tensor: A float tensor of shape [batch_size, num_anchors,
        num_classes] representing the predicted logits for each class
     target_tensor: A float tensor of shape [batch_size, num_anchors,
        num_classes] representing one-hot encoded classification targets
     weights: A float tensor of shape [batch_size, num_anchors]
     alpha: A scalar tensor for focal loss alpha hyper-parameter
     gamma: A scalar tensor for focal loss gamma hyper-parameter
    Returns:
        loss: A (scalar) tensor representing the value of the loss function
    """
    sigmoid_p = tf.nn.sigmoid(prediction_tensor)
    zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)

    # For poitive prediction, only need consider front part loss, back part is 0;
    # target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
    pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)

    # For negative prediction, only need consider back part loss, front part is 0;
    # target_tensor > zeros <=> z=1, so negative coefficient = 0.
    neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
    per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
                          - (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))

    if weights is not None:
        per_entry_cross_ent = per_entry_cross_ent * weights

    return tf.reduce_mean(per_entry_cross_ent)


def macro_f1_func(y, y_hat, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)

    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive

    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_pred = tf.cast(tf.greater(y_hat, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y, axis=0), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (tf.ones_like(y) - y), axis=0), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((tf.ones_like(y_pred) - y_pred) * y, axis=0), tf.float32)
    pre = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 * tp / (2 * tp + fn + fp + 1e-16)
    mean_cls_pre = tf.reduce_mean(pre)
    mean_cls_rec = tf.reduce_mean(rec)
    mean_cls_f1 = tf.reduce_mean(f1)
    return tp, fp, fn, pre, rec, f1, mean_cls_pre, mean_cls_rec, mean_cls_f1


def mapping_func(name, feat_to_id):
    mapping_recs = []
    for k, v in feat_to_id.items():
        mapping_recs.append([name, k, str(int(v))])
    return mapping_recs


def train():
    if plat == "windows":
        model_dir = os.path.join("model", 'mll')
    else:
        model_dir = os.path.join(FLAGS.checkpointDir, 'mll')

    Tools.log_print('loading dataset...')
    train_x, train_y, val_x, val_y, test_x, test_y, art_to_id, lang_to_id, release_to_id, test_id, \
    duration_mean, duration_std, all_records = load_data()

    label_samples = train_y.sum(axis=0)
    weight = min(label_samples) / label_samples
    print("the number of artist: ", len(art_to_id))

    Tools.log_print('building network...')
    network = Network(art_num=len(art_to_id), lang_num=len(lang_to_id), release_num=len(release_to_id))

    # training parameters
    learning_rate = FLAGS.lr
    train_epochs = FLAGS.train_epochs
    batchsize = FLAGS.batch_size
    display_step = FLAGS.display_step

    # BCE loss
    loss_op = focal_loss(prediction_tensor=network.logits, target_tensor=network.golden, weights=weight)

    # optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    # train operator
    train_op = optimizer.minimize(loss_op)

    # accuracy
    correct_op = tf.equal(tf.cast(tf.greater_equal(tf.sigmoid(network.logits), 0.5), tf.int32),
                          tf.cast(network.golden, tf.int32))
    accuracy_op = tf.reduce_mean(tf.reduce_min(tf.cast(correct_op, tf.float32), 1))

    macro_f1_op = macro_f1_func(network.golden, tf.sigmoid(network.logits), 0.5)

    # initialize the variables (i.e. assign their default value)
    init_op = tf.global_variables_initializer()
    Tools.log_print('build network success.\n')

    # 测试集相关的指标输出
    train_log = []
    log_count = 0
    test_indicator = []
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # run the initializer
        sess.run(init_op)

        # run epochs
        Tools.log_print('start training...')
        for epoch in range(1, train_epochs + 1):
            train_steps = math.ceil(len(train_x) / batchsize)

            total_loss = 0.0
            total_acc = 0.0
            total_macro_f1 = 0.0

            for step in range(1, train_steps + 1):
                start = (step - 1) * batchsize
                end = step * batchsize
                batch_x, batch_y = train_x[start:end, :], train_y[start:end, :]
                music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
                                                           batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
                music_x = np.hstack((music_x, dur_x))
                art_x = art_x.reshape([len(art_x), 1]).astype(int)
                lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
                release_x = release_x.reshape([len(release_x), 1]).astype(int)

                _, batch_loss, batch_acc, f1_result = sess.run(
                    [train_op, loss_op, accuracy_op, macro_f1_op],
                    feed_dict={network.music_embedding: music_x,
                               network.batch_samples: [len(art_x)],
                               network.art_id: art_x,
                               network.lang_id: lang_x,
                               network.release_id: release_x,
                               network.golden: batch_y})

                total_loss += batch_loss
                total_acc += batch_acc
                total_macro_f1 += f1_result[-1]

                if step % display_step == 0:
                    # 训练集的情况
                    avg_loss = format(total_loss / display_step, ".4f")
                    avg_acc = total_acc / display_step
                    avg_f1 = format(total_macro_f1 / display_step, ".4f")

                    total_loss = 0.0
                    total_acc = 0.0
                    total_macro_f1 = 0.0

                    # 验证集的计算
                    val_total_loss = 0
                    val_tp = []
                    val_fp = []
                    val_fn = []

                    val_steps = math.ceil(len(val_x) / batchsize)
                    for val_step in range(1, val_steps + 1):
                        start = (val_step - 1) * batchsize
                        end = val_step * batchsize
                        batch_x, batch_y = val_x[start:end, :], val_y[start:end, :]
                        music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
                                                                   batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
                        music_x = np.hstack((music_x, dur_x))
                        art_x = art_x.reshape([len(art_x), 1]).astype(int)
                        lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
                        release_x = release_x.reshape([len(release_x), 1]).astype(int)

                        val_batch_loss, val_batch_f1 = sess.run([loss_op, macro_f1_op],
                                                                feed_dict={network.music_embedding: music_x,
                                                                           network.batch_samples: [len(art_x)],
                                                                           network.art_id: art_x,
                                                                           network.lang_id: lang_x,
                                                                           network.release_id: release_x,
                                                                           network.golden: batch_y})

                        tp, fp, fn = val_batch_f1[:3]
                        val_total_loss += val_batch_loss * len(batch_x)
                        val_tp.append(tp)
                        val_fp.append(fp)
                        val_fn.append(fn)

                    # 计算验证集的loss
                    val_loss = format(val_total_loss / len(val_x), ".4f")
                    val_tp = np.sum(val_tp, axis=0)
                    val_fp = np.sum(val_fp, axis=0)
                    val_fn = np.sum(val_fn, axis=0)

                    val_macro_f1 = format(np.mean(2 * val_tp / (2 * val_tp + val_fn + val_fp + 1e-16)), ".4f")

                    Tools.log_print("Epoch[%d/%d] Step[%d/%d] Train Minibatch Loss= %s, macro_f1=%s, "
                                    "Accuracy= %.4f, val_loss= %s, val_macro_f1= %s" %
                                    (epoch, train_epochs, step, train_steps, avg_loss, avg_f1, avg_acc, val_loss,
                                     val_macro_f1))

                    tmp = (
                    log_count, "{}/{}".format(epoch, train_epochs), "{}/{}".format(step, train_steps), avg_loss, avg_f1,
                    val_loss, val_macro_f1)
                    log_count += 1
                    train_log.append(tmp)

        saver.save(sess, save_path=model_dir)
        Tools.log_print("finished training!")

        # load model and test
        Tools.log_print('start testing...')
        if load_model(saver, sess, model_dir):
            test_steps = math.ceil(len(test_x) / batchsize)

            prediction_y = []
            test_total_loss = 0
            test_tp = []
            test_fp = []
            test_fn = []
            for step in range(1, test_steps + 1):
                start = (step - 1) * batchsize
                end = step * batchsize
                batch_x, batch_y = test_x[start:end, :], test_y[start:end, :]
                music_x, art_x, lang_x, release_x, dur_x = batch_x[:, :-4], batch_x[:, -4], batch_x[:, -3], \
                                                           batch_x[:, -2], batch_x[:, -1].reshape(-1, 1)
                music_x = np.hstack((music_x, dur_x))
                art_x = art_x.reshape([len(art_x), 1]).astype(int)
                lang_x = lang_x.reshape([len(lang_x), 1]).astype(int)
                release_x = release_x.reshape([len(release_x), 1]).astype(int)

                test_batch_loss, test_batch_f1, batch_prediction = sess.run([loss_op, macro_f1_op, network.logits],
                                                                            feed_dict={network.music_embedding: music_x,
                                                                                       network.batch_samples: [
                                                                                           len(art_x)],
                                                                                       network.art_id: art_x,
                                                                                       network.lang_id: lang_x,
                                                                                       network.release_id: release_x,
                                                                                       network.golden: batch_y})
                tp, fp, fn = test_batch_f1[:3]
                test_total_loss += test_batch_loss * len(batch_x)
                test_tp.append(tp)
                test_fp.append(fp)
                test_fn.append(fn)
                prediction_y += batch_prediction.tolist()

            # 计算测试集的loss
            test_loss = test_total_loss / len(val_x)
            test_tp = np.sum(test_tp, axis=0)
            test_fp = np.sum(test_fp, axis=0)
            test_fn = np.sum(test_fn, axis=0)

            test_precision = test_tp / (test_tp + test_fp)
            test_recall = test_tp / (test_tp + test_fn)
            test_f1_score = 2 * test_tp / (2 * test_tp + test_fn + test_fp + 1e-16)

            # 计算测试集的acc
            prediction_y = tf.sigmoid(np.array(prediction_y)).eval()
            correct = tf.equal(tf.cast(tf.greater_equal(prediction_y, 0.5), tf.int32),
                               tf.cast(test_y, tf.int32))
            test_acc = tf.reduce_mean(tf.reduce_min(tf.cast(correct, tf.float32), 1)).eval()

            mean_f1_score = format(np.mean(test_f1_score), ".4f")
            mean_precision = format(np.mean(test_precision), ".4f")
            mean_recall = format(np.mean(test_recall), ".4f")

            Tools.log_print("test_loss=%.4f, test_accuracy=%.4f, test_macro_f1=%s, test_macro_precision=%s, "
                            "test_macro_recall=%s" % (test_loss, test_acc, mean_f1_score, mean_precision, mean_recall))
            test_indicator.append(("total", mean_f1_score, mean_precision, mean_recall, "", "", "", ""))

            # 计算每个类别的precision和recall
            for i, label in enumerate(save_label):
                tp_i = int(test_tp[i])
                fp_i = int(test_fp[i])
                fn_i = int(test_fn[i])
                tn_i = len(test_y) - tp_i - fp_i - fn_i
                cf_mx = np.array([[tp_i, fn_i], [fp_i, tn_i]])

                f1 = format(test_f1_score[i], ".4f")
                pre = format(test_precision[i], ".4f")
                rec = format(test_recall[i], ".4f")
                print("{}：macro_f1={}, precision={}, recall={}".format(label, f1, pre, rec))
                print(cf_mx)
                print()
                test_indicator.append((label, f1, pre, rec, tp_i, fn_i, fp_i, tn_i))

            # 计算每个样本的预测情况
            pred_y = tf.cast(tf.greater(prediction_y, 0.5), tf.float32).eval()
            error_count = tf.cast(tf.math.count_nonzero(pred_y - test_y, axis=1), tf.float32).eval()

    # 输出预测样本的本身标签和预测标签
    write_recs = []
    prediction_y = prediction_y.tolist()
    for index, count in enumerate(error_count):
        test_tmp = test_y[index]
        test_tmp = [save_label[_x] for _x, _y in enumerate(test_tmp) if int(_y) == 1]

        label_prob = dict(zip(save_label, [format(x, ".4f") for x in prediction_y[index]]))
        label_prob = dict(sorted(label_prob.items(), key=lambda x: x[1], reverse=True))

        pre_tmp = [k for k, v in label_prob.items() if float(v) > 0.5]

        features = all_records[test_id[index]]
        tmp = (test_id[index], ",".join(test_tmp), ",".join(pre_tmp), features[10], features[9],
               features[0], features[6], features[7], features[5], json.dumps(label_prob, ensure_ascii=False))
        write_recs.append(tmp)

    map_table, log_table, indicator_table, label_table = FLAGS.outputs.split(",")

    # 写各个特征的map文件
    art_mapping = mapping_func("artist", art_to_id)
    lang_mapping = mapping_func("language", lang_to_id)
    release_mapping = mapping_func("release", release_to_id)
    mapping_records = art_mapping + lang_mapping + release_mapping + \
                      [["duration", "mean", duration_mean], ["duration", "std", duration_std]]

    Tools.log_print('finished testing...')


if __name__ == '__main__':
    train()