7.集成学习
什么是集成学习机器学习的两个核心任务集成学习中boosting和BaggingBaggin集成原理实现流程随机森林构造过程面试题包外估计(Out-of-Bag Estimate)定义用途随机森林APIbagging集成优点随机森林案例(以泰坦尼克号乘客生存预测为例)from sklearn.ensemble import RandomForestClassifierfrom sklearn.mod
·
什么是集成学习
机器学习的两个核心任务
集成学习中boosting和Bagging
Baggin
集成原理
实现流程
随机森林构造过程
面试题
包外估计(Out-of-Bag Estimate)
定义
用途
随机森林API
bagging集成优点
随机森林案例(以泰坦尼克号乘客生存预测为例)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor,export_graphviz
# 获取数据
titan = pd.read_csv("titanic.csv")
# 数据基本处理
# 确定特征值,目标值
x = titan[["pclass","age","sex"]]
y = titan["survived"]
# 缺失值处理
x["age"].fillna(value=titan["age"].mean(),inplace=True)
# 数据集划分
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=22,test_size=0.2)
# 特征工程-字典特征抽取
x_train = x_train.to_dict(orient="records")
x_test = x_test.to_dict(orient="records")
transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)
# 机器学习-决策树
estimator = DecisionTreeRegressor(max_depth=5)
estimator.fit(x_train,y_train)
# 模型评估
print("得分:\n",estimator.score(x_test,y_test))
rf = RandomForestClassifier()
# 通过超参数调优
param = {"n_estimators":[100,120,300],"max_depth":[3,7,11]}
gc = GridSearchCV(rf,param_grid=param,cv=3)
gc.fit(x_train,y_train)
print("随机森林预测结果是:\n",gc.score(x_test,y_test))
otto案例-Otto Group Product
数据集介绍
评分标准
导入依赖
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
数据获取
data = pd.read_csv("train.csv")
data.head()
数据基本处理
## 数据类别不均衡
sns.countplot(data.target)
plt.show()
# 随机欠采样获取数据
## 确定特征值,目标值
y = data["target"]
x = data.drop(["id","target"],axis=1)
x.head(),y.head()
## 欠采样获取数据
rus = RandomUnderSampler(random_state=0)
X_resampled,Y_resampled = rus.fit_resample(x,y)
sns.countplot(Y_resampled)
plt.show()
# 把标签值转换为数字
le = LabelEncoder()
Y_resampled = le.fit_transform(Y_resampled)
# 分割数据
x_train,x_test,y_train,y_test = train_test_split(X_resampled,Y_resampled,test_size=0.2,random_state=22)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
模型训练
## 开启包外估计
rf = RandomForestClassifier(oob_score=True)
rf.fit(x_train,y_train)
y_pre = rf.predict(x_test)
score = rf.score(x_test,y_test)
rf.oob_score_ #0.7587845622119815
score1 #0.7840483731644111
评分
# logloss 参数要求one-hot格式
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1,1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1))
log_loss(y_test1,y_pre1,eps=1e-15,normalize=True)
7.4587049513916055
改变预测值输出模式,让输出结果为百分占比,减低logloss值
y_pre_probae = rf.predict_proba(x_test)
y_pre_probae
rf.oob_score_ #0.7587845622119815
log_loss(y_test1,y_pre_probae,eps=1e-15,normalize=True)
模型调优
确定最优的n_estimators
tuned_parameters = range(10,200,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=one_parameter,
max_depth=10,
max_features=10,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定n_estimators=170的时候,表现效果不错
确定最优的max_features
tuned_parameters = range(5,40,5)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=10,
max_features=one_parameter,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("max_features")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定max_features=15的时候,表现效果不错
确定最优的max_depth
tuned_parameters = range(10,100,10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=one_parameter,
max_features=15,
min_samples_leaf=10,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定max_depth=30的时候,表现效果不错
确定最优的min_samples_leaf
tuned_parameters = range(1,10,2)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优
for j,one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(
n_estimators=170,
max_depth=30,
max_features=15,
min_samples_leaf=one_parameter,
oob_score=True,
n_jobs=-1)
rf2.fit(x_train,y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre_proba = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test,y_pre_proba,eps=1e-15,normalize=True)
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)
axes[0].set_xlabel("min_samples_leaf")
axes[0].set_ylabel("errot_t")
axes[1].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
由图像可知,确定min_samples_leaf=1的时候,表现效果不错
确定最优模型
rf3 = RandomForestClassifier(
n_estimators=170,
max_depth=30,
max_features=15,
min_samples_leaf=1,
oob_score=True,
random_state=40,
n_jobs=-1)
rf3.fit(x_train,y_train)
rf3.score(x_test,y_test) #0.788367405701123
rf3.oob_score_ #0.7647609447004609
y_pre_probal = rf3.predict_proba(x_test)
log_loss(y_test,y_pre_probal) #0.6964344507957512
生成提交数据
test_data = pd.read_csv("test.csv")
test_data.head()
test_data_drop_id = test_data.drop(["id"],axis=1)
test_data_drop_id.head()
y_pre_test = rf3.predict_proba(test_data_drop_id)
y_pre_test
result_data = pd.DataFrame(y_pre_test,columns=["Class_"+str(i) for i in range(1,10)])
result_data.head()
result_data.insert(loc=0,column="id",value=test_data.id)
result_data.head()
result_data.to_csv("submissson.csv",index=False)
Boosting
实现过程
baggin集成与boosting集成的区别
- 区别⼀:数据⽅⾯
- Bagging:对数据进⾏采样训练;
- Boosting:根据前⼀轮学习结果调整数据的重要性。
- 区别⼆:投票⽅⾯
- Bagging:所有学习器平权投票;
- Boosting:对学习器进⾏加权投票。
- 区别三:学习顺序
- Bagging的学习是并⾏的,每个学习器没有依赖关系;
- Boosting学习是串⾏,学习有先后顺序。
- 区别四:主要作⽤
- Bagging主要⽤于提⾼泛化性能(解决过拟合,也可以说降低⽅差)
- Boosting主要⽤于提⾼训练精度(解决⽋拟合,也可以说降低偏差)
AdaBoost(了解)
构造过程
案例
API
GBDT(了解)
Decision Tree: CART回归树
Gradient Boosting: 拟合负梯度
原理
更多推荐
已为社区贡献7条内容
所有评论(0)