作者:章华燕
编辑:田 旭
四
XGBoost 参数详解
在运行XGboost之前,必须设置三种类型成熟:general parameters,booster parameters和task parameters:
01
General Parameters
02
Parameters for Tree Booster
03
Parameter for Linear Booster
04
Task Parameters
五
XGBoost 实战
XGBoost有两大类接口:XGBoost原生接口 和 scikit-learn接口 ,并且XGBoost能够实现 分类 和 回归 两种任务。因此,本章节分四个小块来介绍!
01
基于XGBoost原生接口的分类
from sklearn.datasets import load_iris
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# read in the iris data
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234565)
params = {
'booster': 'gbtree',
'objective': 'multi:softmax',
'num_class': 3,
'gamma': 0.1,
'max_depth': 6,
'lambda': 2,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}
plst = params.items()
dtrain = xgb.DMatrix(X_train, y_train)
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds)
# 对测试集进行预测
dtest = xgb.DMatrix(X_test)
ans = model.predict(dtest)
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
if ans[i] == y_test[i]:
cnt1 += 1
else:
cnt2 += 1
print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))
# 显示重要特征
plot_importance(model)
plt.show()
输出预测正确率以及特征重要性:
Accuracy: 96.67 %
02
基于XGBoost原生接口的回归
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# 读取文件原始数据
data = []
labels = []
labels2 = []
with open("lppz5.csv", encoding='UTF-8') as fileObject:
for line in fileObject:
line_split = line.split(',')
data.append(line_split[10:])
labels.append(line_split[8])
X = []
for row in data:
row = [float(x) for x in row]
X.append(row)
y = [float(x) for x in labels]
# XGBoost训练过程
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
params = {
'booster': 'gbtree',
'objective': 'reg:gamma',
'gamma': 0.1,
'max_depth': 5,
'lambda': 3,
'subsample': 0.7,
'colsample_bytree': 0.7,
'min_child_weight': 3,
'silent': 1,
'eta': 0.1,
'seed': 1000,
'nthread': 4,
}
dtrain = xgb.DMatrix(X_train, y_train)
num_rounds = 300
plst = params.items()
model = xgb.train(plst, dtrain, num_rounds)
# 对测试集进行预测
dtest = xgb.DMatrix(X_test)
ans = model.predict(dtest)
# 显示重要特征
plot_importance(model)
plt.show()
重要特征(值越大,说明该特征越重要)显示结果:
03
基于Scikit-learn接口的分类
from sklearn.datasets import load_iris
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# read in the iris data
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# 训练模型
model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='multi:softmax')
model.fit(X_train, y_train)
# 对测试集进行预测
ans = model.predict(X_test)
# 计算准确率
cnt1 = 0
cnt2 = 0
for i in range(len(y_test)):
if ans[i] == y_test[i]:
cnt1 += 1
else:
cnt2 += 1
print("Accuracy: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))
# 显示重要特征
plot_importance(model)
plt.show()
输出预测正确率以及特征重要性:
Accuracy: 100.00 %
04
基于XGBoost原生接口的回归
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
# 读取文件原始数据
data = []
labels = []
labels2 = []
with open("lppz5.csv", encoding='UTF-8') as fileObject:
for line in fileObject:
line_split = line.split(',')
data.append(line_split[10:])
labels.append(line_split[8])
X = []
for row in data:
row = [float(x) for x in row]
X.append(row)
y = [float(x) for x in labels]
# XGBoost训练过程
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
model.fit(X_train, y_train)
# 对测试集进行预测
ans = model.predict(X_test)
# 显示重要特征
plot_importance(model)
plt.show()
重要特征(值越大,说明该特征越重要)显示结果: