10分钟
直接学习-示例
class TrainTest:
def __init__(self):
df = pd.read_csv('./data/iris.csv')
_feature_names = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
x = df[_feature_names]
y = df['Class'].map(lambda x: _label_map[x])
train_X, test_X, train_Y, test_Y = train_test_split(x, y, test_size=0.3,
stratify=y, shuffle=True, random_state=1)
self._train_matrix = xgt.DMatrix(data=train_X, label=train_Y,
feature_names=_feature_names,
feature_types=['float', 'float', 'float', 'float'])
self._validate_matrix = xgt.DMatrix(data=test_X, label=test_Y,
feature_names=_feature_names,
feature_types=['float', 'float', 'float', 'float'])
def train_test(self):
params={
'booster':'gbtree',
'eta':0.01,
'max_depth':5,
'tree_method':'exact',
'objective':'binary:logistic',
'eval_metric':['logloss','error','auc']
}
eval_rst={}
booster=xgt.train(params,self._train_matrix,num_boost_round=20,
evals=([(self._train_matrix,'valid1'),(self._validate_matrix,'valid2')]),
early_stopping_rounds=5,evals_result=eval_rst,verbose_eval=True)
## 训练输出
# Multiple eval metrics have been passed: 'valid2-auc' will be used for early stopping.
# Will train until valid2-auc hasn't improved in 5 rounds.
# [0] valid1-logloss:0.685684 valid1-error:0.042857 valid1-auc:0.980816 valid2-logloss:0.685749 valid2-error:0.066667 valid2-auc:0.933333
# ...
# Stopping. Best iteration:
# [1] valid1-logloss:0.678149 valid1-error:0.042857 valid1-auc:0.99551 valid2-logloss:0.677882 valid2-error:0.066667 valid2-auc:0.966667
print('booster attributes:',booster.attributes())
# booster attributes: {'best_iteration': '1', 'best_msg': '[1]\tvalid1-logloss:0.678149\tvalid1-error:0.042857\tvalid1-auc:0.99551\tvalid2-logloss:0.677882\tvalid2-error:0.066667\tvalid2-auc:0.966667', 'best_score': '0.966667'}
print('fscore:', booster.get_fscore())
# fscore: {'Petal Length': 8, 'Petal Width': 7}
print('eval_rst:',eval_rst)
# eval_rst: {'valid1': {'logloss': [0.685684, 0.678149, 0.671075, 0.663787, 0.656948, 0.649895], 'error': [0.042857, 0.042857, 0.042857, 0.042857, 0.042857, 0.042857], 'auc': [0.980816, 0.99551, 0.99551, 0.99551, 0.99551, 0.99551]}, 'valid2': {'logloss': [0.685749, 0.677882, 0.670747, 0.663147, 0.656263, 0.648916], 'error': [0.066667, 0.066667, 0.066667, 0.066667, 0.066667, 0.066667], 'auc': [0.933333, 0.966667, 0.966667, 0.966667, 0.966667, 0.966667]}}
def cv_test(self):
params = {
'booster': 'gbtree',
'eta': 0.01,
'max_depth': 5,
'tree_method': 'exact',
'objective': 'binary:logistic',
'eval_metric': ['logloss', 'error', 'auc']
}
eval_history = xgt.cv(params, self._train_matrix,num_boost_round=20,
nfold=3,stratified=True,metrics=['error', 'auc'],
early_stopping_rounds=5,verbose_eval=True,shuffle=True)
## 训练输出
# [0] train-auc:0.974306+0.00309697 train-error:0.0428743+0.0177703 test-auc:0.887626+0.0695933 test-error:0.112374+0.0695933
#....
print('eval_history:', eval_history)
# eval_history: test-auc-mean test-auc-std test-error-mean test-error-std \
# 0 0.887626 0.069593 0.112374 0.069593
# 1 0.925821 0.020752 0.112374 0.069593
# 2 0.925821 0.020752 0.098485 0.050631
# train-auc-mean train-auc-std train-error-mean train-error-std
# 0 0.974306 0.003097 0.042874 0.01777
# 1 0.987893 0.012337 0.042874 0.01777
# 2 0.986735 0.011871 0.042874 0.01777
学员评价