TVP

# 交叉验证和超参数调整：如何优化你的机器学习模型

K-fold交叉验证

Python中的K-fold交叉验证

# Define a function that compares the CV perfromance of a set of predetrmined models

def cv_comparison(models, X, y, cv):

# Initiate a DataFrame for the averages and a list for all measures

cv_accuracies = pd.DataFrame()

maes = []

mses = []

r2s = []

accs = []

# Loop through the models, run a CV, add the average scores to the DataFrame and the scores of

# all CVs to the list

for model in models:

mae = -np.round(cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv), 4)

maes.append(mae)

mae_avg = round(mae.mean(), 4)

mse = -np.round(cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv), 4)

mses.append(mse)

mse_avg = round(mse.mean(), 4)

r2 = np.round(cross_val_score(model, X, y, scoring='r2', cv=cv), 4)

r2s.append(r2)

r2_avg = round(r2.mean(), 4)

acc = np.round((100 - (100 * (mae * len(X))) / sum(y)), 4)

accs.append(acc)

acc_avg = round(acc.mean(), 4)

cv_accuracies[str(model)] = [mae_avg, mse_avg, r2_avg, acc_avg]

cv_accuracies.index = ['Mean Absolute Error', 'Mean Squared Error', 'R^2', 'Accuracy']

return cv_accuracies, maes, mses, r2s, accs

# Create the models to be tested

mlr_reg = LinearRegression()

rf_reg = RandomForestRegressor(random_state=42)

xgb_reg = xgb_regressor = XGBRegressor(random_state=42)

# Put the models in a list to be used for Cross-Validation

models = [mlr_reg, rf_reg, xgb_reg]

# Run the Cross-Validation comparison with the models used in this analysis

comp, maes, mses, r2s, accs = cv_comparison(models, X_train_temp,     y_train_temp, 4)

# Create DataFrame for all R^2s

r2_comp = pd.DataFrame(r2s, index=comp.columns, columns=['1st Fold', '2nd Fold', '3rd Fold',

'4th Fold'])

# Add a column for the averages

r2_comp['Average'] = np.round(r2_comp.mean(axis=1),4)

# Number of trees in Random Forest

rf_n_estimators = [int(x) for x in np.linspace(200, 1000, 5)]

rf_n_estimators.append(1500)

rf_n_estimators.append(2000)

# Maximum number of levels in tree

rf_max_depth = [int(x) for x in np.linspace(5, 55, 11)]

# Add the default as a possible value

rf_max_depth.append(None)

# Number of features to consider at every split

rf_max_features = ['auto', 'sqrt', 'log2']

# Criterion to split on

rf_criterion = ['mse', 'mae']

# Minimum number of samples required to split a node

rf_min_samples_split = [int(x) for x in np.linspace(2, 10, 9)]

# Minimum decrease in impurity required for split to happen

rf_min_impurity_decrease = [0.0, 0.05, 0.1]

# Method of selecting samples for training each tree

rf_bootstrap = [True, False]

# Create the grid

rf_grid = {'n_estimators': rf_n_estimators,

'max_depth': rf_max_depth,

'max_features': rf_max_features,

'criterion': rf_criterion,

'min_samples_split': rf_min_samples_split,

'min_impurity_decrease': rf_min_impurity_decrease,

'bootstrap': rf_bootstrap}

# Create the model to be tuned

rf_base = RandomForestRegressor()

# Create the random search Random Forest

rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions = rf_grid,

n_iter = 200, cv = 3, verbose = 2, random_state = 42,

n_jobs = -1)

# Fit the random search model

rf_random.fit(X_train_temp, y_train_temp)

# View the best parameters from the random search

rf_random.best_params_

xgboost的超参数整定

# Number of trees to be used

xgb_n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]

# Maximum number of levels in tree

xgb_max_depth = [int(x) for x in np.linspace(2, 20, 10)]

# Minimum number of instaces needed in each node

xgb_min_child_weight = [int(x) for x in np.linspace(1, 10, 10)]

# Tree construction algorithm used in XGBoost

xgb_tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist']

# Learning rate

xgb_eta = [x for x in np.linspace(0.1, 0.6, 6)]

# Minimum loss reduction required to make further partition

xgb_gamma = [int(x) for x in np.linspace(0, 0.5, 6)]

# Learning objective used

xgb_objective = ['reg:squarederror', 'reg:squaredlogerror']

# Create the grid

xgb_grid = {'n_estimators': xgb_n_estimators,

'max_depth': xgb_max_depth,

'min_child_weight': xgb_min_child_weight,

'tree_method': xgb_tree_method,

'eta': xgb_eta,

'gamma': xgb_gamma,

'objective': xgb_objective}

# Create the model to be tuned

xgb_base = XGBRegressor()

# Create the random search Random Forest

xgb_random = RandomizedSearchCV(estimator = xgb_base, param_distributions = xgb_grid,

n_iter = 200, cv = 3, verbose = 2,

random_state = 420, n_jobs = -1)

# Fit the random search model

xgb_random.fit(X_train_temp, y_train_temp)

# Get the optimal parameters

xgb_random.best_params_

# Create the final Multiple Linear Regression

mlr_final = LinearRegression()

# Create the final Random Forest

rf_final = RandomForestRegressor(n_estimators = 200,

min_samples_split = 6,

min_impurity_decrease = 0.0,

max_features = 'sqrt',

max_depth = 25,

criterion = 'mae',

bootstrap = True,

random_state = 42)

# Create the fnal Extreme Gradient Booster

xgb_final = XGBRegressor(tree_method = 'exact',

objective = 'reg:squarederror',

n_estimators = 1600,

min_child_weight = 6,

max_depth = 8,

gamma = 0,

eta = 0.1,

random_state = 42)

# Train the models using 80% of the original data

mlr_final.fit(X_train_temp, y_train_temp)

rf_final.fit(X_train_temp, y_train_temp)

xgb_final.fit(X_train_temp, y_train_temp)

# Define a function that compares all final models

def final_comparison(models, test_features, test_labels):

scores = pd.DataFrame()

for model in models:

predictions = model.predict(test_features)

mae = round(mean_absolute_error(test_labels, predictions), 4)

mse = round(mean_squared_error(test_labels, predictions), 4)

r2 = round(r2_score(test_labels, predictions), 4)

errors = abs(predictions - test_labels)

mape = 100 * np.mean(errors / test_labels)

accuracy = round(100 - mape, 4)

scores[str(model)] = [mae, mse, r2, accuracy]

scores.index = ['Mean Absolute Error', 'Mean Squared Error', 'R^2', 'Accuracy']

return scores

# Call the comparison function with the three final models

final_scores = final_comparison([mlr_final, rf_final, xgb_final], X_test, y_test)

final_scores.columns = ['Linear Regression', 'Random Forest', 'Extreme Gradient Boosting']

deephub翻译组：钱三一

DeepHub

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20200815A04IAV00?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

10元无门槛代金券