更新《机器学习算法竞赛实战》一书的阅读笔记,更多详细的内容请阅读原书。本文的主要内容包含:
针对具体问题的建模分为3个部分:
影响数据质量的4个原因:
解决数据集过大或者正负样本不均衡的方法:
什么场景下需要处理样本不均衡问题?
# 10折交叉验证
from sklearn.model_selection import KFold
NFOLDS = 10 # 控制折数
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2023)
for trn_index, val_index in folds.split(X_train, y_train):
train_df, train_label = X_train.iloc[trn_index,:], y_train[trn_index] # 根据对应的索引号来取数
valid_df, valid_label = X_train.iloc[val_index,:], y_train[val_index]
In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold # K折交叉验证
from sklearn.metrics import mean_squared_error #评价指标mse
from sklearn.preprocessing import OneHotEncoder # 独热码
import lightgbm as lgb # lgb模型
import warnings
warnings.filterwarnings("ignore")
In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [3]:
train.head()
查看数据的基本信息情况:
In [4]:
train.isnull().sum() # 缺失值情况
Out[4]:
Id 0
MSSubClass 0
MSZoning 0
LotFrontage 259
LotArea 0
...
MoSold 0
YrSold 0
SaleType 0
SaleCondition 0
SalePrice 0
Length: 81, dtype: int64
In [5]:
train.dtypes # 字段类型
Out[5]:
Id int64
MSSubClass int64
MSZoning object
LotFrontage float64
LotArea int64
...
MoSold int64
YrSold int64
SaleType object
SaleCondition object
SalePrice int64
Length: 81, dtype: object
In [6]:
train.describe() # 描述统计信息
In [7]:
all_data = pd.concat([train,test]) # 数据合并
all_data = pd.get_dummies(all_data) # 哑变量处理
In [8]:
# 缺失值均值填充
all_data = all_data.fillna(all_data.mean())
In [9]:
all_data.head()
In [10]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
In [11]:
from sklearn.model_selection import KFold
NFOLDS = 5 # 控制折数
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2023)
params = {"num_leaves":63,
"min_child_samples":50,
"objective":"regression",
"learning_rate":0.01,
"boosting_type":"gbdt",
"metric":"rmse"
}
for trn_index, val_index in folds.split(X_train, y):
train_df, train_label = X_train.iloc[trn_index,:], y[trn_index] # 根据对应的索引号来取数
valid_df, valid_label = X_train.iloc[val_index,:], y[val_index]
dtrn = lgb.Dataset(train_df, label=train_label)
dval = lgb.Dataset(valid_df, label=valid_label)
dst = lgb.train(params,
dtrn,
num_boost_round=1000,
valid_sets=[dtrn, dval], # 验证集数据
early_stopping_rounds=100,
verbose_eval=100
)