用此文章结束2018年的最后一天!
该文章主要内容是津南数字制造算法挑战赛的基本分析和Baseline.
加载数据
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
train = pd.read_csv('input/jinnan_round1_train_20181227.csv', encoding = 'gb18030')
test = pd.read_csv('input/jinnan_round1_testA_20181227.csv', encoding = 'gb18030'
我们整体性的看下数据,有个初步的了解
stats = []
for col in train.columns:
stats.append((col, train[col].nunique(), train[col].isnull().sum() * 100 / train.shape[0], train[col].value_counts(normalize=True, dropna=False).values[0] * 100, train[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)[:10]
stats = []
for col in test.columns:
stats.append((col, test[col].nunique(), test[col].isnull().sum() * 100 / test.shape[0], test[col].value_counts(normalize=True, dropna=False).values[0] * 100, train[col].dtype))
stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)[:20]
有趣的是,训练集中缺失值比较多的A23和A21在测试集中并无缺失,同样地,测试集中缺失概率达67%的A25、A27和A20反而在训练集中并无缺失。(此处作者笔误,缺失率只有0.67%)
同时,能够发现多个特征只有一个类别,以及很多特征某一类别高达90%以上,因此将其删除
for df in [train, test]:
df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)
good_cols = list(train.columns)
for col in train.columns:
rate = train[col].value_counts(normalize=True, dropna=False).values[0]
if rate > 0.9:
good_cols.remove(col)
# 删除异常样本
train = train[train['收率']>0.80]
train = train[good_cols]
good_cols.remove('收率')
test = test[good_cols]
可以看到数据集里面包含时间特征,时间区间特征,以及数据特征。
此赛题目标预测产品的收率,所以先对收率进行分析
target_col = "收率"
plt.figure(figsize=(8,6))
plt.scatter(range(train.shape[0]), np.sort(train[target_col].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('yield', fontsize=12)
plt.show()
plt.figure(figsize=(12,8))
sns.distplot(train[target_col].values, bins=50, kde=False, color="red")
plt.title("Histogram of yield")
plt.xlabel('yield', fontsize=12)
plt.show()
可以看住收率主要集中在0.85-1.00之间,然后小于0.8收率的有两条,可以将其当作噪音进行删除。
训练模型之前只做了基本的处理
1.去除缺失值大于90%的特征
2.去除类别唯一的特征
3.时间特征处理
4.进行label encoder
5.进行onehot编码
# 合并数据集
target = train['收率']
del train['收率']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)
def timeTransform(t):
'''
时间戳转化
'''
try:
t,m,s=t.split(":")
except:
if t=='1900/1/9 7:00':
return 7*3600/3600
elif t=='1900/1/1 2:30':
return (2*3600+30*60)/3600
elif t==-1:
return -1
else:
return 0
try:
tm = (int(t)*3600+int(m)*60+int(s))/3600
except:
return (30*60)/3600
return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
try:
data[f] = data[f].apply(timeTransform)
except:
print(f,'应该在前面被删除了!')
def getDuration(se):
'''
获取操作时段,并进行转化
'''
try:
sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
except:
if se == -1:
return 0
try:
if int(sh)>int(eh):
tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
else:
tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
except:
if se=='19:-20:05':
return 1
elif se=='15:00-1600':
return 1
return tm
for f in ['A20','A28','B4','B9','B10','B11']:
data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)
# 此处强特,样本id直接作为数值特征使用
data['样本id'] = data['样本id'].apply(lambda x: int(x.split('_')[1]))
categorical_columns = [f for f in data.columns if f not in ['样本id']]
numerical_columns = [f for f in data.columns if f not in categorical_columns]
#label encoder
for f in categorical_columns:
data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test = data[train.shape[0]:]
# one-hot
X_train = train[numerical_columns].values
X_test = test[numerical_columns].values
enc = OneHotEncoder()
for f in categorical_columns:
enc.fit(data[f].values.reshape(-1, 1))
X_train = sparse.hstack((X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
X_test = sparse.hstack((X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')
y_train = target.values
param = {'num_leaves': 30,
'min_data_in_leaf': 30,
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.01,
"min_child_samples": 30,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.9 ,
"bagging_seed": 11,
"metric": 'mse',
"lambda_l1": 0.1,
"verbosity": -1}
# 五折交叉验证
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
num_round = 10000
clf = lgb.train(param,
trn_data,
num_round,
valid_sets = [trn_data, val_data],
verbose_eval = 200,
early_stopping_rounds = 100)
oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)))
# 提交结果
sub_df = pd.read_csv('input/jinnan_round1_submit_20181227.csv', header=None)
sub_df[1] = predictions
sub_df.to_csv("sub_jinnan.csv", index=False, header=None)
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8,
'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4}
folds = KFold(n_splits=10, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb,predictions_xgb]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
clf_3 = BayesianRidge()
clf_3.fit(trn_data, trn_y)
oof_stack[val_idx] = clf_3.predict(val_data)
predictions += clf_3.predict(test_stack) / 10
print("CV score: {:<8.8f}".format(mean_squared_error(oof_stack, target)))
2018年即将结束,也正是我参加比赛刚好一年的时候,从最开始的《天池精准医疗大赛——人工智能辅助糖尿病遗传风险预测》到《OGeek算法挑战赛》,比赛做的多了,思维也发生了变化,一开始不会做任特征,只知道用各种模型堆叠结果,到现在的各种特征,模型却缺少了多样性。所以呢,在比赛上,19年需要做的事情是告别模型单一性。