铁柱在2018年11月底发了一篇 LSTM 回归预测模型 ,现在改用Lightgbm模型。本篇文章偏工程,需要读者了解Python关于Class的语法,理论部分也会在后续的文章中介绍.
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import pandas as pd
def loadData():
mete_data = pd.DataFrame(pd.read_excel(meteorological_data))
power_data = pd.DataFrame(pd.read_csv(power_real_predict,encoding='gbk'))
mete_data.columns=['文档时间', '时间', '风速', '风向', '气压', '温度', '湿度', '空气密度']
mete_data = time2datetime(mete_data)
power_data = time2datetime(power_data)
data = datamerge(power_data,mete_data)
return data[ori_mete],data[Daxian_pre],data[Real_pow]
def time2datetime(df):
df['时间'] = pd.to_datetime(df['时间'])
return df
def datamerge(df1,df2):
return pd.merge(df1,df2,on='时间',how='inner')
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# @Describe: Preprocess Data
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
def Nanimputer(df):
imputer = Imputer(missing_values='NaN', strategy='mean', axis=)
imputer.fit(df.drop(['实际功率'],axis=))
return imputer.transform(df.drop(['实际功率'],axis=))
def train_test_data(df,label,persent):
train = df[:int(-len(df)*persent)]
y = label['实际功率'].values[:int(-len(df)*persent)]
test = df[int(-len(df)*persent):]
y_test = label['实际功率'].values[int(-len(df)*persent):]
train,test = standardScaler(train,test)
return train,y,test,y_test
def standardScaler(train,test):
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)
return train,test
def pre_processing(df,label,persent):
data = Nanimputer(df)
return train_test_data(data,label,persent)
import pandas as pd
from config import items,items1,shift_time
def feature_extrac(df):
df['风速_湿度'] = df['风速'] * df['湿度']**
df['风速_风向'] = df['风速'] * df['风向']
df['风速_空气密度'] = df['风速'] * df['空气密度']**
df['风速_空气密度1'] = df['风速'] * df['空气密度']
for item in items:
for t in shift_time:
df[item + str(t)] = df[item].shift(t)
window =[,,]
for item in items1:
for wind in window:
df[item + '_std'] = df[item].rolling(wind,min_periods=).std()
df[item + '_mean'] = df[item].rolling(wind,min_periods=).mean()
df[item + '_sum'] = df[item].rolling(wind,min_periods=).sum()
df[item + '_max'] = df[item].rolling(wind,min_periods=).max()
df[item + '_min'] = df[item].rolling(wind,min_periods=).min()
df[item + '_median'] = df[item].rolling(wind,min_periods=).median()
return df
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# @Describe: config
meteorological_data = 'XXX.xls'
power_real_predict = 'YYYYYY.CSV'
ori_mete = ['实际功率','风速', '风向', '气压', '温度', '湿度', '空气密度']
Daxian_pre = ['预测功率']
Real_pow = ['实际功率']
shift_time = [-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,,,,,,,,,,,,]
items = [ '风向','气压', '温度', '湿度', '空气密度','风速_湿度','风速_风向','风速_空气密度','风速_空气密度1']
items1 = ['风向','湿度', '空气密度','风速_湿度','风速_风向','风速_空气密度','风速_空气密度1']
params = {
"objective": "regression",
"metric": "mse",
"num_leaves": ,
"min_child_samples": ,
"learning_rate": 0.01,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"bagging_frequency": ,
"bagging_seed": ,
"verbosity": -1
}
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# @Describe: Metrics
from math import sqrt
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
class Metrics():
def __init__(self, wepre, otpre, real):
self.wepre = wepre
self.otpre = otpre
self.real = real
def RMSE(self):
rmse = sqrt(mean_squared_error(self.real,self.wepre))
rmse_other = sqrt(mean_squared_error(self.real, self.otpre))
print('Test RMSE: %.3f' % rmse)
print('某同行公司Test RMSE: %.3f' % rmse_other)
def MulHourEval(self):
for metric_t in [,,]:
zhibiao_list = []
zhibiao_list_daxian = []
for i in range(,len(self.real)):
rmse = sqrt(mean_squared_error(self.real[i:i+metric_t-1], self.wepre[i:i+metric_t-1]))#
rmse_other = sqrt(mean_squared_error(self.real[i:i+metric_t-1], self.otpre[i:i+metric_t-1]))
zhibiao = - rmse/
zhibiao_other = - rmse_other/
zhibiao_list.append(zhibiao)
zhibiao_list_other.append(zhibiao_other)
zhibiao_list_pd = np.array(zhibiao_list)
zhibiao_list_other_pd = np.array(zhibiao_list_other)
print('Our model %s hour eval:%s'%(metric_t/,np.mean(zhibiao_list_pd)))
print('Other model %s hour eval:%s'%(metric_t/,np.mean(zhibiao_list_other_pd)))
#定义模型
def lgbmodel(data_train,label_train,data_test):
X_train, X_test, y_train, y_test = train_test_split(data_train,label_train,test_size=/,shuffle=True, random_state=)
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label = y_test)
lgbm = lgb.train(params,
lgb_train,
num_boost_round=,
valid_sets=lgb_eval,
early_stopping_rounds=,
verbose_eval= )
predict = lgbm.predict(data_test)
return predict
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from config import params # 从自创的congfig模块中 加载设定好的参数
from dataload import loadData #从自创的dataload 模块中加载 loadData以生产数据
from genFeature import feature_extrac #从自创的genFeature加载 fetature_extrac以用于特征提取
from Mulmetics import Metrics #从自创的多评估模块加载业内的评估方法
from DataPrepro import pre_processing #从自创的预处理模块加载数据预处理
def main():
Percent = / #设定训练集与验证集的比例
data,pre,real = loadData()#加载数据
data = feature_extrac(data)# 提取特征
train,y,test,label_test = pre_processing(data,real,Percent) #分割数据
print(train.shape,y.shape,test.shape,label_test.shape) #打印数据形状
model_predict = lgbmodel(train,y,test) #训练
metrics = Metrics(model_predict,pre['预测功率'].values[int(-len(data)*Percent):],label_test) #评估
metrics.RMSE()
metrics.MulHourEval()
if __name__ == '__main__':
main()
(, ) (,) (, ) (,)
Training until validation scores don't improve for 200 rounds.
[100] valid_0's l2: 77.9721
[] valid_0's l2: 56.5769
[300] valid_0's l2: 49.1198
[] valid_0's l2: 45.1405
[500] valid_0's l2: 42.4402
[] valid_0's l2: 40.2658
[700] valid_0's l2: 38.4283
[] valid_0's l2: 36.812
[900] valid_0's l2: 35.4044
[] valid_0's l2: 34.1903
[1100] valid_0's l2: 33.0945
[] valid_0's l2: 32.1998
[1300] valid_0's l2: 31.3365
[] valid_0's l2: 30.5228
[1500] valid_0's l2: 29.7129
Did not meet early stopping. Best iteration is:
[] valid_0's l2: 29.7129
Test RMSE: 10.263
某同行公司Test RMSE: 11.768
Our model 4.0 hour eval:0.8319254785233036
Other model 4.0 hour eval:0.8141609206646702
Our model 24.0 hour eval:0.810845265380047
Other model 24.0 hour eval:0.7942520524266607
Our model 72.0 hour eval:0.8029606299934267
Other model 72.0 hour eval:0.7796889844798093
本篇文章对整个Lightgbm回归过程进行了封装,更接近工程上的应用,利于后期在模块中添加函数、方法。 建议: 本文章案例不提供数据,大家可以自己创造随机数据或者使用国日新能的竞赛数据,但是特征名称要做相应改动。有问题可私聊E-mail:deepwind@aliyun.com