# 房产估值模型训练及预测结果

## 两个模型MLPR

GB___df_y = df['unitPrice']___得到DataFrame的unitPrice字段数据，y = df_y.values得到shape为(21935，)，类型为numpy.ndarray的矩阵，即长度为21935的一维矩阵。 df_x = df.drop(['unitPrice'],axis=1)得到DataFrame的除了unitPrice字段的其他字段，x = df_x.values得到shape为(21935,120)，类型为numpy.ndarray的矩阵，即大小为21935*120的二维矩阵。 用sklearn中的预处理函数preprocessing.StandardScaler()对数据标准化处理，处理过程是先用训练集fit，再把测试集也标准化处理。 调用MLPRegresso()获得多层感知器-回归模型，再用训练集进行训练，最后对测试集进行测试得分。 调用GradientBoostingRegressor()获得集成-回归模型，再用训练集进行训练，最后对测试集进行测试得分。

```from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

#boston = load_boston()
df = pd.read_excel("数据处理结果.xlsx")
df_y = df['unitPrice']
df_x = df.drop(['unitPrice'],axis=1)
x = df_x.values
y = df_y.values

train_x,test_x,train_y,test_y = train_test_split(x,y,train_size=0.8,\
random_state=33)
ss_x = preprocessing.StandardScaler()
train_x1 = ss_x.fit_transform(train_x)
test_x1 = ss_x.transform(test_x)

ss_y = preprocessing.StandardScaler()
train_y1 = ss_y.fit_transform(train_y.reshape(-1,1))
test_y1 = ss_y.transform(test_y.reshape(-1,1))

model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x1,train_y1.ravel())
mlp_score = model_mlp.score(test_x1,test_y1.ravel())
print("sklearn多层感知器-回归模型得分",mlp_score)

model_gbr = GradientBoostingRegressor()
model_gbr.fit(train_x1,train_y1.ravel())
gbr_score = model_gbr.score(test_x1,test_y1.ravel())
print("sklearn集成-回归模型得分",gbr_score)```

## 异常值处理

image.png

```from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd

def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位数为{}  上四分位数{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("删除第{}列中数值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印处理异常值的相关信息
print("原矩阵共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("删除第{}列中的异常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data

df = pd.read_excel("数据处理结果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]

train_x,test_x,train_y,test_y = train_test_split(x,y,train_size=0.8,\
random_state=33)

ss_y = preprocessing.StandardScaler()
train_y = ss_y.fit_transform(train_y.reshape(-1,1))
test_y = ss_y.transform(test_y.reshape(-1,1))

model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多层感知器-回归模型得分",mlp_score)

model_gbr = GradientBoostingRegressor(learning_rate=0.1)
model_gbr.fit(train_x,train_y.ravel())

ss_y = preprocessing.StandardScaler()
train_y = ss_y.fit_transform(train_y.reshape(-1,1))
test_y = ss_y.transform(test_y.reshape(-1,1))

gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回归模型得分",gbr_score)```

## 正态化

```from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math

def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位数为{}  上四分位数{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("删除第{}列中数值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印处理异常值的相关信息
print("原矩阵共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("删除第{}列中的异常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data

df = pd.read_excel("数据处理结果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]
for i in range(len(y)):
y[i] = math.log(y[i])

train_x,test_x,train_y,test_y = train_test_split(x,y,train_size=0.8,\
random_state=33)

ss_x = preprocessing.StandardScaler()
train_x = ss_x.fit_transform(train_x)
test_x = ss_x.transform(test_x)

ss_y = preprocessing.StandardScaler()
train_y = ss_y.fit_transform(train_y.reshape(-1,1))
test_y = ss_y.transform(test_y.reshape(-1,1))

model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多层感知器-回归模型得分",mlp_score)

model_gbr = GradientBoostingRegressor(learning_rate=0.1)
model_gbr.fit(train_x,train_y.ravel())
gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回归模型得分",gbr_score)```

## 交叉验证

```from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
import math
from sklearn.model_selection import KFold

def cleanOutlier(data,column,mul=3):
data = data[data[:,column].argsort()] #得到排序后的ndarray
l = len(data)
low = int(l/4)
high = int(l/4*3)
lowValue = data[low,column]
highValue = data[high,column]
print("下四分位数为{}  上四分位数{}".format(lowValue,highValue))
if lowValue - mul * (highValue - lowValue) < data[0,column] :
delLowValue = data[0,column]
else:
delLowValue = lowValue - mul * (highValue - lowValue)
if highValue + mul * (highValue - lowValue) > data[-1,column]:
delHighValue = data[-1,column]
else:
delHighValue = highValue + mul * (highValue - lowValue)
print("删除第{}列中数值小于{}或者大于{}的部分".format(column,\
delLowValue,delHighValue))
for i in range(low):
if data[i,column] >= delLowValue:
recordLow = i
break
for i in range(len(data)-1,high,-1):
if data[i,column] <= delHighValue:
recordHigh = i
break
#打印处理异常值的相关信息
print("原矩阵共有{}行".format(len(data)),end=',')
print("保留{}到{}行".format(recordLow,recordHigh),end=',')
data = data[recordLow:recordHigh+1]
print("删除第{}列中的异常值后剩余{}行".format(column,\
recordHigh+1-recordLow))
return data

df = pd.read_excel("数据处理结果.xlsx")
data = df.values.astype('float')
data = cleanOutlier(data,0)
x = data[:,1:]
y = data[:,0]
for i in range(len(y)):
y[i] = math.log(y[i])

kf = KFold(n_splits=5,shuffle=True)

for train_index,test_index in kf.split(x):
train_x = x[train_index]
test_x = x[test_index]
train_y = y[train_index]
test_y = y[test_index]

ss_x = preprocessing.StandardScaler()
train_x = ss_x.fit_transform(train_x)
test_x = ss_x.transform(test_x)

ss_y = preprocessing.StandardScaler()
train_y = ss_y.fit_transform(train_y.reshape(-1,1))
test_y = ss_y.transform(test_y.reshape(-1,1))

model_mlp = MLPRegressor(solver='lbfgs',hidden_layer_sizes=(20,20,20),random_state=1)
model_mlp.fit(train_x,train_y.ravel())
mlp_score = model_mlp.score(test_x,test_y.ravel())
print("sklearn多层感知器-回归模型得分",mlp_score)

model_gbr = GradientBoostingRegressor(learning_rate=0.1)
model_gbr.fit(train_x,train_y.ravel())
gbr_score = model_gbr.score(test_x,test_y.ravel())
print("sklearn集成-回归模型得分",gbr_score)```

120 篇文章26 人订阅

0 条评论

9920

4.4K80

31480

24060

49790

55160

11510

598120

34240

16220