# 机器学习实战（2）之预测房价

```# -*- coding: utf-8 -*-
"""
Created on Sun Oct 21 14:37:15 2018

"""

% reset -f
% clear

# In[*]
##########第一步  导入包
# In[*]
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import numpy as np
import seaborn as sns
import os
from scipy.stats import skew
from scipy.stats.stats import pearsonr

# In[*]
##########第二步  导入数据
# In[*]

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))```

```# In[*]
# 第三步，将目标变量标准化

matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"],
"log(price + 1)":np.log1p(train["SalePrice"])})
prices.hist()

#log transform the target:```
```# In[*]
# 第四步，将预测变量标准化
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))

skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])```

```# In[*]
# 第五步，处理字符型变量以及将填充缺失值
# In[*]
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
# In[*]
# 第六步，划分训练集和测试集
# In[*]
#creating matrices for sklearn:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice```

