# Kaggle初探--房价预测案例之数据分析

## 概述

```import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# import warnings
# warnings.filterwarnings('ignore')

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline```
```train_df = pd.read_csv("../input/train.csv")

## 查看数据

`all_df = pd.concat((train_df.loc[:,'MSSubClass':'SaleCondition'], test_df.loc[:,'MSSubClass':'SaleCondition']), axis=0,ignore_index=True)`
`all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)`
```quantitative = [f for f in all_df.columns if all_df.dtypes[f] != 'object']
qualitative = [f for f in all_df.columns if all_df.dtypes[f] == 'object']```
`print("quantitative: {}, qualitative: {}" .format (len(quantitative),len(qualitative)))`
`quantitative: 35, qualitative: 44`

## 处理缺失数据

1. 缺失的行特别对，弃用该列
2. 缺失的值比较少，取均值
3. 缺失的值中间，对于类别信息的列可以将缺失作为新的类别做 one-hot
`missing = all_df.isnull().sum()`
```missing.sort_values(inplace=True,ascending=False)
missing = missing[missing > 0]```
`types = all_df[missing.index].dtypes`
`percent = (all_df[missing.index].isnull().sum()/all_df[missing.index].isnull().count()).sort_values(ascending=False)`
```missing_data = pd.concat([missing, percent,types], axis=1, keys=['Total', 'Percent','Types'])
missing_data.sort_values('Total',ascending=False,inplace=True)
missing_data```

image.png

`missing.plot.bar()`
`<matplotlib.axes._subplots.AxesSubplot at 0x112096c88>`

output_14_1.png

## 数据统计分析

### 单变量分析

`train_df.describe()['SalePrice']`
```count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64```
```#skewness and kurtosis
print("Skewness: %f" % train_df['SalePrice'].skew())
print("Kurtosis: %f" % train_df['SalePrice'].kurt())
# 在统计学中，峰度（Kurtosis）衡量实数随机变量概率分布的峰态。峰度高就意味着方差增大是由低频度的大于或小于平均值的极端差值引起的。```
```Skewness: 1.882876
Kurtosis: 6.536282```

#### 相关性

`corrmat = train_df.corr()`
```#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()```

output_21_0.png

```## 同时是相关性列，也是缺失数据的
missing_data.index.intersection(cols)```
`Index(['GarageCars', 'GarageArea', 'TotalBsmtSF'], dtype='object')`
`missing_data.loc[missing_data.index.intersection(cols)]`

image.png

```#dealing with missing data
all_df = all_df.drop((missing_data[missing_data['Total'] > 1]).index,1)
# df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
all_df.isnull().sum().max() #just checking that there's no missing data missing...
# 对于missing 1 的我们到时候已平均数填充```

```#histogram and normal probability plot
sns.distplot(train_df['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_df['SalePrice'], plot=plt)```

output_27_0.png

output_27_1.png

`train_df['SalePrice'] = np.log(train_df['SalePrice'])`
```#histogram and normal probability plot
sns.distplot(train_df['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_df['SalePrice'], plot=plt)```

output_30_0.png

output_30_1.png

```quantitative = [f for f in all_df.columns if all_df.dtypes[f] != 'object']
qualitative = [f for f in all_df.columns if all_df.dtypes[f] == 'object']
print("quantitative: {}, qualitative: {}" .format (len(quantitative),len(qualitative)))```
`quantitative: 30, qualitative: 26`
```f = pd.melt(all_df, value_vars=quantitative)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")```

output_33_0.png

`all_df[quantitative].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)`
```MiscVal          21.947195
PoolArea         16.898328
LotArea          12.822431
LowQualFinSF     12.088761
3SsnPorch        11.376065
KitchenAbvGr      4.302254
BsmtFinSF2        4.145323
EnclosedPorch     4.003891
ScreenPorch       3.946694
OpenPorchSF       2.535114
WoodDeckSF        1.842433
1stFlrSF          1.469604
BsmtFinSF1        1.424989
GrLivArea         1.269358
TotalBsmtSF       1.162285
BsmtUnfSF         0.919351
2ndFlrSF          0.861675
TotRmsAbvGrd      0.758367
Fireplaces        0.733495
HalfBath          0.694566
OverallCond       0.570312
BedroomAbvGr      0.326324
GarageArea        0.241176
OverallQual       0.197110
MoSold            0.195884
FullBath          0.167606
YrSold            0.132399
GarageCars       -0.218260
YearBuilt        -0.599806
dtype: float64```

#### 定量特征分析

```train = all_df.loc[train_df.index]
train['SalePrice'] = train_df.SalePrice```
```def anova(frame):
anv = pd.DataFrame()
anv['feature'] = qualitative
pvals = []
for c in qualitative:
samples = []
for cls in frame[c].unique():
s = frame[frame[c] == cls]['SalePrice'].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')

a = anova(train)
a['disparity'] = np.log(1./a['pval'].values)
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)```
```/Users/zhuanxu/anaconda/envs/linear_regression_demo/lib/python3.6/site-packages/scipy/stats/stats.py:2958: RuntimeWarning: invalid value encountered in double_scalars
ssbn += _square_of_sums(a - offset) / float(len(a))```

output_38_1.png

```def encode(frame, feature):
ordering = pd.DataFrame()
ordering['val'] = frame[feature].unique()
ordering.index = ordering.val
ordering['spmean'] = frame[[feature, 'SalePrice']].groupby(feature).mean()['SalePrice']
ordering = ordering.sort_values('spmean')
ordering['ordering'] = range(1, ordering.shape[0]+1)
ordering = ordering['ordering'].to_dict()

for cat, o in ordering.items():
frame.loc[frame[feature] == cat, feature+'_E'] = o

qual_encoded = []
for q in qualitative:
encode(train, q)
qual_encoded.append(q+'_E')
print(qual_encoded)```
`['MSSubClass_E', 'Street_E', 'LotShape_E', 'LandContour_E', 'LotConfig_E', 'LandSlope_E', 'Neighborhood_E', 'Condition1_E', 'Condition2_E', 'BldgType_E', 'HouseStyle_E', 'RoofStyle_E', 'RoofMatl_E', 'Exterior1st_E', 'Exterior2nd_E', 'ExterQual_E', 'ExterCond_E', 'Foundation_E', 'Heating_E', 'HeatingQC_E', 'CentralAir_E', 'Electrical_E', 'KitchenQual_E', 'PavedDrive_E', 'SaleType_E', 'SaleCondition_E']`
```# 选出了包含缺失数据的行，处理一下
missing_data = all_df.isnull().sum()
missing_data = missing_data[missing_data>0]
ids = all_df[missing_data.index].isnull()
# index (0), columns (1)
all_df.loc[ids[ids.any(axis=1)].index][missing_data.index]```

image.png

```# 处理完后对于nan的数据，其值还是nan
train.loc[1379,'Electrical_E']```
`nan`

#### 相关性计算

```def spearman(frame, features):
spr = pd.DataFrame()
spr['feature'] = features
#Signature: a.corr(other, method='pearson', min_periods=None)
#Docstring:
#Compute correlation with `other` Series, excluding missing values
# 计算特征和 SalePrice的 斯皮尔曼 相关系数
spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
spr = spr.sort_values('spearman')
plt.figure(figsize=(6, 0.25*len(features))) # width, height
sns.barplot(data=spr, y='feature', x='spearman', orient='h')

features = quantitative + qual_encoded
spearman(train, features)```

output_45_0.png

```plt.figure(1)
corr = train[quantitative+['SalePrice']].corr()
sns.heatmap(corr)
plt.figure(2)
corr = train[qual_encoded+['SalePrice']].corr()
sns.heatmap(corr)
plt.figure(3)
# [31,27]
corr = pd.DataFrame(np.zeros([len(quantitative)+1, len(qual_encoded)+1]), index=quantitative+['SalePrice'], columns=qual_encoded+['SalePrice'])
for q1 in quantitative+['SalePrice']:
for q2 in qual_encoded+['SalePrice']:
corr.loc[q1, q2] = train[q1].corr(train[q2])
sns.heatmap(corr)```
`<matplotlib.axes._subplots.AxesSubplot at 0x1172cb860>`

output_47_1.png

output_47_2.png

output_47_3.png

#### Pairplots

```def pairplot(x, y, **kwargs):
ax = plt.gca()
ts = pd.DataFrame({'time': x, 'val': y})
ts = ts.groupby('time').mean()
ts.plot(ax=ax)
plt.xticks(rotation=90)

f = pd.melt(train, id_vars=['SalePrice'], value_vars=quantitative+qual_encoded)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(pairplot, "value", "SalePrice")```
```IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.```

### 价格分段

```a = train['SalePrice']
a.plot.hist()```
`<matplotlib.axes._subplots.AxesSubplot at 0x11ed529b0>`

output_51_1.png

```features = quantitative

standard = train[train['SalePrice'] < np.log(200000)]
pricey = train[train['SalePrice'] >= np.log(200000)]

diff = pd.DataFrame()
diff['feature'] = features
diff['difference'] = [(pricey[f].fillna(0.).mean() - standard[f].fillna(0.).mean())/(standard[f].fillna(0.).mean())
for f in features]

sns.barplot(data=diff, x='feature', y='difference')
x=plt.xticks(rotation=90)```

### 分类

```features = quantitative + qual_encoded
model = TSNE(n_components=2, random_state=0, perplexity=50)
X = train[features].fillna(0.).values
tsne = model.fit_transform(X)

std = StandardScaler()
s = std.fit_transform(X)
pca = PCA(n_components=30)
pca.fit(s)
pc = pca.transform(s)
kmeans = KMeans(n_clusters=5)
kmeans.fit(pc)

fr = pd.DataFrame({'tsne1': tsne[:,0], 'tsne2': tsne[:, 1], 'cluster': kmeans.labels_})
sns.lmplot(data=fr, x='tsne1', y='tsne2', hue='cluster', fit_reg=False)
print(np.sum(pca.explained_variance_ratio_))```
`0.838557886152`

output_55_1.png

30个成分能覆盖83%的方差，整体看来，这种聚类方法不太好

78 篇文章18 人订阅

0 条评论

## 相关文章

### 详解 Kaggle 房价预测竞赛优胜方案：用 Python 进行全面数据探索

AI 研习社按：Kaggle 的房价预测竞赛从 2016 年 8 月开始，到 2017 年 2 月结束。这段时间内，超过 2000 多人参与比赛，选手采用高级回...

71070

21480

292100

31190

3K20

### 6.5编程实例-立方体透视投影

GLint winWidth = 600, winHeight = 600; //设置初始化窗口大小

23010

12530

26770

14620

797110