# 2.数据集

```https://www.kaggle.com/thomaskonstantin/bank-churn-data-exploration-and-churn-prediction/
```

# 3.代码与分析

Windows环境下打开 Cmd (开始-运行-CMD)，苹果系统环境下请打开 Terminal (command+空格输入Terminal)，准备开始输入命令安装依赖。

```pip install numpy
pip install pandas
pip install plotly
pip install scikit-learn
pip install scikit-plot

# 最后模型预测需要用到，安装需要conda
# 如果只是想探索性分析数据，可以不导入 imblearn
conda install -c conda-forge imbalanced-learn
```

## 3.1 导入需要的模块

```import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
pyo.init_notebook_mode()
sns.set_style('darkgrid')
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
import scikitplot as skplt

plt.rc('figure',figsize=(18,9))
%pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
```

## 3.2 加载数据

```c_data = pd.read_csv('./BankChurners.csv')
c_data = c_data[c_data.columns[:-2]]
```

## 3.3 探索性数据分析

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Customer_Age'],name='Age Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Customer_Age'],name='Age Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()
```

```ex.pie(c_data,names='Gender',title='Propotion Of Customer Genders')
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Dependent_count'],name='Dependent count Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Dependent_count'],name='Dependent count Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of Dependent counts (close family size)")
fig.show()
```

```ex.pie(c_data,names='Education_Level',title='Propotion Of Education Levels')
```

```ex.pie(c_data,names='Marital_Status',title='Propotion Of Different Marriage Statuses')
```

```ex.pie(c_data,names='Income_Category',title='Propotion Of Different Income Levels')
```
```ex.pie(c_data,names='Card_Category',title='Propotion Of Different Card Categories')
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Months_on_book'],name='Months on book Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Months_on_book'],name='Months on book Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of months the customer is part of the bank")
fig.show()
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Total_Relationship_Count'],name='Total no. of products Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Total_Relationship_Count'],name='Total no. of products Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of Total no. of products held by the customer")
fig.show()
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Months_Inactive_12_mon'],name='number of months inactive Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Months_Inactive_12_mon'],name='number of months inactive Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of the number of months inactive in the last 12 months")
fig.show()
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Credit_Limit'],name='Credit_Limit Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Credit_Limit'],name='Credit_Limit Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of the Credit Limit")
fig.show()
```

```fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=c_data['Total_Trans_Amt'],name='Total_Trans_Amt Box Plot',boxmean=True)
tr2=go.Histogram(x=c_data['Total_Trans_Amt'],name='Total_Trans_Amt Histogram')

fig.update_layout(height=700, width=1200, title_text="Distribution of the Total Transaction Amount (Last 12 months)")
fig.show()
```

```ex.pie(c_data,names='Attrition_Flag',title='Proportion of churn vs not churn customers')
```

## 3.4 数据预处理

```c_data.Attrition_Flag = c_data.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
c_data.Gender = c_data.Gender.replace({'F':1,'M':0})
c_data = pd.concat([c_data,pd.get_dummies(c_data['Education_Level']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Income_Category']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Marital_Status']).drop(columns=['Unknown'])],axis=1)
c_data = pd.concat([c_data,pd.get_dummies(c_data['Card_Category']).drop(columns=['Platinum'])],axis=1)
c_data.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category','CLIENTNUM'],inplace=True)
```

```sns.heatmap(c_data.corr('pearson'),annot=True)
```

## 3.5 SMOTE模型采样

SMOTE模型经常用于解决数据不平衡的问题，它通过添加生成的少数类样本改变不平衡数据集的数据分布，是改善不平衡数据分类模型性能的流行方法之一。

```oversample = SMOTE()
X, y = oversample.fit_resample(c_data[c_data.columns[1:]], c_data[c_data.columns[0]])
usampled_df = X.assign(Churn = y)
ohe_data =usampled_df[usampled_df.columns[15:-1]].copy()
usampled_df = usampled_df.drop(columns=usampled_df.columns[15:-1])
sns.heatmap(usampled_df.corr('pearson'),annot=True)
```

## 3.6 主成分分析

```N_COMPONENTS = 4

pca_model = PCA(n_components = N_COMPONENTS )

pc_matrix = pca_model.fit_transform(ohe_data)

evr = pca_model.explained_variance_ratio_
cumsum_evr = np.cumsum(evr)

ax = sns.lineplot(x=np.arange(0,len(cumsum_evr)),y=cumsum_evr,label='Explained Variance Ratio')
ax.set_title('Explained Variance Ratio Using {} Components'.format(N_COMPONENTS))
ax = sns.lineplot(x=np.arange(0,len(cumsum_evr)),y=evr,label='Explained Variance Of Component X')
ax.set_xticks([i for i in range(0,len(cumsum_evr))])
ax.set_xlabel('Component number #')
ax.set_ylabel('Explained Variance')
plt.show()
```
```usampled_df_with_pcs = pd.concat([usampled_df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)
usampled_df_with_pcs
```

```sns.heatmap(usampled_df_with_pcs.corr('pearson'),annot=True)
```

# 4.模型选择及测试

```X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']

X = usampled_df_with_pcs[X_features]
y = usampled_df_with_pcs['Churn']

train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)
```

## 4.1 交叉验证

```rf_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",RandomForestClassifier(random_state=42)) ])
svm_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",SVC(random_state=42,kernel='rbf')) ])

f1_cross_val_scores = cross_val_score(rf_pipe,train_x,train_y,cv=5,scoring='f1')
svm_f1_cross_val_scores=cross_val_score(svm_pipe,train_x,train_y,cv=5,scoring='f1')
```
```plt.subplot(3,1,1)
ax = sns.lineplot(x=range(0,len(f1_cross_val_scores)),y=f1_cross_val_scores)
ax.set_title('Random Forest Cross Val Scores')
ax.set_xticks([i for i in range(0,len(f1_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('F1 Score')
plt.show()
plt.subplot(3,1,2)
ax.set_xlabel('Fold Number')
ax.set_ylabel('F1 Score')
plt.show()
plt.subplot(3,1,3)
ax = sns.lineplot(x=range(0,len(svm_f1_cross_val_scores)),y=svm_f1_cross_val_scores)
ax.set_title('SVM Cross Val Scores')
ax.set_xticks([i for i in range(0,len(svm_f1_cross_val_scores))])
ax.set_xlabel('Fold Number')
ax.set_ylabel('F1 Score')
plt.show()
```

## 4.2 模型预测

```rf_pipe.fit(train_x,train_y)
rf_prediction = rf_pipe.predict(test_x)

svm_pipe.fit(train_x,train_y)
svm_prediction = svm_pipe.predict(test_x)

print('F1 Score of Random Forest Model On Test Set - {}'.format(f1(rf_prediction,test_y)))
print('F1 Score of SVM Model On Test Set - {}'.format(f1(svm_prediction,test_y)))
```

## 4.3 对原始数据（采样前）进行模型预测

```ohe_data =c_data[c_data.columns[16:]].copy()
pc_matrix = pca_model.fit_transform(ohe_data)
original_df_with_pcs = pd.concat([c_data,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

unsampled_data_prediction_RF = rf_pipe.predict(original_df_with_pcs[X_features])
unsampled_data_prediction_SVM = svm_pipe.predict(original_df_with_pcs[X_features])
```

F1最高的随机森林模型有0.63分，偏低，这也比较正常，毕竟在这种分布不均的数据集中，查全率是比较难拿到高分数的。

## 4.4 结果

```ax = sns.heatmap(confusion_matrix(unsampled_data_prediction_RF,original_df_with_pcs['Attrition_Flag']),annot=True,cmap='coolwarm',fmt='d')
ax.set_title('Prediction On Original Data With Random Forest Model Confusion Matrix')
ax.set_xticklabels(['Not Churn','Churn'],fontsize=18)
ax.set_yticklabels(['Predicted Not Churn','Predicted Churn'],fontsize=18)

plt.show()
```

-END-

0 条评论

• ### 大数据在金融行业的应用

数据显示，中国大数据IT应用投资规模以五大行业最高，其中以互联网行业占比最高，占大数据IT应用投资规模的28.9%，其次是电信领域（19.9%），第三为金融领...

• ### 2 个月如何从小白到 Python 高手，牛津大学博士带你飞！

Python 可以做任何事情。无论是从入门级选手到专业级数据挖掘、科学计算、图像处理、人工智能，Python 都可以胜任。或许是因为这种万能属性，周围好更多的小...

• ### 【笔记】CDA LEVEL II 数据建模师培训学习笔记（一）软件安装

? 写在前面：此笔记是PPV课学员张梦根据李玉玺老师在CDA LEVEL II 数据建模师培训的上课内容整理而成的。 ———————————–作者说明——...

• ### 欺诈、骗单、玩消失，如何用大数据解决银行这些痛点？

银行的问题总是循环往复地出现。打开任何一家新闻网站或者报纸，我们都能看到一篇又一篇关于银行问题的报道。欺诈、英国退欧引发的不良影响、各式各样的金融危机和违规行为...

• ### 【陆勤阅读】PyCon 2014：机器学习应用占据Python的半壁江山

今年的PyCon于4月9日在加拿大蒙特利尔召开，凭借快速的原型实现能力， Python在学术界得到了广泛应用。最近其官方网站发布了大会教程部分的视频和幻灯片，其...

• ### 一个资深数据人对 数据挖掘 的解读

数据分析网 http://www.afenxi.com/post/7348 在银行做了两年的数据分析和挖掘工作，较少接触互联网的应用场景，因此，一直都在思考一个...

• ### 【数据挖掘】互联网和金融，在数据挖掘上究竟存在什么样的区别？

---- 在银行做了两年的数据分析和挖掘工作，较少接触互联网的应用场景，因此，一直都在思考一个问题，“互联网和金融，在数据挖掘上，究竟存在什么样的区别”。在对这...