评分卡模型(一)评分卡建模实战
小P:我看你做的这些数据挖掘,虽然预测结果挺准的,但是完全不知道怎么来的啊 小H:其实在风控领域有个很流行的评分卡模型,可以很直观的告诉你什么特征加分,什么特征减分,每个样本有多少分 小P:这个可以啊,那它有什么缺点吗 小H:缺点,那自然是准确率可能会低一点~
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split,cross_val_score # 数据分区库
import xgboost as xgb
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, \
precision_score, recall_score, roc_curve, roc_auc_score, precision_recall_curve # 导入指标库
from imblearn.over_sampling import SMOTE # 过抽样处理库SMOTE
import matplotlib.pyplot as plt
import prettytable # 导入表格库
from pandas_profiling import ProfileReport # 自动eda
import sweetviz as sv # 自动eda
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
import os
import shutil
import toad
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from toad.plot import bin_plot, badrate_plot
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from scipy.stats import scoreatpercentile
from toad.scorecard import ScoreCard
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder # 数据预处理库
%matplotlib inline
# 风格设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文
sns.set(style="ticks") # 设置风格
# 导入自定义模块
import sys
sys.path.append("/Users/heinrich/Desktop/Heinrich-blog/数据分析使用手册")
from keyIndicatorMapping import *
# 读取数据
raw_data = pd.read_csv('train.csv') # 读取数据文件
raw_data.head()
image-20230206152523445
# 变量分类
var_class_dic = var_class(raw_data, 'target')
y_col = get_key(var_class_dic, 'y')[0]
date_col = get_key(var_class_dic, 'date')
number_col = get_key(var_class_dic, 'number')
object_col = get_key(var_class_dic, 'object')
# 更改数据类型
raw_data = raw_data.apply(pd.to_numeric, errors='ignore') # 把能转换成数字的都转换成数字,不能转化的由error=True参数控制忽略掉。
# raw_data[date_col] = raw_data[date_col].apply(pd.to_datetime) # 转时间列
# 数据审查
na_count = raw_data.isnull().any().sum() # 缺失值样本量
n_samples, n_features = raw_data.shape # 总样本量,总特征数
print('samples: {0}| features: {1} | na count: {2}'.format(n_samples, n_features, na_count))
samples: 108940| features: 167 | na count: 8
# 定义id列
ids_col = ['APP_ID_C']
# 定义排除的特征列:一般包括ID列、日期列、目标列
ex_lis = ids_col+date_col+[y_col]
# 定义排除列不含y
ex_lis_noy = ids_col+date_col
# 变量统计信息
raw_data_detect=toad.detector.detect(raw_data)
raw_data_detect
image-20230206152613997
# 变量价值信息 输出iv值、gini指数,entropy熵、unique values
# 对于数据量大或高维度数据,建议使用iv_only=True 2. 要去掉主键,日期等高unique values且不用于建模的特征
toad.quality(raw_data.drop(ex_lis_noy, axis=1), y_col, iv_only=True)
image-20230206152641954
# 样本拆分:训练样本、测试样本
train = raw_data.loc[raw_data.month.isin(['2019-03','2019-04','2019-05'])==True,:]
test = raw_data.loc[raw_data.month.isin(['2019-03','2019-04','2019-05'])==False,:]
# 方式1:根据缺失率、iv、相关系数初筛
raw_data_1s, drop_lst= toad.selection.select(train, train[y_col],
empty=0.7, iv=0.1,
corr=0.7,
return_drop=True,
exclude=ex_lis)
# 方式2: 通过机器学习筛选重要特征
# 分类变量标签化
df = obj_label(raw_data, ex_lis)
# 构造xgboost数据
X = df.drop(ex_lis, axis=1)
y = df[y_col]
def xgb_topN(X, y, n):
# XGB分类模型
param_dist = {'eval_metric':['logloss','auc','error'], 'use_label_encoder':False}
xgb_select = xgb.XGBClassifier(**param_dist, seed=11)
xgb_select.fit(X, y)
# 获取top10特征
features = X.columns # 获取特征名称
importances = xgb_select.feature_importances_ # 获取特征重要性
xgb_topN_feature = list(pd.Series(importances, index=features).sort_values(ascending=False)[:n].index)
return xgb_topN_feature
# 合并两特征并集
N=10
xgb_topN_feature = xgb_topN(X, y, N)
features = list(set(raw_data_1s.columns.to_list()+xgb_topN_feature))
# 构造新数据集
train_s = train[features].copy()
print("drop empty:", len(drop_lst['empty']),
"drop iv:", len(drop_lst['iv']),
"drop corr:", len(drop_lst['corr']),
"xgb keep:", N,
"keep:", train_s.shape[1]
)
drop empty: 0 drop iv: 131 drop corr: 14 xgb keep: 10 keep: 29
# 得到切分节点 卡方分箱 默认为左闭右开
combiner = toad.transform.Combiner()
combiner.fit(train_s, train_s[y_col], method='chi',
min_samples=0.05, exclude=ex_lis)
# 导出箱的节点
bins = combiner.export()
print(bins)
{'var_l_68': [0.000255689, 0.002045513, 0.013040143, 0.025313219], 'var_l_91': [0.000125945, 0.002141058], 'var_b17': [], 'var_l_107': [4.26e-05, 0.158232299], 'var_l_71': [0.00134285], 'var_l_67': [], 'var_d1': [['Hit-6+ Vintage'], ['Hit-lt 6 Vinta', 'nan']], 'var_b18': [-9999, 2], 'var_l_119': [0.0035087720000000003, 0.426829268], 'var_l_33': [], 'var_b15': [-8888, 1, 3], 'var_l_58': [0.000534759], 'var_b20': [], 'var_b12': [-8888, -999], 'var_l_89': [0.0005875440000000001, 0.004700353], 'var_l_19': [0.000328176], 'var_d7': [['LARGE FLEET OPERATOR', 'COMPANY', 'STRATEGIC TRANSPRTER', 'DOCTOR-SELF EMPLOYED', 'SALARIED', 'HOUSEWIFE', 'DOCTOR-SALARIED', 'CONSULTANT', 'SAL(RETIRAL AGE 58)', 'MEDIUM FLEETOPERATOR', 'STRATEGIC CAPTIVE'], ['RETAIL TRANSPORTER', 'SAL(RETIRAL AGE 60)', 'nan', 'STUDENT', 'SERVICES', 'OTHERS', 'MANUFACTURING', 'TRADER', 'PENSIONER', 'CARETAKER', 'AGENT', 'FIRST TIME USERS', 'DIRECTOR', 'SMALL RD TRANS.OPR', 'SERV-PRIVATE SECTOR', 'TRADING'], ['PROPRIETOR', 'SELF-EMPLOYED', 'BUSINESSMAN', 'RETIRED PERSONNEL', 'STRATEGIC S1', 'CONTRACTOR', 'GOVERNMENT SERVICE', 'AGRICULTURIST', 'MANAGER', 'RETAIL', 'TEACHER', 'SCHOOLS', 'BANKS SERVICE', 'OFFICER', 'ACCOUNTANT', 'RESTAURANT KEEPER', 'Salaried', 'GENARAL RETAILER', 'SERV-PUBLIC SECTOR', 'Defence (NCO)', 'POLICEMAN', 'ADVISOR', 'STRATEGIC S2', 'SALESMAN', 'BARRISTER', 'OTHERS NOT DEFINED', 'RETAIL CAPTIVE', 'SUP STRAT TRANSPORT', 'CARPENTER', 'SECRETARY', 'LECTURER', 'JEWELLER', 'DRIVER', 'ATTORNEY AT LAW', 'STRATEGIC S3', 'PROGRAMMER', 'TECHNICIAN', 'TAILOR', 'PLANTER', 'PRIEST', 'EXECUTIVE ASSISTANT', 'STRATEGIC BUS OP', 'HELPER', 'CLERK', 'PROOF READER', 'ASSOCIATION', 'TYPIST', 'DISPENSER', 'ADMINSTRATOR', 'BUS CONTROLLER', 'Trading', 'TAXI DRIVER', 'QUANTITY SURVEYOR', 'INDUSTRY', 'ENGINEERING', 'NURSE', 'PRIVATE TAILOR', 'GARAGIST', 'CHAIRMAN', 'STOCKBROKER(S)-COMMD', 'HAIRDRESSER', 'PHARMACIST', 'RETAIL BUS OPERATOR']], 'var_d2': [669.0, 748.0, 782.0, 818.0], 'var_l_125': [0.001808318, 0.237654321, 0.401360544], 'var_b10': [-8888.0, 0.548229531], 'var_l_43': [], 'var_d11': [['N'], ['U']], 'var_d5': [['O', 'nan', 'F'], ['M']], 'var_l_60': [0.000429369, 0.001288107, 0.004723057, 0.014169171], 'var_b19': [-9999, 1, 2, 6], 'var_b9': [-9999, -8888, 9]}
通过Bivar图观察变量分箱后的单调趋势,若存在波动性,则进行手动分箱
# 变量分布
# 定义df、y_col、figure_save_path
train_t = combiner.transform(train_s)
figure_save_path = "var_bivar_file"
if os.path.exists(figure_save_path):
shutil.rmtree(figure_save_path)
os.mkdir(figure_save_path)
else:
os.mkdir(figure_save_path)
# 分箱变量展示
for x in bins.keys():
bin_plot(train_t, x=x, target=y_col)
plt.savefig('%s/%s.jpg' % (figure_save_path, x))
plt.close()
# 计算展示的行列
num_plots = len(bins.keys())
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
# 生成网格图
fig = plt.figure(figsize = (80,60)) # 每调用一次plt.figure()函数就会创建一个新的显示窗口,与matlab中的figure类似
for i, x in enumerate(bins.keys()):
path = figure_save_path+'/'+x+'.jpg'
img = plt.imread(path)
plt.subplot(num_cols, num_rows, i+1) # 表示第i张图片,下标只能从1开始,不能从0,
plt.imshow(img)
#下面两行是消除每张图片自己单独的横纵坐标,不然每张图片会有单独的横纵坐标,影响美观
plt.xticks([])
plt.yticks([])
fig.tight_layout() # 调整整体空白
plt.subplots_adjust(wspace =0, hspace =0) # 调整子图间距
plt.show()
image-20221221220114239
# 查找非单调趋势的连续变量和只有一个分组的变量(删除)
adj_col = []
del_col = []
for x, v in bins.items():
if len(v)==0:
del_col.append(x)
elif x in object_col:
pass
else:
df_temp = bin_plot(train_t, x=x, target=y_col, return_frame=True)
plt.close()
L = list(df_temp[1]['badrate'])
if all(x<=y for x, y in zip(L, L[1:])) or all(x>=y for x, y in zip(L, L[1:])) : # 连续变量分箱后非单调
pass
else:
adj_col.append(x)
print('adj_col:',adj_col)
print('del_col:',del_col)
adj_col: ['var_l_68', 'var_l_107', 'var_b18', 'var_l_119', 'var_b12', 'var_d2', 'var_l_125', 'var_b10', 'var_b19', 'var_b9']
del_col: ['var_b17', 'var_l_67', 'var_l_33', 'var_b20', 'var_l_43']
# 手动调整分箱:部分变量分箱后,badrate没有呈现单调趋势
# 这里只调整var_b9(存在上下波动趋势)
print(bins['var_b9'])
[-9999, -8888, 9]
# 手动调整
adj_bin = {'var_b9': [-9999,9]} # 合并2,3箱
combiner.set_rules(adj_bin)
train_t = combiner.transform(train_s)
# 绘制Bivar图,观察调整后的趋势
bin_plot(train_t, x='var_b9', target=y_col)
plt.show()
output_26_0
调整后的var_b9具有单调趋势
# 训练、测试样本间稳定性
train_t = combiner.transform(train_s)
test_t = combiner.transform(test[train_s.columns])
data = pd.concat([train_t,test_t], join='inner', keys=['train', 'test'])\
.reset_index(level=0).rename(columns={'level_0':'sample'})
figure_save_path = "var_badrate_file"
if os.path.exists(figure_save_path):
shutil.rmtree(figure_save_path)
os.mkdir(figure_save_path)
else:
os.mkdir(figure_save_path)
# 分箱变量展示
for x in bins.keys():
badrate_plot(data, x='sample', target=y_col, by=x)
plt.title(x)
plt.savefig('%s/%s.jpg' % (figure_save_path, x))
plt.close()
# 计算展示的行列
num_plots = len(bins.keys())
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
# 生成网格图
fig = plt.figure(figsize = (80,60)) # 每调用一次plt.figure()函数就会创建一个新的显示窗口,与matlab中的figure类似
for i, x in enumerate(bins.keys()):
path = figure_save_path+'/'+x+'.jpg'
img = plt.imread(path)
plt.subplot(num_cols, num_rows, i+1) # 表示第i张图片,下标只能从1开始,不能从0,
plt.imshow(img)
#下面两行是消除每张图片自己单独的横纵坐标,不然每张图片会有单独的横纵坐标,影响美观
plt.xticks([])
plt.yticks([])
fig.tight_layout() # 调整整体空白
plt.subplots_adjust(wspace =0, hspace =0) # 调整子图间距
plt.show()
image-20221221220421865
# 不同月份稳定性
train_t = combiner.transform(train_s)
test_t = combiner.transform(test[train_s.columns])
data = pd.concat([train_t,test_t], join='inner', keys=['train', 'test'])\
.reset_index(level=0).rename(columns={'level_0':'sample'})
figure_save_path = "var_badrate_file_month"
if os.path.exists(figure_save_path):
shutil.rmtree(figure_save_path)
os.mkdir(figure_save_path)
else:
os.mkdir(figure_save_path)
# 分箱变量展示
for x in bins.keys():
badrate_plot(data, x='month', target=y_col, by=x)
plt.title(x)
plt.savefig('%s/%s.jpg' % (figure_save_path, x))
plt.close()
# 计算展示的行列
num_plots = len(bins.keys())
num_cols = math.ceil(np.sqrt(num_plots))
num_rows = math.ceil(num_plots/num_cols)
# 生成网格图
fig = plt.figure(figsize = (80,60)) # 每调用一次plt.figure()函数就会创建一个新的显示窗口,与matlab中的figure类似
for i, x in enumerate(bins.keys()):
path = figure_save_path+'/'+x+'.jpg'
img = plt.imread(path)
plt.subplot(num_cols, num_rows, i+1) # 表示第i张图片,下标只能从1开始,不能从0,
plt.imshow(img)
#下面两行是消除每张图片自己单独的横纵坐标,不然每张图片会有单独的横纵坐标,影响美观
plt.xticks([])
plt.yticks([])
fig.tight_layout() # 调整整体空白
plt.subplots_adjust(wspace =0, hspace =0) # 调整子图间距
plt.show()
image-20221221220514510
# 手动调整分箱:var_b19在不同月份间存在交叉,不具有稳定性
# var_l_68的1,2箱存在交叉,合并为一箱
print(bins['var_b19'])
[-9999, 1, 2, 6]
# 手动调整
adj_bin = {'var_b19': [-9999, 1, 6]}
combiner.set_rules(adj_bin)
train_t = combiner.transform(train_s)
test_t = combiner.transform(test[train_s.columns])
data = pd.concat([train_t,test_t], join='inner', keys=['train', 'test'])\
.reset_index(level=0).rename(columns={'level_0':'sample'})
# 绘制badrate图,观察调整后的结果
badrate_plot(data, x='sample', target=y_col, by='var_b19')
badrate_plot(data, x='month', target=y_col, by='var_b19')
plt.show()
output_34_0
output_34_1
手动分箱后,在不同样本、月份间均稳定
# 剔除单分组变量
train_t.drop(del_col, axis=1, inplace=True)
test_t.drop(del_col, axis=1, inplace=True)
print("keep:", train_t.shape[1])
keep: 24
目的是将特征的非线性关系转换为线性的,对异常值不敏感
w = toad.transform.WOETransformer()
#对WOE的值进行转化,映射到原数据集上。对训练集用fit_transform,测试集用transform.
train_w = w.fit_transform(train_t, train_t[y_col],
exclude=ex_lis)
test_w = w.transform(test_t[train_t.columns])
data = pd.concat([train_w, test_w])
# psi筛选 筛选稳定性的变量
np.seterr(divide='ignore',invalid='ignore') # 防止0/0产生的invalid value
psi_df = toad.metrics.PSI(train_w, test_w).sort_values(0)
psi_df = psi_df.reset_index()
psi_df = psi_df.rename(columns = {'index': 'feature', 0: 'psi'})
col_keep = list(set(list(psi_df[psi_df.psi<0.02].feature)).union(set(ex_lis))) # 保留低psi特征和不参与特征的并集
train_psi = train_w[col_keep]
print("keep:", train_psi.shape[1])
keep: 24
# 因为特征WOE编码后,部分变量的IV变低,且整体相关性变大。故再次进行特征筛选
train_psi_s2, drop_lst = toad.selection.select(train_psi,
train_psi[y_col],
empty=0.7,
iv=0.1,
corr=0.7,
return_drop=True,
exclude=ex_lis)
print("keep:", train_psi_s2.shape[1],
"drop empty:", len(drop_lst['empty']),
"drop iv:", len(drop_lst['iv']),
"drop corr:", len(drop_lst['corr']))
keep: 13 drop empty: 0 drop iv: 9 drop corr: 2
# 逐步回归筛选变量
train_stp = toad.selection.stepwise(train_psi_s2,
train_psi_s2[y_col],
exclude=ex_lis,
direction='both',
criterion='aic',
estimator='ols',
intercept=False)
print("keep:", train_stp.shape[1])
keep: 10
test_stp = test_w[train_stp.columns]
data_finall = pd.concat([train_stp, test_stp])
print(data_finall.shape)
(108940, 10)
# 样本拆分
X, y = data_finall.drop(ex_lis, axis=1), data_finall[y_col]
X_train, y_train = train_stp.drop(ex_lis, axis=1), train_stp[y_col]
X_test, y_test = test_stp.drop(ex_lis, axis=1), test_stp[y_col]
# 样本均衡处理
model_smote = SMOTE(random_state=0) # 建立SMOTE模型对象 设置随机种子,保持采样样本一致
X_train, y_train = model_smote.fit_resample(X_train,y_train) # 输入数据并作过抽样处理
# 模型训练
model_lr = LogisticRegression(C=0.1, class_weight='balanced')
model_lr.fit(X_train, y_train)
LogisticRegression(C=0.1, class_weight='balanced')
model_confusion_metrics(model_lr, X_train, y_train, 'train')
model_confusion_metrics(model_lr, X_test, y_test, 'test')
confusion matrix for train
+----------+--------------+--------------+
| | prediction-0 | prediction-1 |
+----------+--------------+--------------+
| actual-0 | 45378 | 18566 |
| actual-1 | 23641 | 40303 |
+----------+--------------+--------------+
confusion matrix for test
+----------+--------------+--------------+
| | prediction-0 | prediction-1 |
+----------+--------------+--------------+
| actual-0 | 683 | 288 |
| actual-1 | 15786 | 26819 |
+----------+--------------+--------------+
model_core_metrics(model_lr, X_train, y_train, 'train')
model_core_metrics(model_lr, X_test, y_test, 'test')
core metrics for train
+------+----------+-----------+--------+-------+------+
| auc | accuracy | precision | recall | f1 | ks |
+------+----------+-----------+--------+-------+------+
| 0.74 | 0.67 | 0.657 | 0.71 | 0.683 | 0.35 |
+------+----------+-----------+--------+-------+------+
core metrics for test
+-------+----------+-----------+--------+-------+-------+
| auc | accuracy | precision | recall | f1 | ks |
+-------+----------+-----------+--------+-------+-------+
| 0.728 | 0.631 | 0.041 | 0.703 | 0.078 | 0.335 |
+-------+----------+-----------+--------+-------+-------+
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_roc(model_lr, X_test, y_test, name='test')
plt.subplot(222)
plot_ks(model_lr, X_test, y_test, name='test')
plt.subplot(223)
plot_pr(model_lr, X_test, y_test, name='test')
plt.subplot(224)
plot_lift(model_lr, X_test, y_test, name='test')
plt.tight_layout()
plt.show()
output_57_0
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_cv_box(model_lr, X_test, y_test, name='test')
plt.subplot(222)
plot_learning_curve(model_lr, X_test, y_test, name='test')
plt.tight_layout()
plt.show()
output_60_0
# 模型PSI:小于10%,则无需更新模型;10%-20%, 需检查变化原因,加强监控频率;大于20%,则模型需要迭代
mpsi = model_psi(model_lr, X_train, X_test)
print('模型PSI:',mpsi)
模型PSI: 0.3299012600361123
# 模型捕获率报告
y_test_prob = model_lr.predict_proba(X_test)[:, 1]
df_capture = capture_table(y_test_prob, y_test)
df_capture.columns=['KS', '负样本个数', '正样本个数', '负样本累计个数', '正样本累计个数', '捕获率', '负样本占比']
df_capture
image-20230206152721898
逻辑回归评分卡拉伸规则:
# 计算odds
bad_total=raw_data[y_col].sum()
good_total=raw_data.shape[0]-bad_total
odds=round(bad_total/good_total,2)
base_odds=round(good_total/bad_total,0)
print('bad_total:{0}\ngood_total:{1}\nodds:{2}\nbase_odds:{3}\n'.format(bad_total,good_total,odds,base_odds))
bad_total:2391
good_total:106549
odds:0.02
base_odds:45.0
# 生成评分报告 # 注意ScoreCard方法里求解A=𝑃0-𝐵∗𝑙𝑜𝑔(𝑜𝑑𝑑𝑠)。因此这里的base_odds使用好坏比,即(1-p)/p
card = ScoreCard(combiner=combiner,
transer=w, C=0.1,
class_weight='balanced',
base_score=600,
base_odds=45,
pdo=60,
rate=2)
card.fit(X_train, y_train)
# 输出标准评分卡规则
final_card = card.export(to_frame=True)
final_card
image-20230206152753141
def get_score(X, card):
'''
X:X数据集
card:评分卡对象名
return:增加分值列的df
'''
df_score=pd.DataFrame(card.predict(X), index=X.index, columns=["score"])
df_data_score = pd.concat([X,df_score], axis=1)
return df_data_score
# 计算评分卡得分
final_data_score=get_score(test, card)
# 得分的直方图
sns.histplot(final_data_score['score'])
plt.show()
output_73_0
评分分布呈现正态,符合预期
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_score_hist(final_data_score, y_col, 'score')
plt.subplot(222)
plot_lorenz(final_data_score, y_col, 'score')
plt.tight_layout()
plt.show()
output_76_0
%%time
# 搜索cutoff点
print('{:*^60}'.format('cutoff search result'))
_, cutoff_score=search_cutoff(final_data_score,y_col,'score')
print('{:*^60}'.format('set cutoff result'))
# 设定cutoff点,衡量有效性
matrix_df=rule_verify(final_data_score,y_col,'score',cutoff_score)
********************cutoff search result********************
最大KS值:0.334
KS最大的分数:280
*********************set cutoff result**********************
拒绝准确率:0.04
查全率:0.742
误伤率:0.408
规则拒绝率:0.415
CPU times: user 4min 40s, sys: 50.6 s, total: 5min 30s
Wall time: 5min 31s
# 查看cutoff结果
plot_score_hist(final_data_score, y_col, 'score', cutoff=cutoff_score)
plt.show()
output_80_0
评分卡区分较差,误伤率过高。拒绝准确率过低
# 生成评分明细
final_data_score_detail = card.predict(test, return_sub=True)[1]
final_data_score_detail['score'] = final_data_score_detail[list(final_card['name'].unique())].sum(axis=1)
# 归一化处理
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
final_data_score_detail_scaler = final_data_score_detail.copy()
final_data_score_detail_scaler.iloc[:,:-1] = final_data_score_detail_scaler.iloc[:,:-1].apply(max_min_scaler)
# 画布基本设置
fig = plt.figure(figsize=(6,6)) # 建立画布
ax = fig.add_subplot(111, polar=True) # 增加子网格,注意polar参数
labels = final_card['name'].unique() # 设置要展示的数据标签
cor_list = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] # 定义不同类别的颜色
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False) # 计算各个区间的角度
angles = np.concatenate((angles, [angles[0]])) # 建立相同首尾字段以便于闭合
labels = np.concatenate((labels,[labels[0]])) # 新版本增加,对labels进行封闭
# 画雷达图
i = 65367
score = int(final_data_score_detail_scaler.loc[i]['score'])
data_tmp = np.array(final_data_score_detail_scaler.loc[i])[0:-1] # 获得对应类数据
data = np.concatenate((data_tmp, [data_tmp[0]])) # 建立相同首尾字段以便于闭合
ax.plot(angles, data, 'o-', c=cor_list[0], label=f'score:{score}') # 画线
# 设置图像显示格式
ax.set_thetagrids(angles * 180 / np.pi, labels, fontproperties="SimHei") # 设置极坐标轴
ax.set_title(f"用户{i}得分雷达图", fontproperties="SimHei") # 设置标题放置
ax.set_rlim(-0.2, 1.2) # 设置坐标轴尺度范围
plt.legend(loc=0) # 设置图例位置
plt.show()
output_83_0
由于评分卡是基于LR模型训练的,虽然在特征处理过程较为严格,但本身模型准确性较低。因此可以考虑通过其他准确率高的模型进行训练,例如XGBoost。只需将odd的计算换为(1-p)/p
即可,这里的p为模型输出的概率值。
当然基于其他机器学习模型的评分卡虽然提高了准确性,也能得到最终得分,但由于缺乏系数支持,所以无法获得每个变量的单独得分。
# 样本拆分
X_train, y_train = train_t.drop(ex_lis, axis=1), train_t[y_col]
X_test, y_test = test_t.drop(ex_lis, axis=1), test_t[y_col]
# 样本均衡处理
model_smote = SMOTE(random_state=0) # 建立SMOTE模型对象 设置随机种子,保持采样样本一致
X_train, y_train = model_smote.fit_resample(X_train,y_train) # 输入数据并作过抽样处理
# XGB分类模型训练
param_dist = {'n_estimators': 10, 'subsample': 0.8, 'learning_rate':0.1,
'max_depth': 10, 'n_jobs': -1,
'eval_metric':'logloss', 'use_label_encoder':False}
model_xgb = xgb.XGBClassifier(**param_dist)
model_xgb.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=10, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=10, n_jobs=-1,
num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=0.8, tree_method='exact',
use_label_encoder=False, validate_parameters=1, verbosity=None)
# 核心指标评价
model_confusion_metrics(model_xgb, X_train, y_train, 'train')
model_confusion_metrics(model_xgb, X_test, y_test, 'test')
model_core_metrics(model_xgb, X_train, y_train, 'train')
model_core_metrics(model_xgb, X_test, y_test, 'test')
confusion matrix for train
+----------+--------------+--------------+
| | prediction-0 | prediction-1 |
+----------+--------------+--------------+
| actual-0 | 56929 | 7015 |
| actual-1 | 21968 | 41976 |
+----------+--------------+--------------+
confusion matrix for test
+----------+--------------+--------------+
| | prediction-0 | prediction-1 |
+----------+--------------+--------------+
| actual-0 | 663 | 308 |
| actual-1 | 14824 | 27781 |
+----------+--------------+--------------+
core metrics for train
+-------+----------+-----------+--------+-------+-------+
| auc | accuracy | precision | recall | f1 | ks |
+-------+----------+-----------+--------+-------+-------+
| 0.863 | 0.773 | 0.722 | 0.89 | 0.797 | 0.551 |
+-------+----------+-----------+--------+-------+-------+
core metrics for test
+-------+----------+-----------+--------+-------+-------+
| auc | accuracy | precision | recall | f1 | ks |
+-------+----------+-----------+--------+-------+-------+
| 0.728 | 0.653 | 0.043 | 0.683 | 0.081 | 0.339 |
+-------+----------+-----------+--------+-------+-------+
# 模型区分与排序能力
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_roc(model_xgb, X_test, y_test, name='test')
plt.subplot(222)
plot_ks(model_xgb, X_test, y_test, name='test')
plt.subplot(223)
plot_pr(model_xgb, X_test, y_test, name='test')
plt.subplot(224)
plot_lift(model_xgb, X_test, y_test, name='test')
plt.tight_layout()
plt.show()
output_92_0
# 模型泛化能力
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_cv_box(model_xgb, X_test, y_test, name='test')
plt.subplot(222)
plot_learning_curve(model_xgb, X_test, y_test, name='test')
plt.tight_layout()
plt.show()
output_93_0
# 模型稳定性
mpsi = model_psi(model_xgb, X_train, X_test)
print('模型PSI:',mpsi)
模型PSI: 0.5933633709800615
# 模型捕获率报告
y_test_prob = model_xgb.predict_proba(X_test)[:, 1]
df_capture = capture_table(y_test_prob, y_test)
df_capture.columns=['KS', '负样本个数', '正样本个数', '负样本累计个数', '正样本累计个数', '捕获率', '负样本占比']
df_capture
image-20230206152823018
# 构建XGBoost等机器学习的评分卡校验(仅返回样本的分数,无法得到每个变量的分数。解释性低于逻辑回归)
def score_rule(p):
'''
p:概率值
return:分数值
'''
score = 600+60*(math.log2((1-p)/p))
return score
def get_score_ml(model, X, y):
'''
model:模型对象
X:X数据集
y:标签集
return:增加分值列的df
'''
df=pd.concat([X_test, y_test], axis=1)
df["prob"]=model.predict_proba(X)[:, 1]
df["score"]=df["prob"].map(lambda x:score_rule(x))
return df
final_data_score=get_score_ml(model_xgb, X_test, y_test)
# 得分的直方图
sns.histplot(final_data_score['score'])
plt.show()
output_97_0
# 评分卡结果区分能力
fig = plt.figure(figsize=(18,12))
plt.subplot(221)
plot_score_hist(final_data_score, y_col, 'score')
plt.subplot(222)
plot_lorenz(final_data_score, y_col, 'score')
plt.tight_layout()
plt.show()
output_98_0
%%time
# 搜索cutoff点
print('{:*^60}'.format('cutoff search result'))
_, cutoff_score=search_cutoff(final_data_score,y_col,'score')
print('{:*^60}'.format('set cutoff result'))
# 设定cutoff点,衡量有效性
matrix_df=rule_verify(final_data_score,y_col,'score',cutoff_score)
********************cutoff search result********************
最大KS值:0.337
KS最大的分数:595
*********************set cutoff result**********************
拒绝准确率:0.045
查全率:0.648
误伤率:0.311
规则拒绝率:0.319
CPU times: user 1min 10s, sys: 1.29 s, total: 1min 12s
Wall time: 1min 12s
# 查看cutoff结果
plot_score_hist(final_data_score, y_col, 'score', cutoff=cutoff_score)
plt.show()
output_100_0
评分卡模型是具有完整且完善的建模流程,而且结果展示完全适用于业务运营,因此兼具高准确性、高解释性的优点,而且利用评分卡模型解释日常业务时,不需要考虑cutoff,也不必担心因为拒绝造成的幸存者偏差现象,因此也不必进行拒绝推断、迁移学习等。
共勉~
[1]
toad使用教程: https://toad.readthedocs.io/en/stable/tutorial_chinese.html
[2]
基于Xgboost的AI评分卡构建: https://blog.csdn.net/zzpl139/article/details/125968097?utm_medium=distribute.pc_feed_404.none-task-blog-2~default~BlogCommendFromBaidu~Rate-10-125968097-blog-null.pc_404_mixedpudn&depth_1-utm_source=distribute.pc_feed_404.none-task-blog-2~default~BlogCommendFromBaidu~Rate-10-125968097-blog-null.pc_404_mixedpud
[3]
评分卡模型的评估方法论: https://zhuanlan.zhihu.com/p/56738542