TVP

# 理解Scikit-Learn中分类性能度量指标

Understanding Data Science Classification Metrics in Scikit-Learn in Python

https://towardsdatascience.com/data-science-performance-metrics-for-everyone-4d68f4859eef。

confusion_matrix

accuracy_score

recall_score

precision_score

f1_score

roc_curve

roc_auc_score

https://github.com/andrewwlong/classification_metrics_sklearn。

importpandasaspd

thresh =0.5

df['predicted_RF'] = (df.model_RF >=0.5).astype('int')

df['predicted_LR'] = (df.model_LR >=0.5).astype('int')

confusion_matrix

True positive（真阳性） — 真实值（actual） = 1, 预测值（predicted） = 1

False positive（假阳性） — 真实值（actual） = 0, 预测值（predicted） = 1

False negative（假阴性） — 真实值（actual） = 1, 预测值（predicted） = 0

True negative（真阴性） — 真实值（actual） = 0, 预测值（predicted） = 0

https://en.wikipedia.org/wiki/Precision_and_recall#/media/File:Precisionrecall.svg。

fromsklearn.metrics importconfusion_matrix

confusion_matrix(df.actual_label.values,df.predicted_RF.values)

deffind_TP(y_true,y_pred):

#counts the number of true positives (y_true = 1, y_pred = 1)

returnsum((y_true ==1) & (y_pred ==1))

deffind_FN(y_true,y_pred):

#counts the number of false negatives (y_true = 1, y_pred = 0)

deffind_FP(y_true,y_pred):

# countsthe number of false positives (y_true = 0, y_pred = 1)

deffind_TN(y_true,y_pred):

#counts the number of true negatives (y_true = 0, y_pred = 0)

print('TP:',find_TP(df.actual_label.values,df.predicted_RF.values))

print('FN:',find_FN(df.actual_label.values,df.predicted_RF.values))

print('FP:',find_FP(df.actual_label.values,df.predicted_RF.values))

print('TN:',find_TN(df.actual_label.values,df.predicted_RF.values))

importnumpyasnp

deffind_conf_matrix_values(y_true,y_pred):

#calculate TP, FN, FP, TN

TP=find_TP(y_true,y_pred)

FN=find_FN(y_true,y_pred)

FP=find_FP(y_true,y_pred)

TN=find_TN(y_true,y_pred)

returnTP,FN,FP,TN

defmy_confusion_matrix(y_true,y_pred):

TP,FN,FP,TN = find_conf_matrix_values(y_true,y_pred)

returnnp.array([[TN,FP],[FN,TP]])

my_confusion_matrix(df.actual_label.values,df.predicted_RF.values)

assertnp.array_equal(my_confusion_matrix(df.actual_label.values,

df.predicted_RF.values),confusion_matrix(df.actual_label.values,

df.predicted_RF.values) ),'my_confusion_matrix() is not correct for RF'

assertnp.array_equal(my_confusion_matrix(df.actual_label.values,

df.predicted_LR.values),confusion_matrix(df.actual_label.values,

df.predicted_LR.values) ),'my_confusion_matrix() is not correct for LR'

accuracy_score

fromsklearn.metrics importaccuracy_score

accuracy_score(df.actual_label.values,df.predicted_RF.values)

defmy_accuracy_score(y_true,y_pred):

#calculates the fraction of samples predicted correctly

TP,FN,FP,TN= find_conf_matrix_values(y_true,y_pred)

assertmy_accuracy_score(df.actual_label.values,df.predicted_RF.values)

==accuracy_score(df.actual_label.values,df.predicted_RF.values),

'my_accuracy_score failed on RF'

assertmy_accuracy_score(df.actual_label.values,df.predicted_LR.values)

==accuracy_score(df.actual_label.values,df.predicted_LR.values),

'my_accuracy_score failed on LR'

print('Accuracy RF:%.3f'%(my_accuracy_score(df.actual_label.values,

df.predicted_RF.values)))

print('Accuracy LR:%.3f'%(my_accuracy_score(df.actual_label.values,

df.predicted_LR.values)))

recall_score

Recall召回率是您正确预测的positive事件的分数，如下所示：

fromsklearn.metrics importrecall_score

recall_score(df.actual_label.values,df.predicted_RF.values)

defmy_recall_score(y_true,y_pred):

#calculates the fraction of positive samples predicted correctly

TP,FN,FP,TN= find_conf_matrix_values(y_true,y_pred)

assertmy_recall_score(df.actual_label.values,df.predicted_RF.values)

== recall_score(df.actual_label.values,df.predicted_RF.values),

'my_accuracy_score failed on RF'

assertmy_recall_score(df.actual_label.values,df.predicted_LR.values)

== recall_score(df.actual_label.values,df.predicted_LR.values),

'my_accuracy_score failed on LR'

print('Recall RF:%.3f'%(my_recall_score(df.actual_label.values,

df.predicted_RF.values)))

print('Recall LR:%.3f'%(my_recall_score(df.actual_label.values,

df.predicted_LR.values)))

precision_score

Precision（精确度）是实际为正的事件所占总的预测阳性事件的比，如下所示:

fromsklearn.metrics importprecision_score

precision_score(df.actual_label.values,df.predicted_RF.values)

defmy_precision_score(y_true,y_pred):

#calculates the fraction of predicted positives samples that

are actuallypositive

TP,FN,FP,TN= find_conf_matrix_values(y_true,y_pred)

assertmy_precision_score(df.actual_label.values,

df.predicted_RF.values) == precision_score(df.actual_label.values,

df.predicted_RF.values),'my_accuracy_score failed on RF'

assertmy_precision_score(df.actual_label.values,

df.predicted_LR.values) ==precision_score(df.actual_label.values,

df.predicted_LR.values),'my_accuracy_score failed on LR'

print('Precision RF:%.3f'%(my_precision_score(df.actual_label.values,

df.predicted_RF.values)))

print('Precision LR:%.3f'%(my_precision_score(df.actual_label.values,

df.predicted_LR.values)))

f1_score

f1 score是召回率和精确度的调和平均值，得分越高越好。f1 score的计算公式如下:

fromsklearn.metrics importf1_score

f1_score(df.actual_label.values,df.predicted_RF.values)

defmy_f1_score(y_true,y_pred):

#calculates the F1 score

recall= my_recall_score(y_true,y_pred)

precision= my_precision_score(y_true,y_pred)

assertmy_f1_score(df.actual_label.values,df.predicted_RF.values) ==

f1_score(df.actual_label.values,df.predicted_RF.values),

'my_accuracy_score failed on RF'

assertmy_f1_score(df.actual_label.values,df.predicted_LR.values) ==

f1_score(df.actual_label.values,df.predicted_LR.values),

'my_accuracy_score failed on LR'

print('F1 RF:%.3f'%(my_f1_score(df.actual_label.values,

df.predicted_RF.values)))

print('F1 LR:%.3f'%(my_f1_score(df.actual_label.values,

df.predicted_LR.values)))

print('scores with threshold= 0.5')

print('Accuracy RF:%.3f'%(my_accuracy_score(df.actual_label.values,

df.predicted_RF.values)))

print('Recall RF:%.3f'%(my_recall_score(df.actual_label.values,

df.predicted_RF.values)))

print('Precision RF:%.3f'%(my_precision_score(df.actual_label.values,

df.predicted_RF.values)))

print('F1 RF:%.3f'%(my_f1_score(df.actual_label.values,

df.predicted_RF.values)))

print(' ')

print('scores with threshold = 0.25')

print('Accuracy RF:%.3f'%(my_accuracy_score(df.actual_label.values,

(df.model_RF >=0.25).astype('int').values)))

print('Recall RF:%.3f'%(my_recall_score(df.actual_label.values,

(df.model_RF >=0.25).astype('int').values)))

print('Precision RF: %.3f'%(my_precision_score(df.actual_label.values,

(df.model_RF >=0.25).astype('int').values)))

print('F1 RF:%.3f'%(my_f1_score(df.actual_label.values,

(df.model_RF >=0.25).astype('int').values)))

roc_curve 和 roc_auc_score

ROC曲线非常有助于理解真阳性率（true-positive rate）和假阳性率（false positive rates）之间的平衡。 Scikit learn为实现和分析ROC曲线构建了函数。这些函数的输入（roc_curve和roc_auc_score）是实际标签和预测概率（不是预测标签）。

roc_curve和roc_auc_score都是复杂的函数，所以我们不会让你从头开始编写这些函数。相反，我们将向您展示如何使用scikit learn中的函数来实现并解释关键点。让我们先用roc_curve来做ROC图。

fromsklearn.metrics importroc_curve

fpr_RF,tpr_RF,thresholds_RF =roc_curve(df.actual_label.values,

df.model_RF.values)

fpr_LR,tpr_LR,thresholds_LR =roc_curve(df.actual_label.values,

df.model_LR.values)

roc_curve函数返回三个列表：

thresholds = 按降序排列的所有唯一预测概率

fpr = 每个阈值的假阳性率(FP/ (FP + TN))

tpr = 每个阈值的真阳性率(TP/ (TP + FN))

importmatplotlib.pyplot asplt

plt.plot(fpr_RF,tpr_RF,'r-',label='RF')

plt.plot(fpr_LR,tpr_LR,'b-',label='LR')

plt.plot([,1],[,1],'k-',label='random')

plt.plot([,,1,1],[,1,1,1],'g-',label='perfect')

plt.legend()

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.show()

fromsklearn.metrics importroc_auc_score

auc_RF = roc_auc_score(df.actual_label.values,df.model_RF.values)

auc_LR = roc_auc_score(df.actual_label.values,df.model_LR.values)

print('AUC RF:%.3f'% auc_RF)

print('AUC LR:%.3f'% auc_LR)

importmatplotlib.pyplot asplt

plt.plot(fpr_RF,tpr_RF,'r-',label='RF AUC:%.3f'%auc_RF)

plt.plot(fpr_LR,tpr_LR,'b-',label='LR AUC:%.3f'%auc_LR)

plt.plot([,1],[,1],'k-',label='random')

plt.plot([,,1,1],[,1,1,1],'g-',label='perfect')

plt.legend()

plt.xlabel('False Positive Rate')

plt.ylabel('True Positive Rate')

plt.show()

https://towardsdatascience.com/understanding-data-science-classification-metrics-in-scikit-learn-in-python-3bc336865019

• 发表于:
• 原文链接https://kuaibao.qq.com/s/20180817B00EPQ00?refer=cp_1026
• 腾讯「腾讯云开发者社区」是腾讯内容开放平台帐号（企鹅号）传播渠道之一，根据《腾讯内容开放平台服务协议》转载发布内容。
• 如有侵权，请联系 cloudcommunity@tencent.com 删除。

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2023-02-05

2018-04-03

2023-02-05

2023-02-05

2023-02-05

10元无门槛代金券