第7章模型评估笔记

用户1075469

发布于 2022-03-04 11:22:49

88900

代码可运行

文章被收录于专栏：科技记者科技记者

运行总次数：0

代码可运行

7.2 k折交叉验证模型性能

这个方法可以解决过度适应的问题，

library(modeldata)
library(e1071)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
accuracies <- c()
for (i in 1:10) {
  fit <- svm(churn~.,churnTrain[ind!=i,])
  predictions <- predict(fit, churnTrain[ind==i,!names(churnTrain) %in% c("churn")])
  correct_count <- sum(as.data.frame(predictions) == churnTrain[ind==i,c("churn")])
  accuracies <- append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
}
accuracies
[1] 0.920 0.874 0.906 0.902 0.874 0.890 0.884 0.904 0.922 0.902

7.3 e1071包进行交叉验证

# e1071 交叉验证
library(e1071)
churnTrain <- churn[,!names(churn) %in% c('state',
                                          'area_code', "account_length")]
set.seed(2)
ind <- sample(2,nrow(churnTrain),replace = TRUE,
              prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
tuned <- tune.svm(churn~., data = trainset, gamma = 10^-2,
                  cost = 10^2, tunedcontrol = tune.control(cross = 10))
summary(tuned)
# Error estimation of ‘svm’ using 10-fold cross validation: 0.07473914
tuned$performances
  gamma cost      error dispersion
1  0.01  100 0.07473914 0.01605983
svmfit <- tuned$best.model
table(trainset[,c("churn")], predict(svmfit))
       yes   no
  yes  400   93
  no    14 2972

tune函数采用风格式搜索方法来完成参数优化，查看其他优化函数?tune

7.4 caret包进行交叉验证

# caret
library(caret)
# 重复3次10折交叉
control <- trainControl(method = "repeatedcv",number = 10,
                        repeats = 3)
model <- train(churn~.,data = trainset, method = "rpart",
               preProcess="scale", trControl=control)
model
CART 

3479 samples
  19 predictor
   2 classes: 'yes', 'no' 

Pre-processing: scaled (69) 
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 3131, 3132, 3130, 3130, 3131, 3132, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.04056795  0.9301563  0.6720876
  0.05578093  0.9147378  0.5774005
  0.07505071  0.8731365  0.2416613

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.04056795.

trainControl中可以设置重采样的参数，指定boot\boot632\cv\repeatdcv\LOOCV\LGOCV\non\oob\adaptive_cv\adaptive_boot\adaptive_LGOCV等。

7.5 caret包对变量重要程度排序

得到监督学习模型后，可以改变输入值，比较给定模型输出效果的变化敏感程度来评估不同特征对模型的重要性。

# 重要性排序
importance <- varImp(model, scale = FALSE)
importance
rpart variable importance

  only 20 most important variables shown (out of 69)

                              Overall
international_planyes         196.919
total_day_minutes             172.870
total_day_charge              161.582
number_customer_service_calls 155.274
total_intl_minutes            133.221
total_intl_charge             124.817
total_intl_calls               48.700
voice_mail_planyes             40.912
total_eve_charge               31.116
total_eve_minutes              31.116
...
plot(importance)

扩展rpart等一些分类算法包中从训练模型中产生的对象包含了变量重要性，可以查看和输出。

library(rpart)
model.rp <- rpart(churn~.,data = trainset)
model.rp$variable.importance
             total_day_charge             total_day_minutes                         state 
                   161.951011                    161.951011                     85.145010 
           total_intl_minutes number_customer_service_calls             total_intl_charge 
                    80.504234                     76.323349                     75.141041 
...

7.6 rimier包进行变量重要程度排序

这个费了好大劲，好像只有数值变量才行。

# rminier
install.packages("rminer")
library(rminer)
model <- fit(Species~., iris,model = "svm")
variable.importance <- Importance(model,iris,method = "sensv")
L=list(runs=1,sen=t(variable.importance$imp),sresponses=variable.importance$sresponses)
mgraph(L,graph = "IMP",leg = names(iris),col = "gray", Grid = 4)

7.7 caret包找到高度关联的特征

去掉非数值型属性，相关性计算获得一个关联度矩阵，将阈值设置为0.75，挑选高度关联的属性。

# findCorrelation
new_train <- trainset[,!names(trainset) %in% c('churn',
                                     'international_plan', "voice_mail_plan")]
cor_mat <- cor(new_train)
highlyCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
names(new_train)[highlyCorrelated]
[1] "total_night_minutes" "total_intl_charge"   "total_day_charge"    "total_eve_charge"

还可以使用subselect包中的leaps, genetic和anneal函数达到同样效果。

7.8 利用caret包选择特征

特征选择可以挑选出预测误差最低的属性子集，有助于我们判断究竟应该使用哪些特征才能建立一个精确的模型，递归特征排除函数rfe，自动选出符合要求的特征。

# caret选择特征
library(modeldata)
library(caret)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- sample(2,nrow(churnTrain),replace = TRUE,
              prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
# 特征转换，应该就是yes no变1，0,因子变二元属性
intl_plan <- model.matrix(~ trainset.international_plan - 1,
                          data = data.frame(trainset$international_plan))
colnames(intl_plan) <- c("trainset.international_planno"="intl_no",
                         "trainset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ trainset.voice_mail_plan - 1,
                           data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) <- c("trainset.voice_mail_planno"="voice_no",
                          "trainset.voice_mail_planyes"="voice_yes")
trainset$international_plan <- NULL
trainset$voice_mail_plan <- NULL
trainset <- cbind(intl_plan, voice_plan,trainset)
# testset同样操作
intl_plan <- model.matrix(~ testset.international_plan - 1,
                          data = data.frame(testset$international_plan))
colnames(intl_plan) <- c("testset.international_planno"="intl_no",
                         "testset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ testset.voice_mail_plan - 1,
                           data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) <- c("testset.voice_mail_planno"="voice_no",
                          "testset.voice_mail_planyes"="voice_yes")
testset$international_plan <- NULL
testset$voice_mail_plan <- NULL
testset <- cbind(intl_plan, voice_plan,testset)
# 线性判别分析创建一个特征筛选算法，交叉验证方法cv
ldaControl <- rfeControl(functions = ldaFuncs, method = "cv")
# 反向特征筛选
ldaProfile <- rfe(trainset[,names(trainset)!='churn'][,-c(5,6,7)],
                  trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl)
ldaProfile

Recursive feature selection

Outer resampling method: Cross-Validated (10 fold) 

Resampling performance over subset size:

 Variables Accuracy   Kappa AccuracySD KappaSD Selected
         1   0.8554 0.02245   0.009841 0.06008         
         2   0.8554 0.02245   0.009841 0.06008         
         3   0.8494 0.23064   0.013363 0.10679         
...
plot(ldaProfile, type = c("o","g"))
# 最佳变量子集
ldaProfile$optVariables
# 合适模型
ldaProfile$fit
Call:
lda(x, y)

Prior probabilities of groups:
      yes        no 
0.1417074 0.8582926 

Group means:
postResample(predict(ldaProfile, testset[,names(testset)!="churn"]), testset[,c("churn")])
 Accuracy     Kappa 
0.8520710 0.2523709

扩展

7.9 评测回归模型性能

均方根误差法RMSE，相对平方差RSE,可决系数R-Square。

# 回归模型性能评估
library(car)
data("Quartet")
plot(Quartet$x,Quartet$y3)
lmfit<- lm(Quartet$y3~Quartet$x)
abline(lmfit, col="red")
predicted <- predict(lmfit,newdata = Quartet[c("x")])
actual <- Quartet$y3
# 这里用的caret包的这个函数，这个包是个宝呀，啥都有
rmse <- RMSE(predicted, actual)
mu <- mean(actual)
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
Rsquare <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.118286 0.333676 0.666324
# MASS包rlm重新计算
library(MASS)
plot(Quartet$x,Quartet$y3)
rlmfit <- rlm(Quartet$y3~Quartet$x)
abline(rlmfit,col='red')
predicted <- predict(rlmfit, newdata = Quartet['x'])
rmse <- (mean((predicted-actual)^2))^0.5
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
rsqure <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.2790448 0.4365067 0.6663240

扩展

library(e1071)
tune(lm,y3~x,data = Quartet)
Error estimation of ‘lm’ using 10-fold cross validation: 2.351897

caret的train函数交叉验证，DAAG包的cv.lm可以达到同样效果

7.10 利用混淆矩阵评测模型的预测能力

模型的精确度、召回率、特异性以及准确率等性能指标

# 混淆矩阵
svm.model <- train(churn~., data=trainset,method = "svmRadial")
svm.pred <- predict(svm.model, testset[,names(testset)!="churn"])
table(svm.pred, testset[,"churn",drop=TRUE])
svm.pred  yes   no
     yes   12    1
     no   198 1290
confusionMatrix(svm.pred,testset[,"churn",drop=TRUE])
Confusion Matrix and Statistics

          Reference
Prediction  yes   no
       yes   12    1
       no   198 1290
                                          
               Accuracy : 0.8674          
                 95% CI : (0.8492, 0.8842)
    No Information Rate : 0.8601          
    P-Value [Acc > NIR] : 0.2183          
                                          
                  Kappa : 0.0928

7.11 ROCR评测模型的预测能力

受试者工作曲线ROC是一种常见的二元分类系统性能展示图形，曲线上分别标注了不同切点的真阳和假阳率。通常会基于曲线下面积AUC来衡量模型的分类性能。

install.packages("ROCR")
library(ROCR)
svmfit <- svm(churn~.,data = trainset, prob = TRUE)
pred <- predict(svmfit, testset[,names(testset)!="churn"],probability = TRUE)
# 标记yes的概率
pred.prob <- attr(pred,"probabilities")
pred.to.roc <- pred.prob[, 2]
# 预测
pred.rocr <- prediction(pred.to.roc, testset$churn)
# 性能评估
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.rocr, colorize = T, main = paste("AUC:", (perf.rocr@y.values)))

7.12 利用caret包比较ROC曲线

install.packages("pROC")
library(pROC)
# 重复3次10折交叉
control <- trainControl(method = "repeatedcv", number = 10,
                        repeats = 3,classProbs = TRUE,
                        summaryFunction = twoClassSummary)
# glm分类
glm.model <- train(churn ~., data = trainset,method = "glm",
                   metric = "ROC", trControl = control)
# svm分类
svm.model <- train(churn ~., data = trainset,method = "svmRadial",
                   metric = "ROC", trControl = control)
rpart.model <- train(churn ~., data = trainset,method = "rpart",
                     metric = "ROC", trControl = control)
# 分别预测
glm.probs <- predict(glm.model, testset[,names(testset) != "churn"], type = "prob")
svm.probs <- predict(svm.model, testset[,names(testset) != "churn"], type = "prob")
rpart.probs <- predict(rpart.model, testset[,names(testset) != "churn"], type = "prob")
# 生成ROC曲线，在一个图中
glm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = glm.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(glm.ROC, type = "S", col = 'red')

svm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = svm.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(svm.ROC, add = TRUE, col = 'green')
rpart.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = rpart.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(rpart.ROC, add = TRUE, col = 'blue')

在这里困扰了好久，一直报错，Error in colnames(data) : argument "data" is missing, with no default，最后发现由于tibble默认不降维引起的，加drop=TRUE解决。

7.13 caret包比较模型性能差异

# 模型重采样
cv.values <- resamples(list(glm=glm.model, svm=svm.model, rpart = rpart.model))
summary(cv.values)
Call:
summary.resamples(object = cv.values)

Models: glm, svm, rpart 
Number of resamples: 30 

ROC 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.7483007 0.7977494 0.8187351 0.8200275 0.8357939 0.8953608    0
svm   0.8356721 0.8594691 0.8722842 0.8778421 0.9018812 0.9184929    0
rpart 0.6446078 0.7146965 0.7666979 0.7773052 0.8459407 0.9173395    0

Sens 
           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
glm   0.1600000 0.2088235 0.2672549 0.2667320 0.3184314 0.40    0
svm   0.2600000 0.3879412 0.4454902 0.4460654 0.5049020 0.66    0
rpart 0.2352941 0.3800000 0.4411765 0.4622876 0.5637255 0.76    0

Spec 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.9509804 0.9639639 0.9673203 0.9666242 0.9705882 0.9771242    0
svm   0.9346405 0.9542484 0.9672131 0.9641094 0.9729749 0.9836601    0
rpart 0.9705882 0.9836601 0.9901639 0.9885492 0.9934641 1.0000000    0
dotplot(cv.values, metric = "ROC")
bwplot(cv.values, layout=c(3,1))

扩展还可以使用densityplot.splom和xyplot函数可视化

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2021-12-26，如有侵权请联系 cloudcommunity@tencent.com 删除

编程算法

本文分享自科技记者微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

编程算法

登录后参与评论

0 条评论

热度