前几天,我answered my own question关于如何循环遍历数据,以获得2,3,4和5集合中我的变量的所有组合,计算量表得分,并评估一些心理测量学。
它可以工作,但对于我的实际用例来说,它非常慢,它包含了20个变量的全部616,645个组合,集合为2-10。
我没有太多并行运行的经验,但我认为可能的解决方案是使用foreach和doParallel包,如this SO answer中所描述的那样。不幸的是,我不太明白如何使这个想法适应我的用例。
下面是我的实际代码,其中有一个小得多的玩具示例,只需几秒钟即可运行:
library(gtools)
library(OptimalCutpoints)
# new packages to run loop in parallel
library(foreach)
library(doParallel)
registerDoParallel(detectCores())
# create fake data
df <- data.frame(class=sample(0:1, 50, replace=T),
v01=sample(0:3, 50, replace=T),
v02=sample(0:3, 50, replace=T),
v03=sample(0:3, 50, replace=T),
v04=sample(0:3, 50, replace=T),
v05=sample(0:3, 50, replace=T))
# combinations
dfoc <- as.data.frame(NULL)
ri <- 1
# I think the outer loop should somehow use
# foreach(i=2:(length(df)-1)) %:%
# and then the inner loop use
# foreach(r=1:nrow(p)) %dopar%
# but I'm not sure of the assignment in either case
# I want to build dfoc each iteration
for (i in 2:(length(df)-1)) {
p <- combinations(n = length(df)-1, r = i, v = names(df[2:(length(df))]))
for (r in 1:nrow(p)) {
keep <- c("class", p[r,])
v <- keep[-1]
df_ <- df[, keep]
df_$T <- rowSums(df_[,2:length(keep)])
oc <- summary(optimal.cutpoints(X = "T",
status = "class",
tag.healthy = 0,
methods = "SpEqualSe",
data = df_,
control = control.cutpoints(),
ci.fit = TRUE,
conf.level = 0.95,
trace = FALSE))
dfoc[ri,1] <- i # number vars in set
dfoc[ri,2] <- r # permutation number
dfoc[ri,3] <- paste(v, collapse=",") # var names in set
dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1] # cutoff
dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2] # sen
dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3] # spe
dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4] # ppv
dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5] # npv
dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2] # sen l95
dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3] # sen u95
dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2] # spe l95
dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3] # spe u95
dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2] # ppv l95
dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3] # ppv u95
dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2] # npv l95
dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3] # npv u95
dfoc[ri,17] <- oc$p.table$Global$AUC_CI # auc
ri <- ri+1
remove(df_)
remove(keep)
remove(v)
remove(oc)
}
}发布于 2016-03-10 18:35:31
这种方法将我的实际用例(>600 k组合)的运行时从2+天数减少到了2+小时。
# packages
library(gtools)
library(OptimalCutpoints)
library(foreach)
library(doParallel)
registerDoParallel(detectCores())
# create fake data
df <- data.frame(class=sample(0:1, 50, replace=T),
v01=sample(0:3, 50, replace=T),
v02=sample(0:3, 50, replace=T),
v03=sample(0:3, 50, replace=T),
v04=sample(0:3, 50, replace=T),
v05=sample(0:3, 50, replace=T))
# combinations
dfoc <- as.data.frame(NULL)
ri <- 1
# outer function
outer <- function(s, d) {
p <- combinations(n = length(d)-1, r = s, v = names(d[2:(length(d))]))
return(p)
}
# inner function
combo <- function(i, r, p, d) {
keep <- c("class", p[r,])
v <- keep[-1]
d_ <- d[, keep]
d_$T <- rowSums(d_[,2:length(keep)])
oc <- summary(optimal.cutpoints(X = "T",
status = "class",
tag.healthy = 0,
methods = "SpEqualSe",
data = d_,
control = control.cutpoints(),
ci.fit = TRUE,
conf.level = 0.95,
trace = FALSE))
dfoc[ri,1] <- i # number vars in set
dfoc[ri,2] <- r # permutation number
dfoc[ri,3] <- paste(v, collapse=",") # var names in set
dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1] # cutoff
dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2] # sen
dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3] # spe
dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4] # ppv
dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5] # npv
dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2] # sen l95
dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3] # sen u95
dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2] # spe l95
dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3] # spe u95
dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2] # ppv l95
dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3] # ppv u95
dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2] # npv l95
dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3] # npv u95
dfoc[ri,17] <- oc$p.table$Global$AUC_CI # auc
ri <- ri+1
remove(d_)
remove(keep)
remove(v)
remove(oc)
return(dfoc)
}
# parallel
system.time(
y <- foreach(s=2:5) %do% {
p <- outer(s, df)
x <- foreach(r=1:nrow(p), .combine=rbind) %dopar% combo(s, r, p, df)
}
)
# convert to data frame
finalDF <- do.call(rbind.data.frame, y)https://stackoverflow.com/questions/35902123
复制相似问题