首页
学习
活动
专区
圈层
工具
发布
首页
学习
活动
专区
圈层
工具
MCP广场
社区首页 >问答首页 >并行循环构建数据帧

并行循环构建数据帧
EN

Stack Overflow用户
提问于 2016-03-09 20:40:32
回答 2查看 672关注 0票数 1

前几天,我answered my own question关于如何循环遍历数据,以获得2,3,4和5集合中我的变量的所有组合,计算量表得分,并评估一些心理测量学。

它可以工作,但对于我的实际用例来说,它非常慢,它包含了20个变量的全部616,645个组合,集合为2-10。

我没有太多并行运行的经验,但我认为可能的解决方案是使用foreachdoParallel包,如this SO answer中所描述的那样。不幸的是,我不太明白如何使这个想法适应我的用例。

下面是我的实际代码,其中有一个小得多的玩具示例,只需几秒钟即可运行:

代码语言:javascript
运行
复制
library(gtools)
library(OptimalCutpoints)

# new packages to run loop in parallel
library(foreach)
library(doParallel)
registerDoParallel(detectCores())  

# create fake data
  df <- data.frame(class=sample(0:1, 50, replace=T),
                   v01=sample(0:3, 50, replace=T),
                   v02=sample(0:3, 50, replace=T),
                   v03=sample(0:3, 50, replace=T),
                   v04=sample(0:3, 50, replace=T),
                   v05=sample(0:3, 50, replace=T))

# combinations
  dfoc <- as.data.frame(NULL)
  ri <- 1

  # I think the outer loop should somehow use 
  #    foreach(i=2:(length(df)-1)) %:%
  # and then the inner loop use
  #    foreach(r=1:nrow(p)) %dopar%
  # but I'm not sure of the assignment in either case
  # I want to build dfoc each iteration

  for (i in 2:(length(df)-1)) {  
    p <- combinations(n = length(df)-1, r = i, v = names(df[2:(length(df))]))
    for (r in 1:nrow(p)) {
      keep <- c("class", p[r,])
      v <- keep[-1]
      df_ <- df[, keep]
      df_$T <- rowSums(df_[,2:length(keep)])
      oc <- summary(optimal.cutpoints(X = "T", 
                                      status = "class",
                                      tag.healthy = 0,
                                      methods = "SpEqualSe",
                                      data = df_,
                                      control = control.cutpoints(),
                                      ci.fit = TRUE,
                                      conf.level = 0.95, 
                                      trace = FALSE))
      dfoc[ri,1] <- i                                    # number vars in set
      dfoc[ri,2] <- r                                    # permutation number
      dfoc[ri,3] <- paste(v, collapse=",")               # var names in set
      dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1]     # cutoff
      dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2]     # sen
      dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3]     # spe
      dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4]     # ppv
      dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5]     # npv
      dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2]   # sen l95
      dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3]  # sen u95
      dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2]  # spe l95
      dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3]  # spe u95
      dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2]  # ppv l95
      dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3]  # ppv u95
      dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2]  # npv l95
      dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3]  # npv u95
      dfoc[ri,17] <- oc$p.table$Global$AUC_CI               # auc
      ri <- ri+1
      remove(df_)
      remove(keep)
      remove(v)
      remove(oc)
    }
  }
EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2016-03-11 11:20:45

一位同事向我展示了如何向量化和简化:

代码语言:javascript
运行
复制
## packages
library(gtools)
library(OptimalCutpoints)
library(foreach)
library(doParallel)
registerDoParallel(detectCores())

## create fake data
df <- data.frame(class=sample(0:1, 50, replace=T),
                 v01=sample(0:3, 50, replace=T),
                 v02=sample(0:3, 50, replace=T),
                 v03=sample(0:3, 50, replace=T),
                 v04=sample(0:3, 50, replace=T),
                 v05=sample(0:3, 50, replace=T))

## all combinations in one data frame
## 2:5 is the number of items
combos <- do.call(rbind, lapply(2:5, function(s) {
  data.frame(
    NItems = s,
    Vars = apply(combinations(
    n = length(df)-1,
    r = s,
    v = names(df[2:(length(df))])
  ), 1, paste, collapse = ","), stringsAsFactors=FALSE)
}))

## function
combo <- function(p, d) {
  keep <- c("class", unlist(strsplit(p[1, "Vars"], ",")))
  v <- keep[-1]
  d_ <- d[, keep]
  d_$T <- rowSums(d_[,2:length(keep)])
  oc <- summary(optimal.cutpoints(X = "T",
                                  status = "class",
                                  tag.healthy = 0,
                                  methods = "SpEqualSe",
                                  data = d_,
                                  control = control.cutpoints(),
                                  ci.fit = TRUE,
                                  conf.level = 0.95,
                                  trace = FALSE))

  out <- oc$p.table$Global$SpEqualSe[[1]]

  data.frame(
    cutoff = out[1],     # cutoff
    sen = out[2],     # sen
    spe = out[3],     # spe
    ppv = out[4],     # ppv
    npv = out[5],     # npv
    senl95 = out[2,2],   # sen l95
    senu95 = out[2,3],  # sen u95
    spel95 = out[3,2],  # spe l95
    speu95 = out[3,3],  # spe u95
    ppvl95 = out[4,2],  # ppv l95
    ppvu95 = out[4,3],  # ppv u95
    npvl95 = out[5,2],  # npv l95
    npvu95 = out[5,3],  # npv u95
    auc = oc$p.table$Global$AUC_CI, # auc
    stringsAsFactors = FALSE)
}


## not parallel
system.time(
  y <- foreach(r=1:nrow(combos), .combine=rbind) %do% combo(combos[r, , drop=FALSE], df)
)
finalDF <- cbind(combos, y)

## parallel
system.time(
  y2 <- foreach(r=1:nrow(combos), .combine=rbind) %dopar% combo(combos[r, , drop=FALSE], df)
)
finalDF2 <- cbind(combos, y2)

## test equal
all.equal(y, y2)
票数 0
EN

Stack Overflow用户

发布于 2016-03-10 18:35:31

这种方法将我的实际用例(>600 k组合)的运行时从2+天数减少到了2+小时。

代码语言:javascript
运行
复制
# packages  
  library(gtools)
  library(OptimalCutpoints)
  library(foreach)
  library(doParallel)
  registerDoParallel(detectCores())  

# create fake data
  df <- data.frame(class=sample(0:1, 50, replace=T),
                   v01=sample(0:3, 50, replace=T),
                   v02=sample(0:3, 50, replace=T),
                   v03=sample(0:3, 50, replace=T),
                   v04=sample(0:3, 50, replace=T),
                   v05=sample(0:3, 50, replace=T))

# combinations
  dfoc <- as.data.frame(NULL)
  ri <- 1

# outer function
  outer <- function(s, d) {
    p <- combinations(n = length(d)-1, r = s, v = names(d[2:(length(d))]))
    return(p)
  } 

# inner function
  combo <- function(i, r, p, d) {
    keep <- c("class", p[r,])
    v <- keep[-1]
    d_ <- d[, keep]
    d_$T <- rowSums(d_[,2:length(keep)])
    oc <- summary(optimal.cutpoints(X = "T", 
                                    status = "class",
                                    tag.healthy = 0,
                                    methods = "SpEqualSe",
                                    data = d_,
                                    control = control.cutpoints(),
                                    ci.fit = TRUE,
                                    conf.level = 0.95, 
                                    trace = FALSE))
    dfoc[ri,1] <- i                                    # number vars in set
    dfoc[ri,2] <- r                                    # permutation number
    dfoc[ri,3] <- paste(v, collapse=",")               # var names in set
    dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1]     # cutoff
    dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2]     # sen
    dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3]     # spe
    dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4]     # ppv
    dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5]     # npv
    dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2]   # sen l95
    dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3]  # sen u95
    dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2]  # spe l95
    dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3]  # spe u95
    dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2]  # ppv l95
    dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3]  # ppv u95
    dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2]  # npv l95
    dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3]  # npv u95
    dfoc[ri,17] <- oc$p.table$Global$AUC_CI               # auc
    ri <- ri+1
    remove(d_)
    remove(keep)
    remove(v)
    remove(oc)
    return(dfoc)
  }

# parallel
  system.time(
  y <- foreach(s=2:5) %do% {
    p <- outer(s, df)
    x <- foreach(r=1:nrow(p), .combine=rbind) %dopar% combo(s, r, p, df)
  }
  )

# convert to data frame
  finalDF <- do.call(rbind.data.frame, y)
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/35902123

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档