文章/答案/技术大牛

发布

社区首页 >问答首页 >从数据帧中随机迭代抽样

问从数据帧中随机迭代抽样
EN

Stack Overflow用户

提问于 2022-05-24 12:05:41

回答 2查看 55关注 0票数 2

我有以下数据，它模拟一个面板数据集(即，每个unit多个unit)。

dat <- structure(list(x = c(-0.32, -0.26, 0.05, -0.37, -0.37, -0.08, 
-0.01, 0.05, 0.19, -0.48, 0.37, 0.05, -0.58, -0.18, -0.04, -0.28, 
-0.44, -0.48, 1.05, 0.62, 0.85, 0.42, 0.7, 0.64, -0.19, -0.11, 
-0.65, -0.01, 0.39, -0.02, -0.23, -0.6, -0.1, 0.39, 0.33, 0.39, 
-0.09, -0.16, 0.26, -0.62, -0.44, -0.6, -0.17, -0.27, -0.12, 
-0.53, -0.38, -0.33, -0.17, -0.11, -0.25, -0.92, -0.6, -0.81, 
0.75, 0.52, 0.57, 1.32, 1.21, 1.21), y = c(-0.42, -2.01, -1.19, 
0.7, 1.28, 1.37, 0.52, 2.04, 2.34, -1.45, 2.84, 0.1, -3.12, 0.22, 
-0.06, -1.65, -0.9, -1.5, -0.98, -0.69, 0.15, 1.7, 1.47, 0.15, 
0.26, 0.84, 0.35, 0.86, -1.23, -0.74, -1.79, -0.56, -2.15, 2.11, 
2.34, 0.57, 0.38, 0.57, 0.97, 0.32, -1.71, -0.8, 1.45, -0.12, 
1.93, 2.76, 0.08, -2.8, -0.06, 1.09, -0.4, 0.41, 0.02, -1.61, 
1.75, 1.6, -0.19, 0.13, -0.89, -1.1), unit = c(1, 1, 1, 2, 2, 
2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 
9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 
20, 20), wave = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 
1, 2, 3)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(unit = c(1, 
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20), .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 
    19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 43:45, 
    46:48, 49:51, 52:54, 55:57, 58:60), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))

我现在想在这个数据集中模拟自然损耗:一些单位在第二波中有一定的概率退出，有些单位在第三波中退出，等到第n波时，概率在每一步都保持不变。注意，这种方法对于波浪的数量应该是灵活的。

这是我想出来的。虽然很管用，但我觉得很慢。然而，由于波浪的数量不同，我不知道如何避免循环。

# number of units and number of observations per unit:
n = 20
n_perunit = 3

# define attrition probability:
attrition = 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2
system.time(for (i in 2:n_perunit) {
  n_remaining <- round(length(remaining)*attrition)
  remaining <- sample(remaining, n_remaining)
  dat <- dat %>% 
    mutate(drop = ifelse(
      wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
    filter(drop == FALSE) %>%
    mutate(drop = NULL)
})

效率：

   user  system elapsed 
  0.016   0.000   0.016

有什么办法可以改进吗？

编辑：

基于@jpsmith的回答(据我所见，对于没有人退出的组不起作用，因为min(which(dropout == "yes")将为这些组返回一个Inf值)，我得出了以下结论：

set.seed(1234)
system.time(if (!is.null(attrition)) {
  # assign a 1 or 0 indicating dropout
  dat <- dat %>%
    mutate(dropout = ifelse(
      wave > 1, sample(
        0:1, n(), prob = c(attrition, 1-attrition), replace = TRUE), 0))
  # first get the first (minimum) dropout in each unit...
  dat <- dat %>%
    group_by(unit) %>%
    mutate(min = ifelse(
      length(which(dropout == 1) > 0), min(which(dropout == 1)), n_perunit)) %>%
    # ... then slice out rows up to that row 
    slice(1:min) %>%
    # as this also includes the first dropout rows, drop that one
    filter(dropout == 0)
})

效率：

user  system elapsed
0.01    0.00    0.01

然而，切片产生的一些恼人的警告--知道为什么吗？

回答 2

Stack Overflow用户

发布于 2022-05-24 12:36:38

也许我错了，但实际上，在第一波之后，自然损耗是第一波之后的:随后的每一波都有辍学的概率--所以如果你到达第三波，那么这个概率就不取决于任何因素(类似于如果前两个是头的话，那么第三个头被翻转的概率)。如果我正确地阅读了这篇文章，你可以在第一次“辍学”之后，在波涛>1的同时指定辍学，然后删除所有的观察结果。这将把所有的东西矢量化，而且速度更快。

代码

set.seed(123)                                                                                                                                                                                                                                                                                                   ), row.names = c(NA, -20L), .drop = TRUE))
attrition <- 2/3

# Assign "dropout" position
dat$dropout <- ifelse(dat$wave > 1, sample(c("Yes","No"), prob = c(attrition, 1-attrition)), "No")

# Drop all observations after first dropout recorded
dat %>% group_by(unit) %>% slice(seq_len(min(which(dropout == "Yes") - 1)))

输出：

# Groups:   unit [20]
#       x     y  unit  wave dropout
# <dbl> <dbl> <dbl> <dbl> <chr>  
# 1 -0.32 -0.42     1     1 No     
# 2 -0.26 -2.01     1     2 No     
# 3 -0.37  0.7      2     1 No     
# 4 -0.01  0.52     3     1 No     
# 5  0.05  2.04     3     2 No     
# 6 -0.48 -1.45     4     1 No     
# 7 -0.58 -3.12     5     1 No     
# 8 -0.18  0.22     5     2 No     
# 9 -0.28 -1.65     6     1 No     
# 10  1.05 -0.98     7     1 No     
# # … with 20 more rows

由于您没有设置种子或提供所需的输出数据集，我无法比较，但是如果您提供的话，我很乐意测试这一点。

   user  system elapsed 
  0.008   0.001   0.009

票数 1

Stack Overflow用户

发布于 2022-05-24 13:52:56

由于每个波后剩余的单位数是确定性的，我们可以一次采样。

library(dplyr)
set.seed(5)
n <- 20
n_perunit <- 3

# define attrition probability:
attrition <- 2/3

# Start with a vector of all units
remaining <- 1:n

# loop through waves beginning with 2

fOriginal <- function(df, remaining) {
  for (i in 2:n_perunit) {
    n_remaining <- round(length(remaining)*attrition)
    remaining <- sample(remaining, n_remaining)
    df <- df %>% 
      mutate(drop = ifelse(
        wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
      filter(drop == FALSE) %>%
      mutate(drop = NULL)
  }
  df
}

fNew <- function(df) {
  nleft <- numeric(n_perunit + 1)
  nleft[1] <- n
  for (i in 2:n_perunit) nleft[i] <- round(nleft[i - 1]*attrition)
  df[df$wave <= sample(rep.int(1:n_perunit, -diff(nleft)))[df$unit],]
}

dfOrig <- fOriginal(dat, remaining)
dfNew <- fNew(dat)
# the resulting data.frames are not identical due to different random sampling
# methods, but they both have the same number of rows and same wave counts
identical(tabulate(dfOrig$wave), tabulate(dfNew$wave))
#> [1] TRUE

microbenchmark::microbenchmark(fOriginal = fOriginal(dat, remaining),
                               fNew = fNew(dat))
#> Unit: milliseconds
#>       expr     min       lq     mean   median      uq     max neval
#>  fOriginal 12.0433 13.24815 14.52889 14.02410 15.0525 23.5338   100
#>       fNew  1.2956  1.41915  1.73176  1.56935  1.7398  5.0738   100

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/72362592

复制

相似问题

问从数据帧中随机迭代抽样
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问从数据帧中随机迭代抽样EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问从数据帧中随机迭代抽样
EN