我有以下数据,它模拟一个面板数据集(即,每个unit
多个unit
)。
dat <- structure(list(x = c(-0.32, -0.26, 0.05, -0.37, -0.37, -0.08,
-0.01, 0.05, 0.19, -0.48, 0.37, 0.05, -0.58, -0.18, -0.04, -0.28,
-0.44, -0.48, 1.05, 0.62, 0.85, 0.42, 0.7, 0.64, -0.19, -0.11,
-0.65, -0.01, 0.39, -0.02, -0.23, -0.6, -0.1, 0.39, 0.33, 0.39,
-0.09, -0.16, 0.26, -0.62, -0.44, -0.6, -0.17, -0.27, -0.12,
-0.53, -0.38, -0.33, -0.17, -0.11, -0.25, -0.92, -0.6, -0.81,
0.75, 0.52, 0.57, 1.32, 1.21, 1.21), y = c(-0.42, -2.01, -1.19,
0.7, 1.28, 1.37, 0.52, 2.04, 2.34, -1.45, 2.84, 0.1, -3.12, 0.22,
-0.06, -1.65, -0.9, -1.5, -0.98, -0.69, 0.15, 1.7, 1.47, 0.15,
0.26, 0.84, 0.35, 0.86, -1.23, -0.74, -1.79, -0.56, -2.15, 2.11,
2.34, 0.57, 0.38, 0.57, 0.97, 0.32, -1.71, -0.8, 1.45, -0.12,
1.93, 2.76, 0.08, -2.8, -0.06, 1.09, -0.4, 0.41, 0.02, -1.61,
1.75, 1.6, -0.19, 0.13, -0.89, -1.1), unit = c(1, 1, 1, 2, 2,
2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9,
9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14,
15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20,
20, 20), wave = c(1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3,
1, 2, 3)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -60L), groups = structure(list(unit = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20), .rows = structure(list(1:3, 4:6, 7:9, 10:12, 13:15, 16:18,
19:21, 22:24, 25:27, 28:30, 31:33, 34:36, 37:39, 40:42, 43:45,
46:48, 49:51, 52:54, 55:57, 58:60), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -20L), .drop = TRUE))
我现在想在这个数据集中模拟自然损耗:一些单位在第二波中有一定的概率退出,有些单位在第三波中退出,等到第n波时,概率在每一步都保持不变。注意,这种方法对于波浪的数量应该是灵活的。
这是我想出来的。虽然很管用,但我觉得很慢。然而,由于波浪的数量不同,我不知道如何避免循环。
# number of units and number of observations per unit:
n = 20
n_perunit = 3
# define attrition probability:
attrition = 2/3
# Start with a vector of all units
remaining <- 1:n
# loop through waves beginning with 2
system.time(for (i in 2:n_perunit) {
n_remaining <- round(length(remaining)*attrition)
remaining <- sample(remaining, n_remaining)
dat <- dat %>%
mutate(drop = ifelse(
wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
filter(drop == FALSE) %>%
mutate(drop = NULL)
})
效率:
user system elapsed
0.016 0.000 0.016
有什么办法可以改进吗?
编辑:
基于@jpsmith的回答(据我所见,对于没有人退出的组不起作用,因为min(which(dropout == "yes")
将为这些组返回一个Inf
值),我得出了以下结论:
set.seed(1234)
system.time(if (!is.null(attrition)) {
# assign a 1 or 0 indicating dropout
dat <- dat %>%
mutate(dropout = ifelse(
wave > 1, sample(
0:1, n(), prob = c(attrition, 1-attrition), replace = TRUE), 0))
# first get the first (minimum) dropout in each unit...
dat <- dat %>%
group_by(unit) %>%
mutate(min = ifelse(
length(which(dropout == 1) > 0), min(which(dropout == 1)), n_perunit)) %>%
# ... then slice out rows up to that row
slice(1:min) %>%
# as this also includes the first dropout rows, drop that one
filter(dropout == 0)
})
效率:
user system elapsed
0.01 0.00 0.01
然而,切片产生的一些恼人的警告--知道为什么吗?
发布于 2022-05-24 12:36:38
也许我错了,但实际上,在第一波之后,自然损耗是第一波之后的:随后的每一波都有辍学的概率--所以如果你到达第三波,那么这个概率就不取决于任何因素(类似于如果前两个是头的话,那么第三个头被翻转的概率)。如果我正确地阅读了这篇文章,你可以在第一次“辍学”之后,在波涛>1的同时指定辍学,然后删除所有的观察结果。这将把所有的东西矢量化,而且速度更快。
代码
set.seed(123) ), row.names = c(NA, -20L), .drop = TRUE))
attrition <- 2/3
# Assign "dropout" position
dat$dropout <- ifelse(dat$wave > 1, sample(c("Yes","No"), prob = c(attrition, 1-attrition)), "No")
# Drop all observations after first dropout recorded
dat %>% group_by(unit) %>% slice(seq_len(min(which(dropout == "Yes") - 1)))
输出:
# Groups: unit [20]
# x y unit wave dropout
# <dbl> <dbl> <dbl> <dbl> <chr>
# 1 -0.32 -0.42 1 1 No
# 2 -0.26 -2.01 1 2 No
# 3 -0.37 0.7 2 1 No
# 4 -0.01 0.52 3 1 No
# 5 0.05 2.04 3 2 No
# 6 -0.48 -1.45 4 1 No
# 7 -0.58 -3.12 5 1 No
# 8 -0.18 0.22 5 2 No
# 9 -0.28 -1.65 6 1 No
# 10 1.05 -0.98 7 1 No
# # … with 20 more rows
由于您没有设置种子或提供所需的输出数据集,我无法比较,但是如果您提供的话,我很乐意测试这一点。
user system elapsed
0.008 0.001 0.009
发布于 2022-05-24 13:52:56
由于每个波后剩余的单位数是确定性的,我们可以一次采样。
library(dplyr)
set.seed(5)
n <- 20
n_perunit <- 3
# define attrition probability:
attrition <- 2/3
# Start with a vector of all units
remaining <- 1:n
# loop through waves beginning with 2
fOriginal <- function(df, remaining) {
for (i in 2:n_perunit) {
n_remaining <- round(length(remaining)*attrition)
remaining <- sample(remaining, n_remaining)
df <- df %>%
mutate(drop = ifelse(
wave >= i & !(unit %in% remaining), TRUE, FALSE)) %>%
filter(drop == FALSE) %>%
mutate(drop = NULL)
}
df
}
fNew <- function(df) {
nleft <- numeric(n_perunit + 1)
nleft[1] <- n
for (i in 2:n_perunit) nleft[i] <- round(nleft[i - 1]*attrition)
df[df$wave <= sample(rep.int(1:n_perunit, -diff(nleft)))[df$unit],]
}
dfOrig <- fOriginal(dat, remaining)
dfNew <- fNew(dat)
# the resulting data.frames are not identical due to different random sampling
# methods, but they both have the same number of rows and same wave counts
identical(tabulate(dfOrig$wave), tabulate(dfNew$wave))
#> [1] TRUE
microbenchmark::microbenchmark(fOriginal = fOriginal(dat, remaining),
fNew = fNew(dat))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fOriginal 12.0433 13.24815 14.52889 14.02410 15.0525 23.5338 100
#> fNew 1.2956 1.41915 1.73176 1.56935 1.7398 5.0738 100
https://stackoverflow.com/questions/72362592
复制相似问题