我想用下面的例子从一个数据集中计算硒臭氧的平均暴露量。平均值应该是从出生年份到5岁的臭氧值。
final = data.frame(ID = c(1, 2, 3, 4, 5, 6),
Zone = c("A", "B", "C", "D", "A", "B"),
dob = c(1993, 1997, 1994, 2001, 1999, 1993),
Ozone_1993 = c(0.12, 0.01, 0.36, 0.78, 0.12, 0.01),
Ozone_1994 = c(0.75, 0.23, 0.14, 0.98, 0.75, 0.23),
Ozone_1995 = c(1.38, 0.45, -0.08, 1.18, 1.38, 0.45),
Ozone_1996 = c(2.01, 0.67, -0.3, 1.38, 2.01, 0.67),
Ozone_1997 = c(2.64, 0.89, -0.52, 1.58, 2.64, 0.89),
Ozone_1998 = c(3.27, 1.11, -0.74, 1.78, 3.27, 1.11),
Ozone_1999 = c(3.9, 1.33, -0.96, 1.98, 3.9, 1.33),
Ozone_2000 = c(4.53, 1.55, -1.18, 2.18, 4.53, 1.55),
Ozone_2001 = c(5.16, 1.77, -1.4, 2.38, 5.16, 1.77),
Ozone_2002 = c(5.79, 1.99, -1.62, 2.58, 5.79, 1.99),
Ozone_2003 = c(6.42, 2.21, -1.84, 2.78, 6.42, 2.21),
Ozone_2004 = c(7.05, 2.43, -2.06, 2.98, 7.05, 2.43),
mean_under5_ozone = c(0.85, 1.33, -0.3, 2.68, 5.16, 0.45))其中,列(变量) mean_under5_ozone是从出生到5岁或5岁以下臭氧暴露的平均分数。例如,ID 1的mean_under5_ozone是从Ozone_1993到Ozone_1997的行平均数
从新手那里,
发布于 2020-10-02 19:58:06
这里有一种方法可以用for循环来实现。(这并不是很优雅,但它避免了过多的dplyr和rlang语法细节。)
dob_yr以下)定义一个列,其中包含用于自定义平均值(use_vars )的变量名。对行和每一行进行use_vars)提取相关变量,并计算自定义均值。。
library(dplyr)
df <- tibble(id=1:5)
df$zone <- c(rep('A', 5))
df$dob_yr <- c(1991:1995)
for (yr in 1991:1995) {
df[[paste('x_',yr,sep='')]] <- c(abs(rnorm(5)))
}
df # check mock data
add_use_vars <- function(df, dob_yr_varname='dob_yr', prefix='x_', yr_within=3) {
vars <- names(df %>% select(starts_with(prefix)))
vars_yr <- as.integer(sub(prefix, '', vars))
df$use_vars <- NA
for (i in seq_along(df[[dob_yr_varname]])) {
yr <- df[[dob_yr_varname]][i]
idx <- (vars_yr <= yr + yr_within) & (vars_yr >= yr)
df$use_vars[i] <- list(vars[idx]) # list with one element
}
return(df)
}
df <- add_use_vars(df)
df$use_vars[1][[1]] # see the first row in use_vars
custom_mean <- function(df, varname_varlist='use_vars') {
df$custom_mean <- NA
for (i in seq_along(df[[varname_varlist]])) {
vars = df[[varname_varlist]][i][[1]] # extract first element in list
df$custom_mean[i] <- mean(as.numeric(df[i, vars]))
}
return(df)
}
df <- custom_mean(df)
df # see results注意,对于这个模拟数据,对于每一行,我在包含从出生年份起的0到3年的值的列上的平均值。
发布于 2020-10-02 18:55:38
(完全重写)
我想我不明白mean_under5_ozone的意思,因为我不能复制你的数字。例如,对于出生于1993年的ID==1,这意味着我们需要从1993年到1998年(包括5岁)或1997年(直到但不包括在内)的数据,但这两个平均值都不是0.85:
mean(unlist(final[1, 4:9]))
# [1] 1.695
mean(unlist(final[1, 4:8]))
# [1] 1.38忽略这一点,我将给出我认为正确的final数据的答案。
潮间带
library(dplyr)
library(tidyr) # pivot_longer
final <- select(final, -mean_under5_ozone)
final %>%
pivot_longer(starts_with("Ozone"), names_pattern = "(.*)_(.*)", names_to = c("type", "year")) %>%
mutate(year = as.integer(year)) %>%
group_by(ID) %>%
summarize(mean_under5_ozone = mean(value[ between(year, dob, dob + 5) ]), .groups = "drop")
# # A tibble: 6 x 2
# ID mean_under5_ozone
# <dbl> <dbl>
# 1 1 1.70
# 2 2 1.44
# 3 3 -0.41
# 4 4 2.68
# 5 5 5.48
# 6 6 0.56data.table
library(data.table)
library(magrittr) # %>%, not required but used for improved readability
finalDT[, mean_under5_ozone := NULL]
melt(finalDT, 1:3) %>%
.[, year := as.integer(gsub("[^0-9]", "", variable))] %>%
.[ year >= dob, ] %>%
.[, .(mean_under5_ozone = mean(value[ between(year, dob, dob + 5) ])), by = .(ID)] %>%
.[order(ID),]
# ID mean_under5_ozone
# 1: 1 1.695
# 2: 2 1.440
# 3: 3 -0.410
# 4: 4 2.680
# 5: 5 5.475
# 6: 6 0.560一些想法,使用随机数据。
set.seed(42)
dat <- data.frame(dob = sample(1990:2020, size=1000, replace=TRUE), Ozone_1993=runif(1000), Ozone_1994=runif(1000), Ozone_1995=runif(1000))
head(dat)
# dob Ozone_1993 Ozone_1994 Ozone_1995
# 1 2006 0.37383448 0.68624969 0.1681480
# 2 1994 0.46496563 0.29309851 0.8198724
# 3 1990 0.04660819 0.41994895 0.7501070
# 4 2014 0.98751620 0.73526105 0.2899959
# 5 1999 0.90845233 0.84982125 0.1798130
# 6 1993 0.97939015 0.07746459 0.6172919潮间带
library(dplyr)
dat %>%
filter(dob >= 2015) %>%
summarize_at(vars(starts_with("Ozone")), mean)
# Ozone_1993 Ozone_1994 Ozone_1995
# 1 0.5242029 0.4852803 0.4864364这是每年的平均数。如果您需要一个统计数据,那么
# library(tidyr) # pivot_longer
dat %>%
filter(dob >= 2015) %>%
tidyr::pivot_longer(starts_with("Ozone")) %>%
summarize(value = mean(value))
# # A tibble: 1 x 1
# value
# <dbl>
# 1 0.499data.table
library(data.table)
datDT <- as.data.table(dat)
datDT[ dob >= 2015, ][, lapply(.SD, mean), .SDcols = patterns("^Ozone")]
# Ozone_1993 Ozone_1994 Ozone_1995
# 1: 0.5242029 0.4852803 0.4864364
melt(datDT[ dob >= 2015, ], "dob")[, .(value = mean(value))]
# value
# 1: 0.4986398基R
apply(subset(dat, dob >= 2015, select = Ozone_1993:Ozone_1995), 2, mean)
# Ozone_1993 Ozone_1994 Ozone_1995
# 0.5242029 0.4852803 0.4864364
mean(unlist(subset(dat, dob >= 2015, select = Ozone_1993:Ozone_1995)))
# [1] 0.4986398https://stackoverflow.com/questions/64176590
复制相似问题