我正在尝试用学生t-test在一个非常大的数据框架内计算p.values
。因为我的原始数据帧在数据帧内有大约几条线,所以p.values的计算花费了很长时间(大约花了100分钟)。
我正在尝试加快这一过程,但我不确定数据帧是否是提高速度的最佳格式,或者我是否应该重塑数据并可能使用matrix
。
下面是一些可重复使用的示例,最后是一个小数据帧和一个基准测试。
library(dplyr)
my.t.test <- function (x, y = NULL) {
nx <- length(x)
mx <- mean(x)
vx <- var(x)
ny <- length(y)
my <- mean(y)
vy <- var(y)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
cont <- c("A", "B")
set.seed(1)
df1 <- data.frame(id=rep(1:1000, each=8),
replicate=1:4,
A=rnorm(8000, mean=26, sd=5),
B=rnorm(8000, mean=25, sd=7))
completeDF <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
p.value=t.test(get(cont[1]), get(cont[2]))$p.value,
log10.p.value=-log10(p.value),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
noPvalue <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
myPvalue <- function() {
df1 %>%
group_by(id) %>%
summarise(Comparison=paste(cont, collapse=' - '),
p.value=my.t.test(get(cont[1]), get(cont[2])),
log10.p.value=-log10(p.value),
log2.foldchange=mean(get(cont[1]), na.rm=TRUE) - mean(get(cont[2]), na.rm=TRUE)
)}
microbenchmark::microbenchmark(
completeDF(), noPvalue(), myPvalue()
)
我的基准:
Unit: milliseconds
expr min lq mean median uq max neval
completeDF() 358.38330 365.09423 424.60255 369.20453 377.40354 655.2009 100
noPvalue() 57.42996 58.89978 81.86222 59.66851 60.96582 337.2346 100
myPvalue() 216.04812 220.98277 318.09568 224.19516 493.74908 609.4516 100
因此,使用我非常精简(无需测试等)的t.test函数,我已经节省了一些时间。但我想知道这是否可以通过矢量化以某种方式进一步改善。
发布于 2018-08-22 22:01:24
均值和方差计算需要按组进行,但t检验和p值计算可以向量化。
my.t.test.2 <- function(grp, x, y) {
grp <- factor(grp)
x_g <- split(x, grp)
x_n <- lengths(x_g)
x_mean <- vapply(x_g, mean, numeric(1))
x_var <- vapply(x_g, var, numeric(1))
y_g <- split(y, grp)
y_n <- lengths(y_g)
y_mean <- vapply(y_g, mean, numeric(1))
y_var <- vapply(y_g, var, numeric(1))
x_se2 <- x_var / x_n
y_se2 <- y_var / y_n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / (x_se2^2 / (x_n - 1L) + (y_se2^2) / (y_n - 1L))
2 * pt(-abs(tstat), df)
}
人们可以尝试通过避免分派(mean()
缓慢的“原因”)和最小化冗余计算(例如,每个组的长度)来变得超级聪明。
my.t.test.2.1 <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
可以包装规范解决方案和其他解决方案,以提供相同的输出
f0 <- function(df)
df %>% group_by(id) %>% summarize(p.value = t.test(A, B)$p.value)
f1 <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test(A, B))
f2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2(df$id, df$A, df$B))
f2.1 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.1(df$id, df$A, df$B))
f2.1()
产生与规范实现相同的结果,并且速度大约是标准实现的两倍;担心mean()
的速度等(f2()
与f2.1()
)似乎大多是被误导的
> all.equal.default(f0(df1), f2.1(df1))
[1] TRUE
> microbenchmark(f0(df1), f1(df1), f2(df1), f2.1(df1), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 374.2819 379.7749 380.8365 380.0094 381.2368 388.8794 5
f1(df1) 249.6502 250.2525 251.8813 252.1965 253.3444 253.9630 5
f2(df1) 154.1152 158.3243 159.8277 159.1076 162.7602 164.8311 5
f2.1(df1) 151.0032 151.0149 152.3900 152.8105 153.2840 153.8373 5
对我来说,C++实现
my.t.test.cpp <- function (x, y = NULL) {
nx <- length(x)
mx <- sum_cpp(x) / nx
vx <- var_cpp(x, mx)
ny <- length(y)
my <- sum_cpp(y) / ny
vy <- var_cpp(y, my)
stderrx <- sqrt(vx/nx)
stderry <- sqrt(vy/ny)
stderr <- sqrt(stderrx^2 + stderry^2)
df <- stderr^4/(stderrx^4/(nx - 1) + stderry^4/(ny - 1))
tstat <- (mx - my - 0)/stderr
pval <- 2 * pt(-abs(tstat), df)
return(pval)
}
fcpp <- function(df)
df %>% group_by(id) %>% summarize(p.value = my.t.test.cpp(A, B))
产生与规范相等的结果,并以大约100毫秒的速度进行打卡。
分析2.1解决方案显示,大部分时间都花在var()
内部,其中有一个对stopifnot()
的调用以及一个参数匹配调用
> var
function (x, y = NULL, na.rm = FALSE, use)
{
...
na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
"everything", "na.or.complete"))
...
if (is.data.frame(x))
x <- as.matrix(x)
else stopifnot(is.atomic(x))
...
.Call(C_cov, x, y, na.method, FALSE)
}
<bytecode: 0x5e1a440>
<environment: namespace:stats>
> Rprof(); x <- my.t.test.2.1(df1$id, df1$A, df1$B); Rprof(NULL); summaryRprof()
$by.self
self.time self.pct total.time total.pct
"withCallingHandlers" 0.04 28.57 0.08 57.14
"tryCatchList" 0.04 28.57 0.04 28.57
"vapply" 0.02 14.29 0.14 100.00
"stopifnot" 0.02 14.29 0.12 85.71
"match.call" 0.02 14.29 0.02 14.29
$by.total
total.time total.pct self.time self.pct
"vapply" 0.14 100.00 0.02 14.29
"my.t.test.2.1" 0.14 100.00 0.00 0.00
"stopifnot" 0.12 85.71 0.02 14.29
"FUN" 0.12 85.71 0.00 0.00
"withCallingHandlers" 0.08 57.14 0.04 28.57
"tryCatchList" 0.04 28.57 0.04 28.57
"tryCatch" 0.04 28.57 0.00 0.00
"match.call" 0.02 14.29 0.02 14.29
$sample.interval
[1] 0.02
$sampling.time
[1] 0.14
因此,在追求速度时,人们可能会避免参数检查,而直接调用C函数
my.t.test.2.2 <- compiler::cmpfun(function(grp, x, y) {
var <- function(x)
.Call(stats:::C_cov, x, NULL, 4L, FALSE)
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- vapply(x_g, var, numeric(1), USE.NAMES = FALSE)
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- vapply(y_g, var, numeric(1), USE.NAMES = FALSE)
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2 <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2(df$id, df$A, df$B))
事实证明,这是相当好的表现。
> all.equal.default(f0(df1), f2.2(df1))
[1] TRUE
> microbenchmark(
+ f0(df1), f1(df1), f2(df1), f2.1(df1), f2.2(df1), fcpp(df1),
+ times = 5
+ )
Unit: milliseconds
expr min lq mean median uq max neval
f0(df1) 378.61985 379.25525 393.38371 379.56797 386.2806 443.19488 5
f1(df1) 250.99802 252.45281 253.55140 253.34249 255.2801 255.68362 5
f2(df1) 156.76073 158.63126 159.63693 160.33446 161.2260 161.23216 5
f2.1(df1) 146.64555 148.28773 151.17250 151.38536 153.9363 155.60751 5
f2.2(df1) 25.24441 25.62982 27.50898 26.11755 30.0836 30.46951 5
fcpp(df1) 104.20851 104.50396 105.19383 104.62905 104.7876 107.84006 5
我们可以使用C++实现来计算方差,而不是使用
my.t.test.2.2.cpp <- compiler::cmpfun(function(grp, x, y) {
grp <- factor(grp)
x_g <- split.default(x, grp)
n <- lengths(x_g)
n1 <- n - 1L
x_mean <- vapply(x_g, mean.default, numeric(1), USE.NAMES = FALSE)
x_var <- unlist(Map(var_cpp, x_g, x_mean))
y_g <- split.default(y, grp)
y_mean <- vapply(y_g, mean.default, numeric(1), USE.NAMES = FALSE)
y_var <- unlist(Map(var_cpp, y_g, y_mean))
x_se2 <- x_var / n
y_se2 <- y_var / n
se <- sqrt(x_se2 + y_se2)
tstat <- (x_mean - y_mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / n1)
2 * pt(-abs(tstat), df)
})
f2.2.cpp <- function(df)
tibble(id = unique(df$id), p.value = my.t.test.2.2.cpp(df$id, df$A, df$B))
为了获得可比较的性能
> microbenchmark(f2.2(df1), f2.2.cpp(df1), times = 20)
Unit: milliseconds
expr min lq mean median uq max neval
f2.2(df1) 25.11237 25.69622 30.27956 26.35570 29.81884 87.34955 20
f2.2.cpp(df1) 24.88787 25.25171 26.80836 25.43498 29.06338 30.80012 20
我不确定哪一个更像黑客--编写自己的C++代码来处理变化,还是直接调用R的C代码。
更快的C++解决方案在一次调用中计算组平均值和方差
cppFunction('List doit(IntegerVector group, NumericVector x) {
int n_grp = 0;
for (int i = 0; i < group.size(); ++i)
n_grp = group[i] > n_grp ? group[i] : n_grp;
std::vector<int> n(n_grp);
std::vector<double> sum(n_grp), sumsq(n_grp);
for (int i = 0; i < group.size(); ++i) {
n[ group[i] - 1 ] += 1;
sum[ group[i] - 1 ] += x[i];
sumsq[ group[i] - 1 ] += x[i] * x[i];
}
NumericVector mean(n_grp), var(n_grp);
for (size_t i = 0; i < n.size(); ++i) {
mean[i] = sum[i] / n[i];
var[i] = (sumsq[i] - sum[i] * mean[i]) / (n[i] - 1);
}
return List::create(_["n"]=n[0], _["mean"]=mean, _["var"]=var);
}')
my.t.test.2.3.cpp <- compiler::cmpfun(function(grp, x, y) {
x <- doit(grp, x)
y <- doit(grp, y)
x_se2 <- x$var / x$n
y_se2 <- y$var / y$n
se <- sqrt(x_se2 + y_se2)
tstat <- (x$mean - y$mean) / se
df <- se^4 / ((x_se2^2 + y_se2^2) / (x$n - 1L))
2 * pt(-abs(tstat), df)
})
f2.3.cpp <- function(df)
tibble(
id = unique(df$id),
p.value = my.t.test.2.3.cpp(df$id, df$A, df$B)
)
这是很快的
> all.equal.default(f0(df1), f2.3.cpp(df1))
[1] TRUE
> microbenchmark(f2.2(df1), f2.2.cpp(df1), f2.3.cpp(df1), times = 50)
Unit: milliseconds
expr min lq mean median uq max
f2.2(df1) 24.743364 25.445833 28.032135 25.873117 29.191020 88.642771
f2.2.cpp(df1) 24.122380 24.867212 26.012985 25.369963 25.897866 30.783544
f2.3.cpp(df1) 2.831635 2.946094 3.101408 2.992049 3.073788 7.191572
neval
50
50
50
>
另一种选择是Bioconductor package genefilter::rowttests()
,它需要一个矩阵
set.seed(1)
m1 <- cbind(
matrix(rnorm(8000, mean = 26, sd = 5), ncol=8, byrow = TRUE),
matrix(rnorm(8000, mean = 25, sd = 7), ncol=8, byrow = TRUE)
)
f4 <- function(m1)
genefilter::rowttests(m1, factor(rep(1:2, each=8)))
而且速度也很快
> microbenchmark(f2.3.cpp(df1), f4(m1), times=50)
Unit: milliseconds
expr min lq mean median uq max neval
f2.3.cpp(df1) 2.760877 2.796542 2.877030 2.845795 2.895441 3.286143 50
f4(m1) 1.335288 1.359007 1.397601 1.377544 1.412606 1.693340 50
(一些不同之处在于创建tibble)。
https://stackoverflow.com/questions/51962691
复制相似问题