方程:
假定
宁超在他的公众号“Pythn与数量遗传学”的“方差组分估计之约束最大似然”文章中,给出了下面两种计算公式,公式一是直接似然函数(direct REML),公式二是间接的似然函数(MME based REML)。
下面这个示例是张勤老师在2019年统计遗传学暑假学校教材的数据
# zhangqin laoshi PPT codeherd <- factor(c(1,2,2,1,1,2,1,2,2))
sire <- factor(c(1,1,1,2,2,3,4,4,4))
y <- c(240,190,170,180,200,140,170,100,130)
REML_data <- data.frame(herd,sire,y)
X <- model.matrix(y~herd-1)
Z <- model.matrix(y~sire-1)
A <- matrix(c(1,0.25,0,0,0.25,1,0,0,0,0,1,0,0,0,0,1),nr=4)
书写似然函数
mixmodel.loglik<-function(theta,y){
sigmag2<-theta[1]
sigmae2<-theta[2]
n = length(y)
G = A*sigmag2
R = diag(n)*sigmae2
V = Z %*% G %*% t(Z) + R
Vi = solve(V)
P = Vi - Vi %*% X %*% solve(t(X) %*% Vi %*% X) %*% t(X) %*% Vi
logl = -0.5*(log(det(V)) + log(det(t(X) %*% Vi %*% X)) + t(y) %*% P %*% y) return(-logl)
}
optim(c(1,1),mixmodel.loglik,y=y)
mixmodel.loglik2 <-function(theta,y){
sigmag2<-theta[1]
sigmae2<-theta[2]
n = length(y)
G = A*sigmag2
R = diag(n)*sigmae2
V = Z %*% G %*% t(Z) + R
Vi = solve(V)
P = Vi - Vi %*% X %*% solve(t(X) %*% Vi %*% X) %*% t(X) %*% Vi
C11 = t(X) %*% solve(R) %*% X
C12 = t(X) %*% solve(R) %*% Z
C21 = t(Z) %*% solve(R) %*% X
C22 = t(Z) %*% solve(R) %*% Z + solve(G)
C = rbind(cbind(C11,C12),cbind(C21,C22))
logl2 = -0.5*(log(det(C)) + log(det(R)) + log(det(G)) + t(y) %*% P %*% y) return(-logl2)
}
optim(c(1,1),mixmodel.loglik2,y=y)
两者结果一样。
herd <- factor(c(1,2,2,1,1,2,1,2,2))
sire <- factor(c(1,1,1,2,2,3,4,4,4))
y <- c(240,190,170,180,200,140,170,100,130)
REML_data <- data.frame(herd,sire,y)
X <- model.matrix(y~herd-1)
Z <- model.matrix(y~sire-1)
XX <- crossprod(X)
XZ <- crossprod(X,Z)
ZX <- t(XZ)
ZZ <- crossprod(Z)
LHS <- rbind(cbind(XX,XZ),cbind(ZX,ZZ))
Xy <- crossprod(X,y)
Zy <- crossprod(Z,y)
RHS <- rbind(Xy,Zy)
A <- matrix(c(1,0.25,0,0,0.25,1,0,0,0,0,1,0,0,0,0,1),nr=4)
Ainv <- solve(A)
yy <- crossprod(y)
N <- length(y)
rankX <-qr(X)$rank
q <- length(unique(sire))
nh <- length(unique(herd))
k0 <- 12threshold <- 0.00000001write.table(t(c("sigmaE","sigmaS","k")),file="results.txt",row.names = FALSE, col.names = FALSE)repeat{
LHS1 <- LHS
k <- k0
LHS1[(nh+1):(nh+q), (nh+1):(nh+q)] <- LHS1[(nh+1):(nh+q), (nh+1):(nh+q)]+Ainv*k
sol <- solve(LHS1,RHS)
C <- solve(LHS1)
Czz <- C[(nh+1):(nh+q), (nh+1):(nh+q)]
sigmaE <- (yy - crossprod(sol,RHS))/(N-rankX)
sAs <- t(sol[(nh+1):(nh+q)]) %*% Ainv %*% sol[(nh+1):(nh+q)]
trCA <- sum(diag(Czz %*% Ainv))
sigmaS <- (sAs + trCA*sigmaE)/q
k0 <- as.numeric(sigmaE/sigmaS)
out <- c(sigmaE,sigmaS,k0)
write.table(t(out), file="results.txt", append = TRUE, row.names = FALSE, col.names = FALSE)
if(abs(k-k0) < threshold) break}
results <- read.table("results.txt",header = TRUE)
results
sigmaE | sigmaS | k |
---|---|---|
925.6148 | 91.78376 | 10.0847344 |
897.9372 | 108.19047 | 8.2995954 |
863.6173 | 129.55950 | 6.6657968 |
820.9699 | 157.69430 | 5.2060850 |
768.2845 | 194.87483 | 3.9424508 |
704.4117 | 243.56957 | 2.8920351 |
629.8507 | 305.53381 | 2.0614764 |
548.0643 | 380.16717 | 1.4416402 |
465.8970 | 462.97884 | 1.0063030 |
391.7085 | 546.12170 | 0.7172550 |
331.7683 | 621.64909 | 0.5336907 |
287.8838 | 684.73480 | 0.4204312 |
258.0564 | 734.15339 | 0.3515020 |
238.7261 | 770.93255 | 0.3096589 |
226.5165 | 797.12077 | 0.2841684 |
218.8922 | 815.07790 | 0.2685538 |
214.1497 | 827.02910 | 0.2589385 |
211.2014 | 834.81106 | 0.2529931 |
209.3676 | 839.80212 | 0.2493058 |
208.2261 | 842.97109 | 0.2470145 |
207.5152 | 844.97003 | 0.2455888 |
207.0721 | 846.22564 | 0.2447009 |
206.7960 | 847.01226 | 0.2441476 |
206.6238 | 847.50423 | 0.2438027 |
206.5165 | 847.81160 | 0.2435877 |
206.4496 | 848.00351 | 0.2434537 |
206.4078 | 848.12328 | 0.2433701 |
206.3818 | 848.19801 | 0.2433179 |
206.3655 | 848.24463 | 0.2432854 |
206.3554 | 848.27371 | 0.2432651 |
206.3491 | 848.29185 | 0.2432525 |
206.3452 | 848.30317 | 0.2432446 |
206.3427 | 848.31022 | 0.2432397 |
206.3412 | 848.31462 | 0.2432366 |
206.3402 | 848.31737 | 0.2432347 |
206.3396 | 848.31908 | 0.2432335 |
206.3392 | 848.32015 | 0.2432328 |
206.3390 | 848.32081 | 0.2432323 |
206.3389 | 848.32123 | 0.2432320 |
206.3388 | 848.32149 | 0.2432318 |
206.3387 | 848.32165 | 0.2432317 |
206.3387 | 848.32175 | 0.2432316 |
206.3387 | 848.32181 | 0.2432316 |
206.3387 | 848.32185 | 0.2432316 |
206.3386 | 848.32188 | 0.2432315 |
206.3386 | 848.32189 | 0.2432315 |
206.3386 | 848.32190 | 0.2432315 |
最终的结果是,加性方差组分为206.3386,残差的方差组分为848.3219,结果一致
公式中的log,也可以写为ln,是自然对数,在R中log默认的就是自然对数
# 自然对数的3次方exp(3)
20.0855369231877
# 对上面结果求自然对数log(exp(3))
3
在R中,如果想计算10的对数,用函数log10(x)
log(10)
2.30258509299405
log10(10)
1
http://people.math.aau.dk/~rw/Undervisning/Mat6F12/Handouts/2.hand.pdf http://www2.stat.duke.edu/~sayan/Sta613/2018/lec/LMM.pdf 张勤老师 《统计遗传学暑期学校教材》 中国农业大学 2019.07 宁超公众号:Python与数量遗传学 方差组分估计之约束最大似然