对应原版教程第14章http://bioconductor.org/books/release/OSCA/overview.html
单细胞的多组对照设计(例如正常组与给药组)可以为细胞类型水平比较提供以往Bulk RNA-seq分析所不能达到的精度。对此一般有两种进阶分析思路:
笔记要点
#已完成质控、聚类、批次矫正、细胞类型注释
merged
table(merged$tomato,merged$pool)
# 3 4 5
# FALSE 1047 3097 6829
# TRUE 2411 3007 4544
library(scater)
#同一聚类图谱中两组样本的细胞数目分布
head(table(colLabels(merged), merged$tomato))
# FALSE TRUE
# 1 431 322
# 2 541 404
# 3 335 250
# 4 1257 979
# 5 610 456
# 6 504 433
#观察细胞类型注释情况
by.label <- table(colLabels(merged), merged$celltype.mapped)
pheatmap::pheatmap(log2(by.label+1), color=viridis::viridis(101))
summed <- aggregateAcrossCells(merged,
id=colData(merged)[,c("celltype.mapped", "sample")])
summed
#15179 192 即6个样本的32种细胞类型的15179个基因表达情况
#以 Mesenchyme 基因为例
label <- "Mesenchyme" #间质细胞
current <- summed[,label==summed$celltype.mapped]
head(counts(current))
# [,1] [,2] [,3] [,4] [,5] [,6]
# Xkr4 2 0 0 0 3 0
# Rp1 0 0 1 0 0 0
# Sox17 7 0 3 0 14 9
# Mrpl15 1420 271 1009 379 1578 749
# Gm37988 1 0 0 0 0 0
# Rgs20 3 0 1 1 0 0
colData(current)[,c("sample","tomato","pool","celltype.mapped")]
# DataFrame with 6 rows and 4 columns
# sample tomato pool celltype.mapped
# <integer> <logical> <integer> <character>
# 1 5 TRUE 3 Mesenchyme
# 2 6 FALSE 3 Mesenchyme
# 3 7 TRUE 4 Mesenchyme
# 4 8 FALSE 4 Mesenchyme
# 5 9 TRUE 5 Mesenchyme
# 6 10 FALSE 5 Mesenchyme
# follow edgeR的差异分析流程,构建DGElist对象
library(edgeR)
y <- DGEList(counts(current), samples=colData(current))
y
3 advantages of "pseudo-bulk"
接下面的流程均以6个样本的Mesenchyme
间质细胞 Bulk RNA-seq表达矩阵为例进行分析。
discarded <- current$ncells < 10
y <- y[,!discarded]
summary(discarded) #如下没有样本被过滤掉
# Mode FALSE
# logical 6
然后进一步过滤低表达的基因,其在大部分样本里均低于最低阈值的表达水平。
keep <- filterByExpr(y, group=current$tomato)
y <- y[keep,]
summary(keep)
# Mode FALSE TRUE
#logical 83 5815
y <- calcNormFactors(y)
y
# tomato 为分组情况
# pool 为批次情况
y$samples[,c("tomato","pool")]
# tomato pool
#Sample1 TRUE 3
#Sample2 FALSE 3
#Sample3 TRUE 4
#Sample4 FALSE 4
#Sample5 TRUE 5
#Sample6 FALSE 5
design <- model.matrix(~factor(pool) + factor(tomato), y$samples)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust=TRUE)
res <- glmQLFTest(fit, coef=ncol(design))
topTags(res)
DEGs are defined as those with non-zero log-fold changes at a false discovery rate of 5%.
scran
包提供了自动计算所有细胞类型的差异分析的函数pseudoBulkDGE()
,结果保存为list对象。# Removing all pseudo-bulk samples with 'insufficient' cells.
summed.filt <- summed[,summed$ncells >= 10]
library(scran)
de.results <- pseudoBulkDGE(summed.filt,
label=summed.filt$celltype.mapped,
design=~factor(pool) + tomato,
coef="tomatoTRUE",
condition=summed.filt$tomato
)
#查看某一种细胞类型的差异分析结果
cur.results <- de.results[["Allantois"]]
cur.results[order(cur.results$PValue),]
#因为没有对比样本而未能差异分析的失败的细胞类型(之前的过滤步骤)
metadata(de.results)$failed
#[1] "Blood progenitors 1" "Caudal epiblast" "Caudal neurectoderm" "ExE ectoderm"
#[5] "Parietal endoderm" "Stripped"
is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
如下图,举例来说Mid1
基因在91%的细胞类型中均上调差异表达
#同理计算下调的common deg
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
# Finding all genes that are not remotely DE in all other labels.
remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
not.de <- remotely.de==0 | is.na(remotely.de)
not.de.other <- rowMeans(not.de[,colnames(not.de)!="Allantois"])==1
# Intersecting with genes that are DE inthe allantois.
unique.degs <- is.de[,"Allantois"]!=0 & not.de.other
unique.degs <- names(which(unique.degs))
# Inspecting the results.
de.allantois <- de.results$Allantois
de.allantois <- de.allantois[unique.degs,]
de.allantois <- de.allantois[order(de.allantois$PValue),]
de.allantois
另一种思路就是改变差异分析的零假设,直接寻找celltype-specific的基因
de.specific <- pseudoBulkSpecific(summed.filt,
label=summed.filt$celltype.mapped,
design=~factor(pool) + tomato,
coef="tomatoTRUE",
condition=summed.filt$tomato
)
cur.specific <- de.specific[["Allantois"]]
cur.specific <- cur.specific[order(cur.specific$PValue),]
cur.specific
# sizeFactors(summed.filt) <- NULL
plotExpression(logNormCounts(summed.filt),
features="Rbp4",
x="tomato", colour_by="tomato",
other_fields="celltype.mapped") +
facet_wrap(~celltype.mapped)
Tal1ChimeraData
,按照是否敲除Tal1
基因分为WT与KO两组。在已经完成细胞类型注释的基础上,对neural crest
神经嵴细胞进行差异分析。但是根据差异分析结果,发现包含大量的hemoglobin血红蛋白基因(下调)。这是有问题的,因为该细胞不会表达相关基因,即可能在其中一组(WT)中存在明显的ambient RNA干扰。library(MouseGastrulationData)
sce.tal1 <- Tal1ChimeraData()
library(scuttle)
rownames(sce.tal1) <- uniquifyFeatureNames(
rowData(sce.tal1)$ENSEMBL,
rowData(sce.tal1)$SYMBOL
)
summed.tal1 <- aggregateAcrossCells(sce.tal1,
ids=DataFrame(sample=sce.tal1$sample,
label=sce.tal1$celltype.mapped)
)
summed.tal1$block <- summed.tal1$sample %% 2 == 0 # Add blocking factor.
# Subset to our neural crest cells.
summed.neural <- summed.tal1[,summed.tal1$label=="Neural crest"]
# Standard edgeR analysis, as described above.
res.neural <- pseudoBulkDGE(summed.neural,
label=summed.neural$label,
design=~factor(block) + tomato,
coef="tomatoTRUE",
condition=summed.neural$tomato)
summarizeTestsPerLabel(decideTestsPerLabel(res.neural))
# Summary of the direction of log-fold changes.
tab.neural <- res.neural[[1]]
tab.neural <- tab.neural[order(tab.neural$PValue),]
head(tab.neural, 10)
library(DropletUtils)
#定义一个空列表,用于储存每个样本的sample ambient RNA
ambient <- vector("list", ncol(summed.neural))
# Looping over all raw (unfiltered) count matrices and
# computing the ambient profile based on its low-count barcodes.
# Turning off rounding, as we know this is count data.
for (s in seq_along(ambient)) {
#获取每一个样本还没有过滤空液滴的最原始表达矩阵
raw.tal1 <- Tal1ChimeraData(type="raw", samples=s)[[1]]
#预估每一个样本的ambient情况
ambient[[s]] <- estimateAmbience(counts(raw.tal1),
good.turing=FALSE, round=FALSE)
}
# Cleaning up the output for pretty printing.
#合并所有样本的ambient effect
ambient <- do.call(cbind, ambient)
colnames(ambient) <- seq_len(ncol(ambient))
rownames(ambient) <- uniquifyFeatureNames(
rowData(raw.tal1)$ENSEMBL,
rowData(raw.tal1)$SYMBOL
)
head(ambient)
## 1 2 3 4
## Xkr4 1 0 0 0
## Gm1992 0 0 0 0
## Gm37381 1 0 1 0
## Rp1 0 1 0 1
## Sox17 76 76 31 53
## Gm37323 0 0 0 0
#然后据此预测过滤后的表达矩阵的可能受污染的基因表达比例
max.ambient <- maximumAmbience(counts(summed.neural),
ambient, mode="proportion")
head(max.ambient)
## [,1] [,2] [,3] [,4]
## Xkr4 NaN NaN NaN NaN
## Gm1992 NaN NaN NaN NaN
## Gm37381 NaN NaN NaN NaN
## Rp1 NaN NaN NaN NaN
## Sox17 0.1775 0.1833 0.468 1
## Gm37323 NaN NaN NaN NaN
#最后将样本平均污染比例超过10%的基因认为是不合格基因,将其从DEG列表中过滤掉
contamination <- rowMeans(max.ambient, na.rm=TRUE)
non.ambient <- contamination <= 0.1
okay.genes <- names(non.ambient)[which(non.ambient)]
tab.neural2 <- tab.neural[rownames(tab.neural) %in% okay.genes,]
table(Direction=tab.neural2$logFC > 0, Significant=tab.neural2$FDR <= 0.05)
head(tab.neural2, 10)
#根据结果可以看到原本sig DEG的血红蛋白基因已经被过滤掉了。
在上述方法中,得到ambient
后,如果知道其中某些基因在样本细胞中一定是不表达的,作为阴性对照参考,可提高预估的精度。
#血红蛋白基因一定是不表达的
is.hbb <- grep("^Hb[ab]-", rownames(summed.neural))
alt.prop <- controlAmbience(counts(summed.neural), ambient,
features=is.hbb, mode="proportion")
alt.non.ambient <- rowMeans(alt.prop, na.rm=TRUE) <= 0.1
okay.genes <- names(alt.non.ambient)[which(alt.non.ambient)]
tab.neural4 <- tab.neural[rownames(tab.neural) %in% okay.genes,]
head(tab.neural4)
第二种思路:上述的方法的前提是提供有未过滤空液滴的原始表达矩阵,但对于挖掘公共来源的单细胞表达矩阵一般都是过滤后的,不能够提供可以参考的ambient profile。所以再推测ambient profile是很难的。一种想法是,即假设ambient RNA对所有细胞类型的影响都是相同的,所以specific-common DEG是很值得被怀疑的,但也存在很多问题。具体可参看原教程,这里不多阐述了。
细胞类型丰度的差异分析是指对于同一种细胞类型的数目在不同分组中是否有显著的差异。基本流程类似上面的DE pipeline,只是表达矩阵(列为样本细胞类型,行名为基因,值为基因表达水平)变成了细胞丰度矩阵(列为样本,行为细胞类型,值为细胞组成数目),同样采用 edgeR
pipeline。
#计算细胞类型数目总和
abundances <- table(merged$celltype.mapped, merged$sample)
abundances <- unclass(abundances)
head(abundances)
##
## 5 6 7 8 9 10
## Allantois 97 15 139 127 318 259
## Blood progenitors 1 6 3 16 6 8 17
## Blood progenitors 2 31 8 28 21 43 114
## Cardiomyocytes 85 21 79 31 174 211
## Caudal Mesoderm 10 10 9 3 10 29
## Caudal epiblast 2 2 0 0 22 45
#构建DEGlist对象
extra.info <- colData(merged)[match(colnames(abundances), merged$sample),]
y.ab <- DGEList(abundances, samples=extra.info)
#y.ab$samples$sizeFactor
#过滤低丰度的细胞类型
keep <- filterByExpr(y.ab, group=y.ab$samples$tomato)
y.ab <- y.ab[keep,]
#差异分析
design <- model.matrix(~factor(pool) + factor(tomato), y.ab$samples)
y.ab <- estimateDisp(y.ab, design, trend="none")
fit.ab <- glmQLFit(y.ab, design, robust=TRUE, abundance.trend=FALSE)
res <- glmQLFTest(fit.ab, coef=ncol(design))
summary(decideTests(res))
## factor(tomato)TRUE
## Down 1
## NotSig 22
## Up 1
topTags(res)
上述流程中,未执行DE里的
calcNormFactoors()
这一步骤;仅根据DEGlist计算library size进行统一文库大小。这可能会引起composition effect,即一种细胞类型的“高表达”在同一文库大小情况下,势必会引起其它细胞类型丰度的非正常减小。当然可以使用calcNormFactoors()
进行矫正,但这需要假设大部分细胞类型的丰度没有显著差异,这对于DA分析是比较困难的。可以通过设定较高的DEG阈值、不考虑明显高丰度的细胞类型进行一定程度的消减。