require(devtools)
devtools::install_version('flexmix', '2.3-13')
devtools::install_github('hms-dbmi/scde', build_vignettes = FALSE)
library(scde)
data(pollen)
# remove poor cells and genes
cd <- clean.counts(pollen)
# check the final dimensions of the read count matrix
dim(cd)
将样本和组别转换成颜色
x <- gsub("^Hi_(.*)_.*", "\\1", colnames(cd))
l2cols <- c("coral4", "olivedrab3", "skyblue2", "slateblue3")[as.integer(factor(x, levels = c("NPC", "GW16", "GW21", "GW21+3")))]
为每个细胞构建错误模型。使用由knn.error.models()
实现的k最近邻模型拟合过程。这个knn在测试数据集已经完成,这里并不运行,可直接导入
# EVALUATION NOT NEEDED
#knn <- knn.error.models(cd, k = ncol(cd)/4, n.cores = 1, min.count.threshold = 2, min.nonfailed = 5, max.model.plots = 10)
data(knn)
varinfo <- pagoda.varnorm(knn, counts = cd, trim = 3/ncol(cd), max.adj.var = 5, n.cores = 1, plot = TRUE)
# list top overdispersed genes
sort(varinfo$arv, decreasing = TRUE)[1:10]
varinfo <- pagoda.subtract.aspect(varinfo, colSums(cd[, rownames(knn)]>0))
为了检测单个细胞群体中显著的差异通路,pagoda识别在统计上显著表现协同变化的通路和基因集。对于每个基因集,测试了由第一个主成分解释的方差是否显著超过背景期望。既可以测试预先定义的基因集,也可以测试de novo基因集。
library(org.Hs.eg.db)
# translate gene names to ids
ids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))
rids <- names(ids); names(rids) <- ids
# convert GO lists from ids to gene names
gos.interest <- unique(c(ls(org.Hs.egGO2ALLEGS)[1:100],"GO:0022008","GO:0048699", "GO:0000280"))
go.env <- lapply(mget(gos.interest, org.Hs.egGO2ALLEGS), function(x) as.character(na.omit(rids[x])))
go.env <- clean.gos(go.env) # remove GOs with too few or too many genes
go.env <- list2env(go.env) # convert to an environment
#Now, we can calculate weighted first principal component magnitudes for each GO gene set in the provided environment.
pwpca <- pagoda.pathway.wPCA(varinfo, go.env, n.components = 1, n.cores = 1)
# evaluate the statistical significance of the observed overdispersion for each GO gene set.
df <- pagoda.top.aspects(pwpca, return.table = TRUE, plot = TRUE, z.score = 1.96)
image.png
还可以测试在给定数据集内其表达高度相关的“de novo”基因集。接下来的步骤将在数据中确定“de novo”基因簇,并建立基因集加权主成分大小期望的背景模型。并去掉离群值
clpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 1, plot = TRUE)
image.png
df <- pagoda.top.aspects(pwpca, clpca, return.table = TRUE, plot = TRUE, z.score = 1.96)
head(df)
image.png
image.png
# get full info on the top aspects
tam <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
# determine overall cell clustering
hc <- pagoda.cluster.cells(tam, varinfo)
# combine pathways that are driven by the same sets of genes:
tamr <- pagoda.reduce.loading.redundancy(tam, pwpca, clpca)
#combine aspects that show similar patterns
tamr2 <- pagoda.reduce.redundancy(tamr, distance.threshold = 0.9, plot = TRUE, cell.clustering = hc, labRow = NA, labCol = NA, box = TRUE, margins = c(0.5, 0.5), trim = 0)
image.png
列代表细胞,行聚类代表相似的通路,绿色到橘色代表从低到高的加权PCA的分数,
#While each row here represents a cluster of pathways, the row names are assigned to be the top overdispersed aspect in each cluster.
col.cols <- rbind(groups = cutree(hc, 3))
pagoda.view.aspects(tamr2, cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20), col.cols = rbind(l2cols))
image.png
欢迎关注微信公众号~
参考:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4772672/