这篇文章的目的是演示Seurat+ORA+GSEA的分析流程,重点是了解每一步分析对象的数据结构,正确的数据结构才能保证函数正常运行,用的是pbmc3k数据,下载地址是:https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz。
本文侧重于利用别人的测序数据进行分析,查找自己感兴趣的marker,或者看自己感兴趣的marker在某类细胞中的功能注释或通路分析。
下载表达矩阵后,用Seurat流程拿到差异gene,用于后续ORA和GSEA分析。
#加载R包
library(dplyr)
library(Seurat)
library(patchwork)
#读入数据,文件夹中有三个文件:cellbarcode(细胞名),features(基因名),counts(原始表达矩阵)
pbmc.data <- Read10X(data.dir = "/home/lei/share/download/filtered_gene_bc_matrices/hg19/")
#查看读入的数据格式,是一个包含了基因名,细胞名,表达矩阵等信息的list
View(pbmc.data)
head(pbmc.data)[1:5,1:5]
注意表达矩阵结构,行是基因,列是细胞
#查看给定基因在前30个细胞中的表达情况,其中的“.”表示没有在该细胞中检测到该基因
pbmc.data[c("CD3D","TCL1A","MS4A1"),1:30]
## 3 x 30 sparse Matrix of class "dgCMatrix"
## [[ suppressing 30 column names 'AAACATACAACCAC-1', 'AAACATTGAGCTAC-1', 'AAACATTGATCAGC-1' ... ]]
##
## CD3D 4 . 10 . . 1 2 3 1 . . 2 7 1 . . 1 3 . 2 3 . . . . . 3 4 1 5
## TCL1A . . . . . . . . 1 . . . . . . . . . . . . 1 . . . . . . . .
## MS4A1 . 6 . . . . . . 1 1 1 . . . . . . . . . 36 1 2 . . 2 . . . .
#Seurat使用稀疏矩阵结构来存储表达矩阵,可以显著减少数据所占内存和运行速度
#使用object.size()函数查看对象所占内存
object.size(as.matrix(pbmc.data))#普通矩阵
## 709591472 bytes
object.size(pbmc.data)#稀疏矩阵
## 29905192 bytes
object.size(as.matrix(pbmc.data))/object.size(pbmc.data)#比值,可以看到是20多倍
## 23.7 bytes
#创建seurat对象
pbmc.data@Dimnames[[1]] <- gsub('_','-',pbmc.data@Dimnames[[1]])
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
#查看seurat对象
pbmc
#质控,计算每个细胞中线粒体基因比例
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
#质控指标存储在meta.data中
head(pbmc@meta.data,5)#查看前5个细胞的信息
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759
## AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958
## AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363
## AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845
## AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898
#小提琴图展示每个细胞的基因数量,counts数及线粒体基因的比例
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
#散点图查看counts数与线粒体基因比例和基因数量的相关性
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
#设定筛选条件,每个细胞表达的基因数量在200到2500之间,线粒体基因比例小于5%
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc)
pbmc <- ScaleData(pbmc,vars.to.regress = c('nCount_RNA','percent.mt'))
pbmc <- RunPCA(pbmc)
pbmc <- FindNeighbors(pbmc,dims = 1:15)
pbmc <- FindClusters(pbmc,resolution = 0.5)
pbmc <- RunUMAP(pbmc,dims = 1:15)
pbmc <- RunTSNE(pbmc)
#counts数据结构
pbmc@assays$RNA@counts[10:15,10:12]
## 6 x 3 sparse Matrix of class "dgCMatrix"
## AAACGCTGTTTCTG-1 AAACTTGAAAAACG-1 AAACTTGATCCAGA-1
## HES4 . . .
## RP11-54O7.11 . . .
## ISG15 3 . .
## AGRN . . .
## C1orf159 . . .
## TNFRSF18 . . .
#归一化后的counts数据结构
pbmc@assays$RNA@data[10:15,10:12]
## 6 x 3 sparse Matrix of class "dgCMatrix"
## AAACGCTGTTTCTG-1 AAACTTGAAAAACG-1 AAACTTGATCCAGA-1
## HES4 . . .
## RP11-54O7.11 . . .
## ISG15 3.339271 . .
## AGRN . . .
## C1orf159 . . .
## TNFRSF18 . . .
#标准化后的数据结构
pbmc@assays$RNA@scale.data[10:15,10:12]
## AAACGCTGTTTCTG-1 AAACTTGAAAAACG-1 AAACTTGATCCAGA-1
## TNFRSF25 -0.18365965 -0.31205438 -0.27244539
## TNFRSF9 -0.02616392 -0.06534775 -0.08915883
## CTNNBIP1 -0.17422289 -0.38556780 -0.21931155
## RBP7 -0.40269546 -0.27038105 -0.12693721
## SRM -0.22321864 -0.49858375 -0.42106807
## UBIAD1 -0.18551867 -0.21113706 -0.21219280
#查看分群情况
DimPlot(pbmc,label = T,group.by = "seurat_clusters")
#查看各群细胞数量
table(pbmc$seurat_clusters)
##
## 0 1 2 3 4 5 6 7
## 1165 473 342 287 164 161 32 14
#查找每群细胞的markers
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
#查找两种细胞类型的差异基因
cluster3.markers <- FindMarkers(pbmc, ident.1 = 1, ident.2 = 4, min.pct = 0.25)
#查看markers列表
head(pbmc.markers)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## LDHB 1.106642e-244 1.6969018 0.929 0.475 1.517649e-240 0 LDHB
## RPS12 6.869965e-230 0.8624665 1.000 0.988 9.421470e-226 0 RPS12
## RPS25 1.473786e-200 0.9020696 0.998 0.967 2.021150e-196 0 RPS25
## CD3D 1.083324e-198 1.5522326 0.876 0.239 1.485670e-194 0 CD3D
## RPS6 6.829972e-187 0.7239363 1.000 0.993 9.366624e-183 0 RPS6
## RPS27 1.043972e-186 0.7912906 0.999 0.989 1.431704e-182 0 RPS27
head(cluster3.markers)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## FCGR3A 4.014016e-102 -3.830078 0.121 0.970 5.504821e-98
## LYZ 1.728178e-74 2.515900 1.000 0.988 2.370023e-70
## S100A9 6.419857e-66 3.271117 0.996 0.884 8.804192e-62
## S100A8 2.200374e-65 3.732977 0.975 0.524 3.017593e-61
## IFITM2 6.957396e-65 -2.148189 0.674 0.994 9.541373e-61
## RHOC 1.079357e-64 -2.293654 0.159 0.841 1.480230e-60
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
#为每个基因添加对应的ENTREZID
cluster3.markers$gene <- rownames(cluster3.markers)
ids=bitr(cluster3.markers$gene,'SYMBOL','ENTREZID','org.Hs.eg.db')
#合并数据,cluser3.markers中没有ENTREZID的基因将被过虑掉
cluster3.markers=merge(cluster3.markers,ids,by.x='gene',by.y='SYMBOL')
#查看数据结构
head(cluster3.markers)
## gene p_val avg_log2FC pct.1 pct.2 p_val_adj ENTREZID
## 1 ABI3 1.244277e-32 -1.4849311 0.218 0.726 1.706402e-28 51225
## 2 ABRACL 1.164678e-10 -0.7666557 0.211 0.494 1.597239e-06 58527
## 3 ACAA1 1.066596e-07 -0.7775647 0.163 0.390 1.462730e-03 30
## 4 ACO2 3.976174e-07 -0.4265744 0.087 0.250 5.452925e-03 50
## 5 ACP1 2.016174e-01 0.2548473 0.195 0.262 1.000000e+00 52
## 6 ACTB 5.300991e-23 -0.5262499 0.998 0.994 7.269779e-19 60
#将基因按照avg_log2FC的大小进行降序排列
cluster3.markers <- cluster3.markers[order(cluster3.markers$avg_log2FC,decreasing = T),]
#生成仅含有ENTREZID名字和avg_log2FC值的gene list
cluster3.markers_list <- as.numeric(cluster3.markers$avg_log2FC)
names(cluster3.markers_list) <- cluster3.markers$ENTREZID
head(cluster3.markers_list)
## 6279 6280 3957 4069 929 64231
## 3.732977 3.271117 2.834796 2.515900 2.343899 2.306412
#筛选差异较大的基因集
cluster3_de <- names(cluster3.markers_list)[abs(cluster3.markers_list) > 1]
head(cluster3_de)
## [1] "6279" "6280" "3957" "4069" "929" "64231"
#go富集
cluster3_ego <- enrichGO(cluster3_de, OrgDb = "org.Hs.eg.db", ont="BP", readable=TRUE)
head(cluster3_ego)
## ID Description GeneRatio
## GO:0050727 GO:0050727 regulation of inflammatory response 15/52
## GO:0050729 GO:0050729 positive regulation of inflammatory response 10/52
## GO:1990266 GO:1990266 neutrophil migration 8/52
## GO:0097530 GO:0097530 granulocyte migration 8/52
## GO:0097529 GO:0097529 myeloid leukocyte migration 9/52
## GO:0030593 GO:0030593 neutrophil chemotaxis 7/52
## BgRatio pvalue p.adjust qvalue
## GO:0050727 425/18866 3.199629e-13 5.868120e-10 3.930492e-10
## GO:0050729 158/18866 1.489985e-11 1.366316e-08 9.151645e-09
## GO:1990266 122/18866 1.438531e-09 8.794218e-07 5.890405e-07
## GO:0097530 150/18866 7.409416e-09 3.199843e-06 2.143269e-06
## GO:0097529 222/18866 8.723673e-09 3.199843e-06 2.143269e-06
## GO:0030593 103/18866 1.286258e-08 3.931663e-06 2.633444e-06
## geneID
## GO:0050727 S100A8/S100A9/CCL3/S100A12/GPX1/GSTP1/GRN/IL1B/NFKBIA/SOD1/FCER1G/RPS19/CTSC/SIGLEC10/ADA
## GO:0050729 S100A8/S100A9/CCL3/S100A12/GRN/IL1B/NFKBIA/FCER1G/RPS19/CTSC
## GO:1990266 S100A8/S100A9/CCL3/S100A12/CD99/CSF3R/IL1B/FCER1G
## GO:0097530 S100A8/S100A9/CCL3/S100A12/CD99/CSF3R/IL1B/FCER1G
## GO:0097529 S100A8/S100A9/CCL3/S100A12/CD99/CSF3R/IL1B/FCER1G/RPS19
## GO:0030593 S100A8/S100A9/CCL3/S100A12/CSF3R/IL1B/FCER1G
## Count
## GO:0050727 15
## GO:0050729 10
## GO:1990266 8
## GO:0097530 8
## GO:0097529 9
## GO:0030593 7
#气泡图
dotplot(cluster3_ego, showCategory=10,title="cluster3 vs cluster5 GO")
#KEGG富集
cluster3_ekg <- enrichKEGG(gene= cluster3_de, organism = "hsa",pvalueCutoff = 0.05)
head(cluster3_ekg)
## ID Description GeneRatio BgRatio
## hsa04380 hsa04380 Osteoclast differentiation 7/37 128/8093
## hsa04662 hsa04662 B cell receptor signaling pathway 5/37 82/8093
## hsa04657 hsa04657 IL-17 signaling pathway 5/37 94/8093
## hsa05418 hsa05418 Fluid shear stress and atherosclerosis 5/37 139/8093
## hsa05140 hsa05140 Leishmaniasis 4/37 77/8093
## hsa04145 hsa04145 Phagosome 5/37 152/8093
## pvalue p.adjust qvalue
## hsa04380 1.457246e-06 0.0002142152 0.0001825393
## hsa04662 3.191591e-05 0.0023458191 0.0019989436
## hsa04657 6.171025e-05 0.0030238023 0.0025766737
## hsa05418 3.895623e-04 0.0115922330 0.0098780933
## hsa05140 3.942936e-04 0.0115922330 0.0098780933
## hsa04145 5.872082e-04 0.0143866016 0.0122592595
## geneID Count
## hsa04380 653361/3727/3553/4792/10859/11026/2214 7
## hsa04662 4792/10859/11026/8519/974 5
## hsa04657 6279/6280/3727/3553/4792 5
## hsa05418 653361/2950/3553/1514/3162 5
## hsa05140 653361/3553/4792/2214 4
## hsa04145 929/653361/1514/10376/2214 5
#气泡图
dotplot(cluster3_ekg, showCategory=10,title="cluster3 vs cluster5 KEGG")
GSEA需要使用整个gene list,这里进行kegg分析
cluster3_gsekg <- gseKEGG(cluster3.markers_list,organism = "hsa",pvalueCutoff = 0.05)
head(cluster3_gsekg)
#将富集结果按照NES绝对值降序排列
cluster3_gsekg_arrange <- arrange(cluster3_gsekg,desc(abs(NES)))
head(cluster3_gsekg_arrange) 60/695/2773/10627/8773/103910/10235/5499/7408/4067/5908/2207
## ID Description setSize enrichmentScore
## hsa03010 hsa03010 Ribosome 49 0.5923311
## hsa05171 hsa05171 Coronavirus disease - COVID-19 59 0.4951543
## hsa04657 hsa04657 IL-17 signaling pathway 10 0.8292810
## hsa04620 hsa04620 Toll-like receptor signaling pathway 10 0.7461506
## hsa04611 hsa04611 Platelet activation 17 -0.5652228
## NES pvalue p.adjust qvalues rank
## hsa03010 2.812372 1.656349e-08 1.556968e-06 1.481996e-06 212
## hsa05171 2.472121 8.968208e-07 4.215058e-05 4.012093e-05 212
## hsa04657 2.437697 3.126564e-06 9.796566e-05 9.324839e-05 38
## hsa04620 2.193332 2.185934e-04 5.136945e-03 4.889589e-03 38
## hsa04611 -1.927841 2.370955e-03 4.457395e-02 4.242762e-02 168
## leading_edge
## hsa03010 tags=92%, list=34%, signal=66%
## hsa05171 tags=81%, list=34%, signal=60%
## hsa04657 tags=70%, list=6%, signal=67%
## hsa04620 tags=50%, list=6%, signal=48%
## hsa04611 tags=71%, list=27%, signal=53%
## core_enrichment
## hsa03010 6137/6176/6205/6159/6161/6128/6222/6158/6175/6187/6202/6168/55052/6234/6143/23521/6138/6122/6188/6165/6142/6203/6209/6135/7311/6139/6193/6201/6181/6130/6217/6208/9349/6192/6227/25873/6146/6136/6134/2197/6125/6152/6191/11224/6169
## hsa05171 3553/4792/3725/6137/6176/6205/6159/6161/6128/6222/6158/6175/6187/6202/6168/6234/6143/23521/6138/6122/6188/6165/6142/6203/6209/6135/7311/6139/6193/6201/6181/6130/6217/6208/9349/6192/6227/25873/6146/6136/2353/6134/2197/6125/6152/6191/11224/6169
## hsa04657 6279/6280/3727/3553/4792/2354/3725
## hsa04620 929/6348/3553/4792/3725
## hsa04611 60/695/2773/10627/8773/103910/10235/5499/7408/4067/5908/2207
#作图
color <- c("#f7ca64", "#43a5bf", "#86c697", "#a670d6", "#ef998a")
gsekp1 <- gseaplot2(cluster3_gsekg_arrange, 1:5, color = color, pvalue_table=F, base_size=14)
gsekp2 <- upsetplot(cluster3_gsekg_arrange, n=5)
cowplot::plot_grid(gsekp1, gsekp2, rel_widths=c(1, .6), labels=c("A", "B"))
参考资料:
https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
(Wu et al. 2021)
Wu, Tianzhi, Erqiang Hu, Shuangbin Xu, Meijun Chen, Pingfan Guo, Zehan Dai, Tingze Feng, et al. 2021. “clusterProfiler 4.0: A Universal Enrichment Tool for Interpreting Omics Data.” The Innovation 2 (3): 100141. https://doi.org/10.1016/j.xinn.2021.100141.
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。