使用KEGG通路的基因列表进行单细胞GSEA GSVA分析的过程,我们需要遵循以下步骤:
KEGGREST
或biomaRt
,来查询KEGG数据库并检索特定通路的基因列表。GSVA
包对单细胞数据执行基因集变异分析(GSVA),根据KEGG通路的基因列表评估每个单细胞样本的通路活性。今天我们主要关注第一步,如何获取KEGG通路的基因列表?
1. 表达矩阵
2. 基因集
表达矩阵容易获得,但是如果我们想做kegg数据库的通路分析怎么办?如何获取kegg的通路列表?
library(msigdbr)
genesets = msigdbr(species = "Homo sapiens" ) #msigdbr提供多个物种的基因集数据
# View(msigdbr_collections()) #查看msigdbr包中所有的基因集
unique(genesets$gs_subcat) # 有多个数据库来源的基因集可选,这里选用KEGG
genesets <- subset(genesets, gs_subcat=="CP:KEGG", select = c("gs_name", "gene_symbol"))
unique(genesets$gs_name) #查看有多少条通路(186个)
方法二 使用 KEGGREST
#BiocManager::install("KEGGREST")
#BiocManager::install("EnrichmentBrowser")
library("KEGGREST")
library("EnrichmentBrowser")
KEGGREST:: listDatabases()
KEGGREST::keggList(database ='kegg')
keggList("organism") ## returns the list of KEGG organisms with
#step2: check and obtain a list of entry identifiers (in this case: sar) and associated definition for a given database or a given set of database entries.
res <- keggList("pathway", "hsa") ## returns the list of human pathways
length(res)
res=as.data.frame(res)
head(res)
#step 3: download the pathways of that organism:
hsapathway <- downloadPathways("hsa")
head(hsapathway)
idTypes(org = 'hsa' )
#step 4: retrieve gene sets for an organism from databases such as GO and KEGG:
hsa_kegg_genesets <- getGenesets(org = "hsa", db = "kegg",
gene.id.type = "SYMBOL",
cache = TRUE, return.type="list")
#step5: Parse and write the gene sets to a flat text file in GMT format for other pathway enrichment analysis programs (e.g., GSEA):
writeGMT(hsa_kegg_genesets, gmt.file = "kegg_hsa_kegg_genesets_gmt")
save(hsa_kegg_genesets,file = "~/heart_muscle/hsa_kegg_genesets.rds")
我们可以看到这里的kegg数据集合有357个
library(GSVA);print(getwd()) load("~/heart_muscle/hsa_kegg_genesets.rds") genesets_kegg=hsa_kegg_genesets print(length(hsa_kegg_genesets)) #kegg--- gssea.res <- gsva(expr, genesets_kegg [1:50], method="ssgsea",kcdf="Poisson",min.sz > 3,max.sz=200 ,parallel.sz=10 ) saveRDS(gssea.res, paste0(new_dir,file_name,"_gssea.res_kegg.rds" ) ) gssea.df <- data.frame(Genesets=rownames(gssea.res),gssea.res, check.names = F) write.csv(gssea.df, paste0(new_dir,file_name,"gssea_res_kegg.csv" ), row.names = F) # ssgsea----- #library(GSVA);print(getwd()) #gssea.res <- gsva(expr, genesets_GO [1:50], method="ssgsea",kcdf="Poisson",min.sz > 3,max.sz=200 ,parallel.sz=10 ) #,parallel.sz=10 #saveRDS(gssea.res, paste0(new_dir,file_name,"_gssea.res_go.rds" ) ) #gssea.df <- data.frame(Genesets=rownames(gssea.res), gssea.res, check.names = F) #write.csv(gssea.df, paste0(new_dir,file_name,"gssea_res_go.csv"), row.names = F) #print("done-------");print(getwd())
这里的expr是行为基因列为样本的 表达矩阵。
参考:https://biobeat.wordpress.com/category/r/https://www.researchgate.net/post/How_i_can_get_a_list_of_KEGG_pathways_and_its_list_of_genes
分析完成之后,可以使用新得到的通路矩阵进行差异分析。下期见~
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。