从我们生信技能树历年的几千个马拉松授课学员里面募集了一些优秀的创作者,某种意义来说是传承了我们生信技能树的知识整理和分享的思想!
今天的是三周合计15天的数据挖掘授课学员一点一滴整理的授课知识点笔记哦,还有互动练习题哈,欢迎大家点击文末的阅读原文去关注我们学员的公众号哦!
基因需要转录为RNA和翻译为蛋白质才能发挥其功能,因此,所谓基因的功能实际上是基因产物的功能。
其实除了GO、KEGG还有很多其他的数据库,如pubmed、wiki等等,可以在GSEA上浏览
功能注释:查询感兴趣的基因/基因集合参与哪些可能的生命过程,起到了什么作用
KEGG实际上为一个数据库的集合,包括以下四大类:
其中Systems information中的KEGG Pathway又包括七个子库:
一组基因的功能注释会得到大量的功能节点,需要知道哪些是真正功能被影响了的通路,就需要进行富集分析
富集分析方法通常是分析一组基因在某个功能结点上是否过出现(overpresentation)。如果某个通路出现了overpresentation,说明该通路的很多基因表达大大高于正常
统计学方法包括:
可视化结果可以用条形图、气泡图等表示
rm(list = ls())
options(stringsAsFactors = F)
library(clusterProfiler)
library(org.Hs.eg.db)
library(GSEABase)
library(ggplot2)
library(tidyverse)
## Error in download.KEGG.Path(species)
# https://github.com/YuLab-SMU/clusterProfiler/pull/471
getOption("clusterProfiler.download.method")
## [1] "wininet"
#R.utils::setOption("clusterProfiler.download.method",'auto')
options(clusterProfiler.download.method = "wininet")
#options(clusterProfiler.download.method = "wget")
getOption("clusterProfiler.download.method")
## [1] "wininet"
# 读取差异分析结果
load(file = "data/Step03-edgeR_nrDEG.Rdata")
ls()
## [1] "DEG_edgeR_symbol"
# 提取所有差异表达的基因名
DEG <- DEG_edgeR_symbol[DEG_edgeR_symbol$regulated!="normal",2]
head(DEG)
## [1] "CFTR" "KLHL13" "CYP26B1" "CFLAR" "MTMR7" "PDK4"
## ===GO数据库, 输出所有结果,后续可根据pvalue挑选结果
ego_CC <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="CC", pvalueCutoff= 1,qvalueCutoff= 1)
ego_MF <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="MF", pvalueCutoff= 1,qvalueCutoff= 1)
ego_BP <- enrichGO(gene=DEG, OrgDb= 'org.Hs.eg.db', keyType='SYMBOL', ont="BP", pvalueCutoff= 1,qvalueCutoff= 1)
p_BP <- barplot(ego_BP,showCategory = 10, label_format = 100) + ggtitle("Biological process")
p_CC <- barplot(ego_CC,showCategory = 10, label_format = 100) + ggtitle("Cellular component")
p_MF <- barplot(ego_MF,showCategory = 10, label_format = 100) + ggtitle("Molecular function")
plotc <- p_BP/p_CC/p_MF
plotc
ggsave('result/6.enrichGO.png', plotc, width = 10,height = 16)
ego_BP <- data.frame(ego_BP)
ego_CC <- data.frame(ego_CC)
ego_MF <- data.frame(ego_MF)
write.csv(ego_BP,'result/6.enrichGO_BP.csv')
write.csv(ego_CC,'result/6.enrichGO_CC.csv')
write.csv(ego_MF,'result/6.enrichGO_MF.csv')
## === KEGG
genelist <- bitr(gene=DEG, fromType="SYMBOL", toType="ENTREZID", OrgDb='org.Hs.eg.db')
genelist <- pull(genelist,ENTREZID)
ekegg <- enrichKEGG(gene = genelist, organism = 'hsa', pvalueCutoff = 1, qvalueCutoff = 1)
p1 <- barplot(ekegg, showCategory=10, label_format = 100)
p2 <- dotplot(ekegg, showCategory=10, label_format = 100)
plotc = p1/p2
plotc
ggsave('result/6.enrichKEGG.png', plot = plotc, width = 8, height = 10)
ekegg <- data.frame(ekegg)
write.csv(ekegg,'result/6.enrichKEGG.csv')
library(patchwork)
p <- (p_BP/p_CC)|(p_MF/p1)
p
ggsave('result/6.enrichKEGG_GO.png',plot = p,width = 16, height = 9)
## === 其他数据库通路
geneset <- read.gmt("data/MsigDB/v7.4/h.all.v7.4.symbols.gmt")
table(geneset$term)
##
## HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_HYPOXIA
## 200 200
## HALLMARK_CHOLESTEROL_HOMEOSTASIS HALLMARK_MITOTIC_SPINDLE
## 74 199
## HALLMARK_WNT_BETA_CATENIN_SIGNALING HALLMARK_TGF_BETA_SIGNALING
## 42 54
## HALLMARK_IL6_JAK_STAT3_SIGNALING HALLMARK_DNA_REPAIR
## 87 150
## HALLMARK_G2M_CHECKPOINT HALLMARK_APOPTOSIS
## 200 161
## HALLMARK_NOTCH_SIGNALING HALLMARK_ADIPOGENESIS
## 32 200
## HALLMARK_ESTROGEN_RESPONSE_EARLY HALLMARK_ESTROGEN_RESPONSE_LATE
## 200 200
## HALLMARK_ANDROGEN_RESPONSE HALLMARK_MYOGENESIS
## 100 200
## HALLMARK_PROTEIN_SECRETION HALLMARK_INTERFERON_ALPHA_RESPONSE
## 96 97
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_APICAL_JUNCTION
## 200 200
## HALLMARK_APICAL_SURFACE HALLMARK_HEDGEHOG_SIGNALING
## 44 36
## HALLMARK_COMPLEMENT HALLMARK_UNFOLDED_PROTEIN_RESPONSE
## 200 113
## HALLMARK_PI3K_AKT_MTOR_SIGNALING HALLMARK_MTORC1_SIGNALING
## 105 200
## HALLMARK_E2F_TARGETS HALLMARK_MYC_TARGETS_V1
## 200 200
## HALLMARK_MYC_TARGETS_V2 HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
## 58 200
## HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_XENOBIOTIC_METABOLISM
## 200 200
## HALLMARK_FATTY_ACID_METABOLISM HALLMARK_OXIDATIVE_PHOSPHORYLATION
## 158 200
## HALLMARK_GLYCOLYSIS HALLMARK_REACTIVE_OXYGEN_SPECIES_PATHWAY
## 200 49
## HALLMARK_P53_PATHWAY HALLMARK_UV_RESPONSE_UP
## 200 158
## HALLMARK_UV_RESPONSE_DN HALLMARK_ANGIOGENESIS
## 144 36
## HALLMARK_HEME_METABOLISM HALLMARK_COAGULATION
## 200 138
## HALLMARK_IL2_STAT5_SIGNALING HALLMARK_BILE_ACID_METABOLISM
## 199 112
## HALLMARK_PEROXISOME HALLMARK_ALLOGRAFT_REJECTION
## 104 200
## HALLMARK_SPERMATOGENESIS HALLMARK_KRAS_SIGNALING_UP
## 135 200
## HALLMARK_KRAS_SIGNALING_DN HALLMARK_PANCREAS_BETA_CELLS
## 200 40
geneset$term <- gsub(pattern = "HALLMARK_","", geneset$term)
geneset$term <- str_to_title(geneset$term)
my_path <- enricher(gene=DEG, pvalueCutoff = 1, qvalueCutoff = 1, TERM2GENE=geneset)
p1 <- barplot(my_path, showCategory=15, label_format = 100, color = "pvalue")
p1
ggsave("result/6.enrich_HALLMARK.png", plot = p1, width = 8, height = 7)
my_path <- data.frame(my_path)
write.csv(my_path,"result/6.enrich_HALLMARK.csv")
也可以自己定义基因集,做成GMT格式
原理:
Broad研究所在提出GSEA方法的同时还提供了一个基因集数据库——MSigdb。它从位置,功能,代谢途径,靶标结合等多种角度出发,构建出了许多的基因集合,并将其保存在MSigDB。
https://www.gsea-msigdb.org/gsea/msigdb/index.jsp
GSEA分析要求输入:
# 清空当前环境变量
rm(list = ls())
options(stringsAsFactors = F)
# 加载包
library(GSEABase)
library(clusterProfiler)
library(enrichplot)
library(ggplot2)
library(stats)
# 加载数据
load("data/Step03-edgeR_nrDEG.Rdata")
DEG <- DEG_edgeR_symbol
## 构造GSEA分析数据
# 去掉没有配对上symbol的行
DEG <- DEG[!is.na(DEG$SYMBOL),]
# 去掉重复行
DEG <- DEG[!duplicated(DEG$SYMBOL),]
geneList <- DEG$logFC
names(geneList) <- DEG$SYMBOL
head(geneList)
## TSPAN6 DPM1 SCYL3 FIRRM CFH FUCA2
## -0.38435667 0.19874192 0.02784654 -0.12127561 0.43597022 -0.24682279
geneList <- sort(geneList,decreasing = T)
head(geneList)
## ZBTB16 ANGPTL7 STEAP4 PRODH LGI3 FAM107A
## 7.149822 5.772975 5.266528 4.903309 4.750359 4.683636
tail(geneList)
## ADCY8 RAMP3 GRIN2A LRRTM2 CYP24A1 LINC00906
## -3.916104 -3.962113 -4.104942 -4.108347 -4.295652 -4.472767
# 选择gmt文件
geneset <- read.gmt("data/MsigDB/h.all.v7.5.1.symbols.gmt")
# 运行,输出全部结果
egmt <- GSEA(geneList, TERM2GENE=geneset, pvalueCutoff = 1)
#出点图
dotplot(egmt, label_format = 100)
#按p值出点图
dotplot(egmt,color="pvalue")
# 单个通路图
# 按照通路名
gseaplot2(egmt, "HALLMARK_ADIPOGENESIS",
title = "HALLMARK_ADIPOGENESIS")
# 如果enrichment score高的部分都集中在红色(上调)区域,则说明该通路是上调的
# 按照行数
gseaplot2(egmt, 5, color="red", pvalue_table = T)
#按第一到第十个出图,不显示p值
gseaplot2(egmt, 1:6, color="red")
# 保存结果
go_gsea <- as.data.frame(egmt@result)
write.csv(go_gsea,"result/6.gsea_go_fc.csv",row.names = F)
from 生信技能树