前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >RNA-seq 231023

RNA-seq 231023

原创
作者头像
素素
发布2023-10-31 21:28:02
3950
发布2023-10-31 21:28:02
举报
文章被收录于专栏:生信课程note+实验知识

title: "RNAseq231026"

output: html_document

date: "2023-10-31"


参考https://www.zhihu.com/people/gu_chen/posts?page=2

R Markdown

代码语言:text
复制
###环境设置
rm(list=ls())
options(stringsAsFactors = F) 
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(data.table) #多核读取文件
setwd("D:/huage1/rnaseq1026")
#### 对counts进行处理筛选得到表达矩阵 ####
a1 <- fread('./counts.txt',
            header = T,data.table = F)#载入counts,第一列设置为列名
colnames(a1)
代码语言:txt
复制
##  [1] "Geneid"                                                                      
##  [2] "Chr"                                                                         
##  [3] "Start"                                                                       
##  [4] "End"                                                                         
##  [5] "Strand"                                                                      
##  [6] "Length"                                                                      
##  [7] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207279_sorted.bam"
##  [8] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207280_sorted.bam"
##  [9] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207283_sorted.bam"
## [10] "/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/SRR12207284_sorted.bam"
代码语言:text
复制
counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts 
rownames(counts) <- a1$Geneid
colnames(counts) <- gsub('/public/home/jiezhang_gibh/hqn1/rnaseq1026/test/align/','', #删除样品名前缀
                         gsub('_sorted.bam','',  colnames(counts))) #删除样品名后缀


#### 导入或构建样本信息,  进行列样品名的重命名和分组####
b <- read.csv('./SraRunTable.txt')
b
代码语言:txt
复制
##           Run Assay.Type AvgSpotLen      Bases  BioProject
## 1 SRR12207279    RNA-Seq        295 4512474269 PRJNA645812
## 2 SRR12207280    RNA-Seq        296 6542603004 PRJNA645812
## 3 SRR12207283    RNA-Seq        300 4108168500 PRJNA645812
## 4 SRR12207284    RNA-Seq        300 4536921000 PRJNA645812
##      BioSample      Bytes cell_type_source Center.Name Consent
## 1 SAMN15517122 1855257229       stem cells         GEO  public
## 2 SAMN15517121 2616595592       stem cells         GEO  public
## 3 SAMN15517118 1777556258       stem cells         GEO  public
## 4 SAMN15517117 1699519747       stem cells         GEO  public
##   DATASTORE.filetype DATASTORE.provider               DATASTORE.region
## 1   sra,run.zq,fastq         s3,gs,ncbi gs.US,s3.us-east-1,ncbi.public
## 2   sra,run.zq,fastq         s3,gs,ncbi s3.us-east-1,ncbi.public,gs.US
## 3   sra,fastq,run.zq         gs,s3,ncbi s3.us-east-1,gs.US,ncbi.public
## 4   sra,fastq,run.zq         ncbi,s3,gs gs.US,ncbi.public,s3.us-east-1
##   Experiment genotype GEO_Accession..exp.  Instrument LibraryLayout
## 1 SRX8718088       WT          GSM4668511 HiSeq X Ten        PAIRED
## 2 SRX8718089       WT          GSM4668512 HiSeq X Ten        PAIRED
## 3 SRX8718092       WT          GSM4668515 HiSeq X Ten        PAIRED
## 4 SRX8718093       WT          GSM4668516 HiSeq X Ten        PAIRED
##   LibrarySelection  LibrarySource     Organism Platform
## 1             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 2             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 3             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
## 4             cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA
##            ReleaseDate          create_date version Sample.Name
## 1 2021-02-19T00:00:00Z 2020-07-13T09:00:00Z       1  GSM4668511
## 2 2021-02-19T00:00:00Z 2020-07-13T09:01:00Z       1  GSM4668512
## 3 2021-02-19T00:00:00Z 2020-07-13T08:58:00Z       1  GSM4668515
## 4 2021-02-19T00:00:00Z 2020-07-13T09:03:00Z       1  GSM4668516
##   source_name SRA.Study chip_antibody
## 1   RNA_mESCs SRP271532            NA
## 2   RNA_mESCs SRP271532            NA
## 3  RNA_EpiSCs SRP271532            NA
## 4  RNA_EpiSCs SRP271532            NA
代码语言:text
复制
name_list <- b$source_name[match(colnames(counts),b$Run)]; name_list  #选择所需要的样品信息列
代码语言:txt
复制
## [1] "RNA_mESCs"  "RNA_mESCs"  "RNA_EpiSCs" "RNA_EpiSCs"
代码语言:text
复制
class(name_list)
代码语言:txt
复制
## [1] "character"
代码语言:text
复制
##以上做完
nlgl <- data.frame(row.names=colnames(counts),
                   name_list=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2"),
                   group_list=c("naive","naive","primed","primed"))
name_list <- nlgl$name_list
group_list <- nlgl$group_list
gl <- data.frame(row.names=colnames(counts), #构建样品名与分组对应的数据框
                 group_list=group_list)
class(nlgl)
代码语言:txt
复制
## [1] "data.frame"
代码语言:text
复制
geneid_efflen <- subset(a1,select = c("Geneid","Length"))
colnames(geneid_efflen) <- c("geneid","efflen")  
efflen <- geneid_efflen[match(rownames(counts),
                              geneid_efflen$geneid),
                        "efflen"]
class(efflen)
代码语言:txt
复制
## [1] "integer"
代码语言:text
复制
counts2TPM <- function(count=count, efflength=efflen){
  RPK <- count/(efflength/1000)   #每千碱基reads (Reads Per Kilobase) 长度标准化
  PMSC_rpk <- sum(RPK)/1e6        #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化
  RPK/PMSC_rpk              
}  

tpm <- as.data.frame(apply(counts,2,counts2TPM))
colSums(tpm)
代码语言:txt
复制
## SRR12207279 SRR12207280 SRR12207283 SRR12207284 
##       1e+06       1e+06       1e+06       1e+06
代码语言:text
复制
##新方法 合并
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(data.table) #多核读取文件

g2s <- fread('g2s_vm33_gencode.txt',header = F,data.table = F) #载入从gencode的gtf文件中提取的信息文件
colnames(g2s) <- c("geneid","symbol")

symbol <- g2s[match(rownames(counts),g2s$geneid),"symbol"] #匹配counts行名对应的symbol
table(duplicated(symbol))
代码语言:txt
复制
## 
## FALSE  TRUE 
## 56791   150
代码语言:text
复制
#FALSE  TRUE 
#55492   770 

##使用aggregate根据symbol列中的相同基因进行合并
代码语言:text
复制
counts <- aggregate(counts, by=list(symbol), FUN=sum)
counts <- column_to_rownames(counts,'Group.1')
#在 aggregate 函数的使用过程中, Group.1 列是通过对 counts 数据框中的计数进行求和而生成的临时列。 
dim(counts)
代码语言:txt
复制
## [1] 56791     4
代码语言:text
复制
#[1] 55492     4
tpm <- aggregate(tpm, by=list(symbol), FUN=sum)
tpm <- column_to_rownames(tpm,'Group.1')
#在第一句代码 tpm <- aggregate(tpm, by=list(symbol), FUN=sum) 中, aggregate 函数会根据 symbol 列对 tpm 数据框进行聚合操作,并将每个组的TPM值求和。但是,聚合操作会生成一个新的数据框,其中 symbol 列被重命名为 Group.1 ,而原始的行名可能会丢失。 
#为了保留原始的行名,第二句代码 tpm <- column_to_rownames(tpm,'Group.1') 将 Group.1 列的值作为新的行名,以确保每个基因的标识信息仍然保留在结果中。 
#因此,两句代码的组合将按照 symbol 分组并计算总和,然后使用 Group.1 列的值作为新的行名。 

#初步过滤低表达基因与保存counts数据
keep_feature <- rowSums(counts>1) >= 2
table(keep_feature)
代码语言:txt
复制
## keep_feature
## FALSE  TRUE 
## 42689 14102
代码语言:text
复制
counts_filt <- counts[keep_feature, ] 
tpm_filt <- tpm[keep_feature, ]
counts_raw=counts
counts=counts_filt
tpm=tpm_filt
save(counts_raw,counts,tpm,group_list,gl,file='./1.counts.Rdata')

#四、差异分析前的准备工作
#1.数据预处理,使样本间具有可比性
options(stringsAsFactors = F)
library(FactoMineR)
library(factoextra)  
library(tidyverse) # ggplot2 stringer dplyr tidyr readr purrr  tibble forcats
library(pheatmap)
library(DESeq2)
library(RColorBrewer)
#数据预处理  # (任选以下一种作为dat即可,主要是进行样本间归一化,使得样本具有可比性)
dat <- log2(tpm+1)
colnames(dat) <- c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
#2.boxplot查看样本的基因整体表达情况
### boxplot 查看样本的基因整体表达情况
p=boxplot(dat, col="blue", ylab="dat", main=" normalized data ",
        outline = F, notch = F)
代码语言:text
复制
dev.off()
代码语言:txt
复制
## null device 
##           1
代码语言:text
复制
#3查看聚类情况 hclust 图、距离热图、PCA图、差异基因热图、相关性热图
#聚类
sampleDists <- dist(t(dat))   #dist默认计算矩阵行与行的距离, 因此需要转置
sampleDistMatrix <- as.matrix(sampleDists)  
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)  #选取热图的颜色
p0 <- pheatmap::pheatmap(sampleDistMatrix,
                         fontsize=7,
                         clustering_distance_rows=sampleDists,
                         clustering_distance_cols=sampleDists,
                         angle_col=45,
                         col=colors)
代码语言:text
复制
ggsave(p0,filename = 'check_dist.png',width = 7.5,height =6)
dev.off()
代码语言:txt
复制
## null device 
##           1
代码语言:text
复制
#PCA图
dat.pca <- PCA(t(dat) , graph = F) 
percentVar <- get_eigenvalue(dat.pca)
pca <- fviz_pca_ind(dat.pca,
                    title = "Principal Component Analysis",
                    legend.title = "Groups",
                    geom.ind = c("point", "text"), 
                    pointsize = 1.5,
                    labelsize = 3,
                    col.ind = group_list, # 分组上色
                    axes.linetype=NA,  # remove axeslines
                    mean.point=F#去除分组中心点
) + 
  coord_fixed(ratio = 1)+  #坐标轴的纵横比
  xlab(paste0("PC1 (",round(percentVar[1,'variance.percent'],1),"%)")) +
  ylab(paste0("PC2 (",round(percentVar[2,'variance.percent'],1),"%)"))
pca
代码语言:text
复制
#heatmap检测——取500高表达基因 看一下样本相关性 
dat_500 <- dat[names(sort(apply(dat,1,mad),decreasing = T)[1:500]),]#取高表达量前500基因
M <- cor(dat_500)
gl$name_list <-nlgl$name_list
rownames(gl)<-gl$name_list
p2 <-pheatmap::pheatmap(M,
                        show_rownames = T,
                        angle_col=45,
                        fontsize=7,
                        annotation_col = gl) 
p2
代码语言:text
复制
ggsave(p2,filename = 'check_cor_top500new.png',width = 7.5,height =6)
代码语言:text
复制
#
####################### heatmap检测——取500差异大的基因 差异基因热图##########################################
cg <- names(tail(sort(apply(dat,1,sd)),500)) #取每一行的方差,从小到大排序,取最大的500个
p1 <- pheatmap::pheatmap(dat[cg, ],
                         show_colnames =T,show_rownames = F,
                         fontsize=7,
                         legend_breaks = -3:3,
                         #scale = "row",
                         angle_col=45,
                         annotation_col=gl) 
代码语言:text
复制
ggsave(p1,filename = 'check_heatmap_top500_sdnew.png',width = 7.5,height =6)
代码语言:text
复制
dev.off()
代码语言:txt
复制
## null device 
##           1
代码语言:text
复制
#差异分析 DESeq2
exp="primed"
ctr="naive"
colnames(counts)=c("mESCs_1","mESCs_2","EpiSCs_1","EpiSCs_2")
gl1 <-gl$group_list
group_list <- gl$name_list
dds <- DESeqDataSetFromMatrix(countData = counts,
                              colData = gl,
                              design = ~ group_list)

dds$group_list <- relevel(dds$group_list, ref = ctr)
keep <- rowSums(counts(dds)) >= 1.5*ncol(counts)
dds <- dds[keep,]
dds <- DESeq(dds,quiet = F) 
res <- results(dds,contrast=c("group_list", exp, ctr))
resOrdered <- res[order(res$padj),]
class(resOrdered)
代码语言:txt
复制
## [1] "DESeqResults"
## attr(,"package")
## [1] "DESeq2"
代码语言:text
复制
tempDEG <- as.data.frame(resOrdered)
DEG_DEseq2 <- na.omit(tempDEG)

#差异分析绘制 火山图和热图
#1.火山图的绘制
library(ggplot2)
library(pheatmap)
##筛选条件设置
log2FC_cutoff = log2(10)
padj_cutoff = 0.001
##选取差异分析结果
need_DEG <- DEG_DEseq2[,c(2,6)] #选取log2FoldChange, padj信息
colnames(need_DEG) <- c('log2FoldChange','padj') 

need_DEG$significance  <- as.factor(ifelse(need_DEG$padj < padj_cutoff & abs(need_DEG$log2FoldChange) > log2FC_cutoff,
                                           ifelse(need_DEG$log2FoldChange > log2FC_cutoff ,'UP','DOWN'),'NOT'))

title <- paste0(' Up :  ',nrow(need_DEG[need_DEG$significance =='UP',]) ,
                '\n Down : ',nrow(need_DEG[need_DEG$significance =='DOWN',]),
                '\n FoldChange >= ',round(2^log2FC_cutoff,3))

g <- ggplot(data=need_DEG, 
            aes(x=log2FoldChange, y=-log10(padj), 
                color=significance)) +
  #点和背景
  geom_point(alpha=0.4, size=1) +
  theme_classic()+ #无网格线
  #坐标轴
  xlab("log2 ( FoldChange )") + 
  ylab("-log10 ( P.adjust )") +
  #标题文本
  ggtitle( title ) +
  #分区颜色                  
  scale_colour_manual(values = c('blue','grey','red'))+ 
  #辅助线
  geom_vline(xintercept = c(-log2FC_cutoff,log2FC_cutoff),lty=4,col="grey",lwd=0.8) +
  geom_hline(yintercept = -log10(padj_cutoff),lty=4,col="grey",lwd=0.8) +
  #图例标题间距等设置
  theme(plot.title = element_text(hjust = 0.5), 
        plot.margin=unit(c(2,2,2,2),'lines'), #上右下左
        legend.title = element_blank(), #不显示图例标题
        legend.position="right")  #图例位置

ggsave(g,filename = 'volcano_padj.png',width =8,height =7.5)
代码语言:text
复制
#2.热图的绘制
gene_up <- rownames(need_DEG[with(need_DEG,log2FoldChange>log2FC_cutoff & padj<padj_cutoff),])
gene_down <- rownames(need_DEG[with(need_DEG,log2FoldChange< -log2FC_cutoff & padj<padj_cutoff),])
cg1 <- c(head(gene_up, 50),   #取前50 padj上下调基因名
        head(gene_down, 50))
cg2 <- na.omit(match(cg1,rownames(dat))) 
cg2
代码语言:txt
复制
##   [1]  3740 10121   131  2383  8147  8036  2357  9217  1899  3244  5806
##  [12]  5732  8893  4012  4852  8462 13189  6102  4017  5444  1409 12960
##  [23] 13436 10641  3974 10090 12177  7734  7873 10060 10127   588   889
##  [34] 13432  3576  8306  2405 14063 13735 12994  7930  3363  2389  6126
##  [45] 11482 10539  8824  7887  2394 13154  8254  9052  1243 12776  1067
##  [56]  1076  9567 11852  2080  3805  3050  2180   482  1537  9713  8113
##  [67] 10540  2994  9188   470  7715   242 13609  3232  9276   878  2107
##  [78]  8736 10270   562 11424  1848 12751   931  9172 12255  3542  3726
##  [89]  8097   380 12296  7640  4097  2818  3838  3921 13630  8898  5832
## [100] 13874
代码语言:text
复制
pheatmap::pheatmap(dat[cg2,],
                   show_colnames =T,
                   show_rownames = F,
                   #scale = "row",
                   fontsize = 7 ,
                   cluster_cols = T,
                   annotation_col=gl,
                   filename = 'heatmap_top50up&down_DEG.png')
代码语言:text
复制
#3.GO KEGG富集分析与enrichplot 设定pvalue是防止其被过度矫正,能够提供更多关于富集结果显著性的信息
log2FC_cutoff = log2(10)
pvalue_cutoff = 0.001
padj_cutoff = 0.001
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
library(ggnewscale)
# library(org.Hs.eg.db)
#BiocManager::install("org.Mm.eg.db")
library(org.Mm.eg.db)
library(DOSE)
library(pathview) 
class(DEG_DEseq2)
代码语言:txt
复制
## [1] "data.frame"
代码语言:text
复制
need_DEG18 <- DEG_DEseq2[,c(2,5,6)]  
head(need_DEG18)
代码语言:txt
复制
##               log2FoldChange        pvalue          padj
## Flnc                8.985284 3.700584e-168 4.733047e-164
## Peg10               6.585819 2.187517e-153 1.398917e-149
## 2010310C07Rik       7.431042 7.463596e-130 3.181980e-126
## Col1a2              9.694994 9.271884e-120 2.964685e-116
## Kcnj10              6.039922 3.968897e-118 1.015244e-114
## Klf4               -6.287716 1.096622e-114 2.337633e-111
代码语言:text
复制
colnames(need_DEG18) <- c('log2FoldChange','pvalue','padj')
gene_up=rownames(need_DEG18[with(need_DEG18,log2FoldChange > log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])
gene_down=rownames(need_DEG18[with(need_DEG18,log2FoldChange < -log2FC_cutoff & pvalue<pvalue_cutoff & padj<padj_cutoff),])

#### 转化基因名为entrez ID ####
gene_up_entrez <- as.character(na.omit(bitr(gene_up, #数据集
                                            fromType="SYMBOL", #输入格式
                                            toType="ENTREZID", # 转为ENTERZID格式
                                            OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_down_entrez <- as.character(na.omit(bitr(gene_down, #数据集
                                              fromType="SYMBOL", #输入格式
                                              toType="ENTREZID", # 转为ENTERZID格式
                                              OrgDb="org.Mm.eg.db")[,2])) #"org.Hs.eg.db" "org.Mm.eg.db"
gene_diff_entrez <- unique(c(gene_up_entrez ,gene_down_entrez ))
#利用clusterProfiler进行KEGG与GO富集
kegg_enrich_results <- enrichKEGG(gene  = gene_up_entrez,
                                  organism  = "mmu", #物种人类 hsa 小鼠mmu
                                  pvalueCutoff = 0.05,
                                  qvalueCutoff = 0.2)
kk_read <- DOSE::setReadable(kegg_enrich_results, 
                             OrgDb="org.Mm.eg.db", 
                             keyType='ENTREZID')#ENTREZID to gene Symbol
write.csv(kk_read@result,'KEGG_gene_up_enrichresults.csv') 
save(kegg_enrich_results, file ='KEGG_gene_up_enrichresults.Rdata')
go_enrich_results <- enrichGO(gene = gene_up_entrez,
                              OrgDb = "org.Mm.eg.db",
                              ont   = "ALL"  ,     #One of "BP", "MF"  "CC"  "ALL" 
                              pvalueCutoff  = 0.05,
                              qvalueCutoff  = 0.2,
                              readable      = TRUE)
write.csv(go_enrich_results@result, 'GO_gene_up_BP_enrichresults.csv') 
save(go_enrich_results, file ='GO_gene_up_enrichresults.Rdata')

genelist <- as.numeric(DEG_DEseq2[,2]) 
names(genelist) <- row.names(DEG_DEseq2)
# genelist ID转化
genelist_entrez <- genelist
names(genelist_entrez) <- bitr(names(genelist),
                               fromType="SYMBOL",toType="ENTREZID", 
                               OrgDb="org.Mm.eg.db")[,2]  
genelist_entrez <- genelist_entrez[!is.na(names(genelist_entrez))]

##查看与选择所需通路
kk_read@result$Description[1:10] #查看前10通路
代码语言:txt
复制
##  [1] "Proteoglycans in cancer - Mus musculus (house mouse)"         
##  [2] "Protein digestion and absorption - Mus musculus (house mouse)"
##  [3] "Focal adhesion - Mus musculus (house mouse)"                  
##  [4] "ECM-receptor interaction - Mus musculus (house mouse)"        
##  [5] "Amphetamine addiction - Mus musculus (house mouse)"           
##  [6] "Dopaminergic synapse - Mus musculus (house mouse)"            
##  [7] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"      
##  [8] "Hedgehog signaling pathway - Mus musculus (house mouse)"      
##  [9] "Relaxin signaling pathway - Mus musculus (house mouse)"       
## [10] "JAK-STAT signaling pathway - Mus musculus (house mouse)"
代码语言:text
复制
i=3 
select_pathway <- kk_read@result$ID[i] #选择所需通路的ID号

pathview(gene.data     = genelist_entrez,
         pathway.id    = select_pathway,
         species       = 'mmu' ,      # 人类hsa 小鼠mmu 
         kegg.native   = T,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件 
         new.signature = F, #pdf是否显示pathway标注
         limit         = list(gene=2.5, cpd=1) #图例color bar范围调整 
)
代码语言:text
复制
pathview(gene.data     = genelist_entrez,
         pathway.id    = select_pathway,
         species       = 'mmu' ,      # 人类hsa 小鼠mmu 
         kegg.native   = F,# TRUE输出完整pathway的png文件,F输出基因列表的pdf文件 
         new.signature = F, #pdf是否显示pathway标注
         limit         = list(gene=2.5, cpd=1) #图例color bar范围调整 
)
# kegg.native = T : 这个参数设置为 TRUE ,表示输出为KEGG原生的可视化图像,通常是PNG格式的图像。 
#kegg.native = F : 这个参数设置为 FALSE ,表示输出为基因列表的PDF文件,而不是完整的pathway的PNG文件。
代码语言:text
复制
#dotplot与barplot
barp <- barplot(go_enrich_results, font.size=14, showCategory=12)+
  theme(plot.margin=unit(c(1,1,1,1),'lines')) 
barp
代码语言:text
复制
ggsave(barp,filename = 'KEGGnew.png', width=10, height=10)
代码语言:text
复制
barp1 <- barplot(go_enrich_results, split= "ONTOLOGY", font.size =12,showCategory=6)+
  facet_grid(ONTOLOGY~., scale="free")+     
  theme(plot.margin=unit(c(1,1,1,1),'lines'))
barp1
代码语言:text
复制
ggsave(barp1,filename = 'GO.png', width=10, height=10)
dev.off
代码语言:txt
复制
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x000001f5738bcfe0>
## <environment: namespace:grDevices>
代码语言:text
复制
#
dotp <- enrichplot::dotplot(go_enrich_results,font.size =14,showCategory = 12)+
  theme(legend.key.size = unit(10, "pt"),#调整图例大小
        plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
dotp
代码语言:text
复制
ggsave(dotp,filename = 'KEGGdot.png', width=10, height=10)
代码语言:text
复制
dotp1 <- enrichplot::dotplot(go_enrich_results,font.size =12,split = 'ONTOLOGY',showCategory = 6)+
  facet_grid(ONTOLOGY~., scale="free")+     
  theme(legend.key.size = unit(10, "pt"),#调整图例大小
        plot.margin=unit(c(1,1,1,1),'lines'))#调整四周留白大小
ggsave(dotp1,filename = 'GOdot.png', width=10, height=10)
代码语言:text
复制
#构建各通路下的基因网络
genelist <- as.numeric(DEG_DEseq2[,2]) 
names(genelist) <- row.names(DEG_DEseq2)

cnetp1 <- cnetplot(go_enrich_results,  foldChange = genelist,
                   showCategory = 8,
                   colorEdge = T,
                   node_label = 'all',
                   color_category ='steelblue')+
                   labs(x = "")+
                   labs(y = "")+
                   theme_bw()+
                    theme(panel.grid=element_blank())+
                    theme(panel.border = element_blank())+
                    theme(axis.text = element_blank()) +   ## 删去刻度标签
                    theme(axis.ticks = element_blank()) +   ## 删去刻度线
                    theme(panel.border = element_blank())   ## 删去外层边框
代码语言:text
复制
cnetp2 <- cnetplot(go_enrich_results,  foldChange = genelist,
                   showCategory = 8,
                   node_label = 'gene',
                   circular = T, 
                   colorEdge = T)+
                   labs(x = "")+
                   labs(y = "")+
                   theme_bw()+
                   theme(panel.grid=element_blank())+
                   theme(panel.border = element_blank())+
                   theme(axis.text = element_blank()) +   ## 删去刻度标签
                   theme(axis.ticks = element_blank()) +   ## 删去刻度线
                   theme(panel.border = element_blank())
ggsave(cnetp1,filename ='cnetplot.png', width =12,height =10)
ggsave(cnetp2,filename = 'cnetplot_cir.png', width =15,height =10)
代码语言:text
复制
#emapplot-Enrichment Map
pt <- pairwise_termsim(go_enrich_results)
emapp <- emapplot(pt,
                  showCategory = 30, 
                  node_label   = 'category')  # 'category', 'group', 'all' and 'none'.)
ggsave(emapp,filename = 'emapplot.png',width =10,height =10)
代码语言:text
复制
#GSEA分析
library(org.Mm.eg.db)
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(ggstatsplot)
## 物种设置
organism = 'mmu'    #  人类'hsa' 小鼠'mmu'   
OrgDb = 'org.Mm.eg.db'#人类"org.Hs.eg.db" 小鼠"org.Mm.eg.db"

#### 按照需要可选择不同的DEG方法数据集 ####
need_DEG <- DEG_DEseq2
need_DEG <- need_DEG[,c(2,5)] #选择log2FoldChange和pvalue(凑成数据框)

colnames(need_DEG) <- c('log2FoldChange','pvalue')
need_DEG$SYMBOL <- rownames(need_DEG)

##### 创建gsea分析的geneList(包含从大到小排列的log2FoldChange和ENTREZID信息)####
#转化id  
df <- bitr(rownames(need_DEG), 
           fromType = "SYMBOL",
           toType =  "ENTREZID",
           OrgDb = OrgDb) #人数据库org.Hs.eg.db 小鼠org.Mm.eg.db
need_DEG <- merge(need_DEG, df, by='SYMBOL')  #按照SYMBOL合并注释信息
geneList <- need_DEG$log2FoldChange
names(geneList) <- need_DEG$ENTREZID
geneList <- sort(geneList, decreasing = T)   #从大到小排序

##### gsea富集 ####
KEGG_kk_entrez <- gseKEGG(geneList     = geneList,
                          organism     = organism, #人hsa 鼠mmu
                          pvalueCutoff = 0.25)  #实际为padj阈值,可调整 
KEGG_kk <- DOSE::setReadable(KEGG_kk_entrez, 
                             OrgDb=OrgDb,
                             keyType='ENTREZID')#转化id             

GO_kk_entrez <- gseGO(geneList     = geneList,
                      ont          = "ALL",  # "BP"、"MF"和"CC"或"ALL"
                      OrgDb        = OrgDb,#人类org.Hs.eg.db 鼠org.Mm.eg.db
                      keyType      = "ENTREZID",
                      pvalueCutoff = 0.25)   #实际为padj阈值可调整
GO_kk <- DOSE::setReadable(GO_kk_entrez, 
                           OrgDb=OrgDb,
                           keyType='ENTREZID')#转化id 

save(KEGG_kk_entrez, GO_kk_entrez, file = "GSEA_result.RData")
代码语言:text
复制
##选取富集结果
kk_gse <- KEGG_kk
kk_gse_entrez <- KEGG_kk_entrez

###条件筛选 
#一般认为|NES|>1,NOM pvalue<0.05,FDR(padj)<0.25的通路是显著富集的
kk_gse_cut <- kk_gse[kk_gse$pvalue<0.05 & kk_gse$p.adjust<0.25 & abs(kk_gse$NES)>1]
kk_gse_cut_down <- kk_gse_cut[kk_gse_cut$NES < 0,]
kk_gse_cut_up <- kk_gse_cut[kk_gse_cut$NES > 0,]

#选择展现NES前几个通路 
down_gsea <- kk_gse_cut_down[tail(order(kk_gse_cut_down$NES,decreasing = T),10),]
up_gsea <- kk_gse_cut_up[head(order(kk_gse_cut_up$NES,decreasing = T),10),]
diff_gsea <- kk_gse_cut[head(order(abs(kk_gse_cut$NES),decreasing = T),10),]


#### 经典的GSEA图 
up_gsea$Description
代码语言:txt
复制
## [1] "ECM-receptor interaction - Mus musculus (house mouse)"        
## [2] "Protein digestion and absorption - Mus musculus (house mouse)"
## [3] "Hedgehog signaling pathway - Mus musculus (house mouse)"      
## [4] "Focal adhesion - Mus musculus (house mouse)"                  
## [5] "Proteoglycans in cancer - Mus musculus (house mouse)"         
## [6] "PI3K-Akt signaling pathway - Mus musculus (house mouse)"      
## [7] "Human papillomavirus infection - Mus musculus (house mouse)"
代码语言:text
复制
i=2
gseap1 <- gseaplot2(kk_gse,
                    up_gsea$ID[i],#富集的ID编号
                    title = up_gsea$Description[i],#标题
                    color = "red", #GSEA线条颜色
                    base_size = 18,#基础字体大小
                    rel_heights = c(1.5, 0.5, 1),#副图的相对高度
                    subplots = 1:3,   #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
                    ES_geom = "line", #enrichment score用线还是用点"dot"
                    pvalue_table = T) #显示pvalue等信息
ggsave(gseap1, filename = 'GSEA_up_1.pdf', width =10, height =8)
代码语言:text
复制
#### 合并 GSEA通路 
gseap2 <- gseaplot2(kk_gse,
                    up_gsea$ID,#富集的ID编号
                    title = "UP_GSEA_all",#标题
                    color = "red",#GSEA线条颜色
                    base_size = 18,#基础字体大小
                    rel_heights = c(1.4, 0.5, 1),#副图的相对高度
                    subplots = 1:3, #要显示哪些副图 如subplots=c(1,3) #只要第一和第三个图
                    ES_geom = "line",#enrichment score用线还是用点"dot"
                    pvalue_table = T) #显示pvalue等信息
ggsave(gseap2, filename = "GSEA_up_all.png",width =13,height =12)

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • R Markdown
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档