前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >生物信息数据分析教程视频——13-3种R包(DESeq2、edgeR和limma)进行RNAseq的差异表达分析与比较

生物信息数据分析教程视频——13-3种R包(DESeq2、edgeR和limma)进行RNAseq的差异表达分析与比较

作者头像
DoubleHelix
发布2022-12-15 15:57:57
1.2K0
发布2022-12-15 15:57:57
举报
文章被收录于专栏:生物信息云生物信息云

视频地址:http://mpvideo.qpic.cn/0bc3zeaakaaalqalwhjtmzrvbsodaxeqabia.f10002.mp4?

参考文章:

超详细的DESeq2和edgeR包的基本原理和实战案例

一文就会TCGA数据库基因表达差异分析【过后付费当赞赏】

基于count数据的基因差异表达分析万能代码【和本文代码差不多】

代码:

代码语言:javascript
复制

# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
rm(list = ls())
options(stringsAsFactors = F)

library(TCGAbiolinks)
library(DESeq2)
library(limma)
library(edgeR)
library(reshape2)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(UpSetR)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

opt <- "output/006-DEG-tumor-vs-normal/Unpaired/"

source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###差异分析的函数
source("H:/MedBioInfoCloud/analysis/fun/countsDEAnalysis.R")
###火山图绘制函数
source("H:/MedBioInfoCloud/analysis/fun/plotDEGvolcanoFig.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
norn <- 10 ###正常样本最少数量
cutFC <- 2 ###abs(log2(FC))
cutFDR <- 0.01 ###
colour = c("#DC143C","#00008B", "#808080") ##火山图的3种颜色:Down,ns,Up

for(proj in project){
  message("============================")
  message(proj)
  
  ###删除所有文件===========
  record_files <- dir(paste0(opt,proj,"/"),"*.*",full.names = T)
  ifelse(length(record_files)>0,file.remove(record_files),
         "There is no record file that can be removed")
  
  load(FilePath[grep(proj,FilePath)])#STARdata
  count <- STARdata[["count"]]
  
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(count)[-c(1:3)],
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  if(length(SamN) >= norn){
    
    ##输出路径
    opt_deg <- paste0(opt,proj,"/")
    ifelse(dir.exists(opt_deg),"The folder already exists.",dir.create(opt_deg,recursive = T))
    
    anoinfo <- count[,1:3]
    
    ###过滤数据
    count <- filterGeneTypeExpr(expr = count,
                              fil_col = "gene_type",
                              filter = FALSE)
    
    ##过滤不表达的基因
    count <- count[apply(count,1,var) !=0,]
    
    ##肿瘤组织样本ID
    SamT <- setdiff(colnames(count),SamN)
    
    ###去除重复样本
    nor_exp <- del_dup_sample(count[,SamN],col_rename = F)
    tur_exp <- del_dup_sample(count[,SamT],col_rename = F)
    
    data <- cbind(nor_exp,tur_exp)
    group <- c(rep("Normal",ncol(nor_exp)),rep("Tumor",ncol(tur_exp)))
    names(group) <- colnames(data)
    
    # method = "limma"
    vn_pcDEG <- list()
    vn_lncRNA_DEG <- list()
    for(method in c("DESeq2","edgeR","limma")){
      
      # filter = "filtered" 
      
      for(filter in c("unfiltered","filtered")){
        DEG <- countsDEAnalysis(counts = data, 
                                group = group, 
                                comparison = "Tumor-Normal",
                                method = method, 
                                filter = ifelse(filter == "unfiltered",FALSE,TRUE))
   
        # save(DEG,file = paste0(opt_deg,method,"-all.Rdata"))
        # load(paste0(opt_deg,method,"-all.Rdata"))
        head(DEG)
        
        DEG <- na.omit(DEG)
        
        DEG <- DEG %>%
          mutate(direction = factor(ifelse(FDR < cutFDR & abs(logFC) > cutFC,#添加direction一列
                                           ifelse(logFC > cutFC, "Up", "Down"),"Ns"),
                                    levels=c('Up','Down','Ns')))
        dim(DEG)
        DEG <- merge(anoinfo,DEG,by.x = "gene_name",by.y = "symbol")
        
        save(DEG,file = paste0(opt_deg,method,"-",filter,".Rdata"))
        
        ###=========写出数据============
        write.csv(DEG,file = paste0(opt_deg,method,"-",filter,".csv"))
        
        ###--------------编码蛋白的基因----------
        pcDEG <- DEG[DEG$gene_type == "protein_coding",]
        ###添加labs
        degstat <- table(pcDEG$direction)
        pcDEG$lab <- ""
        if(length(degstat)==3 & min(degstat)>=3){
          pcUpGene <- pcDEG[pcDEG$direction == 'Up',]
          pcUpGene <- arrange(pcUpGene,desc(logFC))
          UpTop3 <- pcUpGene$gene_name[1:3]
          
          pcDownGene <- pcDEG[pcDEG$direction == 'Down',]
          pcDownGene <- arrange(pcDownGene,logFC)
          DownTop3 <- pcDownGene$gene_name[1:3]
          
          labgene <- c(UpTop3,DownTop3)
          pcDEG$lab[match(labgene,pcDEG$gene_name)] <- labgene
        }

        
        ###保存数据,用于后续的富集分析
        save(pcDEG,file = paste0(opt_deg,method,"-",filter,"-pcDEG.Rdata"))
        ###--------------编码lnRNA的基因----------
        lncRNA_DEG <- DEG[DEG$gene_type == "lncRNA",]
        ###添加labs
        degstat <- table(lncRNA_DEG$direction)
        lncRNA_DEG$lab <- ""
        if(length(degstat)==3 & min(degstat)>=3){
          lncRNA_UpGene <- lncRNA_DEG[lncRNA_DEG$direction == 'Up',]
          lncRNA_UpGene <- arrange(lncRNA_UpGene,desc(logFC))
          UpTop3 <- lncRNA_UpGene$gene_name[1:3]
          
          lncRNA_DownGene <- lncRNA_DEG[lncRNA_DEG$direction == 'Down',]
          lncRNA_DownGene <- arrange(lncRNA_DownGene,logFC)
          DownTop3 <- lncRNA_DownGene$gene_name[1:3]
          
          labgene <- c(UpTop3,DownTop3)
          lncRNA_DEG$lab[match(labgene,lncRNA_DEG$gene_name)] <- labgene
          
        }
        
        ###=================记录差异基因
        if(filter == "filtered"){
          vn_pcDEG[[paste0(method,"-Up")]] <- pcDEG$gene_name[pcDEG$direction == 'Up']
          vn_pcDEG[[paste0(method,"-Down")]] <- pcDEG$gene_name[pcDEG$direction == 'Down']
          
          vn_lncRNA_DEG[[paste0(method,"-Up")]] <- lncRNA_DEG$gene_name[lncRNA_DEG$direction == 'Up']
          vn_lncRNA_DEG[[paste0(method,"-Down")]] <- lncRNA_DEG$gene_name[lncRNA_DEG$direction == 'Down']
        }
        
        
        ####----------绘制火山图-----------
        p1 <- plotDEGvolcanoFig(data = pcDEG,
                                x="logFC",
                                y="FDR",
                                cut_pvalue = cutFDR,
                                cutFC = cutFC,
                                title="Tumor vs. Normal\nProtein",
                                group="direction",
                                colour= colour,
                                label="lab")
        
        p2 <- plotDEGvolcanoFig(data = lncRNA_DEG,
                                x= "logFC",
                                y= "FDR",
                                cut_pvalue = cutFDR,
                                cutFC = cutFC,
                                title= "Tumor vs. Normal\nlncRNA",
                                group= "direction",
                                colour= colour,
                                label="lab")
        p <- ggarrange(p1,p2,ncol = 2) # 图形排版为2列
        
        ggsave(filename = paste0(opt_deg,method,"-",filter,"-volcano.pdf"),
               plot = p,width = 11,height = 5)
        ##记录分析的样本信息
        opinfo1 <- paste0(method,":Normal(n=",length(SamN),") vs.Tumor(n =",length(SamT),")")
        opinfo2 <- paste0("\t",filter)
        opinfo3 <- "\t\tprotein_coding:"
        opinfo4 <- paste0("\t\t\tUp:",sum(pcDEG$direction == "Up"))
        opinfo5 <- paste0("\t\t\tDown:",sum(pcDEG$direction == "Down"))
        opinfo6 <- "\t\tlncRNA:"
        opinfo7 <- paste0("\t\t\tUp:",sum(lncRNA_DEG$direction == "Up"))
        opinfo8 <- paste0("\t\t\tDown:",sum(lncRNA_DEG$direction == "Down"))
        
        output <- file(paste0(opt_deg,"00-notesStats.txt"), open="a+b")
        writeLines(c(opinfo1,opinfo2,opinfo3,opinfo4,opinfo5,opinfo6,opinfo7,opinfo8), con=output)
        close(output)
      }
      
     
      
    }
    
    save(vn_pcDEG,vn_lncRNA_DEG,file = paste0(opt_deg,"all-DEG-DESeq2-edgeR-limma.Rdata"))
    ###===========3种方法的差异分析结果比较=============
    upset_data <- list(vn_pcDEG=vn_pcDEG,vn_lncRNA_DEG = vn_lncRNA_DEG)
    upset_p <- lapply(c("vn_pcDEG","vn_lncRNA_DEG"), function(x){
      y <- fromList(upset_data[[x]])#Upset 自带函数转化数据结构
      p <- upset(y,
                 nsets=length(y),
                 nintersects = 1000,
                 sets = names(y),
                 keep.order = TRUE,
                 point.size = 3,
                 line.size = 1.5,
                 number.angles = 0,
                 text.scale = c(2.5, 2, 2, 1.5,2.5),
                 order.by= "freq",
                 matrix.color="#E31A1CFF",
                 sets.bar.color =  paletteer::paletteer_d("RColorBrewer::Paired")[1:ncol(y)],
                 main.bar.color = paletteer::paletteer_d("RColorBrewer::Paired")[1])
      pdf(paste0(opt_deg,"all-method-unset-",x,".pdf"),width = 7,height = 6)
      print(p)
      dev.off()
    })
    ##记录差异基因
    DEG1 <- intersect(vn_pcDEG[["DESeq2-Up"]],vn_pcDEG[["edgeR-Up"]])
    DEG2 <- intersect(vn_pcDEG[["DESeq2-Down"]],vn_pcDEG[["edgeR-Down"]])
    
    DEG3 <- intersect(vn_pcDEG[["limma-Up"]],DEG1)
    DEG4 <- intersect(vn_pcDEG[["limma-Down"]],DEG2)
    
    output1 <- file(paste0(opt_deg,"01-DESeq2-edgeR-Up.txt"), open="a+b")
    writeLines(DEG1, con=output)
    close(output1)
    output2 <- file(paste0(opt_deg,"02-DESeq2-edgeR-Down.txt"), open="a+b")
    writeLines(DEG2, con=output2)
    close(output2)
    
    output3 <- file(paste0(opt_deg,"03-DESeq2-edgeR-limma-Up.txt"), open="a+b")
    writeLines(DEG2, con=output3)
    close(output3)
    
    output4 <- file(paste0(opt_deg,"04-DESeq2-edgeR-limma-Down.txt"), open="a+b")
    writeLines(DEG2, con=output4)
    close(output4)
    # upset_p[["vn_pcDEG"]]
    # upset_p[["vn_lncRNA_DEG"]]
  }
}

差异分析的函数:该函数在前面文章【基于count数据的基因差异表达分析万能代码】中有提到,获取方式在最早的差异分析教程文章中获取【一文就会TCGA数据库基因表达差异分析】,现在分享一下这个函数。后期付费上面文章当是对我的赞赏!

代码语言:javascript
复制
#### 差异分析函数 ####
##DESeq2: baseMean log2FoldChange     lfcSE      stat      pvalue        padj
##method :"DESeq2","edgeR","limma"
countsDEAnalysis <- function (counts, group, comparison, method = "DESeq2", filter = TRUE){
  dge = DGEList(counts = counts)
  keep <- rowSums(cpm(dge) > 1) >= 0.5 * length(group)
  if (method == "DESeq2") {
    coldata <- data.frame(group)
    dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~group)
    dds$group <- factor(dds$group, levels = rev(strsplit(comparison, "-", fixed = TRUE)[[1]]))
    if (filter == TRUE) {
      dds <- dds[keep, ]
    }
    dds <- DESeq(dds)
    res <- results(dds)
    DEGAll <- data.frame(res)
    colnames(DEGAll) <- c("baseMean", "logFC", "lfcSE", "stat", "PValue", "FDR")
  }
  else if (method %in% c("edgeR", "limma")) {
    group <- factor(group)
    design <- model.matrix(~0 + group)
    colnames(design) <- levels(group)
    contrast.matrix <- makeContrasts(contrasts = comparison,
                                     levels = design)
    if (filter == TRUE) {
      dge <- dge[keep, , keep.lib.sizes = TRUE]
    }
    dge <- calcNormFactors(dge)
    if (method == "edgeR") {
      dge <- estimateDisp(dge, design)
      fit <- glmFit(dge, design)
      lrt <- glmLRT(fit, contrast = contrast.matrix)
      DEGAll <- lrt$table
      DEGAll$FDR <- p.adjust(DEGAll$PValue, method = "fdr")
    }
    else if (method == "limma") {
      v <- voom(dge, design = design, plot = FALSE)
      fit <- lmFit(v, design)
      fit2 <- contrasts.fit(fit, contrast.matrix)
      fit2 <- eBayes(fit2)
      DEGAll <- topTable(fit2, coef = 1, n = Inf)
      colnames(DEGAll) <- c("logFC", "AveExpr", "t", "PValue", "FDR", "B")
    }
  }
  DEGAll$symbol <- rownames(DEGAll)
  DEGAll <- dplyr::select(DEGAll, symbol, everything())
  return(DEGAll)
}

绘制火山图的函数:

代码语言:javascript
复制
####--------------------------绘制火山图
plotDEGvolcanoFig <- function(data,x,y,cut_pvalue,cutFC,title,group,colour,label){
  data <- data[,c(x,y,group,label)]
  colnames(data) <- c("log2FC","FDR","group","label")
  p <-  ggplot(data = data, aes(x = log2FC, y = -log10(FDR), colour = group)) + #数据映射
    geom_point(alpha = 1,size=2) +#散点图,alpha就是点的透明度
    scale_color_manual(values = colour) + #手动调颜色c("#DC143C","#00008B", "#808080")
    theme_bw() +#设定主题
    geom_text_repel(label = data$label,#便签就是基因名 geom_text_repel:adds text directly to the plot.
                    size = 4,
                    segment.color = "black", #连接线的颜色,就是名字和点之间的线
                    show.legend = FALSE) +
    theme(axis.title=element_text(size=15,face="plain",color="black"),
          axis.text = element_text(size=12,face="plain",color="black"),
          legend.title = element_blank(),
          legend.background = element_blank(),
          panel.background = element_rect(fill = "transparent",colour = "black"),
          plot.background = element_blank(),
          plot.title = element_text(size=15, lineheight=.8,hjust=0.5, face="plain"),
          legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"))+
    labs(title = title)+ #"Tumor vs. Normal"
    ylab(ifelse(y == "PValue",expression(-log[10]("P Value")),expression(-log[10]("FDR")))) +#expression的作用就是让log10的10下标
    xlab(expression(log[2]("Fold Change"))) +
    geom_vline(xintercept = c(-cutFC, cutFC), #加垂直线,在-1和+1之间划线
               lty = 2,
               col = "black",
               lwd = 0.3) +
    geom_hline(yintercept = -log10(cut_pvalue),
               lty = 2,
               col = "black",
               lwd = 0.3)
  return(p)
}
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2022-10-04,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 MedBioInfoCloud 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
相关产品与服务
数据库
云数据库为企业提供了完善的关系型数据库、非关系型数据库、分析型数据库和数据库生态工具。您可以通过产品选择和组合搭建,轻松实现高可靠、高可用性、高性能等数据库需求。云数据库服务也可大幅减少您的运维工作量,更专注于业务发展,让企业一站式享受数据上云及分布式架构的技术红利!
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档