前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >比较不同流程(limma/voom,edgeR,DESeq2 )差异分析的区别

比较不同流程(limma/voom,edgeR,DESeq2 )差异分析的区别

作者头像
生信技能树
发布2021-01-05 10:06:19
4.7K0
发布2021-01-05 10:06:19
举报
文章被收录于专栏:生信技能树生信技能树
下面是9月学员“冰吉林”的投稿

前言:

距离第一次听说生信已经十几年了,现在是邋遢大叔重新开始学代码,精力确实已不像从前,各位入坑还是要乘早。后来约莫在5年前,课题组当时有个RNA-Seq数据,lab meeting时听瑞典小哥在汇报DEGs筛选,当时感觉好是神奇。其实陆陆续续也有过学习的念头,但在对自己的各种纵容下,想法又逐渐隐没。直到2月前,机缘巧合参加了生信技能树培训,才进一步强化了自己学习生信技术的信念。

几天前,曾老师在群里给我布置了一份学徒作业,比较不同流程(limma/voom,edgeR,DESeq2 )差异分析的区别,拟使用的数据集是TCGA-BRCA的counts值矩阵。作为非肿瘤口的生信新人,秉着无知者无畏的态度试了一试。以下是具体过程。

代码主要来源于小洁老师(不是我吹,听了小洁老师的课,傻子也能学会R代码)。

R包安装

代码语言:javascript
复制
# R包太多,这里略了。
for (pkg in cran_packages){
  if (! require(pkg,character.only=T) ) {
    install.packages(pkg,ask = F,update = F)
    require(pkg,character.only=T) 
  }
}

# first prepare BioManager on CRAN
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)

# use BiocManager to install
for (pkg in Biocductor_packages){
  if (! require(pkg,character.only=T) ) {
    BiocManager::install(pkg,ask = F,update = F)
    require(pkg,character.only=T) 
  }
}

#前面的报错都先不要管。主要看这里
for (pkg in c(Biocductor_packages,cran_packages)){
  require(pkg,character.only=T) 
}

#哪个报错,就回去安装哪个。如果你没有安装xx包,却提示你xx包不存在,这也正常,是因为复杂的依赖关系,缺啥补啥。

if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F)

数据下载

1.从网页选择数据,下载manifest文件

数据存放网站:https://portal.gdc.cancer.gov/

在Repository勾选自己需要的case和file类型。

2.使用gdc-client工具下载

因使用的是Rstudio-Server Rstudio_3.6.3_CentOS7,gdc-client的安装有点波折,解决方法参考https://my.oschina.net/shenweiyan/blog/4538119

代码语言:javascript
复制
#step1
options(stringsAsFactors = F)
library(stringr)
cancer_type="TCGA-BRCA"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("expdata"))dir.create("expdata")
dir()
代码语言:javascript
复制
#此代码块是在Terminal中使用
conda create -n gdc python=3.7
source activate gdc
git clone https://github.com/NCI-GDC/gdc-client
cd gdc-client
pip install -r requirements.txt
python setup.py install 2>&1 | tee -a install.log
###安装完毕后开始下载数据
gdc-client download -m gdc_manifest.2020-12-17_clinic.txt -d clinical
gdc-client download -m gdc_manifest.2020-12-17_exp.txt -d expdata
代码语言:javascript
复制
#下载结束后
length(dir("./clinical/"))
length(dir("./expdata/"))

表达矩阵获得

1.整理临床信息

代码语言:javascript
复制
library(XML)
result <- xmlParse("./clinical/00049989-fa21-48fb-8dda-710c0dd5932e/nationwidechildrens.org_clinical.TCGA-A2-A0CT.xml")   #单个运行
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))

xmls = dir("clinical/",pattern = "*.xml$",recursive = T)   #可层级

td = function(x){
  result <- xmlParse(file.path("clinical/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]

2.整理表达矩阵

批量读取所有的counts.gz文件。

代码语言:javascript
复制
count_files = dir("expdata/",pattern = "*.htseq.counts.gz$",recursive = T)

ex = function(x){
  result <- read.table(file.path("expdata/",x),row.names = 1,sep = "\t")
  return(result)
}
#  head(ex("03aee74e-4e37-4a58-a720-c90e807d2f40/be5bc6a0-9720-47ac-953e-fa8d0c32cd82.htseq.counts.gz"))

exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
exp[1:4,1:4]

下载cart-json文件已将文件名与样本ID一一对应。

代码语言:javascript
复制
meta <- jsonlite::fromJSON("metadata.cart.2020-12-17.json")
colnames(meta)
ids <- meta$associated_entities;class(ids)
ids[[1]]
class(ids[[1]][,1])
ID = sapply(ids,function(x){x[,2]})
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)
head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]

3.过滤表达矩阵

代码语言:javascript
复制
dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 9), ]
dim(exp)
exp[1:4,1:4]

# 过滤在至少在75%的样本中都有表达的基因
keep_exp <- rowSums(exp>0) >= floor(0.75*ncol(exp))
table(keep_exp)
exp <- exp[keep_exp,]
exp[1:4,1:4]
dim(exp)

4.添加分组信息

根据样本ID的第14-15位,给样本分组(tumor和normal)

代码语言:javascript
复制
table(str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(str_sub(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))

DEGs分析及对比

代码语言:javascript
复制
#三大R包差异分析
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))BiocManager::install('DESeq2')
if(!require(edgeR))BiocManager::install('edgeR')
if(!require(limma))BiocManager::install('limma')

rm(list = ls())
load("TCGA-BRCAgdc.Rdata")
table(group_list)

#deseq2----
library(DESeq2)
colData <- data.frame(row.names =colnames(exp), 
                      condition=group_list)
if(!file.exists(paste0(cancer_type,"dd.Rdata"))){
  dds <- DESeqDataSetFromMatrix(
    countData = exp,
    colData = colData,
    design = ~ condition)
  dds <- DESeq(dds)
  save(dds,file = paste0(cancer_type,"dd.Rdata"))
}
load(paste0(cancer_type,"dd.Rdata"))

# 两两比较
res <- results(dds, contrast = c("condition",rev(levels(group_list))))
resOrdered <- res[order(res$pvalue),] # 按照P值排序
DEG <- as.data.frame(resOrdered)
head(DEG)

#添加change列标记基因上调下调
logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
#logFC_cutoff <- 2
k1 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange < -logFC_cutoff)
k2 = (DEG$pvalue < 0.05)&(DEG$log2FoldChange > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
DESeq2_DEG <- DEG

#edgeR----
library(edgeR)

dge <- DGEList(counts=exp,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)

dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) 

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )

#logFC_cutoff <- 2
k1 = (DEG$PValue < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))

head(DEG)
table(DEG$change)
edgeR_DEG <- DEG


###limma----
library(limma)

design <- model.matrix(~0+group_list)
colnames(design)=levels(group_list)
rownames(design)=colnames(exp)

dge <- DGEList(counts=exp)
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

constrasts = paste(rev(levels(group_list)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)

logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )

#logFC_cutoff <- 2
k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)
limma_voom_DEG <- DEG

tj = data.frame(deseq2 = as.integer(table(DESeq2_DEG$change)),
                edgeR = as.integer(table(edgeR_DEG$change)),
                limma_voom = as.integer(table(limma_voom_DEG$change)),
                row.names = c("down","not","up")
);tj

save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,group_list,tj,file = paste0(cancer_type,"DEG.Rdata"))


#可视化
rm(list = ls())
load("TCGA-BRCAgdc.Rdata")
load("TCGA-BRCADEG.Rdata")
if(!require(tinyarray))devtools::install_local("tinyarray-master.zip",upgrade = F)
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
dat = log2(exp+1)
pca.plot = draw_pca(dat,group_list);pca.plot
save(pca.plot,file = paste0(cancer_type,"pcaplot.Rdata"))

cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]

h1 = draw_heatmap(dat[cg1,],group_list,scale_before = T)
h2 = draw_heatmap(dat[cg2,],group_list,scale_before = T)
h3 = draw_heatmap(dat[cg3,],group_list,scale_before = T)

m2d = function(x){
  mean(abs(x))+2*sd(abs(x))
}

v1 = draw_volcano(DESeq2_DEG,pkg = 1,logFC_cutoff = m2d(DESeq2_DEG$log2FoldChange))
v2 = draw_volcano(edgeR_DEG,pkg = 2,logFC_cutoff = m2d(edgeR_DEG$logFC))
v3 = draw_volcano(limma_voom_DEG,pkg = 3,logFC_cutoff = m2d(limma_voom_DEG$logFC))


library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(cancer_type,"heat_vo.png"),width = 15,height = 10)

#三大R包差异基因对比之相关性
#for test correlation plot between the logFC for the top 500 DE genes
head(edgeR_DEG)
head(limma_voom_DEG)
head(DESeq2_DEG)

td1=edgeR_DEG[,1:2]
head(td1)
colnames(td1)=c("logFC_edgeR","logCPM_edgeR")
td1$ID=rownames(td1)


td2=limma_voom_DEG[,1:2]
head(td2)
colnames(td2)=c("logFC_limma","logCPM_limma")
td2$ID=rownames(td2)

td3=DESeq2_DEG[,1:2]
head(td3)
colnames(td3)=c("baseMean","log2FC_DESeq2")
td3$ID=rownames(td3)

td_edgeR_limma=inner_join(td1,td2,by="ID")
td_edgeR_limma=td_edgeR_limma[,c(3,1,4)]
head(td_edgeR_limma)

td_edgeR_DESeq2=inner_join(td1,td3,by="ID")
td_edgeR_DESeq2=td_edgeR_DESeq2[,c(3,1,5)]
head(td_edgeR_DESeq2)

td_DESeq2_limma=inner_join(td3,td2,by="ID")
td_DESeq2_limma=td_DESeq2_limma[,c(3,2,4)]
head(td_DESeq2_limma)

  #选出500个logFC绝对值最大的
# for edgeR vs limma
a1=head(td_edgeR_limma[order(td_edgeR_limma$logFC_edgeR),],250)
a2=tail(td_edgeR_limma[order(td_edgeR_limma$logFC_edgeR),],250)
a=rbind(a1,a2)
head(a)
ggplot(a,aes(y=logFC_limma,x=logFC_edgeR))+geom_point()
p1=ggplot(data=a, aes(y=logFC_limma, x=logFC_edgeR))+geom_point(color="red")+stat_smooth(method="lm",se=FALSE)+stat_cor(data=a, method = "pearson")
png("cor_edgeR_limma.png")
p1
dev.off()

# for edgeR vs DESeq2
a1=head(td_edgeR_DESeq2[order(td_edgeR_DESeq2$logFC_edgeR),],250)
a2=tail(td_edgeR_DESeq2[order(td_edgeR_DESeq2$logFC_edgeR),],250)
a=rbind(a1,a2)
head(a)
ggplot(a,aes(y=log2FC_DESeq2,x=logFC_edgeR))+geom_point()
p2=ggplot(data=a, aes(y=log2FC_DESeq2, x=logFC_edgeR))+geom_point(color="red")+stat_smooth(method="lm",se=FALSE)+stat_cor(data=a, method = "pearson")
png("cor_edgeR_DESeq2.png")
p2
dev.off()

# for limma vs DESeq2
a1=head(td_DESeq2_limma[order(td_DESeq2_limma$logFC_limma),],250)
a2=tail(td_DESeq2_limma[order(td_DESeq2_limma$logFC_limma),],250)
a=rbind(a1,a2)
head(a)
ggplot(a,aes(y=log2FC_DESeq2,x=logFC_limma))+geom_point()
p3=ggplot(data=a, aes(y=log2FC_DESeq2, x=logFC_limma))+geom_point(color="red")+stat_smooth(method="lm",se=FALSE)+stat_cor(data=a, method = "pearson")
png("cor_limma_DESeq2.png")
p3
dev.off()

library(patchwork)
p1+p2+p3+ plot_layout(guides = "collect")
png("cor_limma_DESeq2_edgeR.png")
p1+p2+p3+ plot_layout(guides = "collect")
dev.off()


#三大R包差异基因对比之韦恩图
rm(list = ls())
load("TCGA-BRCAgdc.Rdata")
load("TCGA-BRCADEG.Rdata")
load("TCGA-BRCApcaplot.Rdata")
UP=function(df){
  rownames(df)[df$change=="UP"]
}
DOWN=function(df){
  rownames(df)[df$change=="DOWN"]
}

up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))
dat = log2(exp+1)
hp = draw_heatmap(dat[c(up,down),],group_list,scale_before = T)



#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DESeq2_DEG),
                edgeR = UP(edgeR_DEG),
                limma = UP(limma_voom_DEG))

down_genes = list(Deseq2 = DOWN(DESeq2_DEG),
                  edgeR = DOWN(edgeR_DEG),
                  limma = DOWN(limma_voom_DEG))


up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")

#维恩图拼图,终于搞定

library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(cancer_type,"heat_ve_pca.png"),width = 15,height = 10)


#以下又为test #for upset图  参考https://www.jianshu.com/p/b702f644cad3
#install.packages("UpSetR")
library(UpSetR)
library(dplyr)
library(tidyr)
up_DESeq2=UP(DESeq2_DEG)
up_edgeR=UP(edgeR_DEG)
up_limma=UP(limma_voom_DEG)
down_DESeq2=DOWN(DESeq2_DEG)
down_edgeR=DOWN(edgeR_DEG)
down_limma=DOWN(limma_voom_DEG)

input <- c(
  'DESeq2_up'=  length(up_DESeq2),
  'edgeR_up' =  length(up_edgeR),
  'limma_up' = length(up_limma),
  'DESeq2_down'  =length(down_DESeq2),
  'edgeR_down'  = length(down_edgeR),
  'limma_down'  =length(down_limma),
'DESeq2_up&edgeR_up'  =length(intersect(UP(DESeq2_DEG),UP(edgeR_DEG))),
'DESeq2_up&limma_up'  = length(intersect(UP(DESeq2_DEG),UP(limma_voom_DEG))),
'edgeR_up&limma_up'  =length(intersect(UP(limma_voom_DEG),UP(edgeR_DEG))),
'DESeq2_up&edgeR_up&limma_up'=length(intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))),
'DESeq2_down&edgeR_down'  =length(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG))),
'DESeq2_down&limma_down'  = length(intersect(DOWN(DESeq2_DEG),DOWN(limma_voom_DEG))),
'edgeR_down&limma_down'  =length(intersect(DOWN(limma_voom_DEG),DOWN(edgeR_DEG))),
'DESeq2_down&edgeR_down&limma_down'=length(intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG)))
)

data <- fromExpression(input)
p1 <- upset(data, nsets = 6, 
            sets = c('DESeq2_up',
                     'edgeR_up',
                     'limma_up',
                     'DESeq2_down',
                     'edgeR_down',
                     'limma_down'),
            keep.order = TRUE,
            # number.angles = 30, 
            point.size = 5, 
            line.size = 1.3, 
            mainbar.y.label = "DEGs Intersection", 
            sets.x.label = "",
            mb.ratio = c(0.60, 0.40),
            text.scale = c(2, 3, 4, 1.5,1.5, 2))
png("upset.png")
p1
dev.off()

第一个是3大R包的火山图和如图:

TCGA-BRCAheat_vo

然后是3大R包的各自的上下调基因的韦恩图:

TCGA-BRCAheat_ve_pca

跟韦恩图一个意思的upset图

upset

最后是3个R包各自计算的logFC的相关性:

cor_limma_DESeq2_edgeR

本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2020-12-29,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信技能树 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 前言:
  • R包安装
  • 数据下载
  • 表达矩阵获得
    • 1.整理临床信息
      • 2.整理表达矩阵
        • 3.过滤表达矩阵
          • 4.添加分组信息
          • DEGs分析及对比
          领券
          问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档