视频地址:http://mpvideo.qpic.cn/0bc3zeaakaaalqalwhjtmzrvbsodaxeqabia.f10002.mp4?
参考文章:
一文就会TCGA数据库基因表达差异分析【过后付费当赞赏】
基于count数据的基因差异表达分析万能代码【和本文代码差不多】
代码:
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
rm(list = ls())
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(DESeq2)
library(limma)
library(edgeR)
library(reshape2)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(UpSetR)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
"STARdata.Rdata$",full.names = T)
opt <- "output/006-DEG-tumor-vs-normal/Unpaired/"
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")
###差异分析的函数
source("H:/MedBioInfoCloud/analysis/fun/countsDEAnalysis.R")
###火山图绘制函数
source("H:/MedBioInfoCloud/analysis/fun/plotDEGvolcanoFig.R")
###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
norn <- 10 ###正常样本最少数量
cutFC <- 2 ###abs(log2(FC))
cutFDR <- 0.01 ###
colour = c("#DC143C","#00008B", "#808080") ##火山图的3种颜色:Down,ns,Up
for(proj in project){
message("============================")
message(proj)
###删除所有文件===========
record_files <- dir(paste0(opt,proj,"/"),"*.*",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
"There is no record file that can be removed")
load(FilePath[grep(proj,FilePath)])#STARdata
count <- STARdata[["count"]]
##正常组织样本ID
SamN <- TCGAquery_SampleTypes(barcode = colnames(count)[-c(1:3)],
typesample = c("NT","NB","NBC","NEBV","NBM"))
if(length(SamN) >= norn){
##输出路径
opt_deg <- paste0(opt,proj,"/")
ifelse(dir.exists(opt_deg),"The folder already exists.",dir.create(opt_deg,recursive = T))
anoinfo <- count[,1:3]
###过滤数据
count <- filterGeneTypeExpr(expr = count,
fil_col = "gene_type",
filter = FALSE)
##过滤不表达的基因
count <- count[apply(count,1,var) !=0,]
##肿瘤组织样本ID
SamT <- setdiff(colnames(count),SamN)
###去除重复样本
nor_exp <- del_dup_sample(count[,SamN],col_rename = F)
tur_exp <- del_dup_sample(count[,SamT],col_rename = F)
data <- cbind(nor_exp,tur_exp)
group <- c(rep("Normal",ncol(nor_exp)),rep("Tumor",ncol(tur_exp)))
names(group) <- colnames(data)
# method = "limma"
vn_pcDEG <- list()
vn_lncRNA_DEG <- list()
for(method in c("DESeq2","edgeR","limma")){
# filter = "filtered"
for(filter in c("unfiltered","filtered")){
DEG <- countsDEAnalysis(counts = data,
group = group,
comparison = "Tumor-Normal",
method = method,
filter = ifelse(filter == "unfiltered",FALSE,TRUE))
# save(DEG,file = paste0(opt_deg,method,"-all.Rdata"))
# load(paste0(opt_deg,method,"-all.Rdata"))
head(DEG)
DEG <- na.omit(DEG)
DEG <- DEG %>%
mutate(direction = factor(ifelse(FDR < cutFDR & abs(logFC) > cutFC,#添加direction一列
ifelse(logFC > cutFC, "Up", "Down"),"Ns"),
levels=c('Up','Down','Ns')))
dim(DEG)
DEG <- merge(anoinfo,DEG,by.x = "gene_name",by.y = "symbol")
save(DEG,file = paste0(opt_deg,method,"-",filter,".Rdata"))
###=========写出数据============
write.csv(DEG,file = paste0(opt_deg,method,"-",filter,".csv"))
###--------------编码蛋白的基因----------
pcDEG <- DEG[DEG$gene_type == "protein_coding",]
###添加labs
degstat <- table(pcDEG$direction)
pcDEG$lab <- ""
if(length(degstat)==3 & min(degstat)>=3){
pcUpGene <- pcDEG[pcDEG$direction == 'Up',]
pcUpGene <- arrange(pcUpGene,desc(logFC))
UpTop3 <- pcUpGene$gene_name[1:3]
pcDownGene <- pcDEG[pcDEG$direction == 'Down',]
pcDownGene <- arrange(pcDownGene,logFC)
DownTop3 <- pcDownGene$gene_name[1:3]
labgene <- c(UpTop3,DownTop3)
pcDEG$lab[match(labgene,pcDEG$gene_name)] <- labgene
}
###保存数据,用于后续的富集分析
save(pcDEG,file = paste0(opt_deg,method,"-",filter,"-pcDEG.Rdata"))
###--------------编码lnRNA的基因----------
lncRNA_DEG <- DEG[DEG$gene_type == "lncRNA",]
###添加labs
degstat <- table(lncRNA_DEG$direction)
lncRNA_DEG$lab <- ""
if(length(degstat)==3 & min(degstat)>=3){
lncRNA_UpGene <- lncRNA_DEG[lncRNA_DEG$direction == 'Up',]
lncRNA_UpGene <- arrange(lncRNA_UpGene,desc(logFC))
UpTop3 <- lncRNA_UpGene$gene_name[1:3]
lncRNA_DownGene <- lncRNA_DEG[lncRNA_DEG$direction == 'Down',]
lncRNA_DownGene <- arrange(lncRNA_DownGene,logFC)
DownTop3 <- lncRNA_DownGene$gene_name[1:3]
labgene <- c(UpTop3,DownTop3)
lncRNA_DEG$lab[match(labgene,lncRNA_DEG$gene_name)] <- labgene
}
###=================记录差异基因
if(filter == "filtered"){
vn_pcDEG[[paste0(method,"-Up")]] <- pcDEG$gene_name[pcDEG$direction == 'Up']
vn_pcDEG[[paste0(method,"-Down")]] <- pcDEG$gene_name[pcDEG$direction == 'Down']
vn_lncRNA_DEG[[paste0(method,"-Up")]] <- lncRNA_DEG$gene_name[lncRNA_DEG$direction == 'Up']
vn_lncRNA_DEG[[paste0(method,"-Down")]] <- lncRNA_DEG$gene_name[lncRNA_DEG$direction == 'Down']
}
####----------绘制火山图-----------
p1 <- plotDEGvolcanoFig(data = pcDEG,
x="logFC",
y="FDR",
cut_pvalue = cutFDR,
cutFC = cutFC,
title="Tumor vs. Normal\nProtein",
group="direction",
colour= colour,
label="lab")
p2 <- plotDEGvolcanoFig(data = lncRNA_DEG,
x= "logFC",
y= "FDR",
cut_pvalue = cutFDR,
cutFC = cutFC,
title= "Tumor vs. Normal\nlncRNA",
group= "direction",
colour= colour,
label="lab")
p <- ggarrange(p1,p2,ncol = 2) # 图形排版为2列
ggsave(filename = paste0(opt_deg,method,"-",filter,"-volcano.pdf"),
plot = p,width = 11,height = 5)
##记录分析的样本信息
opinfo1 <- paste0(method,":Normal(n=",length(SamN),") vs.Tumor(n =",length(SamT),")")
opinfo2 <- paste0("\t",filter)
opinfo3 <- "\t\tprotein_coding:"
opinfo4 <- paste0("\t\t\tUp:",sum(pcDEG$direction == "Up"))
opinfo5 <- paste0("\t\t\tDown:",sum(pcDEG$direction == "Down"))
opinfo6 <- "\t\tlncRNA:"
opinfo7 <- paste0("\t\t\tUp:",sum(lncRNA_DEG$direction == "Up"))
opinfo8 <- paste0("\t\t\tDown:",sum(lncRNA_DEG$direction == "Down"))
output <- file(paste0(opt_deg,"00-notesStats.txt"), open="a+b")
writeLines(c(opinfo1,opinfo2,opinfo3,opinfo4,opinfo5,opinfo6,opinfo7,opinfo8), con=output)
close(output)
}
}
save(vn_pcDEG,vn_lncRNA_DEG,file = paste0(opt_deg,"all-DEG-DESeq2-edgeR-limma.Rdata"))
###===========3种方法的差异分析结果比较=============
upset_data <- list(vn_pcDEG=vn_pcDEG,vn_lncRNA_DEG = vn_lncRNA_DEG)
upset_p <- lapply(c("vn_pcDEG","vn_lncRNA_DEG"), function(x){
y <- fromList(upset_data[[x]])#Upset 自带函数转化数据结构
p <- upset(y,
nsets=length(y),
nintersects = 1000,
sets = names(y),
keep.order = TRUE,
point.size = 3,
line.size = 1.5,
number.angles = 0,
text.scale = c(2.5, 2, 2, 1.5,2.5),
order.by= "freq",
matrix.color="#E31A1CFF",
sets.bar.color = paletteer::paletteer_d("RColorBrewer::Paired")[1:ncol(y)],
main.bar.color = paletteer::paletteer_d("RColorBrewer::Paired")[1])
pdf(paste0(opt_deg,"all-method-unset-",x,".pdf"),width = 7,height = 6)
print(p)
dev.off()
})
##记录差异基因
DEG1 <- intersect(vn_pcDEG[["DESeq2-Up"]],vn_pcDEG[["edgeR-Up"]])
DEG2 <- intersect(vn_pcDEG[["DESeq2-Down"]],vn_pcDEG[["edgeR-Down"]])
DEG3 <- intersect(vn_pcDEG[["limma-Up"]],DEG1)
DEG4 <- intersect(vn_pcDEG[["limma-Down"]],DEG2)
output1 <- file(paste0(opt_deg,"01-DESeq2-edgeR-Up.txt"), open="a+b")
writeLines(DEG1, con=output)
close(output1)
output2 <- file(paste0(opt_deg,"02-DESeq2-edgeR-Down.txt"), open="a+b")
writeLines(DEG2, con=output2)
close(output2)
output3 <- file(paste0(opt_deg,"03-DESeq2-edgeR-limma-Up.txt"), open="a+b")
writeLines(DEG2, con=output3)
close(output3)
output4 <- file(paste0(opt_deg,"04-DESeq2-edgeR-limma-Down.txt"), open="a+b")
writeLines(DEG2, con=output4)
close(output4)
# upset_p[["vn_pcDEG"]]
# upset_p[["vn_lncRNA_DEG"]]
}
}
差异分析的函数:该函数在前面文章【基于count数据的基因差异表达分析万能代码】中有提到,获取方式在最早的差异分析教程文章中获取【一文就会TCGA数据库基因表达差异分析】,现在分享一下这个函数。后期付费上面文章当是对我的赞赏!
#### 差异分析函数 ####
##DESeq2: baseMean log2FoldChange lfcSE stat pvalue padj
##method :"DESeq2","edgeR","limma"
countsDEAnalysis <- function (counts, group, comparison, method = "DESeq2", filter = TRUE){
dge = DGEList(counts = counts)
keep <- rowSums(cpm(dge) > 1) >= 0.5 * length(group)
if (method == "DESeq2") {
coldata <- data.frame(group)
dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~group)
dds$group <- factor(dds$group, levels = rev(strsplit(comparison, "-", fixed = TRUE)[[1]]))
if (filter == TRUE) {
dds <- dds[keep, ]
}
dds <- DESeq(dds)
res <- results(dds)
DEGAll <- data.frame(res)
colnames(DEGAll) <- c("baseMean", "logFC", "lfcSE", "stat", "PValue", "FDR")
}
else if (method %in% c("edgeR", "limma")) {
group <- factor(group)
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
contrast.matrix <- makeContrasts(contrasts = comparison,
levels = design)
if (filter == TRUE) {
dge <- dge[keep, , keep.lib.sizes = TRUE]
}
dge <- calcNormFactors(dge)
if (method == "edgeR") {
dge <- estimateDisp(dge, design)
fit <- glmFit(dge, design)
lrt <- glmLRT(fit, contrast = contrast.matrix)
DEGAll <- lrt$table
DEGAll$FDR <- p.adjust(DEGAll$PValue, method = "fdr")
}
else if (method == "limma") {
v <- voom(dge, design = design, plot = FALSE)
fit <- lmFit(v, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
DEGAll <- topTable(fit2, coef = 1, n = Inf)
colnames(DEGAll) <- c("logFC", "AveExpr", "t", "PValue", "FDR", "B")
}
}
DEGAll$symbol <- rownames(DEGAll)
DEGAll <- dplyr::select(DEGAll, symbol, everything())
return(DEGAll)
}
绘制火山图的函数:
####--------------------------绘制火山图
plotDEGvolcanoFig <- function(data,x,y,cut_pvalue,cutFC,title,group,colour,label){
data <- data[,c(x,y,group,label)]
colnames(data) <- c("log2FC","FDR","group","label")
p <- ggplot(data = data, aes(x = log2FC, y = -log10(FDR), colour = group)) + #数据映射
geom_point(alpha = 1,size=2) +#散点图,alpha就是点的透明度
scale_color_manual(values = colour) + #手动调颜色c("#DC143C","#00008B", "#808080")
theme_bw() +#设定主题
geom_text_repel(label = data$label,#便签就是基因名 geom_text_repel:adds text directly to the plot.
size = 4,
segment.color = "black", #连接线的颜色,就是名字和点之间的线
show.legend = FALSE) +
theme(axis.title=element_text(size=15,face="plain",color="black"),
axis.text = element_text(size=12,face="plain",color="black"),
legend.title = element_blank(),
legend.background = element_blank(),
panel.background = element_rect(fill = "transparent",colour = "black"),
plot.background = element_blank(),
plot.title = element_text(size=15, lineheight=.8,hjust=0.5, face="plain"),
legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"))+
labs(title = title)+ #"Tumor vs. Normal"
ylab(ifelse(y == "PValue",expression(-log[10]("P Value")),expression(-log[10]("FDR")))) +#expression的作用就是让log10的10下标
xlab(expression(log[2]("Fold Change"))) +
geom_vline(xintercept = c(-cutFC, cutFC), #加垂直线,在-1和+1之间划线
lty = 2,
col = "black",
lwd = 0.3) +
geom_hline(yintercept = -log10(cut_pvalue),
lty = 2,
col = "black",
lwd = 0.3)
return(p)
}
本文分享自 MedBioInfoCloud 微信公众号,前往查看
如有侵权,请联系 cloudcommunity@tencent.com 删除。
本文参与 腾讯云自媒体同步曝光计划 ,欢迎热爱写作的你一起参与!