前些天我布置了一个学徒作业:这一个图背后是12个差异分析的综合
作业参考的文献:Integrated analysis reveals five potential ceRNA biomarkers in human lung adenocarcinoma
所以我设置的学徒作业是:下载TCGA数据库中LUSC的转录组信号值矩阵,LUSC病人分成了4类T1-4亚型分别与Normal组做差异分析,就是3*4=12个表达矩阵,12次差异分析,画PCA图,热图,火山图,以及用于差异分析结果比较的Venn图。
下面让我们一起看看一个优秀学徒的表演,该学徒很久以前在我们这里分享过他跨专业进入生信学习圈子的感悟:在华大工作五年还不如生信技能树3天?
下载数据
紧跟群主的TCGA视频课程,从UCSC的XENA下载LUSC表达矩阵,临床信息,探针注释GMT文件!
视频课程见:4个小时TCGA肿瘤数据库知识图谱视频教程又有学习笔记啦
rm(list = ls())
pd=read.table("TCGA-LUSC.GDC_phenotype.tsv.gz",header=T,sep = "\t", quote = "\"",dec = ".", fill = TRUE, comment.char = "",row.names = 1)
gset_mRNA=read.table("TCGA-LUSC.htseq_counts.tsv.gz",header=T,sep = "\t", quote = "\"",dec = ".", fill = TRUE, comment.char = "",row.names = 1)
gset_miRNA=read.table("TCGA-LUSC.mirna.tsv.gz",header=T,sep = "\t",row.names = 1)
#由于导入数据列名自动把“-”变为了“.”,可用gsub替换回来。
colnames(gset_miRNA)=gsub("\\.","-", colnames(gset_miRNA))
colnames(gset_mRNA)=gsub("\\.","-", colnames(gset_mRNA))
# 去掉mRNA和miRNA表达矩阵不一致的样本
table(colnames(gset_mRNA) %in% colnames(gset_miRNA))
gset_mRNA=gset_mRNA[,colnames(gset_mRNA) %in% colnames(gset_miRNA)]
gset_miRNA=gset_miRNA[,colnames(gset_miRNA) %in% colnames(gset_mRNA)]
table(substring(colnames(gset_mRNA),14,16))
table(pd$pathologic_T)
# TCGA数据库的样本分组规律(感谢Jimmy大神提供):Tumor types range from 01 - 09, normal types from 10 - 19 and control samples from 20 - 29.
# 分期里T1分为了T1a,T1b,T1;T2分为了T2a,T2b,T2;这里就不细分了,分别合在一起组成T1和T2期亚群
group_list_mRNA=ifelse(substring(colnames(gset_mRNA),14,15) == "11","Normal",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T2",]),"T2",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T3",]),"T3",ifelse(colnames(gset_mRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T4",]),"T4","q"))))))
group_list_miRNA=ifelse(substring(colnames(gset_miRNA),14,15) == "11","Normal",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T1",]),"T1",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T2",]),"T2",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T3",]),"T3",ifelse(colnames(gset_miRNA) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == "T4",]),"T4","q"))))))
table(group_list_mRNA)
table(group_list_miRNA)
# 查下TCGA官网数据的说明,mRNA表达矩阵取了log2(fpkm+1),miRNA表达矩阵取了log2(RPM+1),所以这里要还原回去
head(gset_miRNA)
head(gset_mRNA)
gset_miRNA=(2^gset_miRNA -1)*1000 #差异分析用的RPKM,所以要乘以1000
gset_mRNA=2^gset_mRNA -1
# mRNA表达矩阵包括codingRNA和lncRNA,需要拆分下
library(rtracklayer)
library(SummarizedExperiment)
gtf1 <- rtracklayer::import('gencode.v22.annotation.gtf/gencode.v22.annotation.gtf')
gtf_df <- as.data.frame(gtf1)
head(gtf_df)
ensem2symbol <- gtf_df[gtf_df$type == 'gene',c('gene_id', 'gene_type', 'gene_name', 'source')]
rownames(ensem2symbol) <- ensem2symbol$gene_id
head(sort(table(ensem2symbol$gene_type),decreasing = T))
gset_cdRNA=gset_mRNA[rownames(gset_mRNA) %in% rownames(ensem2symbol[ensem2symbol$gene_type == "protein_coding",]),]
gset_lncRNA=gset_mRNA[rownames(gset_mRNA) %in% rownames(ensem2symbol[ensem2symbol$gene_type == "lincRNA",]),]
#保存表达矩阵和分组信息
save(pd,ensem2symbol,gset_miRNA,gset_cdRNA,gset_lncRNA,group_list_mRNA,group_list_miRNA,file = "step1_output.Rdata")
DESeq2差异分析
## 全部肿瘤样本及正常样本表达矩阵的PCA图,热图
rm(list = ls())
load(file = "step1_output.Rdata")
dat=gset_cdRNA
group_list=ifelse(substring(colnames(gset_cdRNA),14,15) == "11","Normal","Tumor")
### 去掉低质量的探针行
dat=dat[apply(dat,1, function(x) sum(x>1) > 50),]
### 检查数据
table(group_list)
source("run_check_h_pca.R")
pro = "cdRNA_Tumor_vs_Normal"
run_check_h_pca(pro)
## T1-T4亚型组与正常组表达矩阵分别差异分析
### 去掉低质量的探针行
gset=gset_cdRNA
gset=gset[apply(gset,1, function(x) sum(x>1) > 50),]
### for循环批量差异分析
source("function.R")
for (Typel in c("T1","T2","T3","T4")) {
pro=paste0("cdRNA_",Typel,"_vs_Normal_2")
run_filter_check_DESeq2(Typel,pro)
}
## 比较T1-4分别与正常样本差异基因情况
rm(list=ls())
load("cdRNA_T1_vs_Normal_deseq2_DEG.Rdata")
p0=p1 # 一个赋值小错误,由于包装函数里是p1,所以导致赋值时p1和p4一样,所以暂且改成p0赋值了
DEG_T1=DEG
load("cdRNA_T2_vs_Normal_deseq2_DEG.Rdata")
p2=p1
DEG_T2=DEG
load("cdRNA_T3_vs_Normal_deseq2_DEG.Rdata")
p3=p1
DEG_T3=DEG
load("cdRNA_T4_vs_Normal_deseq2_DEG.Rdata")
p4=p1
DEG_T4=DEG
library(ggplot2)
library(gridExtra)
plots <- list(p0,p2,p3,p4)
p= grid.arrange(grobs = plots, ncol = 2,
top = "compare")
ggsave(p,filename = "cdRNA_compare_volcano.png",width = 20,height = 15)
venn <- function(x,y,z,a,name){
if(!require(VennDiagram))install.packages('VennDiagram')
library (VennDiagram)
venn.diagram(x= list(T1_vs_Normal = x,T2_vs_Normal = y,T3_vs_Normal = z,T4_vs_Normal = a),filename = "cdRNA_DEGgene.png",
height = 1000, width = 1000,resolution =300, imagetype="tiff", col="transparent",fill=c("cornflowerblue","green","yellow","darkorchid1"),alpha = 0.5, cex=0.45, cat.cex=0.45)
}
DEG_T1$change!="NOT"
DEGs=function(df){
rownames(df)[df$change!="NOT"]
}
library(VennDiagram)
venn(DEGs(DEG_T1),DEGs(DEG_T2),DEGs(DEG_T3),DEGs(DEG_T4),
"DEGgene"
)
DEGsl = intersect(intersect(intersect(DEGs(DEG_T1),DEGs(DEG_T2)),DEGs(DEG_T3)),DEGs(DEG_T4))
length(DEGsl)
总结
函数代码
run_check_h_pca <- function(pro = "T1_vs_Normal"){
rm(list = ls()) ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
table(group_list)
# 每次都要检测数据
dat[1:4,1:4]
## 下面是画PCA的必须操作,需要看说明书。
dat=t(dat)#画PCA图时要求是行名时样本名,列名时探针名,因此此时需要转换
dat=as.data.frame(dat)#将matrix转换为data.frame
dat=cbind(dat,group_list) #cbind横向追加,即将分组信息追加到最后一列
library("FactoMineR")#画主成分分析图需要加载这两个包
library("factoextra")
# The variable group_list (index = 54676) is removed
# before PCA analysis
dat.pca <- PCA(dat[,-ncol(dat)], graph = FALSE)#现在dat最后一列是group_list,需要重新赋值给一个dat.pca,这个矩阵是不含有分组信息的
fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = dat$group_list, # color by groups
# palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
ggsave(filename = paste0(pro,'_all_samples_PCA.png'))
rm(list = ls()) ## 魔幻操作,一键清空~
load(file = 'step1_output.Rdata')
dat[1:4,1:4]
cg=names(tail(sort(apply(dat,1,sd)),1000))#apply按行('1'是按行取,'2'是按列取)取每一行的方差,从小到大排序,取最大的1000个
library(pheatmap)
#pheatmap(dat[cg,],show_colnames =F,show_rownames = F) #对那些提取出来的1000个基因所在的每一行取出,组合起来为一个新的表达矩阵
n=t(scale(t(dat[cg,]))) # 'scale'可以对log-ratio数值进行归一化
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]
pheatmap(n,show_colnames =F,show_rownames = F)
ac=data.frame(g=group_list)
rownames(ac)=colnames(n) #把ac的行名给到n的列名,即对每一个探针标记上分组信息(是'noTNBC'还是'TNBC')
## 可以看到TNBC具有一定的异质性,拿它来区分乳腺癌亚型指导临床治疗还是略显粗糙。
pheatmap(n,show_colnames =F,show_rownames = F,
annotation_col=ac,filename = paste0(pro,"_heatmap_top1000_sd.png"))
rm(list = ls())
load(file = 'step1_output.Rdata')
dat[1:4,1:4]
exprSet=dat
pheatmap::pheatmap(cor(exprSet))
# 组内的样本的相似性应该是要高于组间的!
colD=data.frame(group_list=group_list)
rownames(colD)=colnames(exprSet)
pheatmap::pheatmap(cor(exprSet),
annotation_col = colD,
show_rownames = F,
filename = paste0(pro,"_cor_all.png"))
dim(exprSet)
exprSet=exprSet[apply(exprSet,1, function(x) sum(x>1) > 5),]
dim(exprSet)
exprSet=log(edgeR::cpm(exprSet)+1)
dim(exprSet)
exprSet=exprSet[names(sort(apply(exprSet, 1,mad),decreasing = T)[1:500]),]
dim(exprSet)
M=cor(log2(exprSet+1))
pheatmap::pheatmap(M,
show_rownames = F,
annotation_col = colD,
filename = paste0(pro,"_cor_top500.png"))
}
run_filter_check_DESeq2 <- function(Typel,pro){
#按照T1-T4分类肺癌组织样本的得到四个表达矩阵
RNA_Normal=gset[,substring(colnames(gset),14,15) == "11"]
RNA_LUSC=gset[,substring(colnames(gset),14,15) == "01"]
colnames(RNA_Normal)
colnames(RNA_LUSC)
# T1-4期亚型的表达矩阵
RNA_T=RNA_LUSC[,colnames(RNA_LUSC) %in% rownames(pd[substring(pd[,"pathologic_T"],1,2) == Typel,])]
dat=cbind(RNA_Normal,RNA_T)
colnames(dat)
group_list=c(rep("Normal",38),rep(Typel,ncol(RNA_T)))
table(group_list)
## 差异分析 DESeq2
if(!require(DESeq2))BiocManager::install("DESeq2")
library(DESeq2)
#需修改results()的contrast参数
#输入:表达矩阵和分组信息
#输出:差异分析结果、火山图
#构建colData (condition存在于colData中,是表示分组的因子型变量)
countData <- floor(dat)
colData <- data.frame(row.names =colnames(countData),
condition=group_list)
dds <- DESeqDataSetFromMatrix(
countData = countData,
colData = colData,
design = ~ condition)
dds <- DESeq(dds)
# 两两比较
res <- results(dds, contrast = c("condition",Typel,"Normal"))
resOrdered <- res[order(res$padj),] # 按照P值排序
DEG <- as.data.frame(resOrdered)
DEG <- na.omit(DEG)
#添加change列标记基因上调下调,在DESeq里FDR就是pdaj,所以要把pvalue改成pdaj
#logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
logFC_cutoff <- 1
DEG$change = as.factor(
ifelse(DEG$padj < 0.01 & abs(DEG$log2FoldChange) > logFC_cutoff,
ifelse(DEG$log2FoldChange >= logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
boxplot(as.numeric(countData[rownames(DEG)[1],])~group_list)
# Heatmap热图
choose_gene=head(rownames(DEG),100) ## 选定部分差异基因
choose_matrix=countData[choose_gene,]
pheatmap::pheatmap(choose_matrix)
annotation_col = data.frame(
group = factor(group_list))
rownames(annotation_col) = colnames(countData)
pheatmap::pheatmap(choose_matrix,
scale = "row",
annotation_col = annotation_col)
# 火山图
library(ggplot2)
this_tile <- paste0('Cutoff for log2FoldChange is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,
'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',])
)
p1 = ggplot(data=DEG,
aes(x=log2FoldChange, y=-log10(padj),color=change)) +
geom_point(alpha=0.4, size=3.5) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 padj") +
geom_vline(xintercept=c(-logFC_cutoff,logFC_cutoff),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(0.05),lty=4,col="black",lwd=0.8) +
ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','grey','red')) ## corresponding to the levels(res$change)
p1
ggsave(p1, filename = paste0(pro,"deseq2_vocalno.png"), width = 10, height = 7 )
save(DEG,p1,file = paste0(pro,"_deseq2_DEG.Rdata"))
}文末友情宣传
强烈建议你推荐给身边的博士后以及年轻生物学PI,多一点数据认知,让他们的科研上一个台阶: