options("repos" = c(CRAN="http://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
options(BioC_mirror="http://mirrors.tuna.tsinghua.edu.cn/bioconductor/")
cran_packages <- c('tidyr',
'tibble',
'dplyr',
'stringr',
'ggplot2',
'ggpubr',
'factoextra',
'FactoMineR',
'pheatmap',
"survival",
"survminer",
"patchwork",
"statsExpressions",
"ggstatsplot",
"ggplotify",
"cowplot",
"glmnet",
"ROCR",
"caret",
"randomForest",
"survminer",
"Hmisc",
"e1071",
"deconstructSigs",
"AnnoProbe",
"timeROC",
"circlize",
"VennDiagram",
"tinyarray")
Biocductor_packages <- c("limma",
"clusterProfiler",
"org.Hs.eg.db",
"SummarizedExperiment",
"DESeq2",
"edgeR",
"ggpubr",
"rtracklayer",
"genefilter",
"maftools",
"ComplexHeatmap")
for (pkg in cran_packages){
if (! require(pkg,character.only=T) ) {
install.packages(pkg,ask = F,update = F)
require(pkg,character.only=T)
}
}
# use BiocManager to install
for (pkg in Biocductor_packages){
if (! require(pkg,character.only=T) ) {
BiocManager::install(pkg,ask = F,update = F)
require(pkg,character.only=T)
}
}
#前面的任何信息都先不要管。主要看这里
for (pkg in c(Biocductor_packages,cran_packages)){
require(pkg,character.only=T)
}
#没有任何提示就是成功了,如果有warningxx包不存在,用library检查一下。
#报错就回去重新安装。如果你没有安装xx包,却提示你xx包不存在,这也正常,是因为依赖关系,缺啥补啥。
选定癌症--下载数据(途径:1.Xena 2.TCGAbiolinks 3.gdc-client. 需要下载的数据有:表达数据,临床信息)
差异分析 (常用的方法:edgeR / DESeq2 / limma。 结果:差异基因热图,火山图 ,PCA图 , 韦恩图)
生存分析(KM-plot / log-rank test / 单因素cox回归)
构建生存模型(方法:Lasso回归 / cox多因素回归 / /随机森林 / 支持向量机。 目的:选出关键基因 / 风险分数计算)
模型预测和评估(ROC曲线 / C-index)
counts矩阵,行名是symbol-reads计数数据
拿不到count数据如何让做差异分析?
tpm:用limma做差异分析(迫不得已)
fpkm、rpkm:转换为tpm,用limma做差异分析(迫不得已) https://mp.weixin.qq.com/s/_DtkxSfLGQHcRju66J4yTQ
RSEM:三大R包都可。 https://www.jianshu.com/p/46b048220b88
只需认准count数据即可,自己的数据还是公共数据、数据库、背景知识均不影响差异分析。
整理输入数据的过程不同,差异分析无差别。数据下载方式不同,是否是count矩阵,行名需要是基因名,分组信息如何获取。
不同的数据,数据整理的终点不同,但起点不同,所以代码不同,不通用。
在GEO官网网页下载数据:表达数据 - counts.csv.gz文件(点击ftp);临床信息表格:series matrix.txt.gz
proj = "cov"
#1.获取表达矩阵
dat = data.table::fread("GSE150392_Cov_Mock_Raw_COUNTS.csv.gz")
####保留symbol ,去重复,再设为行名
library(stringr)
b = dat$V1 %>% str_split("_",simplify = T)
#24行是异常数据,检查它
dat$V1[24]
#解决办法:删除PAR_Y_
dat$V1 = str_remove(dat$V1,"PAR_Y_")
dat$V1[24]
b = dat$V1 %>% str_split("_",simplify = T)
#36850以后是异常数据,检查
dat$V1[36850]
# 删除ERCC开头的行
k = !str_starts(dat$V1,"ERCC-");table(k)
dat = dat[k,]
b = dat$V1 %>% str_split("_",simplify = T)
# 按照symbol去重复
dat = cbind(b[,2],dat[,-1])
library(dplyr)
dat = distinct(dat,V1,.keep_all = T)
# 把symbol设为行名
##方法1:
exp = dat[,-1]
rownames(exp) = dat$V1
exp = as.matrix(exp)
##方法2:
library(tibble)
exp2 = column_to_rownames(dat,"V1")
rm(list = ls())
proj = "cov"
#1.获取表达矩阵
dat = data.table::fread("GSE150392_Cov_Mock_Raw_COUNTS.csv.gz")
# 保留ensemblid ,行名转换
# 删除ERCC开头的行
k = !str_starts(dat$V1,"ERCC-");table(k)
dat = dat[k,]
library(stringr)
b = dat$V1 %>% str_split("_",simplify = T)
exp = as.matrix(dat[,-1])
rownames(exp) = b[,1]
library(tinyarray)
exp = trans_exp_new(exp)
colnames(exp)
Group = str_remove(colnames(exp),"\\d")
Group = factor(Group,levels = c("Mock","Cov"))
###基因过滤(具体方法参考TCGA数据整理代码中数据过滤的方法)
###此处使用过滤标准2:仅保留在一半以上样本里表达的基因
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
library(GEOquery)
eSet = getGEO("GSE150392",destdir = ".",getGPL = F)
eSet = eSet[[1]]
clinical = pData(eSet)
# 顺便看下表达矩阵,空的
dim(exprs(eSet))
save(exp,Group,proj,clinical,file = paste0(proj,".Rdata"))
下载两个文件。 临床信息表格:series matrix.txt.gz
表达数据:count-with-symbol.xls.gz文件(点击ftp)
rm(list = ls())
proj = "DHA"
#1.获取表达矩阵
dat = data.table::fread("GSE162550_gene_sample_count_with_symbol.xls.gz" data.table = F)
# 保留ensembl id ,行名转换
exp = as.matrix(dat[,4:9])
rownames(exp) = dat[,1]
library(tinyarray)
exp = trans_exp_new(exp)
colnames(exp)
Group = rep(c("DMSO","DHA"),each = 3)
Group = factor(Group,levels = c("DMSO","DHA"))
###基因过滤(具体方法参考TCGA数据整理代码中数据过滤的方法)
###此处使用过滤标准2:仅保留在一半以上样本里表达的基因
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
library(GEOquery)
eSet = getGEO("GSE162550",destdir = ".",getGPL = F)
eSet = eSet[[1]]
clinical = pData(eSet)
# 顺便看下表达矩阵,空的
dim(exprs(eSet))
save(exp,Group,proj,clinical,file = paste0(proj,".Rdata"))
TCGA差异分析的输入数据整理
rm(list = ls())
library(TCGAbiolinks)
library(stringr)
library(SummarizedExperiment)
projs <- getGDCprojects()$project_id %>%
str_subset("TCGA")
projs
方法一:去这个链接,找到你要的癌症的count和临床信息数据,下载下来放在工作目录下
https://share.weiyun.com/ZMQdPBLC 密码:xjlshh
proj = "TCGA-CHOL"
load("chol_exp.Rdata")
exp = chol
方法二:
proj = "TCGA-CHOL"
f1 = paste0(proj,"expf.Rdata")
if(!file.exists(f1)){
query = GDCquery(project = proj,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts",)
GDCdownload(query)
dat = GDCprepare(query)
exp = assay(dat)
#tpm = assay(dat,4)
save(exp,file = f1)
}
load(f1)
方法一:和上一步的方法一对应
load("chol_clinical.Rdata")
clinical = chol_clinical
方法二:和上一步的方法二对应:
f2 = paste0(proj,"clf.Rdata")
if(!file.exists(f2)){
query = GDCquery(project = proj,
data.category = "Clinical",
data.type = "Clinical Supplement",
file.type = "xml" )
GDCdownload(query)
dat = GDCprepare_clinic(query,clinical.info = "patient")
k = apply(dat, 2, function(x){!all(is.na(x))});table(k)
clinical = dat[,k]
save(clinical,file = f2)
}
load(f2)
library(tinyarray)
exp = trans_exp_new(exp)
exp[1:4,1:4]
查看过滤之前基因数量:
nrow(exp)
常用过滤标准1:仅去除在所有样本里表达量都为零的基因
exp1 = exp[rowSums(exp)>0,]
nrow(exp1)
常用过滤标准2(推荐):仅保留在一半以上样本里表达的基因
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)
根据样本ID的第14-15位,给样本分组(tumor和normal)
Group = make_tcga_group(exp)
table(Group)
save(exp,Group,proj,clinical,file = paste0(proj,".Rdata"))
1.和Gtex联合分析;2.不做T-N差异分析;3.从GEO数据库中找T-N的数据做差异分析,差异基因在TCGA里面继续分析。
输入数据都是count矩阵和对应的分组信息。
rm(list = ls())
load("TCGA-CHOL.Rdata")
table(Group)
library(DESeq2)
colData <- data.frame(row.names =colnames(exp),
condition=Group)
if(!file.exists(paste0(proj,"_dd.Rdata"))){
dds <- DESeqDataSetFromMatrix(
countData = exp,
colData = colData,
design = ~ condition)
dds <- DESeq(dds)
save(dds,file = paste0(proj,"_dd.Rdata"))}
###以上步骤是限速步骤,设置了存在即跳过
load(file = paste0(proj,"_dd.Rdata"))
class(dds)
res <- results(dds, contrast = c("condition",rev(levels(Group))))
#constrast
c("condition",rev(levels(Group))) ###换了分组,也同样适用,不需要自己修改代码。
class(res)
DEG1 <- as.data.frame(res)
library(dplyr)
DEG1 <- arrange(DEG1,pvalue)
DEG1 = na.omit(DEG1)
head(DEG1)
#添加change列标记基因上调下调
logFC_t = 2
pvalue_t = 0.05
k1 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange < -logFC_t);table(k1)
k2 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange > logFC_t);table(k2)
DEG1$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
##up/down大小写,单词什么的都不重要,只要前后统一就行。
table(DEG1$change)
head(DEG1)
####⚠️注意列名是什么,logFC和log2FoldChange,写代码时注意。log2FoldChange是DESeq2中的列名,
logFC是limma中的列名。
library(edgeR)
dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~Group)
dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit <- glmLRT(fit)
DEG2=topTags(fit, n=Inf)
class(DEG2)
DEG2=as.data.frame(DEG2)
head(DEG2)
k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t)
k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t)
DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
head(DEG2)
table(DEG2$change)
#####limma做转录组差异分析,多了一步voom标准化。和芯片数据差异分析的列名是一样的,但是内容不一样
library(limma)
dge <- edgeR::DGEList(counts=exp)
dge <- edgeR::calcNormFactors(dge)
design <- model.matrix(~Group)
v <- voom(dge,design, normalize="quantile") ###标准化操作
fit <- lmFit(v, design)
fit= eBayes(fit)
DEG3 = topTable(fit, coef=2, n=Inf)
DEG3 = na.omit(DEG3)
k1 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC < -logFC_t)
k2 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC > logFC_t)
DEG3$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG3$change)
head(DEG3)
tj = data.frame(deseq2 = as.integer(table(DEG1$change)),
edgeR = as.integer(table(DEG2$change)),
limma_voom = as.integer(table(DEG3$change)),
row.names = c("down","not","up")
);tj
save(DEG1,DEG2,DEG3,Group,tj,file = paste0(proj,"_DEG.Rdata"))
UP=function(df){rownames(df)[df$change=="UP"]}
###3个数据,上下调分开提取,写6行代码,产生6个变量-可以有上面的代码。
###参数是一个数据框,对他的行名取子集,取出change列是UP的行名。
###三个R包差异分析结果都有统一的change列,所以可以用相同的函数取子集。
###不出错的前提:行名是基因名,有change列,change列有UP的取值。
###有了这个函数,提取上调基因的代码就变成UP(DEG1),起到简化代码的作用。
DOWN=function(df){
rownames(df)[df$change=="DOWN"]}
up = intersect(intersect(UP(DEG1),UP(DEG2)),UP(DEG3))
down = intersect(intersect(DOWN(DEG1),DOWN(DEG2)),DOWN(DEG3))
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group)
#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DEG1),
edgeR = UP(DEG2),
limma = UP(DEG3))
down_genes = list(Deseq2 = DOWN(DEG1),
edgeR = DOWN(DEG2),
limma = DOWN(DEG3))
up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")
#维恩图拼图,终于搞定
library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
# cpm 去除文库大小的影响
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
cg1 = rownames(DEG1)[DEG1$change !="NOT"]
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
cg3 = rownames(DEG3)[DEG3$change !="NOT"]
###画热图
h1 = draw_heatmap(dat[cg1,],Group)
h2 = draw_heatmap(dat[cg2,],Group)
h3 = draw_heatmap(dat[cg3,],Group)
###画火山图
v1 = draw_volcano(DEG1,pkg = 1,logFC_cutoff = logFC_t)
###想让火山图对称的话加上一个参数:symmetry=T
v2 = draw_volcano(DEG2,pkg = 2,logFC_cutoff = logFC_t)
v3 = draw_volcano(DEG3,pkg = 3,logFC_cutoff = logFC_t)
library(patchwork)(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")
ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
###画图后会出现分组与聚类不匹配的问题,没有错误,但是不好解释
期待值:tumor和normal各成一簇,但是实际上不一定是这样的。
成一簇:说明画热图的基因在两个分组间有明显的表达模式;
不成一簇:说明画热图的基因在两个分组间表达模式不是特别明显;
换一组基因或者增删基因,可能改变聚类的结果。
分组和聚类是两件独立的事情,聚类以样本为单位,而不是以分组为单位,每个样本属于那个分组的信息是已知的。
希望各成一簇,解决办法:
1、增删、换基因;
2、取消聚类 cluster_cols = F
a、前提:矩阵的顺序是先tumor后normal,或者先normal后tumor,不聚类时,热图列的顺序与矩阵的顺序完全匹配。
b、如果取消聚类后没有各成一簇,说明表达矩阵的顺序是乱的。
load("TCGA-CHOL.Rdata")
load("TCGA-CHOL_DEG.Rdata")
cg1 = rownames(DEG1)[DEG1$change !="NOT"]
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
cg3 = rownames(DEG3)[DEG3$change !="NOT"]
library(tinyarray)
cg = intersect_all(cg1,cg2,cg3)
gs = sample(cg,100)
dat = log2(cpm(exp)+1)
draw_heatmap(dat[gs,],Group)
以上代码输出的结果出现分组与聚类不匹配的问题,先试试取消聚类的效果
draw_heatmap(dat[gs,],Group,cluster_cols = F )
取消聚类后,没有各成一簇,说明表达矩阵的顺序是乱的。需要先排序,再画图
如何调整表达矩阵的顺序,让tumor和normal各成一簇。根据Group
colData = data.frame(col = colnames(dat),Group =Group )
colData = arrange(colData,Group)
n = dat[gs,colData$col]
draw_heatmap(n,colData$Group,cluster_cols = F )
分组聚类
注意事项:group因子水平不能反 ;没有scale参数,需要自行scale。
library(ComplexHeatmap)
library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), c("#2fa1dd", "white", "#f87669"))
top_annotation = HeatmapAnnotation(
cluster = anno_block(gp = gpar(fill = c("#f87669","#2fa1dd")),
labels = levels(Group),
labels_gp = gpar(col = "white", fontsize = 12)))
m = Heatmap(t(scale(t(exp[c(up,down),]))),name = " ",
col = col_fun,
top_annotation = top_annotation,
column_split = Group,
show_heatmap_legend = T,
border = F,
show_column_names = F,
show_row_names = F,
use_raster = F,
cluster_column_slices = F,
column_title = NULL)
m
生信技能树学习之TCGA数据库挖掘
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。