前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >批量COX回归生存分析图,指定挑选lncRNA基因,森林图,ROC曲线打包给你

批量COX回归生存分析图,指定挑选lncRNA基因,森林图,ROC曲线打包给你

作者头像
生信技能树
发布2019-05-13 18:55:37
6.4K0
发布2019-05-13 18:55:37
举报
文章被收录于专栏:生信技能树生信技能树

大家好,我是生信技能树学徒,昨天我们绘制了基因全景图,今天来做一下COX回归分析

数据准备

回归分析需要用到表达矩阵和样本信息,文件的下载仍旧是来源于XENA

(点评:其实也可以是突变与否的信息,或者其它组学信息,生存分析重点是研究分组,需要表达量,也是根据表达量高低进行分组而已)

代码语言:javascript
复制
# Step1 download TCGA dateset ---------------------------------------------
tumor_type <- "LIHC"
Rdata_file <- paste( './data/TCGA.', tumor_type, '.Rdata', sep = '' )
if (!file.exists( Rdata_file )) {
  raw_data <- fread( destfile, sep = '\t', header = T)
  raw_data <- as.data.frame( raw_data )
  raw_data[1:5, 1:5] 
  rownames( raw_data ) <- raw_data[, 1]
  raw_data <- raw_data[, -1]
  raw_data[1:5, 1:5]
  raw_data <- 2^raw_data - 1
  raw_data <- ceiling( raw_data )
  raw_data[1:5, 1:5]
  dim( raw_data )
  save( raw_data, file = Rdata_file )
}else{
  load( Rdata_file )
}
# 上面的代码是基于我们已经全部下载好了TCGA数据文件

# Step2 clinical information --------------------------

Rdata_file <- paste('./data/', tumor_type, '.phenoData.Rdata', sep = '')
if (!file.exists( Rdata_file )) {
  phenoData <- read.table( destfile,
                           header = T,
                           sep = '\t',
                           quote = '' )
  rownames( phenoData ) <- phenoData[ , 1]
  ## name <- gsub(pattern = "-", replacement = ".", name)
  colnames( phenoData )[1] <- "Tumor_Sample_Barcode"
  phenoData[1:5, 1:5]
  save( phenoData, file = Rdata_file )
}else{
  load( Rdata_file )
}

回归分析前数据准备

代码语言:javascript
复制
# step1 Keep only tumor samples -------------------------------------------

ncol(raw_data)
group_list <- factor(
  ifelse( as.numeric( substr( 
    colnames( raw_data ),14,15)) < 10, 'tumor', 'normal'))
table(group_list)
AssayData <- na.omit(raw_data)

AssayData <- AssayData[, group_list == 'tumor']
dim(AssayData)
AssayData[1:3, 1:3]



# step2 Only retain lncRNA ------------------------------------------------

## ENSEMBLTO SYMBOL
library( "clusterProfiler" )
library( "org.Hs.eg.db" )
keytypes(org.Hs.eg.db)
library("stringr")
rownames( AssayData ) <- str_sub(rownames( AssayData ), start = 1, end = 15)
AssayData$ENSEMBL <- rownames( AssayData )
gene_name <- bitr( AssayData$ENSEMBL, fromType = "ENSEMBL", toType = "SYMBOL", 
                   OrgDb = org.Hs.eg.db )
head( gene_name )
AssayData <- AssayData[gene_name$ENSEMBL, ]
dim(AssayData)
AssayData[1:3, 1:3]

## Pick the gene with the highest expression
gene_name$max <- apply(AssayData, 1, max)
gene_name <- gene_name[order(gene_name$SYMBOL,
                             gene_name$max,
                             decreasing = T), ]
gene_name <- gene_name[!duplicated(gene_name$SYMBOL), ]
dim( gene_name )

## pick lncRNA
{
  gene2type = read.table( './raw_data/gencode.v25lift37.annotation.gtf.gene2type' )
  colnames( gene2type ) = c( "gene", "type" )
}

dim( gene2type )
sort( table( gene2type$type ) )
gene2type = gene2type[ gene2type[,2] == 'lincRNA', ]
length( unique( gene2type$gene ) )

gene_name <- gene_name[gene_name$SYMBOL %in% gene2type$gene, ]
length(gene_name$SYMBOL)

AssayData <- AssayData[gene_name$ENSEMBL, ]

rownames(AssayData) <- gene_name$SYMBOL



# step3 Random grouping ---------------------------------------------------

set.seed(566)
k <- sample(1:ncol(AssayData), ncol(AssayData) / 2 )

t_exp <- AssayData[,k]
v_exp <- AssayData[,-k]



## Based on a data set: t_exp, v_exp, AssayData


# step4 Extract clinical information related to risk factors --------------

pheno <- phenoData[colnames(t_exp), ]
head(pheno)
colnames(pheno)
pheno[1:10, 137:138]

## race
pheno$race <- str_split(pheno$race, ' ', simplify = T)[, 1]
table(pheno$race)

## OS.time
pheno[, 137][is.na(pheno[,137])] = 0
pheno[, 138][is.na(pheno[, 138])] = 0
pheno$OS.days <- as.numeric(pheno[, 137]) + as.numeric(pheno[, 138])
pheno$time <- pheno$OS.days / 30
boxplot(pheno$time)

## status
pheno$status <- ifelse(pheno$vital_status.diagnoses == "alive", 0, 1)

## age
pheno$year_of_death.demographic[is.na(pheno$year_of_death.demographic)] <- '2016'
table(pheno$year_of_death.demographic)
pheno$age <- as.numeric(pheno$year_of_death.demographic) - as.numeric(pheno$year_of_birth.demographic)
boxplot(pheno$age)
median_age <- sort(pheno$age)[length(pheno$age) / 2]
pheno$age_group <- ifelse( pheno$age >= median_age, 'older', 'younger' )
table(pheno$age_group)

## stage
library(stringr) 
pheno$tumor_stage <- str_split(pheno$tumor_stage.diagnoses, ' ', simplify = T)[, 2]
pheno$tumor_stage <- ifelse(pheno$tumor_stage %in% c("i", "ii"),
                            "early_stage", "later_stage")
table(pheno$tumor_stage)

## bind your clinical data
pheno <- pheno[,c("Tumor_Sample_Barcode",
                  "race",
                  "gender.demographic",
                  "tumor_stage",
                  "status",
                  "time",
                  "age_group")]
colnames(pheno) <- c('ID', 'race', 'gender', 'tumor_stage', 'status', 'time', 'age_group')

pheno$ID <- toupper(pheno$ID)
dim(pheno)
head(pheno)
t_exp[1:4, 1:4]

COX回归分析

首先挑选出感兴趣的基因

代码语言:javascript
复制
# step5 COX analysis ------------------------------------------------------
## http://www.sthda.com/english/wiki/cox-proportional-hazards-model

library(survival)
## Eliminate more than half of the gene data not expressed
t_exp <- t_exp[apply(t_exp, 1, function(x) sum(x == 0)) < (ncol(t_exp)*0.49), ]
dim(t_exp)
## [1] 445 187

## single
group_data <- apply(t_exp , 1 , function(gene){
  name <- rownames(gene)
  gene <- unlist(gene)
  group <- ifelse(gene >= median(gene), 'high', 'low')
  names(group) <- name
  return(group)
})
group_data <- as.data.frame(group_data, stringsAsFactors = F)
survival_dat <- data.frame(race = pheno$race,
                           gender = pheno$gender,
                           stage = pheno$tumor_stage, 
                           age_group = pheno$age_group,
                           status = pheno$status,
                           time = pheno$time,
                           stringsAsFactors = F)
survival_dat <- cbind(group_data, survival_dat)
colnames(survival_dat) <- sub("\\-", "", colnames(survival_dat))
covariates <- as.character(colnames(survival_dat))
univ_formulas <- sapply(covariates,
                        function(x){
                          ##print(x)
                          as.formula(paste('Surv(time, status)~', x))
                        })

univ_models <- lapply( univ_formulas, function(x){coxph(x, data = survival_dat)})
# Extract data 
univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         p.value <- signif(x$wald["pvalue"], digits = 2)
                         beta <- signif(x$coef[1], digits = 2)
                         HR <- signif(x$coef[2], digits = 2)
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"], 2)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"], 2)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", HR.confint.upper, ")")
                         res <- c(beta, HR, p.value)
                         names(res) <- c("coef", "HR (95% CI for HR)", "p.value")
                         return(res)
                       })
res_single <- as.data.frame(t(do.call(cbind, univ_results)))
table(res_single$p.value <= 0.01)
res_single <- res_single[res_single$p.value <= 0.01, ]
res_single <- res_single[order(res_single$p.value), ]
single_pick <- rownames(res_single)

## multi
multi_results <- apply(t_exp , 1 , function(gene){
  ## gene <- t_exp[1, ]
  gene <- unlist(gene)
  group <- ifelse(gene >= median(gene), 'high', 'low')
  survival_dat <- data.frame(group = group,
                             race = pheno$race,
                             gender = pheno$gender,
                             stage = pheno$tumor_stage, 
                             age_group = pheno$age_group,
                             status = pheno$status,
                             time = pheno$time,
                             stringsAsFactors = F)
  res.cox <- coxph(Surv(time, status) ~ age_group + stage + gender + race + group, 
                   data =  survival_dat)
  ## summary(res.cox)
  beta <- coef(res.cox)
  se <- sqrt(diag(vcov(res.cox)))
  HR <- exp(beta)
  HRse <- HR * se

  #summary(m)
  res <- as.data.frame(round(cbind(coef = beta,
                     se = se,
                     z = beta/se,
                     p.value = 1 - pchisq((beta/se)^2, 1),
                     HR = HR,
                     HRse = HRse,
                     HRz = (HR - 1) / HRse,
                     HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
                     HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                     HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 5))
  return(res['grouplow',])
})
multi_results <- do.call(rbind, multi_results)
table(multi_results$p.value <= 0.01)
res_multi <- multi_results[multi_results$p.value <= 0.01, ]
res_multi <- res_multi[order(res_multi$p.value), ]
multi_pick <- rownames(res_multi)

overgene <- intersect(multi_pick, single_pick)

森林图

代码语言:javascript
复制
model_exp <- t(log2(t_exp[overgene,] + 1))
colnames(model_exp) <- overgene
dat <- cbind(pheno, model_exp)

dat$gender <- factor(dat$gender)
dat$tumor_stage <- factor(dat$tumor_stage)

library("survminer")
colnames(dat)
s <- Surv(time, status) ~ race + gender + tumor_stage + age_group + LINC01136 + LINC00346 + C2orf48 + LINC01139 + LINC01559
s <- Surv(time, status) ~ LINC01136 + LINC00346 + C2orf48 + LINC01139 + LINC01559

model <- coxph(s, data = dat )
summary(model, data = dat)
options(scipen = 1)
ggforest(model, data = dat, 
         main = "Hazard ratio", 
         cpositions = c(0.10, 0.22, 0.4), 
         fontsize = 1.0, 
         refLabel = "1", noDigits = 4)

生存曲线图

代码语言:javascript
复制
model_exp <- t_exp[overgene, ]
for (i in 1:nrow(model_exp)) {
  gene <- model_exp[i, ]
  name <- rownames(gene)
  gene <- unlist(gene)
  group <- ifelse(gene >= median(gene), 'high', 'low')
  survival_dat <- data.frame(group = group,
                             status = pheno$status,
                             time = pheno$time,
                             stringsAsFactors = F)
  fit <- survfit(Surv(time, status) ~ group, data = survival_dat)
  ggsurvplot(fit, data = survival_dat,
             surv.median.line = "hv",
             legend.title = "Group",
             legend.labs = c("High", "Low"),
             pval = TRUE,
             conf.int = TRUE,
             palette = "jco",
             ggtheme = theme_bw()
  )
  ggsave(filename = paste('./fig/', tumor_type, name, '.png', sep = ''))
}

ROC曲线

代码语言:javascript
复制
library(lars) 
library(glmnet) 
x <- t(log2(t_exp[overgene,] + 1))
y <- survival_dat$status 
cv_fit <- cv.glmnet(x = x, y = y, alpha = 1, nlambda = 1000) 
lasso.prob <- predict(cv_fit,
                      newx = x , 
                      s = c(cv_fit$lambda.min, cv_fit$lambda.1se) )


library(timeROC)
pheno$fp <- as.numeric(lasso.prob[,1])
with(pheno,
     ROC <<- timeROC(T = time,
                     delta = status,
                     marker = fp,
                     cause = 1,
                     weighting = "marginal",
                     times = c(20, 60),
                     ROC = TRUE,
                     iid = TRUE)
)

plot(ROC, time = 60, col = "blue", add = F)
plot(ROC, time = 20, col = "red", add = T)
本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2019-04-26,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 生信技能树 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档