前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >单基因生信分析流程(3)一文解决生存分析和临床参数相关分析

单基因生信分析流程(3)一文解决生存分析和临床参数相关分析

作者头像
用户1359560
发布2019-05-15 10:43:31
2.9K0
发布2019-05-15 10:43:31
举报
文章被收录于专栏:生信小驿站生信小驿站

本文目的

(1)绘制生存分析图 (2)临床参数相关分析

  • 加载所必须的包
代码语言:javascript
复制
# ===============================================================

#load  package

# ===============================================================

library(TCGAbiolinks)
library(SummarizedExperiment)
setwd('D:\\train\\single_gene')
library(dplyr)
library(survival)
library(survminer)

rm(list=ls())
  • 通过TCGAbiolinks下载TCGA生存相关信息(将其中的OS和OStime留作后用)
代码语言:javascript
复制
# ===============================================================

#download survival data

# ===============================================================

query <- GDCquery(project = "TCGA-KIRC", 
                  data.category = "Clinical", 
                  file.type = "xml")
GDCdownload(query)
clinical <- GDCprepare_clinic(query, clinical.info = "patient")



clinical_trait <- clinical  %>%
  dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                days_to_death,days_to_last_followup,race_list,
                person_neoplasm_cancer_status,age_at_initial_pathologic_diagnosis,
                laterality,neoplasm_histologic_grade,stage_event_pathologic_stage,             
                stage_event_tnm_categories  ) %>%
  distinct( bcr_patient_barcode, .keep_all = TRUE)  


#整理死亡患者的临床信息
dead_patient <- clinical_trait  %>%
  dplyr::filter(vital_status == 'Dead') %>%
  dplyr::select(-days_to_last_followup) %>%
  reshape::rename(c(bcr_patient_barcode = 'Barcode',
                    gender = 'Gender',
                    vital_status = 'OS',
                    days_to_death='OS.Time',
                    race_list = 'Race',
                    person_neoplasm_cancer_status='cancer_status',
                    age_at_initial_pathologic_diagnosis = 'Age',
                    neoplasm_histologic_grade = 'Grade',
                    stage_event_pathologic_stage = 'Stage',
                    stage_event_tnm_categories = 'TNM' )) %>%
  mutate(OS=ifelse(OS=='Dead',1,0))%>%
  mutate(OS.Time=OS.Time/365)



#整理生存患者的临床信息
alive_patient <- clinical_trait %>%
  dplyr::filter(vital_status == 'Alive') %>%
  dplyr::select(-days_to_death) %>%
  reshape::rename(c(bcr_patient_barcode = 'Barcode',
                    gender = 'Gender',
                    vital_status = 'OS',
                    days_to_last_followup='OS.Time',
                    race_list = 'Race',
                    person_neoplasm_cancer_status='cancer_status',
                    age_at_initial_pathologic_diagnosis = 'Age',
                    neoplasm_histologic_grade = 'Grade',
                    stage_event_pathologic_stage = 'Stage',
                    stage_event_tnm_categories = 'TNM' )) %>%
  mutate(OS=ifelse(OS=='Dead',1,0))%>%
  mutate(OS.Time=OS.Time/365)

#合并两类患者,得到肾透明细胞癌的临床信息
survival_data <- rbind(dead_patient,alive_patient)

write.csv(survival_data , file = 'KIRC_survival.csv')



survival_data <- subset(survival_data,select=c(Barcode,OS ,OS.Time))
  • 读取表达量数据(前文所得)
代码语言:javascript
复制
# ===============================================================

#load expression data

# ===============================================================



data <- read.csv('KIRC_mRNA_exprSet.csv',header = T,row.names = 1)

marker <- 'ERBB2'


data1 <- data[which(rownames(data) %in% marker),]

data1 <- as.data.frame(t(data1))

row.names(data1) <- substr(row.names(data1),start = 1,stop = 12)
row.names(data1) <- chartr(old='.',new = '-',x=row.names(data1))
data1$Barcode <- row.names(data1)


data1 <- subset(data1,select= c(ERBB2,Barcode))


dt <- merge(data1,survival_data ,by='Barcode')
  • 将患者根据ERBB2表达量分为高低两组(高于中位值和不高于中位值),通过KM法绘制生存曲线
代码语言:javascript
复制
# ===============================================================
#plot survival figure
# ===============================================================

dt_OS <-  dt %>%
  dplyr::select(OS,OS.Time,ERBB2)
dt_OS <- na.omit(dt_OS)
dt_OS['ERBB2'] <- ifelse(dt_OS['ERBB2'] > median(dt_OS[,'ERBB2']), 'high', 'low' )
fit <- survfit(Surv(OS.Time,OS)~ERBB2,data=dt_OS)
getwd()
pdf("ERBB2_os.pdf",height = 8,width = 8)
ggpar( ggsurvplot(
  fit,   data = dt_OS,   
  pval = TRUE,           
  conf.int = TRUE, 
  legend.title = "ERBB2",
  xlim = c(0,5),  
  xlab = "Time in years",  
  break.time.by = 1,  
  pval.size = 8,
  risk.table = "absolute", 
  risk.table.y.text.col = T,
  risk.table.y.text = FALSE,
  risk.table.fontsize = 5,
  legend.labs =  c("high", "Low"),   
  palette = c("#E41A1C","#377EB8")),
  font.y  = c(16, "bold"), 
  font.x  = c(16, "bold"),
  legend = "top",
  font.legend = c(16, "bold"))
dev.off()
  • 临床参数相关分析
代码语言:javascript
复制
# ===============================================================

#clinical parameters

# ===============================================================

survival_data <- rbind(dead_patient,alive_patient)



data <- merge(data1,survival_data,by='Barcode')



library(stringr)


data$Te2  <-   str_extract(data$TNM, "T\\d")


data$Te1  <-   str_extract(data$TNM, "T\\d[a-z]+")
data$Te1 <- ifelse(is.na(data$Te1),data$Te2,data$Te1)


data$N  <-   str_extract(data$TNM, "N\\d")
data$N  <-   str_extract(data$N, "\\d")

data$M <-   str_extract(data$TNM, "M\\d")
data$M <-   str_extract(data$M, "\\d")


data$Age <- ifelse(data$Age > 60, 'old','yong')



data$stage1  <- str_trim(str_extract(data$Stage, "\\s[H-Z]+"),
                         side = c("both", "left", "right"))






data$stage1 


data$stage2  <- str_extract(data$Stage, "\\s[A-Z]+")
data$stage2

data$expression <- data$ERBB2 


#Age
data1 <- na.omit(data)
t.test(expression~Age,data = data1)

table(data$Age)

mean(na.omit(data[data$Age=='old','expression']))

sd(na.omit(data[data$Age=='old','expression']))


mean(na.omit(data[data$Age=='yong','expression']))
sd(na.omit(data[data$Age=='yong','expression']))



#Gender
t.test(expression~Gender,data = data)

mean(na.omit(data[data$Gender=='FEMALE','expression']))
sd(na.omit(data[data$Gender =='FEMALE','expression']))


mean(na.omit(data[data$Gender=='MALE','expression']))
sd(na.omit(data[data$Gender =='MALE','expression']))



#Tumor location


data1 <- subset(data,data$Location == 'Central Lung' | data$Location=='Peripheral Lung')
table(data1$Location)

t.test(expression~Location,data = data1)

mean(na.omit(data[data$Location=='Central Lung','expression']))
sd(na.omit(data[data$Location =='Central Lung','expression']))


mean(na.omit(data[data$Location=='Peripheral Lung','expression']))
sd(na.omit(data[data$Location =='Peripheral Lung','expression']))






#stage
table(data$stage1)

table(data$stage1)
data$stage1[data$stage1  == "I"] <-  'Stage I-II'
data$stage1[data$stage1  == "II"] <-  'Stage I-II'
data$stage1[data$stage1  == "III"] <-  'Stage III-IV'
data$stage1[data$stage1  == "IV"] <-  'Stage III-IV'
table(data$stage1)


data1 <- subset(data,data$stage1=='Stage I-II' | data$stage1 =='Stage III-IV')
table(data$stage1)



t.test(expression~stage1,data = data1)

mean( na.omit(data[data$stage1=='Stage I-II','expression']))
sd( na.omit(data[data$stage1 =='Stage I-II','expression']))


mean( na.omit(data[data$stage1=='Stage III-IV','expression']))
sd( na.omit(data[data$stage1 =='Stage III-IV','expression']))



#T
table(data$Te1)
table(data$Te2)


data$Te2[data$Te2  == "T1"] <-  'T1-T2'
data$Te2[data$Te2  == "T2"] <-  'T1-T2'
data$Te2[data$Te2  == "T3"] <-  'T3-T4'
data$Te2[data$Te2  == "T4"] <-  'T3-T4'
table(data$Te2)



t.test(expression~Te2,data = data)

mean(na.omit(data[data$Te2=='T1-T2','expression']))
sd( na.omit(data[data$Te2 =='T1-T2','expression']))
mean(na.omit(data[data$Te2=='T3-T4','expression']))
sd( na.omit(data[data$Te2 =='T3-T4','expression']))




#N
table(data$N)


data$N[data$N == "0"] <-  'NO'
data$N[data$N  == "1"] <-  'Yes'
data$N[data$N  == "2"] <-  'Yes'
data$N[data$N  == "3"] <-  'Yes'
table(data$N)



t.test(expression~N,data = data)

mean(na.omit(data[data$N=='NO','expression']))
sd( na.omit(data[data$N =='NO','expression']))
mean(na.omit(data[data$N=='Yes','expression']))
sd( na.omit(data[data$N =='Yes','expression']))




#M
table(data$M)


data$M[data$M== "0"] <-  'NO'
data$M[data$M  == "1"] <-  'Yes'

table(data$M)



t.test(expression~M,data = data)

mean(na.omit(data[data$M  =='NO','expression']))
sd(na.omit(data[data$M    =='NO','expression']))
mean(na.omit(data[data$M  =='Yes','expression']))
sd(na.omit(data[data$M    =='Yes','expression']))

基于上述的计算过程,我们得到这张表格所需的所有信息:

本文参与 腾讯云自媒体分享计划,分享自作者个人站点/博客。
原始发表:2019.05.10 ,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 作者个人站点/博客 前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 本文目的
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档