前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >生物信息数据分析教程视频——07-TCGA数据库:基因的表达探索

生物信息数据分析教程视频——07-TCGA数据库:基因的表达探索

作者头像
DoubleHelix
发布2022-12-15 15:29:44
6570
发布2022-12-15 15:29:44
举报
文章被收录于专栏:生物信息云生物信息云

视频地址:http://mpvideo.qpic.cn/0b2ewiaakaaahmalygztmbrvbmwdawzaabia.f10002.mp4?

参考文章:

【0代码】单基因泛癌分析教程

视频中的代码:

代码语言:javascript
复制
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(reshape2)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(RColorBrewer)
library(ggsignif)

FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)
##可以是编码蛋白或非编码蛋白的基因
genes <- c("PTEN","EGFR","TP53","ATF1")

opt <- "output/001-RNASeq_ExpInNorAndTur/"

record_files <- dir(opt,".txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]

# ###在TCGA中没有正常样本,但可以匹配GTEx数据库中的正常样本的癌症类型
# proj4 <- c("TCGA-ACC","TCGA-LGG","TCGA-OV","TCGA-SKCM","TCGA-TGCT","TCGA-UCS" )
# ###在TCGA中正常样本大于0小于10,但可以匹配GTEx数据库中的正常样本的癌症类型
# proj5 <- c("TCGA-CESC","TCGA-GBM","TCGA-PAAD")
# 
# ###在TCGA中正常样本大于0小于10,在GTEx数据库中也没有正常样本的癌症类型
# proj6 <- c("TCGA-PCPG","TCGA-SARC","TCGA-CHOL","TCGA-THYM")

# proj = "TCGA-BLCA"

norn <- 10 #正常样本数最小数量

geneexpdata <- data.frame()
for(proj in project){
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  
  ##正常组织样本ID
  SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm)[-c(1:3)],
                                typesample = c("NT","NB","NBC","NEBV","NBM"))
  
  if(length(SamN) >= norn){
    
    ###只要mRNA表达的数据
    tpm <- filterGeneTypeExpr(expr = tpm,
                              fil_col = "gene_type",
                              filter = FALSE)
    
    ##过滤不表达的基因
    tpm <- tpm[apply(tpm,1,var) !=0,]
    
    ##判断基因是否在表达矩阵中
    gs <- intersect(genes,rownames(tpm))
    if(length(gs) != 0) {
      
      ##肿瘤组织样本ID
      SamT <- setdiff(colnames(tpm),SamN)
      
      ###去除重复样本
      nor_exp <- del_dup_sample(tpm[,SamN],col_rename = T)
      tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
      
      ###============非配对样本==
      ##构建数据框
      nor_gsExp <- log2(data.frame(nor_exp[gs,],check.names = T) + 1) %>% t() %>% 
        as.data.frame() %>% mutate(Sample = rep(paste0("Normal(n=",ncol(nor_exp),")"),ncol(nor_exp)),
                                   .before = 1)%>%  melt(id.vars = "Sample")
      
      tur_gsExp <- log2(data.frame(tur_exp[gs,],check.names = T) + 1) %>% t() %>% 
        as.data.frame() %>% mutate(Sample = rep(paste0("Tumor(n=",ncol(tur_exp),")"),ncol(tur_exp)),
                                   .before = 1)%>%  melt(id.vars = "Sample")
      data <- rbind(nor_gsExp,tur_gsExp)
      head(data)
      colnames(data) <- c("Sample","gene","exp")
      
      compaired <- list(unique(data$Sample))
      data <- mutate(data,cancer = proj,.before = 1)
      geneexpdata <- rbind(geneexpdata,data)
      ##======绘图===
      ##输出路径
      # opt <- "output/001-RNASeq_ExpInNorAndTur/"
      fp_boxplot <- paste0(opt,proj,"/boxplot")
      ifelse(dir.exists(fp_boxplot),"The folder already exists.",dir.create(fp_boxplot,recursive = T))
      
      
      ###==============多个基因在同一个图中
      if(length(gs)> 1){
        p <- ggplot(data, aes(x= gene, y = exp,color = Sample)) +
          #geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))+
          geom_boxplot(aes(color = Sample),alpha =1,
                       lwd = 0.1, outlier.size = 1,
                       outlier.colour = "white")+ #color = c("red", "blue"),
          # geom_violin(aes(fill = Sample),trim = FALSE)+
          # geom_boxplot(width = 0.1,fill = "white")+
          
          theme_bw()+ 
          labs(y= expression(log[2](TPM + 1)))+
          scale_color_manual(values = c(brewer.pal(3,"Dark2")))+ #c("#3300CC","#CC0000")
          stat_compare_means(label = "p.signif") +
          #labs(title = 'ImmuneCellAI') +
          theme(legend.position = "top",
                axis.text.x = element_text(face = "bold",colour = "#1A1A1A"),
                axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
                axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
                legend.title = element_blank(),
                axis.title.x = element_blank(),
                legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
          ) 
        ggsave(filename = paste0(fp_boxplot,"/all_gene.pdf"),
               plot = p,height=5,width= length(gs)*0.9)
      }
      # g <- gs[1]
      ####================单一绘制==============
      lapply(gs, function(g){
        ldat <- data[data$gene == g,]
        p = ggplot(ldat,aes(Sample,exp,fill= Sample))+
          geom_boxplot(aes(fill = Sample),notch = FALSE,
                       position = position_dodge(width =0.01,preserve = "single"),
                       outlier.alpha  =1,width=0.4) +
          scale_fill_manual(values=c(brewer.pal(5,"Set1")))+
          geom_signif(comparisons = compaired,
                      step_increase = 0.1,
                      map_signif_level = T,
                      margin_top = 0.2,
                      test = "wilcox.test")+
          labs(y= paste0("The expression of ",g,"\nlog2(TPM + 1)"),title= proj)+
          theme_classic()+
          theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                plot.title = element_text(hjust = 0.5),
                axis.line=element_line(colour="black",size=0.25),
                axis.title=element_text(size=10,face="plain",color="black"),
                axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
                axis.title.x=element_text(size=12,face="plain",color="black"),
                axis.title.y=element_text(size=12,face="plain",color="black"),
                axis.text = element_text(size=12,color="black"),
                legend.position="none"
          ) 
        #p
        ggsave(filename = paste0(fp_boxplot,"/",g,"-Unpaired .pdf"),plot = p,width = 2.5,height = 5)
      })
      
      ###===============
      fp_violin <- paste0(opt,proj,"/violin")
      ifelse(dir.exists(fp_violin),"The folder already exists.",dir.create(fp_violin,recursive = T))
      
      lapply(gs, function(g){
        ldat <- data[data$gene == g,]
        p = ggplot(ldat, aes(Sample,exp,fill= Sample))+ 
          geom_violin(aes(fill = Sample),trim = FALSE)+
          geom_signif(comparisons = compaired,
                      step_increase = 0.1,
                      map_signif_level = T,
                      margin_top=0.2,
                      tip_length =0.02,
                      test = "wilcox.test")+
          geom_boxplot(width = 0.1,fill = "white")+
          scale_fill_manual(values=c(brewer.pal(3,"Dark2")))+
          theme_classic()+
          labs(y= paste0("The expression of ",g,"\nlog2(TPM + 1)"),title= proj)+
          theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                plot.title = element_text(hjust = 0.5),
                axis.line=element_line(colour="black",size=0.25),
                axis.title.x = element_blank(),
                axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
                axis.text = element_text(size=12,face="plain",color="black"),
                legend.position="none"
          )
        p
        ggsave(filename = paste0(fp_violin,"/",g,"-Unpaired .pdf"),plot = p,width = 3,height = 5)
      })
      
      ###===========配对样本=========
      id <- intersect(colnames(nor_exp),colnames(tur_exp))
      
      if(length(id) > 3){
        paired_nor_gsExp <- nor_exp[gs,id]
        paired_tur_gsExp <- tur_exp[gs,id]
        
        paired_nor_gsExp <- log2(data.frame(paired_nor_gsExp,check.names = T) + 1) %>% t() %>% 
          as.data.frame() %>% mutate(Sample = rep("Normal",ncol(paired_nor_gsExp)),
                                     id = id,
                                     .before = 1)%>%  melt(id.vars = c("Sample","id"))
        
        paired_tur_gsExp <- log2(data.frame(paired_tur_gsExp,check.names = T) + 1) %>% t() %>% 
          as.data.frame() %>% mutate(Sample = rep("Tumor",ncol(paired_tur_gsExp)),
                                     id = id,
                                     .before = 1)%>%  melt(id.vars =c("Sample","id"))
        paired_data <- rbind(paired_nor_gsExp, paired_tur_gsExp)
        head(paired_data)
        colnames(paired_data) <- c("Sample","id","gene","exp")
        paired_compaired <- list(unique(paired_data$Sample))
        
        ###===============
        lapply(gs, function(g){
          ldat <- paired_data[paired_data$gene == g,]
          
          
          #为了防止配对样本信息错乱,先构造一个配对样本的数据集
          dat1_paried <- reshape2::dcast(ldat[c('Sample', 'id', 'exp')], Sample~id)
          rownames(dat1_paried) <- dat1_paried$Sample
          dat1_paried <- t(dat1_paried[,-1] )%>%as.data.frame()
          head(dat1_paried)
          #配对 t 检验,双侧检验
          p.value <- t.test(dat1_paried$Normal, dat1_paried$Tumor, paired = TRUE, alternative = 'two.sided', conf.level = 0.95)
          pv <- p.value[["p.value"]]
          stasig <- ifelse(pv >= 0.001,paste0('p value = ',round(pv,3)),"p value < 0.01")
          
          p1 <- ggplot(ldat, aes(x = Sample, y = exp,colour = Sample)) +
            # geom_boxplot(aes(fill = Sample), show.legend = FALSE, width = 0.6) +  #绘制箱线图展示肿瘤组织和正常组织的两组基因表达整体分布
            geom_point(size = 3) +  #绘制散点表示单个样本的基因表达信息
            geom_line(aes(group = id), color = 'black', lwd = 0.05) +  #绘制样本连线,通过 aes(group) 参数指定配对样本信息
            scale_fill_manual(values = c('#FE7280', '#AC88FF')) +  #以下是颜色分配、主题更改等,不再多说
            theme_classic()+ 
            labs(y= paste0("The expression of ",g,"\nlog2(TPM + 1)"),title= proj)+
            theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                  plot.title = element_text(hjust = 0.5),
                  axis.line=element_line(colour="black",size=0.25),
                  axis.title.x = element_blank(),
                  axis.text.x = element_text(face = "plain",colour = "black"),
                  axis.text = element_text(size=12,face="plain",color="black"),
                  legend.position="none"
            )+ annotate('text', 
                        x= ifelse(min(dat1_paried$Normal) > min(dat1_paried$Tumor),1,2),
                        y= min(ldat$exp + 0.5),
                        label = stasig ,size = 4)
          ggsave(filename = paste0(fp_boxplot,"/",g,"-paired .pdf"),plot = p1,width = 3,height = 3)
        })

      }
      ##记录分析的样本信息
      opinfo <- paste0(proj,":\n\t","Normal(n)=",length(SamN),";Tumor(n)=",length(SamT))
      output <- file(paste0(opt,"annotation.txt"), open="a+b")
      writeLines(opinfo, con=output)
      close(output)
      
    }
    
  }
  else if(length(SamN) == 0){
    output <- file(paste0(opt," zeroNormalSamples_CancerType.txt"), open="a+b")
    writeLines(proj, con=output)
    close(output)
  }else {
    opinfo <- paste0(proj,":\n\t","Normal(n)=",length(SamN))
    output <- file(paste0(opt,"lessNormalSamples_CancerType.txt"), open="a+b")
    writeLines(opinfo, con=output)
    close(output)
  }
}
###============单基因泛癌=============
###===============
fp_pan <- paste0(opt,"/pan-cancer")
ifelse(dir.exists(fp_pan),"The folder already exists.",dir.create(fp_pan,recursive = T))
lapply(unique(geneexpdata$gene), function(g){
  # g = unique(geneexpdata$gene)[1]
  pan_exp <- geneexpdata[geneexpdata$gene == g,]
  head(pan_exp)
  pan_exp$Sample[grep("Normal",pan_exp$Sample)] <- "Normal"
  pan_exp$Sample[grep("Tumor",pan_exp$Sample)] <- "Tumor"
  # c(brewer.pal(3,"Dark2"))
  
  p <- ggplot(pan_exp, aes(x=cancer, y = exp,color = Sample)) +
    #geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))+
    geom_boxplot(aes(color = Sample),alpha =1,
                 lwd = 0.1, outlier.size = 1,
                 outlier.colour = "white")+ #color = c("red", "blue"),
    # geom_violin(aes(fill = Sample),trim = FALSE)+
    # geom_boxplot(width = 0.1,fill = "white")+
    
    theme_bw()+ 
    labs(y= paste0("The expression of ",g,"\nlog2(TPM + 1)"))+
    scale_color_manual(values = c(brewer.pal(3,"Dark2")))+ #c("#3300CC","#CC0000")
    stat_compare_means(label = "p.signif") +
    #labs(title = 'ImmuneCellAI') +
    theme(legend.position = "top",
          axis.text.x = element_text(angle = 45,face = "bold",colour = "#1A1A1A",hjust=1,vjust=1),
          axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
          axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
          legend.title = element_blank(),
          axis.title.x = element_blank(),
          legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
    ) 
  
  ggsave(filename = paste0(fp_pan,"/",g,"_in_pan-cancer.pdf"),
         plot = p,height=5,width=9)

})

filterGeneTypeExpr函数代码:

代码语言:javascript
复制
#filter :"protein_coding","lncRNA","miRNA","misc_RNA","snRNA","miRNA","scRNA",....
filterGeneTypeExpr <- function(expr,fil_col = "gene_type",filter = FALSE){
  
  ##Delete unexpressed genes(rows)in all samples from expr.
  dat <- expr[apply(expr[,-c(1:3)], 1, var)!=0,]
  ##rowSums
  dat <- dplyr::mutate(dat,Sums = rowSums(dat[,-c(1:3)]),.before = 4)
  dat <- dplyr::arrange(dat,gene_name,desc(Sums))
  dat <- dat[!duplicated(dat$gene_name),]
  rownames(dat) <- dat$gene_name
  
  if (filter == FALSE) {
    return(dat[,-c(1:4)])
    
  }else{
    dat <- dat[dat[,fil_col] == filter,-c(1:4)]
    return(dat)
  }
}

del_dup_sample函数代码:

代码语言:javascript
复制
##TCGA肿瘤样本去重
del_dup_sample <- function(data,col_rename =T){
  
  data <- data[,sort(colnames(data))]
  pid = gsub("(.*?)-(.*?)-(.*?)-.*","\\1-\\2-\\3",colnames(data))
  library(dplyr)
  
  reps = pid[duplicated(pid)]##重复样本
  if(length(reps)== 0 ){
    message("There were no duplicate patient samples for this data")
  }else{
    data <- data[,!duplicated(pid)]
  }
  if(col_rename == T){
    colnames(data) <- gsub("(.*?)-(.*?)-(.*?)-.*","\\1-\\2-\\3",colnames(data))
  }
  return(data)
}
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2022-09-28,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 MedBioInfoCloud 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档