前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >生物信息数据分析教程视频——10-TCGA数据库:miRNA的表达探索

生物信息数据分析教程视频——10-TCGA数据库:miRNA的表达探索

作者头像
DoubleHelix
发布2022-12-15 15:40:29
6980
发布2022-12-15 15:40:29
举报
文章被收录于专栏:生物信息云

视频地址:http://mpvideo.qpic.cn/0bc3ueaacaaagqalujrtqfrvbiodagqqaaia.f10002.mp4?

代码:

代码语言:javascript
复制
library(TCGAbiolinks)
library(reshape2)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(RColorBrewer)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-miRNA_Isoform_Exp",
                "Isoform_Expression_data.Rdata$",
                full.names = T)
mirs <- c("hsa-miR-5590-3p","hsa-miR-142-5p","hsa-let-7f-5p")

opt <- "output/002-miR_ExpInNorAndTur/"

record_files <- dir(opt,".txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")

###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-BLCA"
norn <- 10 #正常样本数最小数量
geneexpdata <- data.frame()
for(proj in project){
  message(proj)
  load(FilePath[grep(proj,FilePath)])#
  rpm <- miRNA_data[["RPM"]]
  conmirs <- intersect(rownames(rpm),mirs)
  if(length(conmirs)> 0){
    ##正常组织样本
    SamN <- TCGAquery_SampleTypes(barcode = colnames(rpm),
                                  typesample = c("NT","NB","NBC","NEBV","NBM"))
    if(length(SamN)>= norn){
      
      ##肿瘤组织样本
      SamT <- setdiff(colnames(rpm),SamN)

      # dim(rpm[,SamT])
      #过滤重复样本
      nor_exp = del_dup_sample(rpm[,sort(SamN)],col_rename = T)
      tur_exp = del_dup_sample(rpm[,sort(SamT)],col_rename = T)
      
      
      ###============非配对样本==
      ##构建数据框
      nor_gsExp <- log2(data.frame(nor_exp[conmirs,],check.names = T) + 1) %>% t() %>% 
        as.data.frame() %>% mutate(Sample = rep(paste0("Normal(n=",ncol(nor_exp),")"),ncol(nor_exp)),
                                   .before = 1)%>%  melt(id.vars = "Sample")
      
      tur_gsExp <- log2(data.frame(tur_exp[conmirs,],check.names = T) + 1) %>% t() %>% 
        as.data.frame() %>% mutate(Sample = rep(paste0("Tumor(n=",ncol(tur_exp),")"),ncol(tur_exp)),
                                   .before = 1)%>%  melt(id.vars = "Sample")
      data <- rbind(nor_gsExp,tur_gsExp)
      head(data)
      colnames(data) <- c("Sample","miR","exp")
      compaired <- list(unique(data$Sample))
      data <- mutate(data,cancer = proj,.before = 1)
      geneexpdata <- rbind(geneexpdata,data)
      ##======绘图===
      ##输出路径
      # opt <- "output/001-RNASeq_ExpInNorAndTur/"
      fp_boxplot <- paste0(opt,proj,"/boxplot")
      ifelse(dir.exists(fp_boxplot),"The folder already exists.",dir.create(fp_boxplot,recursive = T))
      
      
      ###==============多个基因在同一个图中
      if(length(conmirs)> 1){
        p <- ggplot(data, aes(x= miR, y = exp,color = Sample)) +
          #geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))+
          geom_boxplot(aes(color = Sample),alpha =1,
                       lwd = 0.1, outlier.size = 1,
                       outlier.colour = "white")+ #color = c("red", "blue"),
          # geom_violin(aes(fill = Sample),trim = FALSE)+
          # geom_boxplot(width = 0.1,fill = "white")+
          
          theme_bw()+ 
          labs(y= expression(log[2](RPM + 1)))+
          scale_color_manual(values = c(brewer.pal(3,"Dark2")))+ #c("#3300CC","#CC0000")
          stat_compare_means(label = "p.signif") +
          #labs(title = 'ImmuneCellAI') +
          theme(legend.position = "top",
                axis.text.x = element_text(angle = 45,face = "bold",colour = "#1A1A1A",hjust = 1,vjust = 1),
                axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
                axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
                legend.title = element_blank(),
                axis.title.x = element_blank(),
                legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
          ) 
        ggsave(filename = paste0(fp_boxplot,"/all_miR.pdf"),
               plot = p,height=5,width= ifelse(length(conmirs)< 5,5,length(conmirs)*0.9))
      }
      # g <- conmirs[1]
      ####================单一绘制==============
      f2 <- lapply(conmirs, function(g){
        ldat <- data[data$miR == g,]
        p = ggplot(ldat,aes(Sample,exp,fill= Sample))+
          geom_boxplot(aes(fill = Sample),notch = FALSE,
                       position = position_dodge(width =0.01,preserve = "single"),
                       outlier.alpha  =1,width=0.4) +
          scale_fill_manual(values = c(brewer.pal(5,"Set1")))+
          geom_signif(comparisons = compaired,
                      step_increase = 0.1,
                      map_signif_level = T,
                      margin_top = 0.2,
                      test = "wilcox.test")+
          labs(y= paste0("The expression of ",g,"\nlog2(RPM + 1)"),title= proj)+
          theme_classic()+
          theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                plot.title = element_text(hjust = 0.5),
                axis.line=element_line(colour="black",size=0.25),
                axis.title=element_text(size=10,face="plain",color="black"),
                axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
                axis.title.x=element_text(size=12,face="plain",color="black"),
                axis.title.y=element_text(size=12,face="plain",color="black"),
                axis.text = element_text(size=12,color="black"),
                legend.position="none"
          ) 
        #p
        ggsave(filename = paste0(fp_boxplot,"/",g,"-Unpaired .pdf"),
               plot = p,width = 2.5,height = 5)
      })
      
      ###===============
      fp_violin <- paste0(opt,proj,"/violin")
      ifelse(dir.exists(fp_violin),"The folder already exists.",dir.create(fp_violin,recursive = T))
      
      lapply(conmirs, function(g){
        ldat <- data[data$miR == g,]
        p = ggplot(ldat, aes(Sample,exp,fill= Sample))+ 
          geom_violin(aes(fill = Sample),trim = FALSE)+
          geom_signif(comparisons = compaired,
                      step_increase = 0.1,
                      map_signif_level = T,
                      margin_top=0.2,
                      tip_length =0.02,
                      test = "wilcox.test")+
          geom_boxplot(width = 0.1,fill = "white")+
          scale_fill_manual(values=c(brewer.pal(3,"Dark2")))+
          theme_classic()+
          labs(y= paste0("The expression of ",g,"\nlog2(RPM + 1)"),title= proj)+
          theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                plot.title = element_text(hjust = 0.5),
                axis.line=element_line(colour="black",size=0.25),
                axis.title.x = element_blank(),
                axis.text.x = element_text(angle = 45,face = "plain",colour = "black",hjust=1,vjust=1),
                axis.text = element_text(size=12,face="plain",color="black"),
                legend.position="none"
          )
        p
        ggsave(filename = paste0(fp_violin,"/",g,"-Unpaired .pdf"),plot = p,width = 3,height = 5)
      })
      
      ###===========配对样本=========
      id <- intersect(colnames(nor_exp),colnames(tur_exp))
      if(length(id) > 3){
        paired_nor_gsExp <- nor_exp[conmirs,id]
        paired_tur_gsExp <- tur_exp[conmirs,id]
        
        paired_nor_gsExp <- log2(data.frame(paired_nor_gsExp,check.names = T) + 1) %>% t() %>% 
          as.data.frame() %>% mutate(Sample = rep("Normal",ncol(paired_nor_gsExp)),
                                     id = id,
                                     .before = 1)%>%  melt(id.vars = c("Sample","id"))
        
        paired_tur_gsExp <- log2(data.frame(paired_tur_gsExp,check.names = T) + 1) %>% t() %>% 
          as.data.frame() %>% mutate(Sample = rep("Tumor",ncol(paired_tur_gsExp)),
                                     id = id,
                                     .before = 1)%>%  melt(id.vars =c("Sample","id"))
        paired_data <- rbind(paired_nor_gsExp, paired_tur_gsExp)
        head(paired_data)
        colnames(paired_data) <- c("Sample","id","miR","exp")
        paired_compaired <- list(unique(paired_data$Sample))
        
        ###===============
        lapply(conmirs, function(g){
          ldat <- paired_data[paired_data$miR == g,]
          #为了防止配对样本信息错乱,先构造一个配对样本的数据集
          dat1_paried <- reshape2::dcast(ldat[c('Sample', 'id', 'exp')], Sample~id)
          rownames(dat1_paried) <- dat1_paried$Sample
          dat1_paried <- t(dat1_paried[,-1] )%>%as.data.frame()
          head(dat1_paried)
          #配对 t 检验,双侧检验
          p.value <- t.test(dat1_paried$Normal, dat1_paried$Tumor, paired = TRUE, alternative = 'two.sided', conf.level = 0.95)
          pv <- p.value[["p.value"]]
          stasig <- ifelse(pv >= 0.001,paste0('p value = ',round(pv,3)),"p value < 0.01")
          
          p1 <- ggplot(ldat, aes(x = Sample, y = exp,colour = Sample)) +
            # geom_boxplot(aes(fill = Sample), show.legend = FALSE, width = 0.6) +  #绘制箱线图展示肿瘤组织和正常组织的两组基因表达整体分布
            geom_point(size = 3) +  #绘制散点表示单个样本的基因表达信息
            geom_line(aes(group = id), color = 'black', lwd = 0.05) +  #绘制样本连线,通过 aes(group) 参数指定配对样本信息
            scale_fill_manual(values = c('#FE7280', '#AC88FF')) +  #以下是颜色分配、主题更改等,不再多说
            theme_classic()+ 
            labs(y= paste0("The expression of ",g,"\nlog2(TPM + 1)"),title= proj)+
            theme(panel.background=element_rect(fill="white",colour="black",size=0.25),
                  plot.title = element_text(hjust = 0.5),
                  axis.line=element_line(colour="black",size=0.25),
                  axis.title.x = element_blank(),
                  axis.text.x = element_text(face = "plain",colour = "black"),
                  axis.text = element_text(size=12,face="plain",color="black"),
                  legend.position="none"
            )+ annotate('text', 
                        x= ifelse(min(dat1_paried$Normal) > min(dat1_paried$Tumor),1,2),
                        y= min(ldat$exp + 0.5),
                        label = stasig ,size = 4)
          ggsave(filename = paste0(fp_boxplot,"/",g,"-paired .pdf"),plot = p1,width = 3,height = 3)
        })
        
      }
      ##记录分析的样本信息,只记录一次
      opinfo <- paste0(proj,":\n\t","Normal(n)=",length(SamN),";Tumor(n)=",length(SamT))
      output <- file(paste0(opt,"annotation.txt"), open="a+b")
      writeLines(opinfo, con=output)
      close(output)
      
    }
    else if(length(SamN) == 0){
      output <- file(paste0(opt," zeroNormalSamples_CancerType.txt"), open="a+b")
      writeLines(proj, con=output)
      close(output)
    }else {
      opinfo <- paste0(proj,":\n\t","Normal(n)=",length(SamN))
      output <- file(paste0(opt,"lessNormalSamples_CancerType.txt"), open="a+b")
      writeLines(opinfo, con=output)
      close(output)
    }
    
  }
 
}
###============单基因泛癌=============
###===============
fp_pan <- paste0(opt,"/pan-cancer")
ifelse(dir.exists(fp_pan),"The folder already exists.",dir.create(fp_pan,recursive = T))
lapply(unique(geneexpdata$miR), function(g){
  # g = unique(geneexpdata$miR)[1]
  pan_exp <- geneexpdata[geneexpdata$miR == g,]
  head(pan_exp)
  pan_exp$Sample[grep("Normal",pan_exp$Sample)] <- "Normal"
  pan_exp$Sample[grep("Tumor",pan_exp$Sample)] <- "Tumor"
  # c(brewer.pal(3,"Dark2"))
  
  p <- ggplot(pan_exp, aes(x=cancer, y = exp,color = Sample)) +
    #geom_jitter(size = 0.5,aes(x=variable, y=value,color = Group))+
    geom_boxplot(aes(color = Sample),alpha =1,
                 lwd = 0.1, outlier.size = 1,
                 outlier.colour = "white")+ #color = c("red", "blue"),
    # geom_violin(aes(fill = Sample),trim = FALSE)+
    # geom_boxplot(width = 0.1,fill = "white")+
    
    theme_bw()+ 
    labs(y= paste0("The expression of ",g,"\nlog2(RPM + 1)"))+
    scale_color_manual(values = c(brewer.pal(3,"Dark2")))+ #c("#3300CC","#CC0000")
    stat_compare_means(label = "p.signif") +
    #labs(title = 'ImmuneCellAI') +
    theme(legend.position = "top",
          axis.text.x = element_text(angle = 45,face = "bold",colour = "#1A1A1A",hjust=1,vjust=1),
          axis.text.y = element_text(face = "bold",colour = "#1A1A1A"),
          axis.title = element_text(size = 12,face = "bold", colour = "#1A1A1A"),
          legend.title = element_blank(),
          axis.title.x = element_blank(),
          legend.text = element_text(size = 12, face = "bold",colour = "#1A1A1A")
    ) 
  
  ggsave(filename = paste0(fp_pan,"/",g,"_in_pan-cancer.pdf"),
         plot = p,height=5,width=9)
  
})
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2022-10-01,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 MedBioInfoCloud 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档