生物信息数据分析教程视频——12-基因之间的相关性分析及可视化

DoubleHelix
发布于 2022-12-15 15:56:30
1.2K1
发布于 2022-12-15 15:56:30
文章被收录于专栏：生物信息云
视频地址：http://mpvideo.qpic.cn/0bc3niaasaaatialkbbtenrva2wdbfvaacia.f10002.mp4?
参考：
如何合理的展示相关性分析结果？？
代码：

# https://mp.weixin.qq.com/s/X2kybql--KPgjUumWX4z9Q
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(WGCNA)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(corrplot)
library(pheatmap)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

geneset1 <- c("EGFR","PTEN","ATF1","ATF3","ADORA1")
geneset2 <- c("MVP","MGLL","TNS3","ANXA6","EHD2","TGM2","DPYSL2","TWF2","EHD1","RRBP1","RRAS")

opt <- "output/003-CorrelationsBetween_mRNAs/"

record_files <- dir(opt,"RecordInformation.txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
pandata <- data.frame()

for(proj in project){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  if(length(intersect(rownames(tpm),geneset1))> 0& length(intersect(rownames(tpm),geneset2))> 0 ){
    
    ##正常组织样本ID
    SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                  typesample = c("NT","NB","NBC","NEBV","NBM"))
    
    ##肿瘤组织样本ID
    SamT <- setdiff(colnames(tpm),SamN)
    
    ###去除重复样本
    tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
    ###long2转换
    tur_exp <- log2(tur_exp + 1)
    
    gs1 <- intersect(rownames(tpm),geneset1)
    gs2 <- intersect(rownames(tpm),geneset2)
    g1exp <- data.frame(t(tur_exp[gs1,]))
    g2exp <- data.frame(t(tur_exp[gs2,]))
    
    # method = "pearson"
    for(method in c("pearson","spearman")){
      
   
      cor_coef <- cor(g1exp,g2exp,method = method )
      cor_p <- corPvalueStudent(cor_coef,nrow(g1exp))

      col2 <- colorRampPalette(c("#3300CC","#3399FF","white","#FF3333","#CC0000"),alpha = TRUE)
  
      # 相关性热图，基因数量太少，不绘制
      if(min(length(gs1),length(gs2)) >= 3){
        fp_corrplot <- paste0(opt,proj,"/corrplot/")
        ifelse(dir.exists(fp_corrplot),"The folder already exists.",dir.create(fp_corrplot,recursive = T))
        
        display_numbers = matrix(ifelse(cor_p > 0.01 | is.na(cor_p), "×", ""),
                                 nrow(cor_p))
        
        pdf(paste0(fp_corrplot,method,"-corrplot.pdf"),
            width = ifelse(length(gs1)>=4,length(gs1)*0.9,3 ),
            height = ifelse(length(gs2)>=4,length(gs1)*0.9,3 ))
        corrplot(cor_coef, col = col2(100),method = 'square',cl.length=5,
                 tl.col="black",tl.cex = 1,cl.pos = "r",cl.ratio = 0.2,
                 cl.align.text = "l",
                 p.mat = cor_p, sig.level = c(.001, .01, .05),outline="white",
                 insig = "label_sig",pch.cex = 1.2, pch.col = "black")%>% print()
        
        pheatmap(cor_coef,
                 #annotation_col =annotation_col,
                 # annotation_row = annotation_row,
                 color = col2(50),
                 cluster_cols =F,
                 fontsize=6,
                 cluster_rows = F,
                 fontsize_row=6,
                 cellwidth =10,
                 cellheight =10,
                 fontsize_number = 16,
                 display_numbers= display_numbers,
                 #scale="row",
                 show_colnames=T,
                 fontsize_col=8,
                 main = paste0(method," correlation")) %>% print()
        
        dev.off()
      }
   
      ###
      # g1 = gs1[1]
      # g2 = gs2[1]
      fp_point <- paste0(opt,proj,"/point/")
      ifelse(dir.exists(fp_point),"The folder already exists.",dir.create(fp_point,recursive = T))
      
      for(g1 in gs1){
        for(g2 in gs2){
          coefficient = round(cor_coef[g1,g2],2)
          pvalue <- round(cor_p[g1,g2],3)
          pv <- ifelse(pvalue < 0.001,"p value < 0.001",paste0("p value = ",pvalue))
          
          txt <- ifelse(method == "pearson",paste0("Pearson correlation coefficient:",coefficient,"\n",pv),
                        paste0(paste0("Spearman correlation coefficient:",coefficient,"\n",pv)))
          data <- data.frame(g1exp[,g1],g2exp[,g2])
          colnames(data) <- c("x","y")
          p <- ggplot(data = data, aes(x = x, y = y)) + #数据映射
            geom_point(alpha = 0.6,shape = 19,size=3,color="#DC143C") +#散点图，alpha就是点的透明度
            #geom_abline()+
            labs(title = txt)+
            geom_smooth(method = lm, formula = y ~ x,aes(colour = "lm"), size = 1.2,se = T)+
            scale_color_manual(values = c("#808080")) + #手动调颜色c("#DC143C","#00008B", "#808080")
            theme_bw() +#设定主题
            theme(axis.title=element_text(size=15,face="plain",color="black"),
                  axis.text = element_text(size=12,face="plain",color="black"),
                  legend.position =  "none",
                  panel.background = element_rect(fill = "transparent",colour = "black"),
                  plot.background = element_blank(),
                  plot.title = element_text(size=15, lineheight=.8,hjust=0.5, face="plain"),
                  legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"))+
            ylab(paste0("The expression of ",g2,"\nlog2(TPM +1)")) + #expression的作用就是让log10的10下标
            xlab(paste0("The expression of ",g1,"\nlog2(TPM +1)"))
          ggsave(paste0(fp_point,g1,"-",g2,"-",method,"-point.pdf"),
                 plot = p,
                 width = 5,height = 5)
          
          onedata <- data.frame(gene1 = g1,
                                gene2 = g2,
                                coef = coefficient,p = pvalue,method = method,
                                cancer = proj)
          pandata <- rbind(pandata,onedata)
        }
      }
    }
 
  }
}

###==========泛癌=======
fp_pan <- paste0(opt,"pan-cancer/")
ifelse(dir.exists(fp_pan),"The folder already exists.",dir.create(fp_pan,recursive = T))

record_files <- dir(fp_pan,".txt$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


cut_ceof <- 0.3
cut_pval <- 0.01
method = "pearson"
for(method in c("pearson","spearman")){
  subdata <- pandata[pandata$method == method,]
  # g1 <- unique(subdata$gene1)[1]
  for(g1 in unique(subdata$gene1)){
    subdata1 <- subdata[subdata$gene1 == g1,]
    # g2 <- unique(subdata$gene2)[1]
    for(g2 in unique(subdata$gene2)){
      subdata2 <- subdata1[subdata1$gene2 == g2,]
 
      subdata2$significance <- ifelse(abs(subdata2$coef)> cut_ceof &subdata2$p < cut_pval,
                                      ifelse(subdata2$coef > 0,"positive","negative"),"ns")
      
      if(length(unique(subdata2$significance)) > 1){
        labslogi <- abs(subdata2$coef) > cut_ceof & subdata2$p < cut_pval
        
        ###筛选要在散点图中展现的癌症类型标签：前3个===============
        if(length(unique(subdata2$significance)) == 3){
          cl <- c("#00008B", "#808080","#DC143C")
          ###正相关的最多显示3个
          posdat <- subdata2[subdata2$significance == "positive",] %>% arrange(desc(coef))
          if(length(posdat$significance)> 3){
            poscancer <- posdat$cancer[1:3]
          }else{poscancer <- posdat$cancer}
          ###负相关的最多显示3个
          negdat <- subdata2[subdata2$significance == "negative",] %>% arrange(coef)
          if(length(negdat$significance)> 3){
            negcancer <- negdat$cancer[1:3]
          }else{negcancer <- negdat$cancer}

          labscancer <- c(poscancer,negcancer)
        }else if(sum(subdata2$significance == "positive")!=0){
          cl <- c("#808080","#DC143C")
          posdat <- subdata2[subdata2$significance == "positive",] %>% arrange(desc(coef))
          if(length(posdat$significance)> 3){
            labscancer <- posdat$cancer[1:3]
          }else{labscancer <- posdat$cancer}
          
        }else{
          cl <- c("#DC143C", "#808080")
          negdat <- subdata2[subdata2$significance == "negative",] %>% arrange(coef)
          if(length(negdat$significance)> 3){
            labscancer <- negdat$cancer[1:3]
          }else{labscancer <- negdat$cancer}
        }
        subdata2$labs <- ""
        subdata2$labs[match(labscancer,subdata2$cancer)] <- labscancer
        subdata2$labs <- gsub("TCGA-","",subdata2$labs)
        ###^^^^^^^^^^^^^^^^^^^
        ###如果p值为0.随机生成极小p值
        subdata2$nlog10p <- -log10(subdata2$p)
        if(sum(subdata2$p == 0)>0){
          nInf <- length(grep("Inf",subdata2$nlog10p))
          subdata2$nlog10p[grep("Inf",subdata2$nlog10p)] <- -log10(runif(nInf,0.000000000001,0.0000001) )
        }
        
        p <-  ggplot(data = subdata2, aes(x = coef,
                                          y = nlog10p,
                                          colour = significance)) + #数据映射
          geom_point(alpha = 0.9,shape = 19,size=3) +#散点图，alpha就是点的透明度
          scale_color_manual(values = cl) +
          theme_bw() +#设定主题
          geom_text_repel(label = subdata2$labs,
                          size = 5,
                          segment.color = "black", #连接线的颜色，就是名字和点之间的线
                          show.legend = FALSE) +
          labs(title = paste0())+
          theme(axis.title=element_text(size=15,face="plain",color="black"),
                axis.text = element_text(size=12,face="plain",color="black"),
                legend.position =  "none",
                panel.background = element_rect(fill = "transparent",colour = "black"),
                plot.background = element_blank(),
                plot.title = element_text(size=15, lineheight=.8,hjust=0.5, face="plain"),
                legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "pt"))+
          ylab(expression(-log[10]("p.value"))) +#expression的作用就是让log10的10下标
          xlab(paste0(method," correlation coefficient of\ngenes ",g1, " and ", g2))+
          geom_vline(xintercept = c(-cut_ceof,cut_ceof),
                     lty = 2,
                     col = "black",
                     lwd = 0.3) +
          geom_hline(yintercept = -log10(cut_pval),
                     lty = 2,
                     col = "black",
                     lwd = 0.3)
        # p
        ggsave(filename = paste0(fp_pan,g1,"-",g2,"-",method,"_in_pan-cancer.pdf"),
               plot = p,height=5,width=5)
        
        ##记录分析的样本信息,只记录一次
        pos <- subdata2$cancer[subdata2$significance == "positive"]
        neg <- subdata2$cancer[subdata2$significance == "negative"]
        opinfo <- paste0(method,":",g1," and ",g2,":")
        output <- file(paste0(fp_pan,"00-RecordInformation.txt"), open="a+b")
        writeLines(c(opinfo,"\tpositive:",paste0("\t\t",pos),"\tnegative:",paste0("\t",neg)), 
                   con=output)
        close(output)
      }
    }
  }
}
本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。
原始发表：2022-10-03，如有侵权请联系 cloudcommunity@tencent.com 删除
视频
本文分享自 MedBioInfoCloud 微信公众号，前往查看
如有侵权，请联系 cloudcommunity@tencent.com 删除。
本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！
视频
登录后参与评论
0 条评论
热度
生物信息数据分析教程视频——12-基因之间的相关性分析及可视化

生物信息数据分析教程视频——12-基因之间的相关性分析及可视化

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐