生物信息数据分析教程视频——11-筛选相关性基因

DoubleHelix

发布于 2022-12-15 15:53:09

7010

发布于 2022-12-15 15:53:09

文章被收录于专栏：生物信息云生物信息云

视频地址：http://mpvideo.qpic.cn/0bc3tiabgaaaneakrfjtfvrvbgwdconaaeya.f10002.mp4?

参考：

如何合理的展示相关性分析结果？？

代码：


rm(list = ls())
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(WGCNA)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(corrplot)
library(pheatmap)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

gene <- "EGFR"
type <- "protein_coding"

opt <- "output/003-CorrelationsBetween_mRNAs/01-screening/"
ifelse(dir.exists(opt),"The folder already exists.",dir.create(opt,recursive = T))


record_files <- dir(opt,".csv$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
cut_coef <- 0.3
cut_p <- 0.01

allcancercor <- data.frame()
for(proj in project){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  anoinfo <- tpm[,1:3]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  if((gene %in% rownames(tpm)) & length(gene) == 1){
    ##正常组织样本ID
    SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                  typesample = c("NT","NB","NBC","NEBV","NBM"))
    
    ##肿瘤组织样本ID
    SamT <- setdiff(colnames(tpm),SamN)
    
    ###去除重复样本
    tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
    ###long2转换
    tur_exp <- log2(tur_exp + 1)
    
    gs2 <- intersect(anoinfo$gene_name[anoinfo$gene_type == type],rownames(tur_exp))
    gs2 <- setdiff(gs2,gene)
  
    g1exp <- data.frame(t(tur_exp[gene,]))
    g2exp <- data.frame(t(tur_exp[gs2,]))
    
    ####皮尔森相关性
    pearson_coef <- cor(g2exp,g1exp,method = "pearson" )
    colnames(pearson_coef) <- "pearson"
    pearson_p <- corPvalueStudent(pearson_coef,nrow(g1exp))
    colnames(pearson_p) <- "pearson_p.value"
    pear_dat <- cbind(pearson_coef,pearson_p)
    
    
    ####spearman相关性
    spearman_coef <- cor(g2exp,g1exp,method = "spearman" )
    colnames(spearman_coef) <- "spearman"
    spearman_p <- corPvalueStudent(spearman_coef,nrow(g1exp))
    colnames(spearman_p) <- "spearman_p.value"
    spear_dat <- cbind(spearman_coef,spearman_p)
    
    cor_data <- cbind(pear_dat,spear_dat) %>% as.data.frame()
    head(cor_data)
    
    screening_cor <- cor_data[((abs(cor_data[,1]) > cut_coef & (cor_data[,2] < cut_p)) &  (abs(cor_data[,3]) > cut_coef & cor_data[,4] < cut_p)),] 
    
    screening_cor$cancer <- proj
    screening_cor <- screening_cor %>% mutate(cor_gene = rownames(screening_cor),.before = 1)
    rownames(screening_cor) <- c(1:nrow(screening_cor))
    
    write.csv(screening_cor,file = paste0(opt,gene,"-",proj,"-correlation.csv"))
    allcancercor <- rbind(allcancercor,screening_cor)
  }
}


###统计相关性在所有癌症中的情况
library(tidyr)

stat <- table(allcancercor$cor_gene) %>% as.data.frame()
stat <- arrange(stat,desc(Freq))


top <- stat$Var1[1:10]

index <- lapply(top, function(x)grep(paste0("^",x,"$"),allcancercor$cor_gene)) %>% unlist()
pdata <- allcancercor[index,]
unique(pdata$cor_gene)

pea <- pdata[,c(1,2,6)]
head(pea)
pea <- spread(pea,cancer,pearson)
rownames(pea) <- pea$cor_gene
pea <- pea[,-1]

spe_p <- pdata[,c(1,4,6)]
head(spe_p)
spe_p <- spread(spe_p,cancer,spearman)
rownames(spe_p) <- spe_p$cor_gene
spe_p <- spe_p[,-1]

col2 <- colorRampPalette(c("#3300CC","#3399FF","white","#FF3333","#CC0000"),alpha = TRUE)


pheatmap_dat <- list(pearson= pea,spearman = spe_p)

d <- pheatmap_dat[[1]]
# 相关性热图
pdf(paste0(opt,gene,"-top30-in-All-cancer.pdf"),width = 7,height = 6)
for(m in names(pheatmap_dat)){
  d <- pheatmap_dat[[m]]
  pheatmap(d,
           #annotation_col =annotation_col,
           # annotation_row = annotation_row,
           color = col2(50),
           cluster_cols =T,
           fontsize=6,
           cluster_rows = T,
           fontsize_row=6,
           cellwidth =10,
           cellheight =10,
           fontsize_number = 16,
           #scale="row",
           show_colnames=T,
           fontsize_col=8,
           main = paste0(m," correlation")) %>% print()
}
dev.off()

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2022-10-02，如有侵权请联系 cloudcommunity@tencent.com 删除

视频

本文分享自 MedBioInfoCloud 微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

视频

登录后参与评论

0 条评论

热度

生物信息数据分析教程视频——11-筛选相关性基因

生物信息数据分析教程视频——11-筛选相关性基因

社区

活动

资源

关于

腾讯云开发者

热门产品

热门推荐

更多推荐