前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >生物信息数据分析教程视频——11-筛选相关性基因

生物信息数据分析教程视频——11-筛选相关性基因

作者头像
DoubleHelix
发布2022-12-15 15:53:09
7010
发布2022-12-15 15:53:09
举报
文章被收录于专栏:生物信息云生物信息云

视频地址:http://mpvideo.qpic.cn/0bc3tiabgaaaneakrfjtfvrvbgwdconaaeya.f10002.mp4?

参考:

如何合理的展示相关性分析结果??

代码:

代码语言:javascript
复制

rm(list = ls())
# setwd("H:/MedBioInfoCloud/analysis/TCGA/new/conventionalAnalysis")
options(stringsAsFactors = F)
library(TCGAbiolinks)
library(WGCNA)
library(ggplot2)
library(ggpubr)
library(ggrepel)
library(corrplot)
library(pheatmap)
FilePath <- dir("H:/MedBioInfoCloud/analysis/TCGA/new/processedTCGAdata/TCGA-STAR_Exp/",
                "STARdata.Rdata$",full.names = T)

gene <- "EGFR"
type <- "protein_coding"

opt <- "output/003-CorrelationsBetween_mRNAs/01-screening/"
ifelse(dir.exists(opt),"The folder already exists.",dir.create(opt,recursive = T))


record_files <- dir(opt,".csv$",full.names = T)
ifelse(length(record_files)>0,file.remove(record_files),
       "There is no record file that can be removed")


source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/filterGeneTypeExpr.R")
source("H:/MedBioInfoCloud/analysis/TCGA/new/00-fun/del_dup_sample.R")


###TCGA数据库中33中癌症类型
project <- getGDCprojects()$project_id
project <- project[grep("TCGA-",project)]
# proj = "TCGA-LUAD"
cut_coef <- 0.3
cut_p <- 0.01

allcancercor <- data.frame()
for(proj in project){
  message("===============================")
  message(proj)
  load(FilePath[grep(proj,FilePath)])#STARdata
  tpm <- STARdata[["tpm"]]
  anoinfo <- tpm[,1:3]
  tpm <- filterGeneTypeExpr(expr = tpm,
                            fil_col = "gene_type",
                            filter = FALSE)
  ##过滤不表达的基因
  tpm <- tpm[apply(tpm,1,var) !=0,]
  if((gene %in% rownames(tpm)) & length(gene) == 1){
    ##正常组织样本ID
    SamN <- TCGAquery_SampleTypes(barcode = colnames(tpm),
                                  typesample = c("NT","NB","NBC","NEBV","NBM"))
    
    ##肿瘤组织样本ID
    SamT <- setdiff(colnames(tpm),SamN)
    
    ###去除重复样本
    tur_exp <- del_dup_sample(tpm[,SamT],col_rename = T)
    ###long2转换
    tur_exp <- log2(tur_exp + 1)
    
    gs2 <- intersect(anoinfo$gene_name[anoinfo$gene_type == type],rownames(tur_exp))
    gs2 <- setdiff(gs2,gene)
  
    g1exp <- data.frame(t(tur_exp[gene,]))
    g2exp <- data.frame(t(tur_exp[gs2,]))
    
    ####皮尔森相关性
    pearson_coef <- cor(g2exp,g1exp,method = "pearson" )
    colnames(pearson_coef) <- "pearson"
    pearson_p <- corPvalueStudent(pearson_coef,nrow(g1exp))
    colnames(pearson_p) <- "pearson_p.value"
    pear_dat <- cbind(pearson_coef,pearson_p)
    
    
    ####spearman相关性
    spearman_coef <- cor(g2exp,g1exp,method = "spearman" )
    colnames(spearman_coef) <- "spearman"
    spearman_p <- corPvalueStudent(spearman_coef,nrow(g1exp))
    colnames(spearman_p) <- "spearman_p.value"
    spear_dat <- cbind(spearman_coef,spearman_p)
    
    cor_data <- cbind(pear_dat,spear_dat) %>% as.data.frame()
    head(cor_data)
    
    screening_cor <- cor_data[((abs(cor_data[,1]) > cut_coef & (cor_data[,2] < cut_p)) &  (abs(cor_data[,3]) > cut_coef & cor_data[,4] < cut_p)),] 
    
    screening_cor$cancer <- proj
    screening_cor <- screening_cor %>% mutate(cor_gene = rownames(screening_cor),.before = 1)
    rownames(screening_cor) <- c(1:nrow(screening_cor))
    
    write.csv(screening_cor,file = paste0(opt,gene,"-",proj,"-correlation.csv"))
    allcancercor <- rbind(allcancercor,screening_cor)
  }
}


###统计相关性在所有癌症中的情况
library(tidyr)

stat <- table(allcancercor$cor_gene) %>% as.data.frame()
stat <- arrange(stat,desc(Freq))


top <- stat$Var1[1:10]

index <- lapply(top, function(x)grep(paste0("^",x,"$"),allcancercor$cor_gene)) %>% unlist()
pdata <- allcancercor[index,]
unique(pdata$cor_gene)

pea <- pdata[,c(1,2,6)]
head(pea)
pea <- spread(pea,cancer,pearson)
rownames(pea) <- pea$cor_gene
pea <- pea[,-1]

spe_p <- pdata[,c(1,4,6)]
head(spe_p)
spe_p <- spread(spe_p,cancer,spearman)
rownames(spe_p) <- spe_p$cor_gene
spe_p <- spe_p[,-1]

col2 <- colorRampPalette(c("#3300CC","#3399FF","white","#FF3333","#CC0000"),alpha = TRUE)


pheatmap_dat <- list(pearson= pea,spearman = spe_p)

d <- pheatmap_dat[[1]]
# 相关性热图
pdf(paste0(opt,gene,"-top30-in-All-cancer.pdf"),width = 7,height = 6)
for(m in names(pheatmap_dat)){
  d <- pheatmap_dat[[m]]
  pheatmap(d,
           #annotation_col =annotation_col,
           # annotation_row = annotation_row,
           color = col2(50),
           cluster_cols =T,
           fontsize=6,
           cluster_rows = T,
           fontsize_row=6,
           cellwidth =10,
           cellheight =10,
           fontsize_number = 16,
           #scale="row",
           show_colnames=T,
           fontsize_col=8,
           main = paste0(m," correlation")) %>% print()
}
dev.off()
本文参与 腾讯云自媒体同步曝光计划,分享自微信公众号。
原始发表:2022-10-02,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 MedBioInfoCloud 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体同步曝光计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档