前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >加权基因共表达网络分析(WGCNA)实例

加权基因共表达网络分析(WGCNA)实例

作者头像
用户7010445
发布2020-03-23 16:32:25
2.3K0
发布2020-03-23 16:32:25
举报
获取表达数据矩阵

这里运行R语言包GDCRNATools的帮助文档中的例子获得胆管癌的rna表达矩阵

library(GDCRNATools)
project<-'TCGA-CHOL'
rnadir<-paste(project,'RNAseq',sep='/')
clinicaldir<-paste(project,'Clinical',sep='/')
gdcRNADownload(project.id = 'TCGA-CHOL',
               data.type = 'RNAseq',
               write.manifest = F,
               method = 'gdc-client',
               directory = rnadir)

gdcClinicalDownload(project.id = 'TCGA-CHOL',
                    write.manifest = F,
                    method='gdc-client',
                    directory = clinicaldir)
metaMatrix.RNA<-gdcParseMetadata(project.id = 'TCGA-CHOL',
                                 data.type = 'RNAseq',
                                 write.meta = F)
metaMatrix.RNA<-gdcFilterDuplicate(metaMatrix.RNA)
metaMatrix.RNA<-gdcFilterSampleType(metaMatrix.RNA)
print("样本比例:")
table(metaMatrix.RNA$sample_type)
rnaCounts<-gdcRNAMerge(metadata = metaMatrix.RNA,
                       path = rnadir,
                       organized = FALSE,
                       data.type = 'RNAseq')
使用R语言包edgeR做差异表达分析

这部分代码完全来自公众号生信星球文章TCGA(转录组)差异分析三大R包及其结果对比

library(stringr)
group_list<-ifelse(str_sub(colnames(rnaCounts),14,15)=="01","tumor","normal")
group_list
table(group_list)
library(edgeR)
deg<-DGEList(counts=rnaCounts,group=group_list)
deg$samples$lib.size<-colSums(deg$counts)
deg<-calcNormFactors(deg)
design<-model.matrix(~0+group_list)
rownames(design)<-colnames(deg)
colnames(design)<-levels(group_list)
dge<-estimateGLMCommonDisp(deg,design)
dge<-estimateGLMTrendedDisp(dge,design)
dge<-estimateGLMTagwiseDisp(dge,design)
fit<-glmFit(dge,design)
fit2<-glmLRT(fit,contrast = c(-1,1))
DEG<-topTags(fit2,n=nrow(rnaCounts))
DEG<-as.data.frame(DEG)
logFC_cutoff<-with(DEG,mean(abs(logFC))+2*sd(abs(logFC)))
logFC_cutoff
DEG$change<-as.factor(ifelse(DEG$PValue<0.05&abs(DEG$logFC)>logFC_cutoff,
                             ifelse(DEG$logFC>logFC_cutoff,"UP","DOWN"),"NOT"))
head(DEG)
table(DEG$change)
绘制差异基因聚类热图和火山图
source("../3-plotfunction.R")
cg1<-rownames(DEG)[DEG$change!="NOT"]
h1<-draw_heatmap(rnaCounts[cg1,],group_list)
v1<-draw_volcano(test=DEG[,c(1,4,6)])
png("edgeR_DEG.png",width=1000)
ggpubr::ggarrange(h1,v1,ncol=2,nrow = 1)
dev.off()

image.png

接下来利用差异表达基因做加权基因共表达网络(WGCNA)

WGCNA分析用到的代码是我在腾讯课堂上购买的一门课程,课程内容是介绍WGCNA分析在植物上的应用的。课程的内容包括基本原理的介绍,讲解文献,实例操作。我自己感觉内容还是非常棒的。课程名字我就不再这里放了,大家如果感兴趣可以给我的公众号留言。课程中用到的代码和数据如果大家需要的话也可以给我的公众号留言。另外还有一大部分代码来自生信技能树公众号文章七步走纯R代码通过数据挖掘复现一篇实验文章(第七步WGCNA)

#标准化基因表达矩阵,这一步是因为我之前尝试WGCNA分析的时候使用原始的counts会遇到报错
#提示输入数据不能为整数,我还不知道WGCNA应该用什么作为输入数据
expCPM<-cpm(rnaCounts, normalized.lib.sizes=TRUE)
expCPM[1:3,1:3]
#将用到的文件保存下来
write.csv(rnaCounts,file="TCGA-CHOL-rnaCounts.csv",quote = F)
write.csv(expCPM,file="TCGA-CHOL-expCPM.csv",quote=F)
#只选用差异表达基因做分析,我这么做事为了减小数据量,缩短运算时间
#我现在还不知道,只用差异表达基因与用完整的基因集有什么区别
expCPM<-expCPM[cg1,]
dim(expCPM)
library(WGCNA)
enableWGCNAThreads()
datExpr<-as.data.frame(t(expCPM))
group_list=factor(ifelse(as.numeric(substr(rownames(datExpr),14,15)) < 10,'tumor','normal'))
table(group_list)
datTraits<-data.frame(row.names = rownames(datExpr),
                      subtype=group_list)
#根据层次聚类绘制样本树
tree=hclust(dist(datExpr),method = 'average')
png("hclusttree.png",width=800)
plot(tree)
dev.off()

image.png

选择软阈值
powers1<-c(1:10,seq(12,30,by=2))
sft = pickSoftThreshold(datExpr,
                        RsquaredCut = 0.9,
                        powerVector = powers1,
                        verbose = 5)
png("pickSoftThres.png",width=1000)
par(mfrow = c(1,2));
cex1 = 0.9;
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers1,cex=cex1,col="red");
abline(h=0.90,col="red")
abline(h=0.85,col="blue")
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers1, cex=cex1,col="red")
dev.off()

image.png

sft$powerEstimate
setwd("WGCNA_example/")
net = blockwiseModules(
  datExpr,
  power = sft$powerEstimate,
  maxBlockSize = 10000,
  TOMType = "unsigned",
  deepSplit = 2, minModuleSize = 30,
  mergeCutHeight = 0.5,
  numericLabels = TRUE,
  pamRespectsDendro = FALSE,
  saveTOMs = TRUE,
  saveTOMFileBase = "FPKM-TOM",
  loadTOMs = FALSE,
  verbose = 3
)
table(net$colors)
moduleColors = labels2colors(net$colors)
table(moduleColors)
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
for(module in substring(colnames(MEs),3)){
  if(module == "grey") next
  ME=as.data.frame(MEs[,paste("ME",module,sep="")])
  colnames(ME)=module
  datModExpr=datExpr[,moduleColors==module]
  datKME = signedKME(datModExpr, ME)
  datKME=cbind(datKME,rep(module,length(datKME)))
  write.table(datKME,quote = F,row.names = T,append = T,file = "All_Gene_KME.txt",col.names = F)
}
png("1.png",width=1000)
plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()

image.png

design=model.matrix(~0+ datTraits$subtype)
design = as.data.frame(design)
colnames(design)=levels(datTraits$subtype)

moduleTraitCor = cor(MEs, design , use = "p")
nSamples = nrow(datExpr)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)

# Display the correlation values within a heatmap plot
png("2.png")
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = colnames(design),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()

image.png

nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]];
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 14);
plotTOM = dissTOM^7;
diag(plotTOM) = NA;
png("3.png")
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
dev.off()

image.png

MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
tumor = as.data.frame(design[,2])
names(tumor) = "tumor"
MET = orderMEs(cbind(MEs, tumor))
png("4.png")
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2),
                      marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)
dev.off()

image.png

第一次完整走完加权基因共表达网络的流程!可是如何解读结果还需要多看文章呀!

本文参与 腾讯云自媒体分享计划,分享自微信公众号。
原始发表:2020-03-19,如有侵权请联系 cloudcommunity@tencent.com 删除

本文分享自 小明的数据分析笔记本 微信公众号,前往查看

如有侵权,请联系 cloudcommunity@tencent.com 删除。

本文参与 腾讯云自媒体分享计划  ,欢迎热爱写作的你一起参与!

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 接下来利用差异表达基因做加权基因共表达网络(WGCNA)
领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档