前往小程序,Get更优阅读体验!
立即前往
首页
学习
活动
专区
工具
TVP
发布
社区首页 >专栏 >转录组差异分析—基本流程

转录组差异分析—基本流程

原创
作者头像
sheldor没耳朵
发布2024-07-29 18:18:48
1140
发布2024-07-29 18:18:48
举报
文章被收录于专栏:GEO数据挖掘

转录组差异分析—基本流程

1 背景知识

抓住主要矛盾

只需要认准count数据即可

自己的数据、公共数据、数据库、背景知识均不影响差异分析

2 读取与整理

2.1 表达矩阵

读取RawCounts.csv文件,其文件形式如下图行名为ensembleid,列名为样本名称。

代码语言:r
复制
dat = read.csv("RawCounts.csv",
               check.names = F,
               row.names = 1)
#read.csv读取后是数据框,需要转换为矩阵
exp = as.matrix(dat)

本次是采用自己的数据作为测试的,如果从GEO上下载数据可以参考如下:

代码语言:r
复制
library(tinyarray)
get_count_txt("GSE193861")
dat = read.delim("GSE193861_raw_counts_GRCh38.p13_NCBI.tsv.gz",row.names = 1)

dat[1:4,1:4]
          GSM5822748 GSM5822749 GSM5822750 GSM5822751
100287102          3          2          1          2
653635            53         39         15         45
102466751          4          1          0          6
107985730          0          0          0          1
# 转换为整数矩阵
exp = round(dat)
# 进行ID转化
exp = trans_entrezexp(exp)
exp[1:4,1:4]
            GSM5822748 GSM5822749 GSM5822750 GSM5822751
DDX11L1              3          2          1          2
WASH7P              53         39         15         45
MIR6859-1            4          1          0          6
MIR1302-2HG          0          0          0          1

还有一种常见的格式是每一个GSM数据集被单独做成一个txt.gz文件(如GSE193861)

代码语言:r
复制
r1 = function(b){
  read.delim(paste0("GSE193861_RAW/",b),header = F,row.names = 1)
}

bs = dir("GSE193861_RAW/")
[1] "GSM5822748_con1.txt.gz" "GSM5822749_con2.txt.gz" "GSM5822750_con3.txt.gz"
 [4] "GSM5822751_con4.txt.gz" "GSM5822752_con5.txt.gz" "GSM5822753_DOX1.txt.gz"
 [7] "GSM5822754_DOX2.txt.gz" "GSM5822755_DOX3.txt.gz" "GSM5822756_DOX4.txt.gz"
[10] "GSM5822757_DOX6.txt.gz"
#lapply返回的是一个列表
dat = lapply(bs, r1)
#新函数 do.call 对列表进行批量操作,对dat中每个元素按照列拼接在一起
exp = do.call(cbind,dat)

在额外添加列名,获得完整的表达矩阵

代码语言:r
复制
library(stringr)
colnames(exp) = str_split_i(bs,"_|\\.",2)

2.2 临床信息

代码语言:r
复制
clinical = 1 #有临床信息就读取,没有临床信息就写1,占位置,不用改后面代码

2.3 表达矩阵行名ID转换

注意:trans_exp_new ()仅用于ensemble 转symbol

代码语言:r
复制
library(tinyarray)
exp = trans_exp_new(exp,species = "human")#注意物种
exp[1:4,1:4]
#输出,检查数据
KO1-H1299-SETDB1-KO1 KO2-H1299-SETDB1-KO2 KO3-H1299-SETDB1-KO3
DDX11L1                      37                   37                   45
WASH7P                      200                  242                  286
AL627309.1                   27                   22                   16
AL627309.6                  219                  222                  227
           KOC1-H1299-KO-control1
DDX11L1                        23
WASH7P                        247
AL627309.1                     22
AL627309.6                    289

2.4 基因过滤

需要过滤一下那些在很多样本里表达量都为0或者表达量很低的基因。过滤标准不唯一。这里使用常用过滤标准2

代码语言:r
复制
#过滤之前基因数量
nrow(exp)
#常用过滤标准1:仅去除在所有样本里表达量都为零的基因
exp1 = exp[rowSums(exp)>0,]
nrow(exp1)
# 常用过滤标准2(推荐):仅保留在一半以上样本里表达的基因
exp = exp[apply(exp, 1, function(x) sum(x > 0) > 0.5*ncol(exp)), ]
nrow(exp)

2.5 分组信息的获取

TCGA的数据,直接用make_tcga_group给样本分组(tumor和normal),其他地方的数据分组方式参考芯片数据。

代码语言:r
复制
colnames(exp)
library(stringr)
Group = ifelse(str_detect(colnames(exp),"control"),"control","KO")
Group = factor(Group,levels = c("control","KO"))
table(Group)
Group
control      KO 
      3       3 
#保存为Rdata
#proj <- pip
save(exp,Group,proj,clinical,file = paste0(proj,".Rdata"))

3 基因差异分析

使用三种包差异分析后取交集

代码语言:r
复制
rm(list = ls())
load("pip.Rdata")

3.1 DEG2

代码语言:r
复制
library(DESeq2)
colData <- data.frame(row.names =colnames(exp), 
                      condition=Group)
#注意事项:如果多次运行,表达矩阵改动过的话,需要从工作目录下删除下面if括号里的文件
if(!file.exists(paste0(proj,"_dd.Rdata"))){
  dds <- DESeqDataSetFromMatrix(
  countData = exp,
  colData = colData,
  design = ~ condition)
  dds <- DESeq(dds)
  save(dds,file = paste0(proj,"_dd.Rdata"))
}
load(file = paste0(proj,"_dd.Rdata"))
class(dds)
res <- results(dds, contrast = c("condition",rev(levels(Group))))
#constrast
c("condition",rev(levels(Group)))
class(res)
DEG1 <- as.data.frame(res)
library(dplyr)
DEG1 <- arrange(DEG1,pvalue)
DEG1 = na.omit(DEG1)
head(DEG1)

#添加change列标记基因上调下调
logFC_t = 2
pvalue_t = 0.05

k1 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange < -logFC_t);table(k1)
FALSE  TRUE 
18213   225
k2 = (DEG1$pvalue < pvalue_t)&(DEG1$log2FoldChange > logFC_t);table(k2)
FALSE  TRUE 
17941   497
DEG1$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG1$change)
 DOWN   NOT    UP 
  225 17716   497 
head(DEG1)
              baseMean log2FoldChange      lfcSE      stat pvalue padj change
ALCAM      4577.774       2.330250 0.06006937  38.79265      0    0     UP
MYO10      2236.271       4.048245 0.07620441  53.12350      0    0     UP
MAP1B      1975.327      -2.601941 0.06879953 -37.81917      0    0   DOWN
HSPB1      6026.809       2.555337 0.06596062  38.74034      0    0     UP
LAMC3      1162.593       3.512105 0.08905396  39.43794      0    0     UP
AL161431.1 2288.839       5.465526 0.10476661  52.16859      0    0     UP

3.2 edgeR

代码语言:r
复制
library(edgeR)

dge <- DGEList(counts=exp,group=Group)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~Group)

dge <- estimateGLMCommonDisp(dge, design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit <- glmLRT(fit) 

DEG2=topTags(fit, n=Inf)
class(DEG2)
DEG2=as.data.frame(DEG2)
head(DEG2)

k1 = (DEG2$PValue < pvalue_t)&(DEG2$logFC < -logFC_t);table(k1)
FALSE  TRUE 
18219   219 
k2 = (DEG2$PValue < pvalue_t)&(DEG2$logFC > logFC_t);table(k2)
FALSE  TRUE 
17947   491 
DEG2$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG2$change)
DOWN   NOT    UP 
  219 17728   491 
#pValue太小,显示为0了
head(DEG2)
          logFC   logCPM       LR PValue FDR change
FSTL5  10.057039 4.222769 2564.032      0   0     UP
BST2    9.350844 3.520750 2140.220      0   0     UP
CTCFL   9.228345 6.052543 4721.104      0   0     UP
PCDH15  8.658005 2.835447 1507.044      0   0     UP
FLRT2   8.595937 4.851412 3462.456      0   0     UP
PRSS21  8.017133 3.502738 2164.292      0   0     UP

3.3 limma

代码语言:r
复制
library(limma)
dge <- edgeR::DGEList(counts=exp)
dge <- edgeR::calcNormFactors(dge)
design <- model.matrix(~Group)
v <- voom(dge,design, normalize="quantile")

fit <- lmFit(v, design)
fit= eBayes(fit)

DEG3 = topTable(fit, coef=2, n=Inf)
DEG3 = na.omit(DEG3)

k1 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC < -logFC_t)
k2 = (DEG3$P.Value < pvalue_t)&(DEG3$logFC > logFC_t)
DEG3$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG3$change)
 DOWN   NOT    UP 
  309 17726   403 
head(DEG3)
         logFC  AveExpr        t      P.Value    adj.P.Val        B change
DPEP3 4.631296 5.555757 60.13280 7.628686e-21 5.507727e-17 37.62656     UP
NEFH  4.326803 4.906266 60.14049 7.612615e-21 5.507727e-17 37.38817     UP
MYO10 4.045039 5.019841 59.54911 8.961483e-21 5.507727e-17 37.37704     UP
FHL1  2.930173 5.822318 49.83428 1.690593e-19 5.195192e-16 34.98868     UP
NTSR1 3.379769 5.688196 49.88634 1.661755e-19 5.195192e-16 34.97232     UP
DHRS2 6.873437 4.445805 51.64176 9.398283e-20 4.332139e-16 33.89399     UP

三种方法做差异分析得出来的结果对比

代码语言:r
复制
tj = data.frame(deseq2 = as.integer(table(DEG1$change)),
           edgeR = as.integer(table(DEG2$change)),
           limma_voom = as.integer(table(DEG3$change)),
           row.names = c("down","not","up")
          );tj
     deseq2 edgeR limma_voom
down    225   219        309
not   17716 17728      17726
up      497   491        403
save(DEG1,DEG2,DEG3,Group,tj,file = paste0(proj,"_DEG.Rdata"))

4 差异基因可视化

在这一步做个处理去除文库大小的影响

代码语言:r
复制
library(ggplot2)
library(tinyarray)
exp[1:4,1:4]
# cpm 去除文库大小的影响
dat = log2(cpm(exp)+1)
pca.plot = draw_pca(dat,Group);pca.plot
save(pca.plot,file = paste0(proj,"_pcaplot.Rdata"))
代码语言:r
复制
cg1 = rownames(DEG1)[DEG1$change !="NOT"]
cg2 = rownames(DEG2)[DEG2$change !="NOT"]
cg3 = rownames(DEG3)[DEG3$change !="NOT"]

h1 = draw_heatmap(dat[cg1,],Group)
h2 = draw_heatmap(dat[cg2,],Group)

h3 = draw_heatmap(dat[cg3,],Group)

v1 = draw_volcano(DEG1,pkg = 1,logFC_cutoff = logFC_t)
v2 = draw_volcano(DEG2,pkg = 2,logFC_cutoff = logFC_t)
v3 = draw_volcano(DEG3,pkg = 3,logFC_cutoff = logFC_t)

library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect') &theme(legend.position = "none")

ggsave(paste0(proj,"_heat_vo.png"),width = 15,height = 10)
代码语言:r
复制
UP=function(df){
  rownames(df)[df$change=="UP"]
}
DOWN=function(df){
  rownames(df)[df$change=="DOWN"]
}

up = intersect(intersect(UP(DEG1),UP(DEG2)),UP(DEG3))
down = intersect(intersect(DOWN(DEG1),DOWN(DEG2)),DOWN(DEG3))
dat = log2(cpm(exp)+1)
hp = draw_heatmap(dat[c(up,down),],Group)

#上调、下调基因分别画维恩图
up_genes = list(Deseq2 = UP(DEG1),
          edgeR = UP(DEG2),
          limma = UP(DEG3))

down_genes = list(Deseq2 = DOWN(DEG1),
          edgeR = DOWN(DEG2),
          limma = DOWN(DEG3))

up.plot <- draw_venn(up_genes,"UPgene")
down.plot <- draw_venn(down_genes,"DOWNgene")

#维恩图拼图
library(patchwork)
#up.plot + down.plot
# 拼图
pca.plot + hp+up.plot +down.plot+ plot_layout(guides = "collect")
ggsave(paste0(proj,"_heat_ve_pca.png"),width = 15,height = 10)
代码语言:r
复制
#分组聚类的热图
library(ComplexHeatmap)
library(circlize)
col_fun = colorRamp2(c(-2, 0, 2), c("#2fa1dd", "white", "#f87669"))
top_annotation = HeatmapAnnotation(
  cluster = anno_block(gp = gpar(fill = c("#f87669","#2fa1dd")),
                       labels = levels(Group),
                       labels_gp = gpar(col = "white", fontsize = 12)))

m = Heatmap(t(scale(t(exp[c(up,down),]))),name = " ",
            col = col_fun,
        top_annotation = top_annotation,
        column_split = Group,
        show_heatmap_legend = T,
        border = F,
        show_column_names = F,
        show_row_names = F,
        use_raster = F,
        cluster_column_slices = F,
        column_title = NULL)
m

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。

如有侵权,请联系 cloudcommunity@tencent.com 删除。

评论
登录后参与评论
0 条评论
热度
最新
推荐阅读
目录
  • 转录组差异分析—基本流程
    • 1 背景知识
      • 2 读取与整理
        • 2.1 表达矩阵
        • 2.2 临床信息
        • 2.3 表达矩阵行名ID转换
        • 2.4 基因过滤
        • 2.5 分组信息的获取
      • 3 基因差异分析
        • 3.1 DEG2
        • 3.2 edgeR
        • 3.3 limma
      • 4 差异基因可视化
      领券
      问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档