two kinds of data.png
> library(tidyverse)
> library(DESeq2)
> #import data
> setwd("F:/rna_seq/data/matrix")
> mycounts<-read.csv("readcount.csv")
> head(mycounts)
X control1 control2 treat1 treat2
1 ENSMUSG00000000001 1648 2306 2941 2780
2 ENSMUSG00000000003 0 0 0 0
3 ENSMUSG00000000028 835 950 1366 1051
4 ENSMUSG00000000031 65 83 52 53
5 ENSMUSG00000000037 70 53 94 66
6 ENSMUSG00000000049 0 3 4 5
#这里有个x,需要去除,先把第一列当作行名来处理
> rownames(mycounts)<-mycounts[,1]
#把带X的列删除
> mycounts<-mycounts[,-1]
> head(mycounts)
control1 control2 treat1 treat2
ENSMUSG00000000001 1648 2306 2941 2780
ENSMUSG00000000003 0 0 0 0
ENSMUSG00000000028 835 950 1366 1051
ENSMUSG00000000031 65 83 52 53
ENSMUSG00000000037 70 53 94 66
ENSMUSG00000000049 0 3 4 5
# 这一步很关键,要明白condition这里是因子,不是样本名称;小鼠数据有对照组和处理组,各两个重复
> condition <- factor(c(rep("control",2),rep("treat",2)), levels = c("control","treat"))
> condition
[1] control control treat treat
Levels: control treat
#colData也可以自己在excel做好另存为.csv格式,再导入即可
> colData <- data.frame(row.names=colnames(mycounts), condition)
> colData
condition
control1 control
control2 control
treat1 treat
treat2 treat
> dds <- DESeqDataSetFromMatrix(mycounts, colData, design= ~ condition)
> dds <- DESeq(dds)
> # 查看一下dds的内容
> dds
显示为
class: DESeqDataSet
dim: 6 4
metadata(1): version
assays(3): counts mu cooks
rownames(6): ENSMUSG00000000001 ENSMUSG00000000003 ... ENSMUSG00000000037 ENSMUSG00000000049
rowData names(21): baseMean baseVar ... deviance maxCooks
colnames(4): control1 control2 treat1 treat2
colData names(2): condition sizeFactor
接下来,我们要查看treat versus control的总体结果,并根据p-value进行重新排序。利用summary
命令统计显示一共多少个genes上调和下调(FDR0.1)
> res = results(dds, contrast=c("condition", "control", "treat"))
#或下面命令
> res= results(dds)
> res = res[order(res$pvalue),]
> head(res)
> summary(res)
#所有结果先进行输出
> write.csv(res,file="All_results.csv")
> table(res$padj<0.05)
> res = results(dds2, contrast=c("condition", "control", "treat"))
> res = res[order(res$pvalue),]
> head(res)
log2 fold change (MLE): condition control vs treat
Wald test p-value: condition control vs treat
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000003309 548.1926 3.231611 0.2658125 12.157485 5.234568e-34 8.193146e-30
ENSMUSG00000046323 404.1894 3.067050 0.2628220 11.669687 1.820923e-31 1.425055e-27
ENSMUSG00000001123 341.8542 2.797485 0.2766499 10.112004 4.887441e-24 2.549941e-20
ENSMUSG00000023906 951.9460 2.382307 0.2510718 9.488551 2.342684e-21 9.116395e-18
ENSMUSG00000018569 485.4839 3.136031 0.3312999 9.465836 2.912214e-21 9.116395e-18
ENSMUSG00000000184 601.0842 -2.827750 0.3154171 -8.965112 3.099648e-19 8.085948e-16
> summary(res)
out of 28335 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 445, 1.6%
LFC < 0 (down) : 625, 2.2%
outliers [1] : 0, 0%
low counts [2] : 12683, 45%
(mean count < 18)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
> write.csv(res,file="All_results.csv")
> table(res$padj<0.05)
FALSE TRUE
14909 743
差异表达基因的界定很不统一,但log2FC是用的最广泛同时也是最不精确的方式,但因为其好理解所以广泛被应用尤其芯片数据处理中,记的是havard universit做过一个统计,FC=2相对比较可靠。但无论怎么,这种界定人为因素太大,过于武断。所以GSEA,WGCNA是拿全部表达数据(可以进行初步过滤)来进行分析,并且WGCNA采取软阈值的方式来挑选genes更加合理。既然挑选差异表达基因,还是采取log2FC和padj来进行。
> diff_gene_deseq2 <-subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
#或
#> diff_gene_deseq2 <-subset(res,padj < 0.05 & (log2FoldChange > 1 | log2FoldChange < -1))
> dim(diff_gene_deseq2)
> head(diff_gene_deseq2)
> write.csv(diff_gene_deseq2,file= "DEG_treat_vs_control.csv")
结果显示如下:
> diff_gene_deseq2 <-subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
> dim(diff_gene_deseq2)
[1] 431 6
> head(diff_gene_deseq2)
log2 fold change (MLE): condition control vs treat
Wald test p-value: condition control vs treat
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000003309 548.1926 3.231611 0.2658125 12.157485 5.234568e-34 8.193146e-30
ENSMUSG00000046323 404.1894 3.067050 0.2628220 11.669687 1.820923e-31 1.425055e-27
ENSMUSG00000001123 341.8542 2.797485 0.2766499 10.112004 4.887441e-24 2.549941e-20
ENSMUSG00000023906 951.9460 2.382307 0.2510718 9.488551 2.342684e-21 9.116395e-18
ENSMUSG00000018569 485.4839 3.136031 0.3312999 9.465836 2.912214e-21 9.116395e-18
ENSMUSG00000000184 601.0842 -2.827750 0.3154171 -8.965112 3.099648e-19 8.085948e-16
library('biomaRt')
library("curl")
mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
my_ensembl_gene_id<-row.names(diff_gene_deseq2)
#listAttributes(mart)
mms_symbols<- getBM(attributes=c('ensembl_gene_id','external_gene_name',"description"),
filters = 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart)
> library('biomaRt')
> library("curl")
> mart <- useDataset("mmusculus_gene_ensembl", useMart("ensembl"))
> my_ensembl_gene_id<-row.names(diff_gene_deseq2)
> mms_symbols<- getBM(attributes=c('ensembl_gene_id','external_gene_name',"description"),
+ filters = 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart)
> head(mms_symbols)
ensembl_gene_id external_gene_name description
1 ENSMUSG00000000120 Ngfr nerve growth factor receptor (TNFR superfamily, member 16) [Source:MGI Symbol;Acc:MGI:97323]
2 ENSMUSG00000000184 Ccnd2 cyclin D2 [Source:MGI Symbol;Acc:MGI:88314]
3 ENSMUSG00000000276 Dgke diacylglycerol kinase, epsilon [Source:MGI Symbol;Acc:MGI:1889276]
4 ENSMUSG00000000308 Ckmt1 creatine kinase, mitochondrial 1, ubiquitous [Source:MGI Symbol;Acc:MGI:99441]
5 ENSMUSG00000000320 Alox12 arachidonate 12-lipoxygenase [Source:MGI Symbol;Acc:MGI:87998]
6 ENSMUSG00000000708 Kat2b K(lysine) acetyltransferase 2B [Source:MGI Symbol;Acc:MGI:1343094]
合并的话两个数据必须有共同的列名,我们先看一下
> head(diff_gene_deseq2)
log2 fold change (MLE): condition control vs treat
Wald test p-value: condition control vs treat
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSMUSG00000003309 548.1926 3.231611 0.2658125 12.157485 5.234568e-34 8.193146e-30
ENSMUSG00000046323 404.1894 3.067050 0.2628220 11.669687 1.820923e-31 1.425055e-27
ENSMUSG00000001123 341.8542 2.797485 0.2766499 10.112004 4.887441e-24 2.549941e-20
ENSMUSG00000023906 951.9460 2.382307 0.2510718 9.488551 2.342684e-21 9.116395e-18
ENSMUSG00000018569 485.4839 3.136031 0.3312999 9.465836 2.912214e-21 9.116395e-18
ENSMUSG00000000184 601.0842 -2.827750 0.3154171 -8.965112 3.099648e-19 8.085948e-16
> head(mms_symbols)
ensembl_gene_id external_gene_name
1 ENSMUSG00000000120 Ngfr
2 ENSMUSG00000000184 Ccnd2
3 ENSMUSG00000000276 Dgke
4 ENSMUSG00000000308 Ckmt1
5 ENSMUSG00000000320 Alox12
6 ENSMUSG00000000708 Kat2b
可见,两个文件没有共同的列名,所以要先给'diff_gene_deseq2'添加一个‘ensembl_gene_id’的列名,方法如下:(应该有更简便的方法)
> ensembl_gene_id<-rownames(diff_gene_deseq2)
> diff_gene_deseq2<-cbind(ensembl_gene_id,diff_gene_deseq2)
> colnames(diff_gene_deseq2)[1]<-c("ensembl_gene_id")
> diff_name<-merge(diff_gene_deseq2,mms_symbols,by="ensembl_gene_id")
>head(diff_name)
#查看Akap8的情况
Akap8 <- diff_name[diff_name$external_gene_name=="Akap8",]
> ensembl_gene_id<-rownames(diff_gene_deseq2)
> diff_gene_deseq2<-cbind(ensembl_gene_id,diff_gene_deseq2)
> colnames(diff_gene_deseq2)[1]<-c("ensembl_gene_id")
> diff_name<-merge(diff_gene_deseq2,mms_symbols,by="ensembl_gene_id")
> head(diff_name)
DataFrame with 6 rows and 9 columns
ensembl_gene_id baseMean log2FoldChange lfcSE stat pvalue padj external_gene_name
<character> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> <character>
1 ENSMUSG00000000120 434.04177 -1.293648 0.2713146 -4.768072 1.859970e-06 2.064698e-04 Ngfr
2 ENSMUSG00000000184 601.08417 -2.827750 0.3154171 -8.965112 3.099648e-19 8.085948e-16 Ccnd2
3 ENSMUSG00000000276 668.12500 -1.071362 0.2445381 -4.381168 1.180446e-05 9.603578e-04 Dgke
4 ENSMUSG00000000308 207.46719 1.944949 0.3427531 5.674489 1.391035e-08 3.819733e-06 Ckmt1
5 ENSMUSG00000000320 61.96266 1.451927 0.4637101 3.131109 1.741473e-03 4.105051e-02 Alox12
6 ENSMUSG00000000708 1070.03203 -1.046546 0.2049062 -5.107440 3.265530e-07 5.056107e-05 Kat2b
description
<character>
1 nerve growth factor receptor (TNFR superfamily, member 16) [Source:MGI Symbol;Acc:MGI:97323]
2 cyclin D2 [Source:MGI Symbol;Acc:MGI:88314]
3 diacylglycerol kinase, epsilon [Source:MGI Symbol;Acc:MGI:1889276]
4 creatine kinase, mitochondrial 1, ubiquitous [Source:MGI Symbol;Acc:MGI:99441]
5 arachidonate 12-lipoxygenase [Source:MGI Symbol;Acc:MGI:87998]
6 K(lysine) acetyltransferase 2B [Source:MGI Symbol;Acc:MGI:1343094]
> Akap8 <- diff_name[diff_name$external_gene_name=="Akap8",]
> Akap8
DataFrame with 1 row and 9 columns
ensembl_gene_id baseMean log2FoldChange lfcSE stat pvalue padj external_gene_name
<character> <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> <character>
1 ENSMUSG00000024045 2281.053 -1.276329 0.2428315 -5.256028 1.471996e-07 2.775865e-05 Akap8
description
<character>
1 A kinase (PRKA) anchor protein 8 [Source:MGI Symbol;Acc:MGI:1928488]