下面是去年实习生的分享
# BiocManager::install('methylKit')
# library(methylKit)
file.list=list( system.file("extdata",
"test1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"test2.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control1.myCpG.txt", package = "methylKit"),
system.file("extdata",
"control2.myCpG.txt", package = "methylKit") )
# read the files to a methylRawList object: myobj
myobj=methRead(file.list,
sample.id=list("test1","test2","ctrl1","ctrl2"),
assembly="hg18", # the genome
# 0:control,1:test samples
treatment=c(1,1,0,0),
context="CpG",
mincov = 10 # minimum read coverage,defaults to 10
)
class(myobj)
## [1] "methylRawList"
## attr(,"package")
## [1] "methylKit"
head(myobj[[1]])
## chr start end strand coverage numCs numTs
## 1 chr21 9764539 9764539 - 12 3 9
## 2 chr21 9764513 9764513 - 12 0 12
## 3 chr21 9820622 9820622 + 13 0 13
## 4 chr21 9837545 9837545 + 11 0 11
## 5 chr21 9849022 9849022 + 124 90 34
## 6 chr21 9853326 9853326 + 17 12 5
每一行是一个甲基化位点,
coverage
代表覆盖这个位点的reads数,numCs
代表甲基化C的比例,numTs
代表非甲基化C的比例。文件大小相比bam文件小很多,读取的速度更快。
######### bam文件读取 #########
# reading multiple files
file.list2=list(system.file("extdata", "test.fastq_bismark.sorted.min.sam",
package = "methylKit"),
system.file("extdata", "test.fastq_bismark.sorted.min.sam",
package = "methylKit"),
system.file("extdata", "test.fastq_bismark.sorted.min.sam",
package = "methylKit"),
system.file("extdata", "test.fastq_bismark.sorted.min.sam",
package = "methylKit"))
objs=processBismarkAln(location=file.list2,
sample.id=list("test1","test2","ctrl1","ctrl1"),assembly="hg18",
save.folder=NULL,save.context=NULL,read.context="CpG",
nolap=FALSE,mincov=10,minqual=20,phred64=FALSE,
treatment=c(1,1,0,0))
## Conversion Statistics:
##
## total otherC considered (>95% C+T): 10
## average conversion rate = 100
## total otherC considered (Forward) (>95% C+T): 10
## average conversion rate (Forward) = 100
## total otherC considered (Reverse) (>95% C+T): 0
## average conversion rate (Reverse) = 0
##
## Reading methylation percentage per base for sample: test1
##
## Conversion Statistics:
##
## total otherC considered (>95% C+T): 10
## average conversion rate = 100
## total otherC considered (Forward) (>95% C+T): 10
## average conversion rate (Forward) = 100
## total otherC considered (Reverse) (>95% C+T): 0
## average conversion rate (Reverse) = 0
##
## Reading methylation percentage per base for sample: test2
##
## Conversion Statistics:
##
## total otherC considered (>95% C+T): 10
## average conversion rate = 100
## total otherC considered (Forward) (>95% C+T): 10
## average conversion rate (Forward) = 100
## total otherC considered (Reverse) (>95% C+T): 0
## average conversion rate (Reverse) = 0
##
## Reading methylation percentage per base for sample: ctrl1
##
## Conversion Statistics:
##
## total otherC considered (>95% C+T): 10
## average conversion rate = 100
## total otherC considered (Forward) (>95% C+T): 10
## average conversion rate (Forward) = 100
## total otherC considered (Reverse) (>95% C+T): 0
## average conversion rate (Reverse) = 0
##
## Reading methylation percentage per base for sample: ctrl1
# nolap
# TRUE:SAM/BAM file has paired-end reads双端测序, the one read of the overlapping paired-end read pair will be ignored for methylation calling.
# mincov:minimum read coverage
# minqual:minimum phred quality score
getMethylationStats(myobj[[2]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 20.00 82.79 63.17 94.74 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 48.38710 70.00000 82.78556 90.00000 93.33333
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 96.42857 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
getMethylationStats(myobj[[2]],plot=TRUE,both.strands=FALSE)
getMethylationStats(myobj[[2]],plot=TRUE,both.strands=TRUE)
getCoverageStats(myobj[[2]],plot=TRUE,both.strands=FALSE)
3.过滤
filtered.myobj=filterByCoverage(myobj,lo.count=10,lo.perc=NULL,
hi.count=NULL,hi.perc=99.9)
getMethylationStats(filtered.myobj[[2]],plot=FALSE,both.strands=FALSE)
## methylation statistics per base
## summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 20.00 82.76 63.13 94.74 100.00
## percentiles:
## 0% 10% 20% 30% 40% 50% 60% 70%
## 0.00000 0.00000 0.00000 48.23226 70.00000 82.75862 90.00000 93.33333
## 80% 90% 95% 99% 99.5% 99.9% 100%
## 96.42857 100.00000 100.00000 100.00000 100.00000 100.00000 100.00000
meth=unite(myobj, destrand=FALSE)
## uniting...
head(meth)
## chr start end strand coverage1 numCs1 numTs1 coverage2 numCs2 numTs2
## 1 chr21 9853296 9853296 + 17 10 7 333 268 65
## 2 chr21 9853326 9853326 + 17 12 5 329 249 79
## 3 chr21 9860126 9860126 + 39 38 1 83 78 5
## 4 chr21 9906604 9906604 + 68 42 26 111 97 14
## 5 chr21 9906616 9906616 + 68 52 16 111 104 7
## 6 chr21 9906619 9906619 + 68 59 9 111 109 2
## coverage3 numCs3 numTs3 coverage4 numCs4 numTs4
## 1 18 16 2 395 341 54
## 2 16 14 2 379 284 95
## 3 83 83 0 41 40 1
## 4 23 18 5 37 33 4
## 5 23 14 9 37 27 10
## 6 22 18 4 37 29 8
min.per.group
参数的值,该参数的值代表每组中至少有多少个样本覆盖该位点时才保留,如果设置为1,就是取并集。# creates a methylBase object,
# where only CpGs covered with at least 1 sample per group will be returned
# there were two groups defined by the treatment vector,
# given during the creation of myobj: treatment=c(1,1,0,0)
meth.min=unite(myobj,min.per.group=1L)
## uniting...
getCorrelation(meth,plot=TRUE)
## test1 test2 ctrl1 ctrl2
## test1 1.0000000 0.9252530 0.8767865 0.8737509
## test2 0.9252530 1.0000000 0.8791864 0.8801669
## ctrl1 0.8767865 0.8791864 1.0000000 0.9465369
## ctrl2 0.8737509 0.8801669 0.9465369 1.0000000
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
{width="768"}
##
## Call:
## hclust(d = d, method = HCLUST.METHODS[hclust.method])
##
## Cluster method : ward.D
## Distance : pearson
## Number of objects: 4
PCASamples(meth, screeplot=TRUE)
PCASamples(meth)
3.去批次化
# make some batch data frame
# this is a bogus data frame
# we don't have batch information
# for the example data
sampleAnnotation=data.frame(batch_id=c("a","a","b","b"),
age=c(19,34,23,40))
sampleAnnotation
## batch_id age
## 1 a 19
## 2 a 34
## 3 b 23
## 4 b 40
as=assocComp(mBase=meth,sampleAnnotation)
as
## $pcs
## PC1 PC2 PC3 PC4
## test1 -0.4978699 -0.5220504 0.68923849 -0.06737363
## test2 -0.4990924 -0.4805506 -0.71827964 0.06365693
## ctrl1 -0.5016543 0.4938800 0.08068700 0.70563101
## ctrl2 -0.5013734 0.5026102 -0.05014261 -0.70249091
##
## $vars
## [1] 92.271885 4.525328 1.870950 1.331837
##
## $association
## PC1 PC2 PC3 PC4
## batch_id 0.3333333 0.3333333 1.0000000 1.0000000
## age 0.5864358 0.6794346 0.3140251 0.3467957
# construct a new object by removing the first pricipal component
# from percent methylation value matrix
newObj=removeComp(meth,comp=1)
# 甲基化百分比
mat=percMethylation(meth)
head(mat)
## test1 test2 ctrl1 ctrl2
## 1 58.82353 80.48048 88.88889 86.32911
## 2 70.58824 75.91463 87.50000 74.93404
## 3 97.43590 93.97590 100.00000 97.56098
## 4 61.76471 87.38739 78.26087 89.18919
## 5 76.47059 93.69369 60.86957 72.97297
## 6 86.76471 98.19820 81.81818 78.37838
# do some changes in the matrix
# this is just a toy example
# 手动进行修改百分比(究竟如何判断还不太清楚???)
mat[mat==100]=80
#将修改后的百分比和原来的meth形成新的矩阵
newobj=reconstruct(mat,meth)
myDiff=calculateDiffMeth(meth)
## two groups detected:
## will calculate methylation difference as the difference of
## treatment (group: 1) - control (group: 0)
# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
head(myDiff25p)
## chr start end strand pvalue qvalue meth.diff
## 11 chr21 9906681 9906681 + 2.324256e-08 2.172846e-07 44.26808
## 12 chr21 9906694 9906694 + 2.983628e-04 1.006776e-03 28.85090
## 13 chr21 9906700 9906700 + 8.405880e-06 4.097059e-05 40.28016
## 18 chr21 9906873 9906873 + 4.976364e-06 2.603518e-05 47.62876
## 23 chr21 9927527 9927527 + 1.120126e-07 9.257475e-07 -46.10984
## 24 chr21 9944505 9944505 + 7.907909e-20 7.515975e-18 -51.01794
myDiffall=getMethylDiff(myDiff,difference=1,qvalue=1)
df=myDiffall
df$type=ifelse(df$pvalue>0.05,'none',
ifelse( df$meth.diff >25,'hyperDMP',
ifelse( df$meth.diff< -25,'hypoDMP','none') )
)
head(df)
## chr start end strand pvalue qvalue meth.diff type
## 1 chr21 9853296 9853296 + 0.009908142 0.021565813 -7.012107 none
## 3 chr21 9860126 9860126 + 0.041604489 0.069780839 -4.111581 none
## 4 chr21 9906604 9906604 + 0.210651680 0.259453090 -7.346369 none
## 5 chr21 9906616 9906616 + 0.001576498 0.004322201 18.817505 none
## 6 chr21 9906619 9906619 + 0.002824594 0.007321638 14.193732 none
## 7 chr21 9906634 9906634 + 0.034326524 0.059268755 13.888889 none
table(df$type)
##
## hyperDMP hypoDMP none
## 78 60 518
this_tile <- paste0('Cutoff for meth.diff is ',25,
'\nThe number of hyperDMP is ',nrow(df[df$type == 'hyperDMP',]) ,
'\nThe number of hypoDMP is ',nrow(df[df$type == 'hypoDMP',])
)
#火山图
library(ggplot2)
p5 <- ggplot(data = df,
aes(x = meth.diff,
y = -log10(pvalue))) +
geom_point(alpha=0.6, size=1.5,
aes(color=type)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("#34bfb5", "#828586","#ff6633"))+
geom_vline(xintercept= c(-25,25),lty=4,col="grey",lwd=0.8) +
geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.8)+
#xlim(-0.5, 0.5)+
theme_classic()+
ggtitle(this_tile )+
theme(plot.title = element_text(size=12,hjust = 0.5),
legend.title = element_blank()
)
p5
染色体
可视化每条染色体的低/高甲基化碱基/区域的分布(本例只包含一条)
diffMethPerChr(myDiff,plot=T,qvalue.cutoff=0.01, meth.cutoff=25)