导语
GUIDE ╲
ggtree是ggplot2的拓展包,可以应用于进化树的绘制,还能对进化树丰富的注释分析。
背景介绍
最近小编在阅读文献发现了文献中使用了一些精美的树状图,觉得非常漂亮,随后又去网上学习树状图的画法,顺便还学习了一种有趣的圆形树状图,在这里小编一起分享给大家,并且介绍今天的主角,树状图绘制R包:ggtree。
atlas of colonic CD8+ T cells in ulcerative colitis.Nature Medicine.2020
系统发育树用于描述一组生物之间的家谱关系,可以根据这些生物的遗传序列进行构建。 传统的系统发育树代表了一种进化史的模型,该树由树节点之间的祖先后代关系和处于不同相关程度的“sister”或“cousin”的聚类描绘而成。
ggtree是一个功能强大的系统发育树可视化及注释R语言软件包,在Bioconductor中发布,是ggplot2的扩展包。ggtree可以读取多种数据格式的系统发育树,并对其进行注释分析。
ggtree的安装
首先通过bioconductor安装ggtree包(在接下来的绘图展示中,还需要安装其他依赖包,也可以用此命令安装)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
#下载安装
BiocManager::install("ggtree")
library(ggplot2) # 加载ggplot2
library(ggtree)
ggtree的使用
01
函数介绍
首先介绍一下ggtree函数以及内部的参数
ggtree(
tr,##数据
mapping = NULL,
layout = "rectangular",##树状图的形状(分为矩形,扇形,倾斜等)
open.angle = 0,##开放角度,当layout为扇形时可设置
mrsd = NULL,##most recent sampling date
as.Date = FALSE,##是否在决策树中使用日期
yscale = "none",#y标尺
yscale_mapping = NULL,
ladderize = TRUE,##逻辑变量,是否以阶梯型展示树
right = FALSE,##阶梯的右边是否应该有最小的分支
branch.length = "branch.length",##用于缩放分支的变量
root.position = 0,##根节点的位置
...
)
02
ggtree可实现的一些绘图展示
在这里,小编给大家分享一些ggtree可以实现的绘图结果展示(当然只是一部分,ggtree还有非常多的功能可以满足大家的需求)
基本树状图绘制
绘制SNP和特征的数据
对具有多个序列比对进行可视化
圆形树状图
多维数据的树状图可视化
03
ggtree绘图操作示例
系统发育树可视化物种丰富度分布
物种丰富度是连续的数值数据,通常可以表现为箱线图,小提琴图或密度曲线,在这一部分展示中使用了微生物组数据进行绘图。在phyloseq包中,采用密度脊线对丰富度数据可视化。
library(phyloseq)
library(ggridges)
library(dplyr)
library(ggtree)
##数据的读取及处理
data("GlobalPatterns")
GP <- GlobalPatterns
GP <- prune_taxa(taxa_sums(GP) > 1000, GP)
sample_data(GP)$human <- get_variable(GP, "SampleType") %in% c("Feces", "Skin")
mergedGP <- merge_samples(GP, "SampleType")
mergedGP <- rarefy_even_depth(mergedGP,rngseed=394582)
mergedGP <- tax_glom(mergedGP,"Order")
melt_simple <- psmelt(mergedGP) %>%
filter(Abundance < 120) %>%
select(OTU, val = Abundance)
##ggtree绘图
p <- ggtree(mergedGP) +
geom_tippoint(aes(color = Phylum), size = 1.5)
##facet_plot根据树结构自动重新排列丰富度数据
##geom_density_ridges函数将密度曲线与树对齐
facet_plot(p, panel="Abundance", data=melt_simple,
geom_density_ridges, mapping = aes(x = val, group = label,
fill = Phylum),
color = 'grey80', lwd = .3)
系统发育树可视化成对核苷酸序列距离
本示例还原了(Chen et al.2017)的Fig1,通过HPV58树计算成对的核苷酸序列距离,演示了向特定面板添加多个图层的功能。facet_plot函数将序列距离表示为点图,然后在同一面板添加一个线图层。另外,facet_plot中的树可以添加多层注释(clade,bootstrap等)
library(tibble)
library(tidyr)
library(Biostrings)
library(treeio)
library(ggplot2)
library(ggtree)
library(muscle)
tree <- read.tree("C:\\Users\\Administrator\\Desktop\\plotting_tree_with_data-master\\HPV58.tree")
clade <- c(A3 = 92, A1 = 94, A2 = 108, B1 = 156, B2 = 159, C = 163, D1 = 173, D2 = 176)
tree <- groupClade(tree, clade)
cols <- c(A1 = "#EC762F", A2 = "#CA6629", A3 = "#894418", B1 = "#0923FA",
B2 = "#020D87", C = "#000000", D1 = "#9ACD32",D2 = "#08630A")
## 使用标签和比例可视化树
p <- ggtree(tree, aes(color = group), ladderize = FALSE) +
geom_tiplab(aes(label = paste0("italic('", label, "')")), parse = TRUE, size = 2.5) +
geom_treescale(x = 0, y = 1, width = 0.002) +
scale_color_manual(values = c(cols, "black"), na.value = "black", name = "Lineage",
breaks = c("A1", "A2", "A3", "B1", "B2", "C", "D1", "D2")) +
guides(color = guide_legend(override.aes = list(size = 5, shape = 15))) +
theme_tree2(legend.position = c(.1, .88))
## 为单系(A,C和 D)和副系(B)组添加标签
p <- p + geom_cladelabel(94, "italic(A1)", color = cols[["A1"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(108, "italic(A2)", color = cols[["A2"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = 0.5, parse = TRUE) +
geom_cladelabel(131, "italic(A3)", color = cols[["A3"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_cladelabel(92, "italic(A)", color = "darkgrey", offset = .00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE) +
geom_cladelabel(156, "italic(B1)", color = cols[["B1"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(159, "italic(B2)", color = cols[["B2"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_strip(65, 71, "italic(B)", color = "darkgrey", offset = 0.00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE) +
geom_cladelabel(163, "italic(C)", color = "darkgrey", offset = .0031, align = TRUE,
offset.text = 0.0002, barsize = 3.2, fontsize = 5, parse = TRUE) +
geom_cladelabel(173, "italic(D1)", color = cols[["D1"]], offset = .003, align = TRUE,
offset.text = -.001, barsize = 1.2, extend = c(0, 0.5), parse = TRUE) +
geom_cladelabel(176, "italic(D2)", color = cols[["D2"]], offset = .003, align = TRUE,offset.text = -.001, barsize = 1.2, extend = c(0.5, 0), parse = TRUE) +
geom_cladelabel(172, "italic(D)", color = "darkgrey", offset = .00315, align = TRUE,
offset.text = 0.0002, barsize = 2, fontsize = 5, parse = TRUE)
##显示支持values
p <- p + geom_nodelab(aes(subset = (node == 92), label = "*"),
color = "black", nudge_x = -.001, nudge_y = 1) +
geom_nodelab(aes(subset = (node == 155), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1) +
geom_nodelab(aes(subset = (node == 158), label = "95/92/1.00"),
color = "black", nudge_x = -0.0001, nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 162), label = "98/97/1.00"),
color = "black", nudge_x = -0.0001, nudge_y = -1, hjust = 1) +
geom_nodelab(aes(subset = (node == 172), label = "*"),
color = "black", nudge_x = -.0003, nudge_y = -1)
## 从提示标签中提取accession numbers
tl <- tree$tip.label
acc <- sub("\\w+\\|", "", tl)
names(tl) <- acc
## 将序列从GenBank直接读取到R中,并将对象转换为DNAStringSet
tipseq <- ape::read.GenBank(acc) %>% as.character %>%
lapply(., paste0, collapse = "") %>% unlist %>%
DNAStringSet
## 比对序列
tipseq_aln <- muscle::muscle(tipseq)
tipseq_aln <- DNAStringSet(tipseq_aln)
## 计算成对距离
tipseq_dist <- stringDist(tipseq_aln, method = "hamming")
## 计算差异百分比
tipseq_d <- as.matrix(tipseq_dist) / width(tipseq_aln[1]) * 100
## 将矩阵转换为facet_plot的data.frame
dd <- as_data_frame(tipseq_d)
dd$seq1 <- rownames(tipseq_d)
td <- gather(dd,seq2, dist, -seq1)
td$seq1 <- tl[td$seq1]
td$seq2 <- tl[td$seq2]
g <- p$data$group
names(g) <- p$data$label
td$clade <- g[td$seq2]
##使用点图和线图可视化序列差异
## 使用facet_plot将序列差异图与树对齐
p2 <- facet_plot(p, panel = "Sequence Distance", data = td, geom_point,
mapping = aes(x = dist, color = clade, shape = clade), alpha = .6) %>%
facet_plot(panel = "Sequence Distance", data = td, geom = geom_path,
mapping=aes(x = dist, group = seq2, color = clade), alpha = .6) +
scale_shape_manual(values = 1:8, guide = FALSE)
print(p2)
使用phylobase绘制关联数据
phylobase包定义了phylo4d,将树与数据框组合在一起并绘图,内部调用treePlot函数。但是它只能将与树相关的数据的数字值绘制为气泡,并且无法生成图例。Phylobase还不支持将关联数据改变例如颜色,大小和形状等特征。这些特征需要大家手动添加。下面是使用phylobase绘制关联数据的示例:
library(phylobase)
data(geospiza_raw)
g1 <- as(geospiza_raw$tree, "phylo4")
g2 <- phylo4d(g1, geospiza_raw$data, missing.data="warn")
plot(g2)
d1 <- data.frame(x = seq(0.93, 1.15, length.out = 5),
lab = names(geospiza_raw$data))
## 使用存储在“g2”中的数据直接绘制气泡图
p1 <- ggtree(g2) + geom_tippoint(aes(size = wingL), x = d1$x[1], shape = 1) +
geom_tippoint(aes(size = tarsusL), x = d1$x[2], shape = 1) +
geom_tippoint(aes(size = culmenL), x = d1$x[3], shape = 1) +
geom_tippoint(aes(size = beakD), x = d1$x[4], shape = 1) +
geom_tippoint(aes(size = gonysW), x = d1$x[5], shape = 1) +
scale_size_continuous(range = c(3,12), name="") +
geom_text(aes(x = x, y = 0, label = lab), data = d1, angle = 90) +
geom_tiplab(offset = .3) + xlim(0, 1.3) +
theme(legend.position = c(.1, .75)) + labs(tag = "A")
library(dplyr)
library(tidyr)
## 从“ g2”提取提示数据并使用“facet_plot”可视化数据
d <- tipData(g2)
d$tip <- rownames(d)
dd <- gather(d, feature, value, -tip)
cat <- seq(ncol(d))
names(cat) <- names(d)
dd$cat <- cat[dd$feature]
d2 <- select(dd, -value) %>% filter(tip == 'fuliginosa')
p <- ggtree(g2) + geom_tiplab() + xlim_tree(c(0, 1.2))
p2 <- facet_plot(p, "Morphometric data", dd, geom_point, aes(x = cat, size = value), shape = 1) %>%
facet_plot("Morphometric data", d2, geom_text, aes(x = cat, y = 0, label = feature), angle = 90) +
scale_size_continuous(range = c(3, 12)) +
theme(legend.position = "right") +
coord_cartesian(clip = "off") + labs(tag = "B")
cowplot::plot_grid(p1, p2)
diet = data.frame(species=tipLabels(g2),
Diet = c("Seeds","Seeds", "Seeds", "Cacti", "Cacti", "Seeds","Insects",
"Insects", "Insects", "Insects", "Fruits", "Insects", "Insects", "Insects"))
##添加一些个性化的设置
p <- ggtree(g2) %<+% diet + geom_tiplab(aes(color = Diet), offset = .05) +
geom_tippoint(aes(color = Diet), size = 5, alpha = .5) +
xlim_tree(c(0, 1.2))
p2 <- gheatmap(ggtree(g2), tipData(g2), colnames_angle = 90) %<+% diet
p3 <- facet_plot(p, "Morphometric data", dd, geom_tile, aes(x = cat, fill = value)) %>%
facet_plot("Morphometric data", d2, geom_text,
aes(x = cat, y = 0, label = feature), angle = 90) +
scale_fill_viridis_c(na.value = "white") + labs(tag = "A") + theme(legend.position="right")
p4 <- facet_plot(p2, "Morphometric data", dd, geom_point,
aes(x = cat, size = value, color = Diet)) %>%
facet_plot("Morphometric data", d2, geom_text,
aes(x = cat, y = 0, label = feature), angle = 90) +
scale_size_continuous(range = c(3, 12)) + labs(tag = "B") +
theme(legend.position = "right") + coord_cartesian(clip = "off")
cowplot::plot_grid(p3, p4)
ggtree对关联矩阵进行可视化
在本示例中,可视化了具有相关基因型的H3流感病毒树
beast_file <- system.file("examples/MCC_FluA_H3.tree", package="ggtree")
beast_tree <- read.beast(beast_file)
#####读取type数据
genotype_file <- system.file("examples/Genotype.txt", package="ggtree")
genotype <- read.table(genotype_file, sep="\t", stringsAsFactor=F)
colnames(genotype) <- sub("\\.$", "", colnames(genotype))
p <- ggtree(beast_tree, mrsd="2013-01-01") +
geom_treescale(x=2008, y=1, offset=2) +
geom_tiplab(size=2)
######gheatmap绘制热图
gheatmap(p, genotype, offset=5, width=0.5, font.size=3,
colnames_angle=-45, hjust=0) +
scale_fill_manual(breaks=c("HuH3N2", "pdm", "trig"),
values=c("steelblue", "firebrick", "darkgreen"), name="genotype")
p <- ggtree(beast_tree, mrsd="2013-01-01") +
geom_tiplab(size=2, align=TRUE, linesize=.5) +
theme_tree2()
gheatmap(p, genotype, offset=8, width=0.6,
colnames=FALSE, legend_title="genotype") +
scale_x_ggtree() +
scale_y_continuous(expand=c(0, 0.3))
ggtree还可以对多个关联矩阵进行可视化
用多个gheatmap函数将多个关联的矩阵与树对齐,但是ggplot2不允许使用多个填充比例。要解决此问题,可以使用ggnewscale创建新的填充比例。
nwk <- system.file("extdata", "sample.nwk", package="treeio")
tree <- read.tree(nwk)
circ <- ggtree(tree, layout = "circular")
##设置level
df <- data.frame(first=c("a", "b", "a", "c", "d", "d", "a", "b", "e", "e", "f", "c", "f"),
second= c("z", "z", "z", "z", "y", "y", "y", "y", "x", "x", "x", "a", "a"))
rownames(df) <- tree$tip.label
df2 <- as.data.frame(matrix(rnorm(39), ncol=3))
rownames(df2) <- tree$tip.label
colnames(df2) <- LETTERS[1:3]
##gheatmap绘制热图
p1 <- gheatmap(circ, df, offset=.8, width=.2,
colnames_angle=95, colnames_offset_y = .25) +
scale_fill_viridis_d(option="D", name="discrete\nvalue")
library(ggnewscale)
###加入注释
p2 <- p1 + new_scale_fill()
gheatmap(p2, df2, offset=15, width=.3,
colnames_angle=90, colnames_offset_y = .25) +
scale_fill_viridis_c(option="A", name="continuous\nvalue")
复合图的绘制
除了使用geom_facet将图形对齐到树,大家还可以使用Cowplot,patchwork,gtable或其他软件包来创建复合图。为了更便捷的实现这个功能,在这里推荐一个R包aplot,可以重新排列ggplot对象的内部数据,并创建与树正确对齐的复合图。
library(ggplot2)
library(ggtree)
set.seed(2019-10-31)
tr <- rtree(10)
d1 <- data.frame(
# only some labels match
label = c(tr$tip.label[sample(5, 5)], "A"),
value = sample(1:6, 6))
d2 <- data.frame(
label = rep(tr$tip.label, 5),
category = rep(LETTERS[1:5], each=10),
value = rnorm(50, 0, 3))
g <- ggtree(tr) + geom_tiplab(align=TRUE)
p1 <- ggplot(d1, aes(label, value)) + geom_col(aes(fill=label)) +
geom_text(aes(label=label, y= value+.1)) +
coord_flip() + theme_tree2() + theme(legend.position='none')
p2 <- ggplot(d2, aes(x=category, y=label)) +
geom_tile(aes(fill=value)) + scale_fill_viridis_c() +
theme_tree2()
cowplot::plot_grid(g, p2, p1, ncol=3)
library(aplot)
p2 %>% insert_left(g) %>% insert_right(p1, width=.5)
用子图对树进行注释
ggtree提供了geom_inset,用于将子图添加到系统树中。输入是ggplot图形对象的命名列表(可以是任何类型的图表)。还可以使用ggplotify将其他功能生成的图转换为ggplot对象,然后在geom_inset中使用该对象。
#####
library(phytools)
data(anoletree)
x <- getStates(anoletree,"tips")
tree <- anoletree
cols <- setNames(palette()[1:length(unique(x))],sort(unique(x)))
fitER <- ape::ace(x,tree,model="ER",type="discrete")
ancstats <- as.data.frame(fitER$lik.anc)
ancstats$node <- 1:tree$Nnode+Ntip(tree)
## cols参数指示哪些列存储统计信息
bars <- nodebar(ancstats, cols=1:6)
bars <- lapply(bars, function(g) g+scale_fill_manual(values = cols))
library(dplyr)
tree2 <- full_join(tree, data.frame(label = names(x), stat = x ), by = 'label')
p <- ggtree(tree2) + geom_tiplab() +
geom_tippoint(aes(color = stat)) +
scale_color_manual(values = cols) +
theme(legend.position = "right") +
xlim(NA, 8)
p + geom_inset(bars, width = .08, height = .05, x = "branch")
文章参考:
1、https://yulab-smu.top/treedata-book/
2、https://github.com/GuangchuangYu/plotting_tree_with_data/
小编总结
ggtree作为一个功能丰富的树状图绘制R包,一定能够成为大家科研路上的助力!小伙伴们是不是对使用ggtree绘制树状图增加了许多了解呢?大家可以进一步去开发ggtree的其他功能,关于ggtree更详细的信息也可以参考:https://yulab-smu.top/treedata-book