构造示例数据
首先我们需要构造一个示例数据集用于接下来的演示,这里我使用的是我的微信好友数据里面的省份、城市、性别变量。这个数据可以用下面的 Python 脚本获取:
import itchat
import pandas as pd
itchat.auto_login(hotReload = True)
friends = itchat.get_friends(update = True)
friends = pd.DataFrame(friends)
friends.to_csv("friends.csv")
如果你的微信无法通过这种方式导出好友数据,可以直接使用我的 friends.csv
数据集:
# 使用微信好友数据
library(hrbrthemes)
library(tidyverse)
library(ggplot2)
library(usethis)
library(magrittr) # 管道操作符
# 使用 Python 脚本 下载微信好友数据
df <- read_csv("friends.csv") %>%
mutate(
gender = case_when(
Sex == 0 ~ "未知",
Sex == 1 ~ "男孩",
Sex == 2 ~ "女孩"
)
) %>%
dplyr::filter(!is.na(City)) %>%
dplyr::filter(stringr::str_detect(City, "[一-龥]+")) %>%
select(prov = Province, city = City, gender)
df 是这样的:
❝是否记得这是我之前介绍的一个小技巧,为 datatable 表格控件添加下载按钮。 ❞
df %>%
DT::datatable(
extensions = 'Buttons',
options = list(dom = 'Blfrtip',
buttons = c('copy', 'csv', 'excel',
'pdf', 'print'),
lengthMenu = list(c(10, 25, 50 ,-1),
c(10, 25, 50, "All"))))
介绍桑基图画法的教程当然首先要介绍我自己写的 R 包啦,我有一个 sankeywheel 包可以用来绘制桑基图,其实我在 探索微信好友数据 的那个教程里面已经介绍过了这个 R 包的使用,这里重复下:
library(sankeywheel)
df %>%
group_by(prov, gender) %>%
count() %>%
ungroup() -> df_count
df_count
sankeywheel(
from = df_count$prov, to = df_count$gender,
weight = df_count$n, type = "sankey",
title = "我的微信好友分布",
subtitle = "TidyFriday Project",
seriesName = "", width = "100%", height = "600px"
)
但是这样是不是有点太“长”了?我们可以把省份分开成左右两部分:
df_count <- rbind(
df_count %>%
slice(1:25) %>%
`colnames<-`(c("from", "to", "n")),
df_count %>%
slice(26:61) %>%
select(gender, prov, n) %>%
`colnames<-`(c("from", "to", "n"))
)
sankeywheel(
from = df_count$from, to = df_count$to,
weight = df_count$n, type = "sankey",
title = "我的微信好友分布",
subtitle = "TidyFriday Project",
seriesName = "", width = "100%", height = "400px"
)
用过 sankeywheel 包的同学都知道这个包还有另外一个功能,就是它也可以绘制和弦图。是绘制桑基图还是和弦图是有 type 参数决定的,type 参数的默认值是 "dependencywheel",也就是说默认绘制的就是和弦图,之所以这样设置,是因为我觉得这个单词不好写:
sankeywheel(
from = df_count$from, to = df_count$to,
weight = df_count$n,
title = "我的微信好友分布",
subtitle = "TidyFriday Project",
seriesName = "", width = "100%", height = "400px"
)
组合多个 HTML 控件可以使用 manipulateWidget 包:
library(manipulateWidget)
combineWidgets(
sankeywheel(
from = df_count$from, to = df_count$to,
weight = df_count$n, type = "sankey",
title = "我的微信好友分布",
subtitle = "TidyFriday Project",
seriesName = ""
),
sankeywheel(
from = df_count$from, to = df_count$to,
weight = df_count$n,
title = "我的微信好友分布",
subtitle = "TidyFriday Project",
seriesName = ""
),
byrow = TRUE, ncol = 2, width = "100%", height = "400px"
)
这个方法就非常重要了,大家一定要掌握。
首先我们导入 ggalluvial 包,对 df 变量进行分组计数并把返回的结果保存到 pg 数据框里面:
library(ggalluvial)
pg <- df %>%
count(prov, city, gender)
然后就可以绘制一幅基于 ggplot2 的桑基图了:
ggplot(pg, aes(
axis1 = prov, axis2 = gender,
axis3 = city, y = n
), size = 0.001) +
geom_stratum(width = 0.5) +
geom_alluvium(aes(fill = gender), width = 0.5) +
theme_ipsum(cnfont) +
scale_fill_manual(values = c(
"男孩" = "#019875",
"女孩" = "#E84A5F",
"未知" = "#2A363B"
)) +
geom_text(
stat = "stratum",
infer.label = TRUE,
family = cnfont, size = 3.5,
color = "#2A2A2A"
) +
scale_x_continuous(
breaks = 1:3,
labels = c("省份", "性别", "城市")
) +
labs(
y = "人数", title = "我的微信好友分布",
subtitle = "TidyFriday Project"
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "none",
panel.grid.minor = element_blank()
) -> p
p
我们可以通过下面的方式自定义 y 轴的标签:
# 修改 y 轴的标签
df %>%
count(prov) %>%
group_by(prov) %>%
summarise(value = sum(n)) %>%
pull(value) %>%
rev() %>%
cumsum() -> breaks
for (i in 2:length(breaks)) {
if (breaks[i] - breaks[i - 1] < 15) {
breaks[i - 1] <- NA
}
}
breaks <- breaks[!is.na(breaks)]
p +
scale_y_continuous(breaks = breaks)
大家一定注意到这个图存在很严重的标签重叠问题,有两个解决办法:
ggplot(pg, aes(
axis1 = prov, axis2 = gender,
axis3 = city, y = n
), size = 0.001) +
geom_stratum(width = 0.5) +
geom_alluvium(aes(fill = gender), width = 0.5) +
theme_ipsum(cnfont) +
scale_fill_manual(values = c(
"男孩" = "#019875",
"女孩" = "#E84A5F",
"未知" = "#2A363B"
)) +
ggrepel::geom_text_repel(
stat = "stratum",
infer.label = TRUE,
family = cnfont, size = 3.5,
color = "#2A2A2A"
) +
scale_x_continuous(
breaks = 1:3,
labels = c("省份", "性别", "城市")
) +
labs(
y = "人数", title = "我的微信好友分布",
subtitle = "TidyFriday Project"
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "none",
panel.grid.minor = element_blank()
) +
scale_y_continuous(breaks = breaks)
ggplot(pg, aes(
axis1 = prov, axis2 = gender,
axis3 = city, y = n
), size = 0.001) +
geom_stratum(width = 0.5) +
geom_alluvium(aes(fill = gender), width = 0.5) +
theme_ipsum(cnfont) +
scale_fill_manual(values = c(
"男孩" = "#019875",
"女孩" = "#E84A5F",
"未知" = "#2A363B"
)) +
ggfittext::geom_fit_text(
stat = "stratum",
infer.label = TRUE,
family = cnfont, min.size = 1,
color = "#2A2A2A"
) +
scale_x_continuous(
breaks = 1:3,
labels = c("省份", "性别", "城市")
) +
labs(
y = "人数", title = "我的微信好友分布",
subtitle = "TidyFriday Project"
) +
theme(
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "none",
panel.grid.minor = element_blank()
) +
scale_y_continuous(breaks = breaks)
ggalluvial 包的详细用法可以参考作者给出的参考文档:
vignette("ggalluvial")
alluvial 包是基于基础绘图系统封装的,似乎不容易解决文本标签相互重叠的问题:
library(alluvial)
pars$family <- cnfont
basetheme(pars)
pg %>%
`colnames<-`(c("省份", "城市", "性别", "数量")) %>%
arrange(数量) -> pg
alluvial(pg[,1:3], freq = pg$`数量`,
col = ifelse(pg$`性别` == "男孩",
"#019875",
ifelse(pg$`性别` == "女孩",
"#E84A5F", "#2A363B")),
border = "grey",
alpha = 0.7)
这个也蛮好用的:
library(echarts4r)
df_count
df_count %>%
e_charts(width = "100%", height = "400px") %>%
e_sankey(from, to, n) %>%
e_title("我的微信好友分布",
textStyle = list("fontSize" = 20,
"fontFamily" = "STSong"),
textAlign = "middle", left = "50%") %>%
e_title("TidyFriday Project",
textStyle = list("fontSize" = 18,
"fontFamily" = "STSong"),
textAlign = "middle",
left = "50%", top = "8%") %>%
e_theme("infographic")
以后搜集到其他好用又好看的方法再分享给大家哈。