由于一个知识星球的小伙伴急需学习如何从 PDF 文档中提取表格,所以先插这个课,「使用 R 语言处理 netCDF 数据」系列的课程下次再发新的哈。本课程介绍了如何使用 R 语言从 WHO(世界卫生组织)的官网上下载新冠疫情的每日报告以及如何从这些报告中的表格里面提取数据。
报告的下载地址为:https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports/
这个非常简单,我的思路是直接获取网页中的所有 <a>
标签的 href 属性,然后过滤出链接中含 .pdf
的,最后再用一个循环下载所有的 PDF 文件即可。
library(rvest)
library(tidyverse)
read_html('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports/') -> html
# 创建 pdf 文件夹
dir.create("pdf")
html %>%
html_nodes("a") %>%
html_attr("href") %>%
as_tibble() %>%
dplyr::filter(str_detect(value, "\\.pdf")) %>%
mutate(value = paste0("https://www.who.int", value)) %>%
mutate(dest = str_match(value, "situation-reports/(.*)\\?")[,2],
dest = paste0("pdf/", dest)) %>%
distinct() -> links
for(i in 1:nrow(links)) {
download.file(links$value[i], links$dest[i])
}
运行上面的代码就可以把所有的报告下载到工作目录下面的 pdf 文件夹里面了。
我选择最新的一个 PDF 做演示:20200523-covid-19-sitrep-124.pdf
,下面使用 tabulizer 包进行数据提取,不过这个包依赖于 rJava
包,因此在使用这个包之前你需要在电脑上安装 Java 和在 R 里面安装 rJava 包。因为电脑系统的关系,我不好演示,大家可以自己研究下,不难的。我这里提供几个 tips,Java 安装之后可能还需要进行环境变量的配置。
install.packages('rJava')
install.packages("tabulizer")
数据提取,我就不在这里说了,直播的时候再一一讲解。
# 使用 tabulizer 包
library(tabulizer)
library(purrr)
library(tidyr)
library(tidyverse)
f <- "pdf/20200523-covid-19-sitrep-124.pdf"
# Table1 的第一页需要手动选择区域
extract_areas(f, pages = 4, output = "data.frame") -> df1
df1 %>%
as.data.frame() %>%
as_tibble() %>%
slice(-1) %>%
set_names(c("Reporting_Country_Territory_Area",
"Total_confirmed_cases",
"Total_confirmed_new_cases",
"Total_deaths",
"Total_new_deaths",
"Transmission_classification",
"Days_since_last_reported_case")) %>%
mutate(continent = ifelse(Total_confirmed_cases == "",
Reporting_Country_Territory_Area, NA)) %>%
select(continent, everything()) %>%
fill(continent) %>%
dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>%
dplyr::filter(!is.na(Total_confirmed_cases)) %>%
dplyr::filter(!is.na(Total_confirmed_new_cases)) %>%
dplyr::filter(!str_detect(Total_confirmed_new_cases,
"[a-z]")) %>%
mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
" ", ""),
Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
Total_deaths = str_remove_all(Total_deaths, " "),
Total_new_deaths = str_remove_all(Total_new_deaths, " "),
Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " ")) %>%
type_convert() %>%
dplyr::filter(!is.na(Total_confirmed_cases)) -> page1
# 剩下的页就好办了
extract_tables(f, pages = 5:13,
guess = FALSE, output = "data.frame") -> dfl
rbind(
dfl[[1]],
dfl[[2]],
dfl[[3]],
dfl[[4]],
dfl[[5]],
dfl[[6]],
dfl[[7]],
dfl[[8]],
dfl[[9]]
) %>%
as_tibble() %>%
set_names(c("Reporting_Country_Territory_Area",
"Total_confirmed_cases",
"Total_confirmed_new_cases",
"Total_deaths",
"Total_new_deaths",
"Transmission_classification",
"Days_since_last_reported_case")) %>%
mutate(continent = ifelse(Total_confirmed_cases == "",
Reporting_Country_Territory_Area, NA)) %>%
select(continent, everything()) %>%
fill(continent) %>%
type_convert() %>%
dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>%
dplyr::filter(!str_detect(Total_confirmed_new_cases,
"[a-z]")) %>%
mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
" ", ""),
Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
Total_deaths = str_remove_all(Total_deaths, " "),
Total_new_deaths = str_remove_all(Total_new_deaths, " "),
Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " ")) %>%
type_convert() %>%
dplyr::filter(!is.na(Total_confirmed_cases)) %>%
mutate(Total_confirmed_new_cases = as.numeric(Total_confirmed_new_cases)) -> page2
# 爬取最后一页
# Table1 的最一页需要手动选择区域
extract_areas(f, pages = 14, output = "data.frame") -> df2
df2 %>%
as.data.frame() %>%
as_tibble() %>%
slice(-1) %>%
set_names(c("Reporting_Country_Territory_Area",
"Total_confirmed_cases",
"Total_confirmed_new_cases",
"Total_deaths",
"Total_new_deaths",
"Transmission_classification",
"Days_since_last_reported_case")) %>%
mutate(Reporting_Country_Territory_Area = case_when(
Reporting_Country_Territory_Area == "(Commonwealth of the)" ~ "Northern Mariana Islands(Commonwealth of the)",
T ~ Reporting_Country_Territory_Area
)) %>%
slice(-1) %>%
mutate(continent = ifelse(Total_confirmed_cases == "",
Reporting_Country_Territory_Area, NA)) %>%
select(continent, everything()) %>%
fill(continent) %>%
dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>%
dplyr::filter(!is.na(Total_confirmed_cases)) %>%
dplyr::filter(!is.na(Total_confirmed_new_cases)) %>%
dplyr::filter(!str_detect(Total_confirmed_new_cases,
"[a-z]")) %>%
mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
" ", ""),
Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
Total_deaths = str_remove_all(Total_deaths, " "),
Total_new_deaths = str_remove_all(Total_new_deaths, " "),
Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " "),
Total_confirmed_new_cases = str_remove_all(Total_confirmed_new_cases, " ")) %>%
type_convert() %>%
slice(1:2) %>%
dplyr::filter(!is.na(Total_confirmed_cases)) -> page3
bind_rows(page1, page2, page3) %>%
fill(continent) -> fulldf
可以保存成 xlsx 文件:
fulldf %>%
writexl::write_xlsx('20200524.xlsx')
fulldf 是这样的: