使用 R 语言从 PDF 文档中提取表格

王诗翔呀

发布于 2020-07-02 17:25:19

4.4K0

文章被收录于专栏：优雅R优雅R

由于一个知识星球的小伙伴急需学习如何从 PDF 文档中提取表格，所以先插这个课，「使用 R 语言处理 netCDF 数据」系列的课程下次再发新的哈。本课程介绍了如何使用 R 语言从 WHO（世界卫生组织）的官网上下载新冠疫情的每日报告以及如何从这些报告中的表格里面提取数据。

下载所有的 PDF 报告

报告的下载地址为：https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports/

这个非常简单，我的思路是直接获取网页中的所有 <a> 标签的 href 属性，然后过滤出链接中含 .pdf 的，最后再用一个循环下载所有的 PDF 文件即可。

library(rvest)
library(tidyverse)
read_html('https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports/') -> html

# 创建 pdf 文件夹
dir.create("pdf")

html %>% 
  html_nodes("a") %>% 
  html_attr("href") %>% 
  as_tibble() %>% 
  dplyr::filter(str_detect(value, "\\.pdf")) %>% 
  mutate(value = paste0("https://www.who.int", value)) %>% 
  mutate(dest = str_match(value, "situation-reports/(.*)\\?")[,2],
         dest = paste0("pdf/", dest)) %>% 
  distinct() -> links

for(i in 1:nrow(links)) {
  download.file(links$value[i], links$dest[i])
}

运行上面的代码就可以把所有的报告下载到工作目录下面的 pdf 文件夹里面了。

从 PDF 里面提取表格数据

我选择最新的一个 PDF 做演示：20200523-covid-19-sitrep-124.pdf，下面使用 tabulizer 包进行数据提取，不过这个包依赖于 rJava 包，因此在使用这个包之前你需要在电脑上安装 Java 和在 R 里面安装 rJava 包。因为电脑系统的关系，我不好演示，大家可以自己研究下，不难的。我这里提供几个 tips，Java 安装之后可能还需要进行环境变量的配置。

Java下载：https://www.java.com/zh_CN/
rJava 包的安装：

install.packages('rJava')

tabulizer 包的安装：

install.packages("tabulizer")

数据提取，我就不在这里说了，直播的时候再一一讲解。

# 使用 tabulizer 包
library(tabulizer)
library(purrr)
library(tidyr)
library(tidyverse)
f <- "pdf/20200523-covid-19-sitrep-124.pdf"
# Table1 的第一页需要手动选择区域
extract_areas(f, pages = 4, output = "data.frame") -> df1

df1 %>% 
  as.data.frame() %>% 
  as_tibble() %>% 
  slice(-1) %>% 
  set_names(c("Reporting_Country_Territory_Area", 
              "Total_confirmed_cases", 
              "Total_confirmed_new_cases",
              "Total_deaths",
              "Total_new_deaths",
              "Transmission_classification",
              "Days_since_last_reported_case")) %>% 
  mutate(continent = ifelse(Total_confirmed_cases == "", 
                            Reporting_Country_Territory_Area, NA)) %>% 
  select(continent, everything()) %>% 
  fill(continent) %>% 
  dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>% 
  dplyr::filter(!is.na(Total_confirmed_cases)) %>% 
  dplyr::filter(!is.na(Total_confirmed_new_cases)) %>% 
  dplyr::filter(!str_detect(Total_confirmed_new_cases,
                            "[a-z]")) %>% 
  mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
                                                 " ", ""),
         Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
         Total_deaths = str_remove_all(Total_deaths, " "),
         Total_new_deaths = str_remove_all(Total_new_deaths, " "),
         Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " ")) %>% 
  type_convert() %>% 
  dplyr::filter(!is.na(Total_confirmed_cases)) -> page1

# 剩下的页就好办了
extract_tables(f, pages = 5:13, 
               guess = FALSE, output = "data.frame") -> dfl

rbind(
  dfl[[1]],
  dfl[[2]],
  dfl[[3]],
  dfl[[4]],
  dfl[[5]],
  dfl[[6]],
  dfl[[7]],
  dfl[[8]],
  dfl[[9]]
) %>% 
  as_tibble() %>% 
  set_names(c("Reporting_Country_Territory_Area", 
              "Total_confirmed_cases", 
              "Total_confirmed_new_cases",
              "Total_deaths",
              "Total_new_deaths",
              "Transmission_classification",
              "Days_since_last_reported_case")) %>% 
  mutate(continent = ifelse(Total_confirmed_cases == "", 
                            Reporting_Country_Territory_Area, NA)) %>% 
  select(continent, everything()) %>% 
  fill(continent) %>% 
  type_convert() %>% 
  dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>% 
  dplyr::filter(!str_detect(Total_confirmed_new_cases,
                            "[a-z]")) %>% 
  mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
                                                 " ", ""),
         Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
         Total_deaths = str_remove_all(Total_deaths, " "),
         Total_new_deaths = str_remove_all(Total_new_deaths, " "),
         Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " ")) %>% 
  type_convert() %>% 
  dplyr::filter(!is.na(Total_confirmed_cases)) %>% 
  mutate(Total_confirmed_new_cases = as.numeric(Total_confirmed_new_cases)) -> page2

# 爬取最后一页
# Table1 的最一页需要手动选择区域
extract_areas(f, pages = 14, output = "data.frame") -> df2
df2 %>% 
  as.data.frame() %>% 
  as_tibble() %>% 
  slice(-1) %>% 
  set_names(c("Reporting_Country_Territory_Area", 
              "Total_confirmed_cases", 
              "Total_confirmed_new_cases",
              "Total_deaths",
              "Total_new_deaths",
              "Transmission_classification",
              "Days_since_last_reported_case")) %>% 
  mutate(Reporting_Country_Territory_Area = case_when(
    Reporting_Country_Territory_Area == "(Commonwealth of the)" ~ "Northern Mariana Islands(Commonwealth of the)",
    T ~ Reporting_Country_Territory_Area
  )) %>% 
  slice(-1) %>% 
  mutate(continent = ifelse(Total_confirmed_cases == "", 
                            Reporting_Country_Territory_Area, NA)) %>% 
  select(continent, everything()) %>% 
  fill(continent) %>% 
  dplyr::filter(!is.na(Reporting_Country_Territory_Area)) %>% 
  dplyr::filter(!is.na(Total_confirmed_cases)) %>% 
  dplyr::filter(!is.na(Total_confirmed_new_cases)) %>% 
  dplyr::filter(!str_detect(Total_confirmed_new_cases,
                            "[a-z]")) %>% 
  mutate(Total_confirmed_cases = str_replace_all(Total_confirmed_cases,
                                                 " ", ""),
         Total_confirmed_cases = str_remove_all(Total_confirmed_cases, " "),
         Total_deaths = str_remove_all(Total_deaths, " "),
         Total_new_deaths = str_remove_all(Total_new_deaths, " "),
         Days_since_last_reported_case = str_remove_all(Days_since_last_reported_case, " "),
         Total_confirmed_new_cases = str_remove_all(Total_confirmed_new_cases, " ")) %>% 
  type_convert() %>% 
  slice(1:2) %>% 
  dplyr::filter(!is.na(Total_confirmed_cases)) -> page3

bind_rows(page1, page2, page3) %>% 
  fill(continent) -> fulldf

可以保存成 xlsx 文件：

fulldf %>% 
  writexl::write_xlsx('20200524.xlsx')

fulldf 是这样的：

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2020-05-28，如有侵权请联系 cloudcommunity@tencent.com 删除

java

https

网络安全

本文分享自优雅R 微信公众号，前往查看

如有侵权，请联系 cloudcommunity@tencent.com 删除。

本文参与腾讯云自媒体同步曝光计划，欢迎热爱写作的你一起参与！

java

https

网络安全

登录后参与评论

0 条评论

热度