文章/答案/技术大牛

发布

社区首页 >问答首页 >抓取HTML时将网站中的值添加到表中

问抓取HTML时将网站中的值添加到表中
EN

Stack Overflow用户

提问于 2021-02-04 22:50:01

回答 1查看 81关注 0票数 1

这是我需要的数据：

https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0

我已经将表导入R：

library(tidyverse)
library(rvest)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

tbls <- html_nodes(webpage, "table")
tbls_ls <- webpage %>%
  html_nodes("table") %>%
  .[5] %>%
  html_table(fill = TRUE)

data = as.tibble(tbls_ls[[1]])

然而，我还需要向表中添加一件事。对于一些陨石，有氧同位素的值可用。当人们点击“图”部分下的陨石名称时，就可以看到这一点。当单击该图时，我们被重定向到一个页面，在该页面中我们有三个同位素值。我想要做的是在我的表中添加三列，包含每个陨石各自的同位素数值。我试着分别为每个"plot“部分编写代码，但我觉得可以有一个更优雅的解决方案。

data-handling

import

data-wrangling

回答 1

Stack Overflow用户

回答已采纳

发布于 2021-02-05 06:33:33

您可以获取没有同位素的表，然后模仿页面的post请求；然后左连接两个

列。您将获得比左表中更多的行数(没有同位素)，因为有多个

，但这与您在查看同位素的方法中所看到的相匹配，在该方法中，在图中有针对同位素的逗号分隔的值列表，而不是按行拆分。

我选择了一个更有选择性的css选择器来最初针对感兴趣的特定表，而不是索引到列表中。

我使用

在写出时保留标题的字符编码(我的想法来自

@stefan

)。

您可以从中删除输出中不需要的列

在写出之前(子集/选择等)。

library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)

webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T) 

data <- list(
  'sfor' = "names",
  'stype' = "contains",
  'country' = "All",
  'categ' = "Ungrouped achondrites",
  'page' = "0",
  'map' = "ge",
  'srt' = "name",
  'lrec' = "200",
  'pnt' = "Oxygen isotopes",
  'mblist' = "All",
  'snew' = "0",
  'sea' = "*"
)


r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)


joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")

输出示例：

编辑：

根据您的请求在注释中添加来自其他urls的附加信息。我必须动态确定要提取的表号，以及处理没有表的情况。

library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
#> The following object is masked from 'package:readr':
#> 
#>     guess_encoding
library(readr)
library(stringr)

get_table <- function(url) {
  page <- read_html(url)
  test_list <- page %>%
    html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
    html_text() # get left hand column %>%
  index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
  table <- page %>%
    html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
    html_table() %>%
    as_tibble()
  temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
  return(temp)
}


start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)

no_isotopes <- webpage %>%
  html_node("#maintable") %>%
  html_table(fill = T)

data <- list(
  "sfor" = "names",
  "stype" = "contains",
  "country" = "All",
  "categ" = "Ungrouped achondrites",
  "page" = "0",
  "map" = "ge",
  "srt" = "name",
  "lrec" = "200",
  "pnt" = "Oxygen isotopes",
  "mblist" = "All",
  "snew" = "0",
  "sea" = "*"
)

r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)

isotopes <- content(r, "text") %>%
  read_html(encoding = "UTF-8") %>%
  html_node("#maintable") %>%
  html_table(fill = T)

joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)

lookups <- webpage %>%
  html_node("#maintable") %>%
  html_nodes("td:nth-of-type(1) a") %>%
  map_df(~ c(html_text(.), html_attr(., "href")) %>%
    set_names("Name", "Link")) %>%
  mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))

error_df <- tibble(
  `State/Prov/County:` = NA_character_,
  `Origin or pseudonym:` = NA_character_,
  `Date:` = NA_character_,
  `Latitude:` = NA_character_,
  `Longitude:` = NA_character_,
  `Mass (g):` = NA_character_,
  `Pieces:` = NA_character_,
  `Class:` = NA_character_,
  `Shock stage:` = NA_character_,
  `Fayalite (mol%):` = NA_character_,
  `Ferrosilite (mol%):` = NA_character_,
  `Wollastonite (mol%):` = NA_character_,
  `Magnetic suscept.:` = NA_character_,
  `Classifier:` = NA_character_,
  `Type spec mass (g):` = NA_character_,
  `Type spec location:` = NA_character_,
  `Main mass:` = NA_character_,
  `Finder:` = NA_character_,
  `Comments:` = NA_character_,
)

no_cores <- future::availableCores() - 1

future::plan(future::multisession, workers = no_cores)

df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))

colnames(df) <- sub(":", "", colnames(df))

df2 <- df %>%
  mutate(
    `Mass (g)` = gsub(",", "", `Mass (g)`),
    across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
  )

if (nrow(df2) == nrow(no_isotopes)) {
  additional_info <- cbind(lookups, df2)
  joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
  final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
  write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}

在2021-02-27由

reprex包

(v0.3.0)

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/66048005

复制

相似问题

问抓取HTML时将网站中的值添加到表中
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问抓取HTML时将网站中的值添加到表中EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问抓取HTML时将网站中的值添加到表中
EN