这是我需要的数据:
https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0
我已经将表导入R:
library(tidyverse)
library(rvest)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")
tbls <- html_nodes(webpage, "table")
tbls_ls <- webpage %>%
html_nodes("table") %>%
.[5] %>%
html_table(fill = TRUE)
data = as.tibble(tbls_ls[[1]])
然而,我还需要向表中添加一件事。对于一些陨石,有氧同位素的值可用。当人们点击“图”部分下的陨石名称时,就可以看到这一点。当单击该图时,我们被重定向到一个页面,在该页面中我们有三个同位素值。我想要做的是在我的表中添加三列,包含每个陨石各自的同位素数值。我试着分别为每个"plot“部分编写代码,但我觉得可以有一个更优雅的解决方案。
发布于 2021-02-05 06:33:33
您可以获取没有同位素的表,然后模仿页面的post请求;然后左连接两个
列。您将获得比左表中更多的行数(没有同位素),因为有多个
,但这与您在查看同位素的方法中所看到的相匹配,在该方法中,在图中有针对同位素的逗号分隔的值列表,而不是按行拆分。
我选择了一个更有选择性的css选择器来最初针对感兴趣的特定表,而不是索引到列表中。
我使用
在写出时保留标题的字符编码(我的想法来自
@stefan
)。
您可以从中删除输出中不需要的列
在写出之前(子集/选择等)。
R
library(dyplr)
library(httr)
library(rvest)
library(readr)
library(magrittr)
library(stringr)
webpage <- read_html("https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0")
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
'sfor' = "names",
'stype' = "contains",
'country' = "All",
'categ' = "Ungrouped achondrites",
'page' = "0",
'map' = "ge",
'srt' = "name",
'lrec' = "200",
'pnt' = "Oxygen isotopes",
'mblist' = "All",
'snew' = "0",
'sea' = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
write_excel_csv(x = joint_table, path = "joint.csv", col_names = T, na = "")
输出示例:
编辑:
根据您的请求在注释中添加来自其他urls的附加信息。我必须动态确定要提取的表号,以及处理没有表的情况。
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
library(readr)
library(stringr)
get_table <- function(url) {
page <- read_html(url)
test_list <- page %>%
html_nodes("#maintable tr > .inside:nth-child(odd)") %>%
html_text() # get left hand column %>%
index <- match(TRUE, stringr::str_detect(test_list, "Data from:")) + 1
table <- page %>%
html_node(paste0("#maintable tr:nth-of-type(", index, ") table")) %>%
html_table() %>%
as_tibble()
temp <- set_names(data.frame(t(table[, -1]), row.names = c()), t(table[, 1])) # https://www.nesono.com/node/456 ; https://stackoverflow.com/a/7970267/6241235
return(temp)
}
start_url <- "https://www.lpi.usra.edu/meteor/metbull.php?sea=%2A&sfor=names&ants=&nwas=&falls=&valids=&stype=contains&lrec=200&map=ge&browse=&country=All&srt=name&categ=Ungrouped+achondrites&mblist=All&rect=&phot=&strewn=&snew=0&pnt=Normal%20table&dr=&page=0"
base <- "https://www.lpi.usra.edu"
webpage <- read_html(start_url)
no_isotopes <- webpage %>%
html_node("#maintable") %>%
html_table(fill = T)
data <- list(
"sfor" = "names",
"stype" = "contains",
"country" = "All",
"categ" = "Ungrouped achondrites",
"page" = "0",
"map" = "ge",
"srt" = "name",
"lrec" = "200",
"pnt" = "Oxygen isotopes",
"mblist" = "All",
"snew" = "0",
"sea" = "*"
)
r <- httr::POST(url = "https://www.lpi.usra.edu/meteor/metbull.php", body = data)
isotopes <- content(r, "text") %>%
read_html(encoding = "UTF-8") %>%
html_node("#maintable") %>%
html_table(fill = T)
joint_table <- dplyr::left_join(no_isotopes, isotopes, by = "Name", copy = FALSE)
lookups <- webpage %>%
html_node("#maintable") %>%
html_nodes("td:nth-of-type(1) a") %>%
map_df(~ c(html_text(.), html_attr(., "href")) %>%
set_names("Name", "Link")) %>%
mutate(Link = paste0(base, gsub("\\s+", "%20", Link)))
error_df <- tibble(
`State/Prov/County:` = NA_character_,
`Origin or pseudonym:` = NA_character_,
`Date:` = NA_character_,
`Latitude:` = NA_character_,
`Longitude:` = NA_character_,
`Mass (g):` = NA_character_,
`Pieces:` = NA_character_,
`Class:` = NA_character_,
`Shock stage:` = NA_character_,
`Fayalite (mol%):` = NA_character_,
`Ferrosilite (mol%):` = NA_character_,
`Wollastonite (mol%):` = NA_character_,
`Magnetic suscept.:` = NA_character_,
`Classifier:` = NA_character_,
`Type spec mass (g):` = NA_character_,
`Type spec location:` = NA_character_,
`Main mass:` = NA_character_,
`Finder:` = NA_character_,
`Comments:` = NA_character_,
)
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
df <- furrr::future_map_dfr(lookups$Link, ~ tryCatch(get_table(.x), error = function(e) error_df))
colnames(df) <- sub(":", "", colnames(df))
df2 <- df %>%
mutate(
`Mass (g)` = gsub(",", "", `Mass (g)`),
across(c(`Mass (g)`, `Magnetic suscept.`), as.numeric)
)
if (nrow(df2) == nrow(no_isotopes)) {
additional_info <- cbind(lookups, df2)
joint_table$Name <- gsub(" \\*\\*", "", joint_table$Name)
final_table <- dplyr::left_join(joint_table, additional_info, by = "Name", copy = FALSE)
write_excel_csv(x = final_table, file = "joint.csv", col_names = T, na = "")
}
在2021-02-27由
reprex包
(v0.3.0)
https://stackoverflow.com/questions/66048005
复制相似问题