Extracts scripts from an html page
Usage
cas_extract_script(
html_document,
script_type = NULL,
match = NULL,
accessors = NULL,
remove_from_script = NULL
)
Arguments
- html_document
An html document parsed with
xml2::read_html()
orrvest::read_html()
.- script_type
Defaults to NULL. Type of script. Common script types include
application/ld+json
,text/template
, etc.- match
Default to NULL. If given, used to filter extracted scripts. Must be a named vector in the format
c(
@type= "NewsArticle")
for a script of type "NewsArticle".- accessors
Defaults to NULL. If given, a vector of accessors passed to
purrr::pluck
in order to extract sub-components of the list resulting from reading the withjsonlite
the result of the previous steps and filter.- remove_from_script
Defaults to NULL. If given, removed after the script has been extracted but before processing the json.
Examples
if (FALSE) { # \dontrun{
if (interactive()) {
url <- "https://www.digi24.ro/stiri/externe/casa-alba-pune-capat-isteriei-globale-nu-exista-indicii-ca-obiectele-zburatoare-doborate-de-rachetele-sua-ar-fi-extraterestre-2250863"
html_document <- rvest::read_html(x = url)
cas_extract_script(
html_document = html_document,
script_type = "application/ld+json"
)
# get date published
cas_extract_script(
html_document = html_document,
script_type = "application/ld+json",
match = c(`@type` = "NewsArticle"),
accessors = "datePublished"
)
# get title
cas_extract_script(
html_document = html_document,
script_type = "application/ld+json",
match = c(`@type` = "NewsArticle"),
accessors = "headline"
)
# get nested element, e.g. url of the logo of the publisher
cas_extract_script(
html_document = html_document,
script_type = "application/ld+json",
match = c(`@type` = "NewsArticle"),
accessors = c("publisher", "logo", "url")
)
}
} # }