Skip to contents

This may or may not work, but it may be worth giving this a quick a try before looking for alternatives. The parameters returned first should work best.

Usage

cas_find_extractor(
  html_document,
  pattern,
  containers = c("h1", "h2", "h3", "h4", "span", "td", "p", "div"),
  exclude_css_path = NULL
)

Arguments

html_document

An html document parsed with xml2::read_html() or rvest::read_html().

pattern

A text string to be matched.

containers

Containers to be parsed for best matches. By default: c("h1", "h2", "h3", "h4", "span", "td", "p", "div"). The order matters, as results are returned in this order (e.g. if a match of the same length is found both in a "h1" and in a "div", "h1" is returned first).

exclude_css_path

Defaults to NULL. To remove script, for example, use script, which is transformed to :not(script). May cause issues, use with caution.

Value

A data frame list with container and class or id of values that should work if passed to cas_extract_html().

Examples

if (FALSE) {
if (interactive) {
  # not ideal example, but you'll get the gist, see additonal example below
  library("castarter")
  url <- "https://www.nasa.gov/news-release/nasa-sets-coverage-for-roscosmos-spacewalk-outside-space-station/"

  html_page <- rvest::read_html(url)

  cas_find_extractor(
    html_document = html_page,
    pattern = "NASA Sets Coverage for Roscosmos Spacewalk Outside Space Station"
  )

  cas_find_extractor(
    html_document = html_page,
    pattern = "Oct 23, 2023"
  )

  cas_find_extractor(
    html_document = html_page,
    pattern = "Roxana Bardan"
  )

  cas_find_extractor(
    html_document = html_page,
    pattern = "RELEASE"
  )

  ## Use this information to extract contents


  library("castarter")
  url <- "https://www.state.gov/designating-russian-virtual-currency-money-launderer/"

  html_page <- rvest::read_html(url)

  cas_find_extractor(
    html_document = html_page,
    pattern = "Designating Russian Virtual Currency Money Launderer"
  )

  cas_extract_html(
    html_document = html_page,
    container = "span",
    container_class = "bc_current collapse"
  )

  cas_extract_html(
    html_document = html_page,
    container = "h1",
    container_class = "featured-content__headline stars-above"
  )


  cas_find_extractor(
    html_document = html_page,
    pattern = "Press Statement"
  )

  cas_extract_html(
    html_document = html_page,
    container = "p",
    container_class = "article-meta doctype-meta"
  )


  cas_find_extractor(
    html_document = html_page,
    pattern = "Matthew Miller, Department Spokesperson"
  )

  cas_extract_html(
    html_document = html_page,
    container = "p",
    container_class = "article-meta__author-bureau"
  )

  cas_find_extractor(
    html_document = html_page,
    pattern = "November 3, 2023"
  )

  cas_extract_html(
    html_document = html_page,
    container = "p",
    container_class = "article-meta__publish-date"
  )

  cas_find_extractor(
    html_document = html_page,
    pattern = "The United States is sanctioning Ekaterina Zhdanova",
    exclude_css_path = "script"
  )


  cas_extract_html(
    html_document = html_page,
    container = "div",
    container_class = "entry-content",
    exclude_css_path = "script"
  )
}
}