How to process datasets efficiently and limiting memory use

library("castarter")

Get the dataset

First, let’s take a dataset to use as an example.

library("piggyback")
dataset_path <- fs::path(fs::path_temp(), "castarter_dataset")
fs::dir_create(dataset_path)

piggyback::pb_download(
  file = "kremlin.ru_en.csv.gz",
  dest = dataset_path,
  repo = "giocomai/tadadit",
  tag = "kremlin.ru_en",
  overwrite = FALSE
)

original_df <- readr::read_csv(
  file = fs::path(dataset_path, "kremlin.ru_en.csv.gz"),
  show_col_types = FALSE
) %>%
  dplyr::mutate(year = lubridate::year(date)) %>%
  dplyr::collect()

Create dataset variations

As a single parquet file:

dataset_parquet_path <- fs::path(fs::path_temp(), "castarter_dataset_parquet")

fs::dir_create(path = dataset_parquet_path)
original_df %>%
  arrow::write_dataset(path = fs::path(dataset_parquet_path))

As a parquet file divided partitioned by year.

dataset_parquet_year_path <- fs::path(fs::path_temp(), "castarter_dataset_year_parquet")

fs::dir_create(path = dataset_parquet_year_path)

original_df %>%
  dplyr::group_by(year) %>%
  arrow::write_dataset(path = dataset_parquet_year_path)

duck_con <- duckdb::dbConnect(drv = duckdb::duckdb(), fs::path(fs::path_temp(), "castarter_dataset_duckdb.duckdb"))

duckdb::dbWriteTable(
  conn = duck_con,
  name = "data",
  value = original_df
)

dplyr::tbl(duck_con, from = "data") %>%
  count()

sqlite_con <- RSQLite::dbConnect(drv = RSQLite::SQLite(), fs::path(fs::path_temp(), "castarter_dataset_sqlite.sqlite"))

RSQLite::dbWriteTable(
  conn = sqlite_con,
  name = "data",
  value = original_df
)

dplyr::tbl(sqlite_con, from = "data") %>%
  count()

Benchmark with detect

ds_py <- arrow::open_dataset(sources = dataset_parquet_year_path)

count <- function(input) {
  input %>%
    dplyr::filter(stringr::str_detect(
      string = text,
      pattern = stringr::regex(
        pattern = "Ukrain",
        ignore_case = TRUE
      )
    )) %>%
    dplyr::mutate(
      n = stringr::str_count(
        string = text,
        pattern = "Ukrain"
      )
    ) %>%
    dplyr::group_by(date) %>%
    dplyr::summarise(
      n = sum(n, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    dplyr::arrange(dplyr::desc(date)) %>%
    dplyr::collect()
}

results <- bench::mark(
  dataset_in_memory = original_df %>% count(),
  parquet_single = arrow::open_dataset(sources = dataset_parquet_path) %>% count(),
  parquet_by_year = arrow::open_dataset(sources = dataset_parquet_year_path) %>% count(), min_iterations = 10
)

results
#> # A tibble: 3 × 6
#>   expression             min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>        <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dataset_in_memory    440ms    447ms      2.23    2.42MB    0    
#> 2 parquet_single       393ms    479ms      2.06   10.13MB    0.515
#> 3 parquet_by_year      150ms    156ms      6.19  250.09KB    2.65

summary(object = results, relative = TRUE)
#> # A tibble: 3 × 6
#>   expression          min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>        <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
#> 1 dataset_in_memory  2.92   2.87      1.08      9.93      NaN
#> 2 parquet_single     2.61   3.08      1        41.5       Inf
#> 3 parquet_by_year    1      1         3.00      1         Inf

plot(results) +
  ggplot2::scale_y_continuous(limits = c(0, NA)) +
  ggplot2::labs(title = "str_detect, str_count, group_by, sum")
#> Loading required namespace: tidyr


ggplot2::ggsave("arrow.png")
#> Saving 7.29 x 4.51 in image

Conclusions

Processing parquet files is more efficient than processing the whole file from memory (even not considering the much smaller memory footprint). Processing the dataset from a parquet file partitioned by year is much faster.

Relying on `castarter`s own internal functions

library("castarter")

cas_set_options(
  base_folder = fs::path(
    fs::path_home_r(),
    "R",
    "castarter_tadadit"
  ),
  project = "Russian institutions",
  website = "kremlin.ru_ru"
)

count <- function(corpus = cas_read_dataset(),
                  pattern = "Ukrain") {
  corpus %>%
    dplyr::filter(stringr::str_detect(
      string = text,
      pattern = stringr::regex(pattern = pattern, ignore_case = TRUE)
    )) %>%
    dplyr::mutate(n = stringr::str_count(
      string = text,
      pattern = pattern
    ))
  # dplyr::summarise({{ n_column_name }} := sum({{ n_column_name }}, na.rm = TRUE),
  #                  .by = {{ group_by }}
  # )
}




corpus <- cas_read_corpus()

cas_count(corpus = corpus, pattern = "ukrain")
#> # A tibble: 7,867 × 3
#>    date       pattern     n
#>    <chr>      <chr>   <int>
#>  1 2008-09-26 ukrain      0
#>  2 2008-09-25 ukrain      0
#>  3 2008-09-24 ukrain      0
#>  4 2008-10-01 ukrain      0
#>  5 2008-09-30 ukrain      0
#>  6 2008-09-29 ukrain      0
#>  7 2008-09-28 ukrain      0
#>  8 2008-10-03 ukrain      0
#>  9 2008-10-02 ukrain      0
#> 10 2008-10-07 ukrain      0
#> # ℹ 7,857 more rows

test_count <- function(text_column, n_column_name, patter = "Ukrain") {
  cas_read_corpus() %>%
    dplyr::mutate({{ n_column_name }} := stringr::str_count(
      string = {{ text_column }},
      pattern = !!patter
    ))
}
test_count(text_column = text, n_column_name = n) %>%
  dplyr::collect()
#> # A tibble: 51,797 × 14
#>    doc_id      text  id    url   title date  time  datetime location description
#>    <chr>       <chr> <chr> <chr> <chr> <chr> <chr> <chr>    <chr>    <chr>      
#>  1 kremlin.ru… Дмит… 1     http… Указ… 2008… 11:00 2008-06… NA       ""         
#>  2 kremlin.ru… През… 2     http… Указ… 2008… 10:50 2008-06… NA       ""         
#>  3 kremlin.ru… През… 3     http… Указ… 2008… 19:00 2008-06… NA       ""         
#>  4 kremlin.ru… Ю.Ба… 4     http… Указ… 2008… 18:45 2008-06… NA       ""         
#>  5 kremlin.ru… През… 5     http… Указ… 2008… 15:30 2008-05… NA       ""         
#>  6 kremlin.ru… Возг… 6     http… Утве… 2008… 16:15 2008-05… NA       "Дмитрий М…
#>  7 kremlin.ru… Госу… 7     http… Объя… 2008… 12:00 2008-05… Москва,… "По традиц…
#>  8 kremlin.ru… В св… 8     http… Сове… 2008… 13:30 2008-05… Москва,… "Борьбу с …
#>  9 kremlin.ru… В Ро… 9     http… Наци… 2008… 11:00 2008-07… NA       ""         
#> 10 kremlin.ru… Указ… 10    http… Указ… 2008… 17:30 2008-07… NA       ""         
#> # ℹ 51,787 more rows
#> # ℹ 4 more variables: keywords <chr>, tags <chr>, tags_links <chr>, n <int>

Storing tokenised dataset as parquet

library("castarter")

cas_set_options(
  base_folder = fs::path(
    fs::path_home_r(),
    "R",
    "castarter_tadadit"
  ),
  project = "Russian institutions",
  website = "kremlin.ru_ru"
)

cas_write_corpus(token = "sentences", partition = "year")