From 0523cd6950efbdecd3ad9808466f0442301f934e Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Sat, 30 Sep 2023 16:28:21 +0100
Subject: [PATCH 1/3] Add cache to cr_works

---
 R/cr_works.R                   | 109 ++++++++++++++++++++++-----------
 tests/testthat/test-cr_works.R |  27 +++++++-
 2 files changed, 100 insertions(+), 36 deletions(-)

diff --git a/R/cr_works.R b/R/cr_works.R
index 5caed27..9a2770c 100644
--- a/R/cr_works.R
+++ b/R/cr_works.R
@@ -18,6 +18,8 @@
 #' @param parse (logical) Whether to output json `FALSE` or parse to
 #' list `TRUE`. Default: `FALSE`
 #' @param async (logical) use async HTTP requests. Default: `FALSE`
+#' @param cache (logical) Should requests be cached and/or retrieved from
+#' the cache? Note that the cache only persists while the package is loaded.
 #'
 #' @section Beware:
 #' The API will only work for CrossRef DOIs.
@@ -100,7 +102,7 @@
 #' cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100,
 #'    facet = TRUE)
 #' ## with optional progress bar
-#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, 
+#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200,
 #'   .progress = TRUE)
 #'
 #' # Low level function - does no parsing to data.frame, get json or a list
@@ -145,53 +147,90 @@
 #' system.time(lapply(queries, function(z) cr_works(query = z)))
 #' }
 
-cr_works <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
-  limit = NULL, sample = NULL, sort = NULL, order = NULL, facet=FALSE,
-  cursor = NULL, cursor_max = 5000, .progress="none", flq = NULL,
-  select = NULL, async = FALSE, ...) {
-
+cr_works <- function(
+    dois = NULL, query = NULL, filter = NULL, offset = NULL,
+    limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE,
+    cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL,
+    select = NULL, async = FALSE, cache = FALSE, ...) {
   if (cursor_max != as.integer(cursor_max)) {
     stop("cursor_max must be an integer", call. = FALSE)
   }
-  args <- prep_args(query, filter, offset, limit, sample, sort, order,
-                    facet, cursor, flq, select)
+
+  args <- prep_args(
+    query, filter, offset, limit, sample, sort, order,
+    facet, cursor, flq, select
+  )
 
   stopifnot(is.logical(async))
-  if (async) {
-    return(cr_async("works", c(dois, args), ...))
-  }
 
-  if (length(dois) > 1) {
-    res <- llply(dois, cr_get_cursor, args = args, cursor = cursor,
-                 cursor_max = cursor_max, .progress = .progress, ...)
-    res <- lapply(res, "[[", "message")
-    res <- lapply(res, parse_works)
-    df <- tibble::as_tibble(bind_rows(res))
-    #exclude rows with empty DOI value until CrossRef API supports
-    #input validation
-    if (nrow(df[df$doi == "", ]) > 0) {
-      warning("only data with valid CrossRef DOIs returned",  call. = FALSE)
+  works_do <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
+                       limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE,
+                       cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL,
+                       select = NULL, async = FALSE, cache = FALSE, ...) {
+    if (async) {
+      return(cr_async("works", c(dois, args), ...))
     }
-    df <- df[!df$doi == "", ]
-    list(meta = NULL, data = df, facets = NULL)
-  } else {
-    tmp <- cr_get_cursor(dois, args = args, cursor = cursor,
-                         cursor_max = cursor_max, .progress, ...)
-    if (is.null(dois)) {
-      if (!is.null(cursor) || is.null(tmp$message)) {
-        tmp
-      } else {
-        meta <- parse_meta(tmp)
-        list(meta = meta,
-             data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))),
-             facets = parse_facets(tmp$message$facets))
+
+    if (length(dois) > 1) {
+      res <- llply(dois, cr_get_cursor,
+        args = args, cursor = cursor,
+        cursor_max = cursor_max, .progress = .progress, ...
+      )
+      res <- lapply(res, "[[", "message")
+      res <- lapply(res, parse_works)
+      df <- tibble::as_tibble(bind_rows(res))
+      # exclude rows with empty DOI value until CrossRef API supports
+      # input validation
+      if (nrow(df[df$doi == "", ]) > 0) {
+        warning("only data with valid CrossRef DOIs returned", call. = FALSE)
       }
+      df <- df[!df$doi == "", ]
+      list(meta = NULL, data = df, facets = NULL)
     } else {
-      list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL)
+      tmp <- cr_get_cursor(dois,
+        args = args, cursor = cursor,
+        cursor_max = cursor_max, .progress, ...
+      )
+      if (is.null(dois)) {
+        if (!is.null(cursor) || is.null(tmp$message)) {
+          tmp
+        } else {
+          meta <- parse_meta(tmp)
+          list(
+            meta = meta,
+            data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))),
+            facets = parse_facets(tmp$message$facets)
+          )
+        }
+      } else {
+        list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL)
+      }
     }
   }
+
+  if (cache == TRUE) {
+    req <- paste("cr_works", paste(dois, collapse = "__"), query, filter, offset, limit, sample,
+      sort, order, facet, cursor, cursor_max, .progress, flq, select,
+      paste(unlist(list(...)), sep = "__"),
+      sep = "__"
+    )
+    rlang::env_cache(
+      env = cr_cache_env, nm = req,
+      default = works_do(dois, query, filter, offset, limit, sample,
+                         sort, order, facet, cursor, cursor_max, .progress, flq,
+                         select, async, ...)
+    )
+  } else {
+    works_do(
+      dois, query, filter, offset, limit, sample,
+      sort, order, facet, cursor, cursor_max, .progress, flq, select, async,
+      ...
+    )
+  }
 }
 
+
+
 #' @export
 #' @rdname cr_works
 cr_works_ <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
diff --git a/tests/testthat/test-cr_works.R b/tests/testthat/test-cr_works.R
index 68d8ace..1a9bae7 100644
--- a/tests/testthat/test-cr_works.R
+++ b/tests/testthat/test-cr_works.R
@@ -230,7 +230,7 @@ test_that("cr_works fails well: arguments that dont require http requests", {
   expect_error(cr_works(offset = 'foo'), "offset value illegal")
 
   expect_error(cr_works(sample = 'foo'), "sample value illegal")
-  
+
   expect_error(cr_works(async = 5), "is not TRUE")
 })
 
@@ -241,3 +241,28 @@ test_that("content domain parsing fix", {
     expect_is(cr_works(dois = "10.7287/peerj.3819v0.1/reviews/2"), "list")
   })
 })
+
+test_that("cr_works cache works", {
+  #vcr::use_cassette("cr_works_cache", { # Error with HTTP request here, but apart from that, test works
+    ## With query
+    # reset cache
+    rm(list = ls(cr_cache_env), envir = cr_cache_env)
+    t1 <- system.time(b1 <- cr_works(query="NSF", cache = TRUE))
+    t2 <- system.time(b2 <- cr_works(query="NSF", cache = TRUE))
+    # compare timing to ensure that caching actually happened
+    expect_gt(t1[3], t2[3])
+    expect_identical(b1, b2)
+    expect_is(b1, "list")
+
+    ## With multiple DOIs
+    # reset cache
+    rm(list = ls(cr_cache_env), envir = cr_cache_env)
+    t1 <- system.time(b1 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE))
+    t2 <- system.time(b2 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE))
+    # compare timing to ensure that caching actually happened
+    expect_gt(t1[3], t2[3])
+    expect_identical(b1, b2)
+    expect_is(b1, "list")
+    expect_match(b1$data$`container.title`[1], "Journal of Personality and Social Psychology")
+ # })
+})

From 36dff5cc6de898056f27cf1f7e3aa42092fb7768 Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Wed, 4 Oct 2023 12:07:07 +0100
Subject: [PATCH 2/3] Implement cache for cr_abstract, fix issue with large
 cache requests for cr_works

---
 NEWS.md         |  7 ++++++
 R/cr_abstract.R | 67 ++++++++++++++++++++++++++++++-------------------
 R/cr_works.R    | 10 ++++++++
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index c7a4643..dd49478 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+rcrossref 1.2.1
+===============
+
+### MINOR IMPROVEMENTS
+* Requests can now be cached (for a single session) with `cr_works`, `cr_abstract` and `cr_cn` by setting `cache = TRUE`
+
+
 rcrossref 1.2.0
 ===============
 
diff --git a/R/cr_abstract.R b/R/cr_abstract.R
index 2c7ee15..2d07d03 100644
--- a/R/cr_abstract.R
+++ b/R/cr_abstract.R
@@ -1,44 +1,59 @@
 #' Get abstract
-#' 
+#'
 #' @export
 #' @param doi (character) a DOI, required.
+#' @param cache (logical) Should requests be cached and/or retrieved from
+#' the cache? Note that the cache only persists while the package is loaded.
 #' @param ... Named parameters passed on to \code{\link[crul]{HttpClient}}
 #' @examples \dontrun{
 #' # abstract found
 #' cr_abstract('10.1109/TASC.2010.2088091')
 #' cr_abstract("10.1175//2572.1")
 #' cr_abstract("10.1182/blood.v16.1.1039.1039")
-#' 
+#'
 #' # doi not found
 #' # cr_abstract(doi = '10.5284/1011335')
-#' 
+#'
 #' # abstract not found, throws error
 #' # cr_abstract(doi = '10.1126/science.169.3946.635')
-#' 
+#'
 #' # a random DOI
 #' # cr_abstract(cr_r(1))
 #' }
-cr_abstract <- function(doi, ...) {
-  url <- paste0('https://api.crossref.org/works/', doi, '/transform/application/vnd.crossref.unixsd+xml')
-  cli <- crul::HttpClient$new(
-    url = url,
-    opts = list(followlocation = 1),
-    headers = list(
-      `User-Agent` = rcrossref_ua(),
-      `X-USER-AGENT` = rcrossref_ua()
+cr_abstract <- function(doi, cache = FALSE, ...) {
+  if (cache == TRUE) {
+    req <- paste("cr_abstract", doi,
+      paste(unlist(list(...)), sep = "__"),
+      sep = "__"
     )
-  )
-  res <- cli$get(...)
-  res$raise_for_status()
-  txt <- res$parse("UTF-8")
-  xml <- tryCatch(read_xml(txt), error = function(e) e)
-  if (inherits(xml, "error")) {
-    stop(doi, " not found ", call. = FALSE)
-  }
-  tt <- tryCatch(xml_find_first(xml, "//jats:abstract"), 
-                 warning = function(w) w)
-  if (inherits(tt, "warning")) {
-    stop("no abstract found for ", doi, call. = FALSE)
+
+    rlang::env_cache(
+      env = cr_cache_env, nm = req,
+      default = cr_abstract(doi, cache = FALSE, ...)
+    )
+  } else {
+    url <- paste0("https://api.crossref.org/works/", doi, "/transform/application/vnd.crossref.unixsd+xml")
+    cli <- crul::HttpClient$new(
+      url = url,
+      opts = list(followlocation = 1),
+      headers = list(
+        `User-Agent` = rcrossref_ua(),
+        `X-USER-AGENT` = rcrossref_ua()
+      )
+    )
+    res <- cli$get(...)
+    res$raise_for_status()
+    txt <- res$parse("UTF-8")
+    xml <- tryCatch(read_xml(txt), error = function(e) e)
+    if (inherits(xml, "error")) {
+      stop(doi, " not found ", call. = FALSE)
+    }
+    tt <- tryCatch(xml_find_first(xml, "//jats:abstract"),
+      warning = function(w) w
+    )
+    if (inherits(tt, "warning")) {
+      stop("no abstract found for ", doi, call. = FALSE)
+    }
+    xml_text(tt)
   }
-  xml_text(tt)
-}
+}
\ No newline at end of file
diff --git a/R/cr_works.R b/R/cr_works.R
index 9a2770c..aeec680 100644
--- a/R/cr_works.R
+++ b/R/cr_works.R
@@ -214,6 +214,16 @@ cr_works <- function(
       paste(unlist(list(...)), sep = "__"),
       sep = "__"
     )
+
+    if (object.size(req) > 10000) {
+      warning("Request call is too large to be cached (likely because many DOIs at once are requested). Request will be executed with cache = FALSE. Split into multiple calls to cache.")
+      works_do(
+        dois, query, filter, offset, limit, sample,
+        sort, order, facet, cursor, cursor_max, .progress, flq, select, async,
+        ...
+      )
+    }
+
     rlang::env_cache(
       env = cr_cache_env, nm = req,
       default = works_do(dois, query, filter, offset, limit, sample,

From cc2226d5316a30cd6a5754059f300723fd28f3d0 Mon Sep 17 00:00:00 2001
From: Lukas Wallrich <lukas.wallrich@gmail.com>
Date: Wed, 4 Oct 2023 14:38:38 +0100
Subject: [PATCH 3/3] Document and fix missing utils::

---
 R/cr_works.R       | 2 +-
 man/cr_abstract.Rd | 5 ++++-
 man/cr_journals.Rd | 2 +-
 man/cr_works.Rd    | 6 +++++-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/R/cr_works.R b/R/cr_works.R
index aeec680..b1b03b1 100644
--- a/R/cr_works.R
+++ b/R/cr_works.R
@@ -215,7 +215,7 @@ cr_works <- function(
       sep = "__"
     )
 
-    if (object.size(req) > 10000) {
+    if (utils::object.size(req) > 10000) {
       warning("Request call is too large to be cached (likely because many DOIs at once are requested). Request will be executed with cache = FALSE. Split into multiple calls to cache.")
       works_do(
         dois, query, filter, offset, limit, sample,
diff --git a/man/cr_abstract.Rd b/man/cr_abstract.Rd
index 3f7818c..a81f35f 100644
--- a/man/cr_abstract.Rd
+++ b/man/cr_abstract.Rd
@@ -4,11 +4,14 @@
 \alias{cr_abstract}
 \title{Get abstract}
 \usage{
-cr_abstract(doi, ...)
+cr_abstract(doi, cache = FALSE, ...)
 }
 \arguments{
 \item{doi}{(character) a DOI, required.}
 
+\item{cache}{(logical) Should requests be cached and/or retrieved from
+the cache? Note that the cache only persists while the package is loaded.}
+
 \item{...}{Named parameters passed on to \code{\link[crul]{HttpClient}}}
 }
 \description{
diff --git a/man/cr_journals.Rd b/man/cr_journals.Rd
index 180ea1d..84ac681 100644
--- a/man/cr_journals.Rd
+++ b/man/cr_journals.Rd
@@ -226,7 +226,7 @@ cr_journals_("2167-8359", works = TRUE, cursor = "*",
 cr_journals("2167-8359", works = TRUE, flq = c(`query.author` = 'Jane'))
 
 # select only certain fields to return
-res <- cr_journals('2167-8359', works = TRUE, 
+res <- cr_journals('2167-8359', works = TRUE,
   select = c('DOI', 'title'))
 names(res$data)
 }
diff --git a/man/cr_works.Rd b/man/cr_works.Rd
index e530773..9334026 100644
--- a/man/cr_works.Rd
+++ b/man/cr_works.Rd
@@ -21,6 +21,7 @@ cr_works(
   flq = NULL,
   select = NULL,
   async = FALSE,
+  cache = FALSE,
   ...
 )
 
@@ -137,6 +138,9 @@ are returned)}
 
 \item{async}{(logical) use async HTTP requests. Default: \code{FALSE}}
 
+\item{cache}{(logical) Should requests be cached and/or retrieved from
+the cache? Note that the cache only persists while the package is loaded.}
+
 \item{...}{Named parameters passed on to \code{\link[crul]{verb-GET}}}
 
 \item{parse}{(logical) Whether to output json \code{FALSE} or parse to
@@ -246,7 +250,7 @@ cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100)
 cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100,
    facet = TRUE)
 ## with optional progress bar
-x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, 
+x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200,
   .progress = TRUE)
 
 # Low level function - does no parsing to data.frame, get json or a list