From 0523cd6950efbdecd3ad9808466f0442301f934e Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Sat, 30 Sep 2023 16:28:21 +0100 Subject: [PATCH 1/3] Add cache to cr_works --- R/cr_works.R | 109 ++++++++++++++++++++++----------- tests/testthat/test-cr_works.R | 27 +++++++- 2 files changed, 100 insertions(+), 36 deletions(-) diff --git a/R/cr_works.R b/R/cr_works.R index 5caed27..9a2770c 100644 --- a/R/cr_works.R +++ b/R/cr_works.R @@ -18,6 +18,8 @@ #' @param parse (logical) Whether to output json `FALSE` or parse to #' list `TRUE`. Default: `FALSE` #' @param async (logical) use async HTTP requests. Default: `FALSE` +#' @param cache (logical) Should requests be cached and/or retrieved from +#' the cache? Note that the cache only persists while the package is loaded. #' #' @section Beware: #' The API will only work for CrossRef DOIs. @@ -100,7 +102,7 @@ #' cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100, #' facet = TRUE) #' ## with optional progress bar -#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, +#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, #' .progress = TRUE) #' #' # Low level function - does no parsing to data.frame, get json or a list @@ -145,53 +147,90 @@ #' system.time(lapply(queries, function(z) cr_works(query = z))) #' } -cr_works <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL, - limit = NULL, sample = NULL, sort = NULL, order = NULL, facet=FALSE, - cursor = NULL, cursor_max = 5000, .progress="none", flq = NULL, - select = NULL, async = FALSE, ...) { - +cr_works <- function( + dois = NULL, query = NULL, filter = NULL, offset = NULL, + limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE, + cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL, + select = NULL, async = FALSE, cache = FALSE, ...) { if (cursor_max != as.integer(cursor_max)) { stop("cursor_max must be an integer", call. = FALSE) } - args <- prep_args(query, filter, offset, limit, sample, sort, order, - facet, cursor, flq, select) + + args <- prep_args( + query, filter, offset, limit, sample, sort, order, + facet, cursor, flq, select + ) stopifnot(is.logical(async)) - if (async) { - return(cr_async("works", c(dois, args), ...)) - } - if (length(dois) > 1) { - res <- llply(dois, cr_get_cursor, args = args, cursor = cursor, - cursor_max = cursor_max, .progress = .progress, ...) - res <- lapply(res, "[[", "message") - res <- lapply(res, parse_works) - df <- tibble::as_tibble(bind_rows(res)) - #exclude rows with empty DOI value until CrossRef API supports - #input validation - if (nrow(df[df$doi == "", ]) > 0) { - warning("only data with valid CrossRef DOIs returned", call. = FALSE) + works_do <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL, + limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE, + cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL, + select = NULL, async = FALSE, cache = FALSE, ...) { + if (async) { + return(cr_async("works", c(dois, args), ...)) } - df <- df[!df$doi == "", ] - list(meta = NULL, data = df, facets = NULL) - } else { - tmp <- cr_get_cursor(dois, args = args, cursor = cursor, - cursor_max = cursor_max, .progress, ...) - if (is.null(dois)) { - if (!is.null(cursor) || is.null(tmp$message)) { - tmp - } else { - meta <- parse_meta(tmp) - list(meta = meta, - data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))), - facets = parse_facets(tmp$message$facets)) + + if (length(dois) > 1) { + res <- llply(dois, cr_get_cursor, + args = args, cursor = cursor, + cursor_max = cursor_max, .progress = .progress, ... + ) + res <- lapply(res, "[[", "message") + res <- lapply(res, parse_works) + df <- tibble::as_tibble(bind_rows(res)) + # exclude rows with empty DOI value until CrossRef API supports + # input validation + if (nrow(df[df$doi == "", ]) > 0) { + warning("only data with valid CrossRef DOIs returned", call. = FALSE) } + df <- df[!df$doi == "", ] + list(meta = NULL, data = df, facets = NULL) } else { - list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL) + tmp <- cr_get_cursor(dois, + args = args, cursor = cursor, + cursor_max = cursor_max, .progress, ... + ) + if (is.null(dois)) { + if (!is.null(cursor) || is.null(tmp$message)) { + tmp + } else { + meta <- parse_meta(tmp) + list( + meta = meta, + data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))), + facets = parse_facets(tmp$message$facets) + ) + } + } else { + list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL) + } } } + + if (cache == TRUE) { + req <- paste("cr_works", paste(dois, collapse = "__"), query, filter, offset, limit, sample, + sort, order, facet, cursor, cursor_max, .progress, flq, select, + paste(unlist(list(...)), sep = "__"), + sep = "__" + ) + rlang::env_cache( + env = cr_cache_env, nm = req, + default = works_do(dois, query, filter, offset, limit, sample, + sort, order, facet, cursor, cursor_max, .progress, flq, + select, async, ...) + ) + } else { + works_do( + dois, query, filter, offset, limit, sample, + sort, order, facet, cursor, cursor_max, .progress, flq, select, async, + ... + ) + } } + + #' @export #' @rdname cr_works cr_works_ <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL, diff --git a/tests/testthat/test-cr_works.R b/tests/testthat/test-cr_works.R index 68d8ace..1a9bae7 100644 --- a/tests/testthat/test-cr_works.R +++ b/tests/testthat/test-cr_works.R @@ -230,7 +230,7 @@ test_that("cr_works fails well: arguments that dont require http requests", { expect_error(cr_works(offset = 'foo'), "offset value illegal") expect_error(cr_works(sample = 'foo'), "sample value illegal") - + expect_error(cr_works(async = 5), "is not TRUE") }) @@ -241,3 +241,28 @@ test_that("content domain parsing fix", { expect_is(cr_works(dois = "10.7287/peerj.3819v0.1/reviews/2"), "list") }) }) + +test_that("cr_works cache works", { + #vcr::use_cassette("cr_works_cache", { # Error with HTTP request here, but apart from that, test works + ## With query + # reset cache + rm(list = ls(cr_cache_env), envir = cr_cache_env) + t1 <- system.time(b1 <- cr_works(query="NSF", cache = TRUE)) + t2 <- system.time(b2 <- cr_works(query="NSF", cache = TRUE)) + # compare timing to ensure that caching actually happened + expect_gt(t1[3], t2[3]) + expect_identical(b1, b2) + expect_is(b1, "list") + + ## With multiple DOIs + # reset cache + rm(list = ls(cr_cache_env), envir = cr_cache_env) + t1 <- system.time(b1 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE)) + t2 <- system.time(b2 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE)) + # compare timing to ensure that caching actually happened + expect_gt(t1[3], t2[3]) + expect_identical(b1, b2) + expect_is(b1, "list") + expect_match(b1$data$`container.title`[1], "Journal of Personality and Social Psychology") + # }) +}) From 36dff5cc6de898056f27cf1f7e3aa42092fb7768 Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Wed, 4 Oct 2023 12:07:07 +0100 Subject: [PATCH 2/3] Implement cache for cr_abstract, fix issue with large cache requests for cr_works --- NEWS.md | 7 ++++++ R/cr_abstract.R | 67 ++++++++++++++++++++++++++++++------------------- R/cr_works.R | 10 ++++++++ 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/NEWS.md b/NEWS.md index c7a4643..dd49478 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +rcrossref 1.2.1 +=============== + +### MINOR IMPROVEMENTS +* Requests can now be cached (for a single session) with `cr_works`, `cr_abstract` and `cr_cn` by setting `cache = TRUE` + + rcrossref 1.2.0 =============== diff --git a/R/cr_abstract.R b/R/cr_abstract.R index 2c7ee15..2d07d03 100644 --- a/R/cr_abstract.R +++ b/R/cr_abstract.R @@ -1,44 +1,59 @@ #' Get abstract -#' +#' #' @export #' @param doi (character) a DOI, required. +#' @param cache (logical) Should requests be cached and/or retrieved from +#' the cache? Note that the cache only persists while the package is loaded. #' @param ... Named parameters passed on to \code{\link[crul]{HttpClient}} #' @examples \dontrun{ #' # abstract found #' cr_abstract('10.1109/TASC.2010.2088091') #' cr_abstract("10.1175//2572.1") #' cr_abstract("10.1182/blood.v16.1.1039.1039") -#' +#' #' # doi not found #' # cr_abstract(doi = '10.5284/1011335') -#' +#' #' # abstract not found, throws error #' # cr_abstract(doi = '10.1126/science.169.3946.635') -#' +#' #' # a random DOI #' # cr_abstract(cr_r(1)) #' } -cr_abstract <- function(doi, ...) { - url <- paste0('https://api.crossref.org/works/', doi, '/transform/application/vnd.crossref.unixsd+xml') - cli <- crul::HttpClient$new( - url = url, - opts = list(followlocation = 1), - headers = list( - `User-Agent` = rcrossref_ua(), - `X-USER-AGENT` = rcrossref_ua() +cr_abstract <- function(doi, cache = FALSE, ...) { + if (cache == TRUE) { + req <- paste("cr_abstract", doi, + paste(unlist(list(...)), sep = "__"), + sep = "__" ) - ) - res <- cli$get(...) - res$raise_for_status() - txt <- res$parse("UTF-8") - xml <- tryCatch(read_xml(txt), error = function(e) e) - if (inherits(xml, "error")) { - stop(doi, " not found ", call. = FALSE) - } - tt <- tryCatch(xml_find_first(xml, "//jats:abstract"), - warning = function(w) w) - if (inherits(tt, "warning")) { - stop("no abstract found for ", doi, call. = FALSE) + + rlang::env_cache( + env = cr_cache_env, nm = req, + default = cr_abstract(doi, cache = FALSE, ...) + ) + } else { + url <- paste0("https://api.crossref.org/works/", doi, "/transform/application/vnd.crossref.unixsd+xml") + cli <- crul::HttpClient$new( + url = url, + opts = list(followlocation = 1), + headers = list( + `User-Agent` = rcrossref_ua(), + `X-USER-AGENT` = rcrossref_ua() + ) + ) + res <- cli$get(...) + res$raise_for_status() + txt <- res$parse("UTF-8") + xml <- tryCatch(read_xml(txt), error = function(e) e) + if (inherits(xml, "error")) { + stop(doi, " not found ", call. = FALSE) + } + tt <- tryCatch(xml_find_first(xml, "//jats:abstract"), + warning = function(w) w + ) + if (inherits(tt, "warning")) { + stop("no abstract found for ", doi, call. = FALSE) + } + xml_text(tt) } - xml_text(tt) -} +} \ No newline at end of file diff --git a/R/cr_works.R b/R/cr_works.R index 9a2770c..aeec680 100644 --- a/R/cr_works.R +++ b/R/cr_works.R @@ -214,6 +214,16 @@ cr_works <- function( paste(unlist(list(...)), sep = "__"), sep = "__" ) + + if (object.size(req) > 10000) { + warning("Request call is too large to be cached (likely because many DOIs at once are requested). Request will be executed with cache = FALSE. Split into multiple calls to cache.") + works_do( + dois, query, filter, offset, limit, sample, + sort, order, facet, cursor, cursor_max, .progress, flq, select, async, + ... + ) + } + rlang::env_cache( env = cr_cache_env, nm = req, default = works_do(dois, query, filter, offset, limit, sample, From cc2226d5316a30cd6a5754059f300723fd28f3d0 Mon Sep 17 00:00:00 2001 From: Lukas Wallrich Date: Wed, 4 Oct 2023 14:38:38 +0100 Subject: [PATCH 3/3] Document and fix missing utils:: --- R/cr_works.R | 2 +- man/cr_abstract.Rd | 5 ++++- man/cr_journals.Rd | 2 +- man/cr_works.Rd | 6 +++++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/R/cr_works.R b/R/cr_works.R index aeec680..b1b03b1 100644 --- a/R/cr_works.R +++ b/R/cr_works.R @@ -215,7 +215,7 @@ cr_works <- function( sep = "__" ) - if (object.size(req) > 10000) { + if (utils::object.size(req) > 10000) { warning("Request call is too large to be cached (likely because many DOIs at once are requested). Request will be executed with cache = FALSE. Split into multiple calls to cache.") works_do( dois, query, filter, offset, limit, sample, diff --git a/man/cr_abstract.Rd b/man/cr_abstract.Rd index 3f7818c..a81f35f 100644 --- a/man/cr_abstract.Rd +++ b/man/cr_abstract.Rd @@ -4,11 +4,14 @@ \alias{cr_abstract} \title{Get abstract} \usage{ -cr_abstract(doi, ...) +cr_abstract(doi, cache = FALSE, ...) } \arguments{ \item{doi}{(character) a DOI, required.} +\item{cache}{(logical) Should requests be cached and/or retrieved from +the cache? Note that the cache only persists while the package is loaded.} + \item{...}{Named parameters passed on to \code{\link[crul]{HttpClient}}} } \description{ diff --git a/man/cr_journals.Rd b/man/cr_journals.Rd index 180ea1d..84ac681 100644 --- a/man/cr_journals.Rd +++ b/man/cr_journals.Rd @@ -226,7 +226,7 @@ cr_journals_("2167-8359", works = TRUE, cursor = "*", cr_journals("2167-8359", works = TRUE, flq = c(`query.author` = 'Jane')) # select only certain fields to return -res <- cr_journals('2167-8359', works = TRUE, +res <- cr_journals('2167-8359', works = TRUE, select = c('DOI', 'title')) names(res$data) } diff --git a/man/cr_works.Rd b/man/cr_works.Rd index e530773..9334026 100644 --- a/man/cr_works.Rd +++ b/man/cr_works.Rd @@ -21,6 +21,7 @@ cr_works( flq = NULL, select = NULL, async = FALSE, + cache = FALSE, ... ) @@ -137,6 +138,9 @@ are returned)} \item{async}{(logical) use async HTTP requests. Default: \code{FALSE}} +\item{cache}{(logical) Should requests be cached and/or retrieved from +the cache? Note that the cache only persists while the package is loaded.} + \item{...}{Named parameters passed on to \code{\link[crul]{verb-GET}}} \item{parse}{(logical) Whether to output json \code{FALSE} or parse to @@ -246,7 +250,7 @@ cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100) cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100, facet = TRUE) ## with optional progress bar -x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, +x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200, .progress = TRUE) # Low level function - does no parsing to data.frame, get json or a list