Skip to content

Further cache support (cr_abstract and cr_works) #241

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
rcrossref 1.2.1
===============

### MINOR IMPROVEMENTS
* Requests can now be cached (for a single session) with `cr_works`, `cr_abstract` and `cr_cn` by setting `cache = TRUE`


rcrossref 1.2.0
===============

Expand Down
67 changes: 41 additions & 26 deletions R/cr_abstract.R
Original file line number Diff line number Diff line change
@@ -1,44 +1,59 @@
#' Get abstract
#'
#'
#' @export
#' @param doi (character) a DOI, required.
#' @param cache (logical) Should requests be cached and/or retrieved from
#' the cache? Note that the cache only persists while the package is loaded.
#' @param ... Named parameters passed on to \code{\link[crul]{HttpClient}}
#' @examples \dontrun{
#' # abstract found
#' cr_abstract('10.1109/TASC.2010.2088091')
#' cr_abstract("10.1175//2572.1")
#' cr_abstract("10.1182/blood.v16.1.1039.1039")
#'
#'
#' # doi not found
#' # cr_abstract(doi = '10.5284/1011335')
#'
#'
#' # abstract not found, throws error
#' # cr_abstract(doi = '10.1126/science.169.3946.635')
#'
#'
#' # a random DOI
#' # cr_abstract(cr_r(1))
#' }
cr_abstract <- function(doi, ...) {
url <- paste0('https://api.crossref.org/works/', doi, '/transform/application/vnd.crossref.unixsd+xml')
cli <- crul::HttpClient$new(
url = url,
opts = list(followlocation = 1),
headers = list(
`User-Agent` = rcrossref_ua(),
`X-USER-AGENT` = rcrossref_ua()
cr_abstract <- function(doi, cache = FALSE, ...) {
if (cache == TRUE) {
req <- paste("cr_abstract", doi,
paste(unlist(list(...)), sep = "__"),
sep = "__"
)
)
res <- cli$get(...)
res$raise_for_status()
txt <- res$parse("UTF-8")
xml <- tryCatch(read_xml(txt), error = function(e) e)
if (inherits(xml, "error")) {
stop(doi, " not found ", call. = FALSE)
}
tt <- tryCatch(xml_find_first(xml, "//jats:abstract"),
warning = function(w) w)
if (inherits(tt, "warning")) {
stop("no abstract found for ", doi, call. = FALSE)

rlang::env_cache(
env = cr_cache_env, nm = req,
default = cr_abstract(doi, cache = FALSE, ...)
)
} else {
url <- paste0("https://api.crossref.org/works/", doi, "/transform/application/vnd.crossref.unixsd+xml")
cli <- crul::HttpClient$new(
url = url,
opts = list(followlocation = 1),
headers = list(
`User-Agent` = rcrossref_ua(),
`X-USER-AGENT` = rcrossref_ua()
)
)
res <- cli$get(...)
res$raise_for_status()
txt <- res$parse("UTF-8")
xml <- tryCatch(read_xml(txt), error = function(e) e)
if (inherits(xml, "error")) {
stop(doi, " not found ", call. = FALSE)
}
tt <- tryCatch(xml_find_first(xml, "//jats:abstract"),
warning = function(w) w
)
if (inherits(tt, "warning")) {
stop("no abstract found for ", doi, call. = FALSE)
}
xml_text(tt)
}
xml_text(tt)
}
}
119 changes: 84 additions & 35 deletions R/cr_works.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#' @param parse (logical) Whether to output json `FALSE` or parse to
#' list `TRUE`. Default: `FALSE`
#' @param async (logical) use async HTTP requests. Default: `FALSE`
#' @param cache (logical) Should requests be cached and/or retrieved from
#' the cache? Note that the cache only persists while the package is loaded.
#'
#' @section Beware:
#' The API will only work for CrossRef DOIs.
Expand Down Expand Up @@ -100,7 +102,7 @@
#' cr_works(query="NSF", cursor = "*", cursor_max = 300, limit = 100,
#' facet = TRUE)
#' ## with optional progress bar
#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200,
#' x <- cr_works(query="NSF", cursor = "*", cursor_max = 1200, limit = 200,
#' .progress = TRUE)
#'
#' # Low level function - does no parsing to data.frame, get json or a list
Expand Down Expand Up @@ -145,53 +147,100 @@
#' system.time(lapply(queries, function(z) cr_works(query = z)))
#' }

cr_works <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
limit = NULL, sample = NULL, sort = NULL, order = NULL, facet=FALSE,
cursor = NULL, cursor_max = 5000, .progress="none", flq = NULL,
select = NULL, async = FALSE, ...) {

cr_works <- function(
dois = NULL, query = NULL, filter = NULL, offset = NULL,
limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE,
cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL,
select = NULL, async = FALSE, cache = FALSE, ...) {
if (cursor_max != as.integer(cursor_max)) {
stop("cursor_max must be an integer", call. = FALSE)
}
args <- prep_args(query, filter, offset, limit, sample, sort, order,
facet, cursor, flq, select)

args <- prep_args(
query, filter, offset, limit, sample, sort, order,
facet, cursor, flq, select
)

stopifnot(is.logical(async))
if (async) {
return(cr_async("works", c(dois, args), ...))
}

if (length(dois) > 1) {
res <- llply(dois, cr_get_cursor, args = args, cursor = cursor,
cursor_max = cursor_max, .progress = .progress, ...)
res <- lapply(res, "[[", "message")
res <- lapply(res, parse_works)
df <- tibble::as_tibble(bind_rows(res))
#exclude rows with empty DOI value until CrossRef API supports
#input validation
if (nrow(df[df$doi == "", ]) > 0) {
warning("only data with valid CrossRef DOIs returned", call. = FALSE)
works_do <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
limit = NULL, sample = NULL, sort = NULL, order = NULL, facet = FALSE,
cursor = NULL, cursor_max = 5000, .progress = "none", flq = NULL,
select = NULL, async = FALSE, cache = FALSE, ...) {
if (async) {
return(cr_async("works", c(dois, args), ...))
}
df <- df[!df$doi == "", ]
list(meta = NULL, data = df, facets = NULL)
} else {
tmp <- cr_get_cursor(dois, args = args, cursor = cursor,
cursor_max = cursor_max, .progress, ...)
if (is.null(dois)) {
if (!is.null(cursor) || is.null(tmp$message)) {
tmp
} else {
meta <- parse_meta(tmp)
list(meta = meta,
data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))),
facets = parse_facets(tmp$message$facets))

if (length(dois) > 1) {
res <- llply(dois, cr_get_cursor,
args = args, cursor = cursor,
cursor_max = cursor_max, .progress = .progress, ...
)
res <- lapply(res, "[[", "message")
res <- lapply(res, parse_works)
df <- tibble::as_tibble(bind_rows(res))
# exclude rows with empty DOI value until CrossRef API supports
# input validation
if (nrow(df[df$doi == "", ]) > 0) {
warning("only data with valid CrossRef DOIs returned", call. = FALSE)
}
df <- df[!df$doi == "", ]
list(meta = NULL, data = df, facets = NULL)
} else {
list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL)
tmp <- cr_get_cursor(dois,
args = args, cursor = cursor,
cursor_max = cursor_max, .progress, ...
)
if (is.null(dois)) {
if (!is.null(cursor) || is.null(tmp$message)) {
tmp
} else {
meta <- parse_meta(tmp)
list(
meta = meta,
data = tibble::as_tibble(bind_rows(lapply(tmp$message$items, parse_works))),
facets = parse_facets(tmp$message$facets)
)
}
} else {
list(meta = NULL, data = tibble::as_tibble(parse_works(tmp$message)), facets = NULL)
}
}
}

if (cache == TRUE) {
req <- paste("cr_works", paste(dois, collapse = "__"), query, filter, offset, limit, sample,
sort, order, facet, cursor, cursor_max, .progress, flq, select,
paste(unlist(list(...)), sep = "__"),
sep = "__"
)

if (utils::object.size(req) > 10000) {
warning("Request call is too large to be cached (likely because many DOIs at once are requested). Request will be executed with cache = FALSE. Split into multiple calls to cache.")
works_do(
dois, query, filter, offset, limit, sample,
sort, order, facet, cursor, cursor_max, .progress, flq, select, async,
...
)
}

rlang::env_cache(
env = cr_cache_env, nm = req,
default = works_do(dois, query, filter, offset, limit, sample,
sort, order, facet, cursor, cursor_max, .progress, flq,
select, async, ...)
)
} else {
works_do(
dois, query, filter, offset, limit, sample,
sort, order, facet, cursor, cursor_max, .progress, flq, select, async,
...
)
}
}



#' @export
#' @rdname cr_works
cr_works_ <- function(dois = NULL, query = NULL, filter = NULL, offset = NULL,
Expand Down
5 changes: 4 additions & 1 deletion man/cr_abstract.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/cr_journals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/cr_works.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 26 additions & 1 deletion tests/testthat/test-cr_works.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ test_that("cr_works fails well: arguments that dont require http requests", {
expect_error(cr_works(offset = 'foo'), "offset value illegal")

expect_error(cr_works(sample = 'foo'), "sample value illegal")

expect_error(cr_works(async = 5), "is not TRUE")
})

Expand All @@ -241,3 +241,28 @@ test_that("content domain parsing fix", {
expect_is(cr_works(dois = "10.7287/peerj.3819v0.1/reviews/2"), "list")
})
})

test_that("cr_works cache works", {
#vcr::use_cassette("cr_works_cache", { # Error with HTTP request here, but apart from that, test works
## With query
# reset cache
rm(list = ls(cr_cache_env), envir = cr_cache_env)
t1 <- system.time(b1 <- cr_works(query="NSF", cache = TRUE))
t2 <- system.time(b2 <- cr_works(query="NSF", cache = TRUE))
# compare timing to ensure that caching actually happened
expect_gt(t1[3], t2[3])
expect_identical(b1, b2)
expect_is(b1, "list")

## With multiple DOIs
# reset cache
rm(list = ls(cr_cache_env), envir = cr_cache_env)
t1 <- system.time(b1 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE))
t2 <- system.time(b2 <- cr_works(dois = c("10.1037/0022-3514.45.2.357", "10.1002/9781118719244"), cache = TRUE))
# compare timing to ensure that caching actually happened
expect_gt(t1[3], t2[3])
expect_identical(b1, b2)
expect_is(b1, "list")
expect_match(b1$data$`container.title`[1], "Journal of Personality and Social Psychology")
# })
})