diff --git a/DESCRIPTION b/DESCRIPTION index 2f09736..eeb2383 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,6 @@ Remotes: nationalparkservice/QCkit Imports: EML, - sf, dplyr, httr, XML, @@ -33,8 +32,6 @@ Imports: readr, magrittr, crayon, - leaflet, - lifecycle, EMLeditor (>= 0.1.5), DPchecker (>= 0.3.4), QCkit (>= 0.1.4), @@ -42,12 +39,16 @@ Imports: jsonlite, cli, purrr, - tibble + tibble, + lifecycle RoxygenNote: 7.3.2 Suggests: knitr, rmarkdown, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + sf, + leaflet, + stringr VignetteBuilder: knitr URL: https://nationalparkservice.github.io/NPSutils/ BugReports: https://github.com/nationalparkservice/NPSutils/issues diff --git a/NEWS.md b/NEWS.md index f312b22..08637be 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# NPSutils 0.3.3 (under development) + +## 2024-10-21 + * Bug fixes to `load_data_package()` + * Bug fixes to `.get_authors()` + * `get_authors` now adds a period (.) after given names with a single character and can handle an unlimited number of given names. + * Moved sf, leaflet, and stringr to from imports to suggests. + # NPSutils 0.3.2 "Lost Coast" * Add new functions, `load_data_packages()` and `load_data_package()`, which can load data packages (EML in .xml and data in .csv) similarly to the deprecated `load_data_package_deprecated()` function but also allows the data types in the tibbles loaded to be specified based on the information in the metadata. * Deprecate `load_data_package()` and rename it to `load_data_package_deprecated()`. diff --git a/R/load_core_metadata.R b/R/load_core_metadata.R index b102711..5ab2cfd 100644 --- a/R/load_core_metadata.R +++ b/R/load_core_metadata.R @@ -5,7 +5,7 @@ #' #' #' @details The returned dataframe has three columns, EML_element, EML_data and EML_data2. EML_element describes the EML element that was extracted. EML_data and EML_data2 contain the data from that element. In the case of EML_elements with only one piece of data (e.g. the data package title), the data is repeated in the EML_data and EML_data2 columns. In cases where the element contains two related pieces of data (e.g. author), those items are held in EML_data (e.g. the author's name) and EML_data2 (e.g. the author's email address). #' -#' Currently this function is under development and may have issues if an author has more than two givenNames (it will only use the first givenName), an author has not givenNames (only a surName) or an author is an organization and does not have any individualName. If you have a data package with these issues, please contact [robert_baker@nps.gov](mailto:robert_baker@nps.gov). +#' Currently this function is under development and may have issues if an author is an organization. If you have a data package with these issues, please contact [robert_baker@nps.gov](mailto:robert_baker@nps.gov). #' #' The fields that should be returned in the dataframe include: title, publication date, authors (and emails), contacts (and emails), publisher, DOI, publisher city, publisher state, content begin date, content end date, the abstract, notes, "for or by NPS", the license name (e.g. "Public Domain", "CC0"), and a list of each data file in the data package by name. #' @@ -126,7 +126,7 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){ #' #' @description `.get_authors()` extracts the "creators" element from EML metadata and returns it as a dataframe with three columsn, first a column indicating that each row is an author. Second, and column with the author's name (first last). Third, the author's email address. #' -#' @details There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName, authors with more than two givenNames (e.g. multiple middle names), organizations as authors where there is no individualName. +#' @details There are some known issues with this function; unfortunately at this time we do not have example data packages to test them. These include: authors without a givenName and organizations as authors where there is no individualName. #' #' @param metadata an EML formatted R object #' @@ -144,29 +144,40 @@ load_core_metadata <- function(ds_ref, path = paste0(getwd(), "/data")){ #set up empty dataframe to hold creator info: individual <- data.frame(author = as.character(), contact = as.character()) - for(i in 1:length(seq_along(creators))){ + + #if single creator, nest it so that it behaves the same as when there are + #multiple creators: + if ("organizationName" %in% names(creators) | + "individualName" %in% names(creators)) { + creators <- list(creators) + } + + for (i in 1:length(seq_along(creators))) { creator <- unlist(creators[[i]], recursive = FALSE) #if there is an individual name: - if(!is.null(creator$individualName.surName)){ + if (!is.null(creator$ + individualName.surName)) { #if there is a given name: - if(!is.null(creator$individualName.givenName)){ - #if there are two given names (e.g. first and middle) - if(length(seq_along(creator$individualName.givenName)) == 2){ - given <- paste(creator$individualName.givenName[[1]], - creator$individualName.givenName[[2]], - sep = " ") - #if there is only one given name (first) - } else if(length(seq_along(creator$individualNAme.givenName)) == 1){ - given <- creator$individualName.givenName - } else { - #More than 2 given names (e.g. first, middle, middle), use only the first given name: - given <- creator$individualName.givenName[[1]] + if (!is.null(creator$individualName.givenName)) { + given <- NULL + for (i in 1:length(seq_along(creator$individualName.givenName))) { + if (nchar(creator$individualName.givenName[[i]]) == 1) { + given <- paste0(given, + paste0(creator$individualName.givenName[[i]], + ". ")) + } else { + given <- paste0(given, + paste0(creator$individualName.given[[i]], + " ")) + } } - } else { #if there is no given name: given <- NA } + #get rid of extra whitespaces and trailing whitespaces: + given <- stringr::str_squish(given) + #get last name sur <- creator$individualName.surName #generate full name as first (first) last diff --git a/R/load_data_packages.R b/R/load_data_packages.R index 6ad7b03..308150c 100644 --- a/R/load_data_packages.R +++ b/R/load_data_packages.R @@ -105,11 +105,21 @@ load_data_packages <- function(reference_id, nom4 <- nom3[["codeDefinition"]] #get factors factors <- NULL + #if (length(seq_along(nom4)) > 1) { + #nom4 <- unlist(nom4, recursive = FALSE) + #} #handle case where there is only one code definition if ("code" %in% names(nom4)) { nom4 <- list(nom4) } + # for(k in 1:length(seq_along(nom4))) { + # if("code" %in% names(nom4[k])) { + # factors <- append(factors, nom5[[k]]) + # } + #} + for (k in 1:length(seq_along(nom4))) { + #print(paste0("i=",i, ", j=", j, " k=, ", k, ".")) factors <- append(factors, nom4[[k]][["code"]]) } #set column type: @@ -159,9 +169,9 @@ load_data_package <- function(reference_id, simplify = TRUE) { x <- load_data_packages(reference_id, - directory = here::here("data"), - assign_attributes = FALSE, - simplify = TRUE) + directory, + assign_attributes, + simplify) return(x) } diff --git a/R/load_pgk_metadata.R b/R/load_pgk_metadata.R index c5a2845..0ad5c0e 100644 --- a/R/load_pgk_metadata.R +++ b/R/load_pgk_metadata.R @@ -51,9 +51,10 @@ load_pkg_metadata <- function(holding_id, directory = here::here("data")) { #load metadata eml_object <- EML::read_eml(meta_location, from = "xml") - attributeList <- EML::get_attributes(workingEMLfile$dataset$dataTable$attributeList) - attributes <- attributeList$attributes - factors <- attributeList$factors + #attributeList <- EML::get_attributes(eml_object) + attribute_list <- eml_object$dataset$dataTable$attributeList + attributes <- attribute_list$attributes + factors <- attribute_list$factors # Figure out column classes based on attribute table (character, numeric, integer, logical, or complex) attributes$columnclass <- "character" diff --git a/docs/news/index.html b/docs/news/index.html index 3d53506..ce14266 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -50,6 +50,19 @@
NEWS.md
+ load_data_package()
+.get_authors()
+get_authors
now adds a period (.) after given names with a single character and can handle an unlimited number of given names.load_data_packages()
and load_data_package()
, which can load data packages (EML in .xml and data in .csv) similarly to the deprecated load_data_package_deprecated()
function but also allows the data types in the tibbles loaded to be specified based on the information in the metadata.