Skip to content

Commit

Permalink
Merge pull request #62 from RobLBaker/master
Browse files Browse the repository at this point in the history
fix load_pkg_metadata; add metanalysis functions
  • Loading branch information
RobLBaker authored Dec 20, 2024
2 parents cded127 + dc23032 commit c87682f
Show file tree
Hide file tree
Showing 16 changed files with 398 additions and 133 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ export(get_park_taxon_citations)
export(get_park_taxon_refs)
export(get_park_taxon_url)
export(get_ref_info)
export(get_ref_list)
export(get_refs_info)
export(get_unit_code)
export(get_unit_code_info)
export(get_unit_info)
Expand All @@ -22,6 +24,7 @@ export(load_domains)
export(load_pkg_metadata)
export(map_wkt)
export(rm_local_packages)
export(summarize_packages)
export(validate_data_package)
importFrom(lifecycle,deprecated)
importFrom(magrittr,"%>%")
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# NPSutils 0.3.3 (under development)

## 2024-12-19
* updated `load_pkg_metadata` to be simpler and essentially call `DPchecker::load_metadata` but with a preset default directory structure that works well with the default settings for `get_data_package`.
* Add meta-analysis functions for finding and producing summary statistics multiple data packages including `get_ref_list`, `get_refs_info()`, and `summarize_packages`.
## 2024-10-24
* fix how `get_data_package` aliases `get_data_packages`, specifically now allows users to adjust parameters to non-default settings.
## 2024-10-21
Expand Down
2 changes: 1 addition & 1 deletion R/load_data_package.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#'
#' @examples
#' \dontrun{
#' load_data_package(2272461)
#' load_data_package_deprecated(2272461)
#' }
load_data_package_deprecated <- function(reference_id) {
data_package_directory <- paste("data/", reference_id, sep = "")
Expand Down
2 changes: 1 addition & 1 deletion R/load_data_packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,4 @@ extract_tbl <- function(x) {
if (!is.list(x))
return(NULL)
unlist(lapply(x, extract_tbl), FALSE)
}
}
118 changes: 10 additions & 108 deletions R/load_pgk_metadata.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
#' Read contents of data package file and construct a data frame based on the
#' metadata file summarizing the fields and their types/definitions.
#' Loads EML-formatted metadata into R for inspection and/or editing
#'
#' @description `load_pkg_metadata()` is essentially a wrapper around `DPchecker::load_metadata` with the directory structure pre-set to work well the default location that `get_data_package` stores downloaded data packages. If you did not use the default settings for `get_data_package` (or downloaded a data package manually) you may find it easier to adjust the directory structure pointing to your data package and load the metadata using `DPchecker::load_metadata()`. Much like `load_metadata`, `load_pkg_metadata` requires that there be a single .xml file in the data package directory, that the metadata file name end in *_metadata.xml, and that the file contain schema-valid EML metadata.
#'
#' @description `load_pkg_metadata()` reads the metadata file from a previously
#' downloaded package and loads a list of fields and their attributes into a
#' dataframe.
#'
#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file.
#' @param directory String. Path to the data package
#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file. Your data should be in a directory that that has the holding ID as its name.
#' @param directory String. Path to the data package directory, defaults to "data".
#'
#' @return one data frame to the global environment.
#'
Expand All @@ -16,105 +13,10 @@
#' \dontrun{
#' load_pgk_metadata(2266200)
#' }
load_pkg_metadata <- function(holding_id, directory = here::here("data")) {
data_package_directory <- paste(directory, "/", holding_id, sep = "")

metadata_file <- list.files(
path = data_package_directory,
pattern = "metadata.xml"
)

# Look for a metadatafile and let the user know about the results of the search.
if (length(metadata_file) == 0) {
cli::cli_abort(c(
"No metadata file found in: {.path {data_package_directory}}.",
"i" = "The filename must end in _metadata.xml"))
return(invisible())
}
if (length(metadata_file) > 1) {
cli::cli_abort(c(
"Multiple metadata files found.",
"i" = "{.path {data_package_directory}} can contain only one
{.file *_metadata.xml}."))
return(invisible())
}
load_pkg_metadata <- function(holding_id, directory = "data") {

meta_location <- paste0(data_package_directory, "/", metadata_file)
if (!file.exists(meta_location)) {
cli::cli_abort(c(
"The data package for: {.var {holding_id}} was not found.",
"i" = "Make sure {.path {data_package_directory}} is the correct location",
"i" = "Make sure you downloaded the correct data package using {.fn get_data_package}."
))
return(invisible())
}

#load metadata
eml_object <- EML::read_eml(meta_location, from = "xml")
#attributeList <- EML::get_attributes(eml_object)
attribute_list <- eml_object$dataset$dataTable$attributeList
attributes <- attribute_list$attributes
factors <- attribute_list$factors

# Figure out column classes based on attribute table (character, numeric, integer, logical, or complex)
attributes$columnclass <- "character"
if (!"numberType" %in% colnames(attributes)) {
attributes$numberType <- as.character(NA)
}
if (!"formatString" %in% colnames(attributes)) {
attributes$formatString <- as.character(NA)
}
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "natural", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "whole", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "integer", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "real", "numeric", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "date" & attributes$formatString == "YYYY-MM-DD", "Date", attributes$columnclass)

# return the field table to the workspace.
return(attributes)

if (metaformat == "fgdc") {
# xmlFilename <- metalocation
workingXMLfile <- EML::read_eml(metalocation, from = "xml")

# Build attributes table from the xml file
attributes <- data.frame(
id = numeric(),
attribute = character(),
attributeDefinition = character(),
attributeType = character(),
attributeFactors = numeric(),
stringsAsFactors = FALSE
)
for (i in 1:length(workingXMLfile$ea$detailed$attr)) {
attributes <- rbind(
attributes,
cbind(
id = i,
attribute = workingXMLfile$ea$detailed$attr[[i]]$attrlabl,
attributeDefinition = workingXMLfile$ea$detailed$attr[[i]]$attrdef,
attributeType = workingXMLfile$ea$detailed$attr[[i]]$attrtype,
attributeFactors = length(workingXMLfile$ea$detailed$attr[[i]]$attrdomv)
)
)
}

attributes$id <- as.integer(as.character(attributes$id))
attributes$attribute <- as.character(attributes$attribute)
attributes$attributeDefinition <- as.character(attributes$attributeDefinition)
# attributes$attributeType<-as.character(attributes$attributeType)
attributes$attributeFactors <- as.integer(as.character(attributes$attributeFactors))

attributes$columnclass <- "character"
# attributes$columnclass<-ifelse(attributes$attributeType=="OID","integer",attributes$columnclass)
# attributes$columnclass<-ifelse(attributes$attributeType=="Date","Date",attributes$columnclass)
# attributes$columnclass<-ifelse(attributes$attributeType=="Double","numeric",attributes$columnclass)

cat("Found ", crayon::blue$bold(nrow(attributes)), " fields.", sep = "")
meta <- DPchecker::load_metadata(directory = here::here("data", holding_id))

return(invisible(meta))
}

# return the field table to the workspace.
return(attributes)
} else {
print("data/metadata format combination not supported")
}
}
Loading

0 comments on commit c87682f

Please sign in to comment.