updates to build_ functions

mjwestgate · mjwestgate · commit 61737d7f01f1 · 2024-12-12T10:47:58.000+11:00
- switch to `source` and `destination` argument syntax
- use `corella::darwin_core_terms` to add links to schema
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,9 +17,9 @@ Authors@R:
              role = c("aut")))
 Description: galaxias helps users describe, package and share biodiversity 
   information using the 'Darwin Core' data standard, which is the format used
-  and accepted by the Global Biodiversity Information Facility (GBIF) and it's
+  and accepted by the Global Biodiversity Information Facility (GBIF) and its'
   partner nodes. It is functionally similar to `devtools`, but with a focus on
-  building Darwin Core Archives (DwCA's) rather than R packages.
+  building 'Darwin Core Archives' rather than R packages.
 Depends: 
   R (>= 4.3.0),
   corella
diff --git a/R/build_archive.R b/R/build_archive.R
@@ -4,54 +4,51 @@
 #' and metadata. This function assumes that all of these file types have been
 #' pre-constructed, and can be found inside a single folder, with no additional
 #' or redundant information. This function is similar to `devtools::build()`,
-#' in the sense that it takes a repository and wraps it for publication, without
-#' assessing the contents in any meaningful way. It differs from 
-#' `devtools::build()` in that it builds a Darwin Core Archive, rather than an 
-#' R package.
+#' in the sense that it takes a repository and wraps it for publication, It 
+#' differs from `devtools::build()` in that it builds a Darwin Core Archive, 
+#' rather than an R package.
 #' @details
 #' This function looks for three types of objects in the specified `directory`:
 #' 
 #'  * One or more `csv` files such as `occurrences.csv` &/or `events.csv`. 
 #'    These will be manipulated versions of the raw dataset, which have been
-#'    altered to use Darwin Core terms as column headers. See the `corella`
-#'    package for details.
-#'  * A metadata statement, stored in xml using the filename `eml.xml`. The
-#'    function `use_metadata()` from the `paperbark` package is a good starting 
-#'    point here, followed by `build_metadata()` to save it in xml.
+#'    altered to use Darwin Core terms as column headers. See 
+#'    [corella::corella-package()] for details.
+#'  * A metadata statement, stored in `EML` using the filename `eml.xml`. The
+#'    function [use_metadata()] is a good starting point here, followed by 
+#'    [build_metadata()] once you have populated your metadata statement.
 #'  * A 'schema' document, also stored in xml, called `meta.xml`. This is 
-#'    usually constructed using `build_schema()`.
+#'    usually constructed using [build_schema()].
 #'
 #' You will get an error if these files are not present. The resulting file
 #' shares the name of the working directory (with a .zip file extension),
 #' and is placed in the parent directory
-#' @param x (string) A directory containing all the files to be stored in the
-#' archive. Defaults to the `data` folder within the current working directory.
-#' @param file (string) A file name to save the resulting zip file.
+#' @param source (string) A directory containing all the files to be stored in 
+#' the archive. Defaults to the `data` folder within the current working 
+#' directory.
+#' @param destination (string) A file name to save the resulting zip file.
 #' @return Invisibly returns the location of the built zip file; but typically
 #' called for the side-effect of building a 'Darwin Core Archive' (i.e. a zip 
 #' file).
 #' @importFrom zip zip
 #' @export
-build_archive <- function(x = "data", file) {
-  x <- get_default_directory(x)
-  
-  progress_update("Retrieving metadata...")
-  files_in <- find_data(x)
+build_archive <- function(source = "data", destination) {
+  progress_update("Retrieving data...")
+  files_in <- get_default_directory(source) |>
+    find_data()
   
   progress_update("Creating zip folder...")
-  file_out <- get_default_file(file)
+  file_out <- get_default_file(destination)
   
   progress_update("Building Darwin Core Archive...")
   zip::zip(zipfile = file_out, 
            files = files_in,
            mode = "cherry-pick")
   
-  cli::cli_alert_success("Darwin Core Archive successfully built. \nSaved as {.file {file_out}}.")
+  cli::cli_alert_success("Darwin Core Archive successfully built. \nSaved as `{.file {file_out}}`.")
   cli::cli_progress_done()
   
-  # invisible(return(file_out)) # might need this to save
-  
-
+  invisible(file_out)
 }
 
 #' Simple function to specify a zip file if no arg given
diff --git a/R/build_metadata.R b/R/build_metadata.R
@@ -1,34 +1,35 @@
 #' Create a metadata statement for a Darwin Core Archive
 #' 
 #' A metadata statement lists the owner of the dataset, how it was collected,
-#' and how it may used (i.e. its' licence). This function simply converts
-#' metadata stored in a markdown file to xml, and stores it in the folder 
-#' specified using the `directory` argument.
+#' and how it can be used (i.e. its' licence). This function simply reads 
+#' converts metadata stored in a markdown file, converts it to xml, and saves it 
+#' in the `destination` file.
 #' 
 #' This function is a fairly shallow wrapper on top of functionality build
 #' in the `paperbark` package, particularly `read_md()` and `write_eml()`. You can 
 #' use that package to gain greater control, or to debug problems, should you 
 #' wish.
-#' @param path Path to a metadata statement stored in markdown format (.md).
-#' @param file A file where the result should be saved. Defaults to 
+#' @param source A metadata file stored in markdown format (`.md`). Defaults
+#' to `metadata.md`, which is the same as is created by [use_metdata()]
+#' @param destination A file where the result should be saved. Defaults to 
 #' `data/eml.xml`.
 #' @returns Does not return an object to the workspace; called for the side
 #' effect of building a file named `meta.xml` in the `data` directory.
 #' @importFrom paperbark read_md
 #' @importFrom paperbark write_eml
 #' @export
-build_metadata <- function(x = "data", 
-                           file = "./data/eml.xml"){
-  if(!file.exists(x)){
-    cli::cli_abort("{.file {x}} doesn't exist in specified location.")
+build_metadata <- function(source = "metadata.md", 
+                           destination = "./data/eml.xml"){
+  if(!file.exists(source)){
+    cli::cli_abort("`{source}` doesn't exist in specified location.")
   }
   # import file, ensure EML metadata is added, convert to XML
   progress_update("Reading file...")
-  metadata_file <- read_md(x)
+  metadata_tbl <- read_md(source)
 
   progress_update("Writing file...")
-  write_eml(built_file, file = file)
+  write_eml(metadata_tbl, file = destination)
   
-  cli::cli_alert_success("Metadata successfully built. Saved as {.file /data/eml.xml}.")
+  cli::cli_alert_success("Metadata successfully built. Saved as `{destination}`.")
   cli::cli_progress_done()
 }
diff --git a/R/build_schema.R b/R/build_schema.R
@@ -4,27 +4,26 @@
 #' It works by detecting column names on csv files in a specified directory;
 #' these should all be Darwin Core terms for this function to produce reliable
 #' results.
-#' @param x (string) A directory containing all the files to be stored in the
-#' archive. Defaults to the `data` folder within the current working directory.
-#' @param file (string) A file name for the resulting schema document.
+#' @param source A directory (**not** a file) containing files to be documented 
+#' in the schema document. Defaults to the `data` folder within the current 
+#' working directory. Note that files that do not match the Darwin Core naming 
+#' convention and/or do not end in `.csv` are ignored.
+#' @param destination A file name for the resulting schema document. Defaults
+#' to `./data/meta.xml` for consistency with the Darwin Core standard.
 #' @returns Does not return an object to the workspace; called for the side
 #' effect of building a file named `meta.xml` in the specified directory.
 #' @importFrom paperbark write_eml
 #' @importFrom glue glue
 #' @importFrom rlang abort
 #' @export
-build_schema <- function(x = "data", 
-                         file = "./data/meta.xml") {
-  x <- get_default_directory(x)
-  
-  files <- detect_dwc_files(x)
-  fields <- detect_dwc_fields(files)
-  result <- add_front_matter(fields)
-  
-  progress_update("Writing file...")
-  write_eml(result, file = file)
-  
-  cli::cli_alert_success("Schema successfully built. Saved as {.file /data/meta.xml}.")
+build_schema <- function(source = "data", 
+                         destination = "./data/meta.xml") {
+  get_default_directory(source) |>
+    detect_dwc_files() |>
+    detect_dwc_fields() |>
+    add_front_matter() |>
+    write_eml(file = destination)
+  cli::cli_alert_success("Schema successfully built. Saved as {destination}.")
   cli::cli_progress_done()
 }
 
@@ -195,7 +194,17 @@ create_field_rows <- function(x){
   index_list <- as.list(seq_along(field_names))
   names(index_list) <- rep("index", n_fields)
   # get sequence of urls
-  term_list <- as.list(glue("http://rs.tdwg.org/dwc/terms/{field_names}"))
+  dwc_df <- corella::darwin_core_terms
+  term_list <- map(field_names, 
+      .f = \(a){
+        term_lookup <- dwc_df$term == a
+        if(any(term_lookup)){
+          dwc_df$url[which(term_lookup)[1]]
+        }else{
+          "no-dwc-term-found"
+        }
+      })
+  # term_list <- as.list(glue("http://rs.tdwg.org/dwc/terms/{field_names}")) # obsolete
   names(term_list) <- rep("term", n_fields)
   # combine
   tibble(level = 3,
diff --git a/man/build_archive.Rd b/man/build_archive.Rd
diff --git a/man/build_metadata.Rd b/man/build_metadata.Rd
diff --git a/man/build_schema.Rd b/man/build_schema.Rd
diff --git a/tests/testthat/test-build.R b/tests/testthat/test-build.R
@@ -8,11 +8,14 @@ test_that("build_ functions work correctly in sequence", {
  
   # add data
   # add events.csv
-  tibble(eventID = 1, eventDate = "2024-01-01") |>
+  tibble(eventID = 1, 
+         eventDate = "2024-01-01") |>
     write.csv(file = "data/events.csv",
               row.names = FALSE)
   # add occurrences.csv
-  tibble(basisOfRecord = "humanObservation", individualCount = 1) |>
+  tibble(basisOfRecord = "humanObservation", 
+         individualCount = 1,
+         scientificName = "Litoria peronii") |>
     write.csv(file = "data/occurrences.csv",
               row.names = FALSE)
   # expect_error(build_archive()) # no schema or metadata
@@ -22,7 +25,7 @@ test_that("build_ functions work correctly in sequence", {
   build_schema()
   expect_true(file.exists("data/meta.xml"))
   result <- readLines("data/meta.xml")
-  expect_equal(length(result), 15) # correct number of entries
+  expect_equal(length(result), 16) # correct number of entries
   expect_true(all(grepl("^\\s*<", result))) # all open with `<`
   # NOTE: still has problems with attributes containing `amp` instead of `&`
   # expect_error(build_archive()) # no metadata yet
diff --git a/vignettes/quick_start_guide.Rmd b/vignettes/quick_start_guide.Rmd
@@ -135,7 +135,7 @@ Darwin Core may be an unfamiliar format, so it can be useful to 'check' your
 data for common issues. We suggest first using `check_archive()`:
 
 
-Alternatively, you can use the GBIF 'validate' API to check your data (not functional!)
+Alternatively, you can use the GBIF 'validate' API to check your data:
 
 ```{r, eval=FALSE}
 validate_archive()