update to new version for new filter requirements

btskinner · Jul 12, 2021 · 924e861 · 924e861
1 parent 9c5881d
commit 924e861
Show file tree

Hide file tree

Showing 33 changed files with 328 additions and 103 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rscorecard
 Title: A Method to Download Department of Education College Scorecard Data
-Version: 0.18.0
+Version: 0.19.0
 Authors@R: person("Benjamin", "Skinner",
 	          email = "btskinner@coe.ufl.edu",
     	          role = c("aut", "cre"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# v0.19.0
+
+- update to only allow filtering on variables allowed due to API
+  changes on 4 April 2021
+
 # v0.18.0
 
 - update dictionary for 12 January 2021 release of data

diff --git a/R/dictionary.R b/R/dictionary.R
@@ -24,6 +24,12 @@
 #' @param return_df Return a tibble of the subset data dictionary.
 #' @param print_off Do not print to console; useful if you only want
 #'     to return a tibble of dictionary values.
+#' @param can_filter Use to confirm that a variable can be used as a
+#'     filtering variable. Returns \code{TRUE} or \code{FALSE}
+#' @param filter_vars Use to print variables that can be used to
+#'     filter calls. Use with argument \code{return_df = TRUE} to
+#'     return a tibble of these variables in addition to console
+#'     output.
 #'
 #' @examples
 #' ## simple search for 'state' in any part of the dictionary
@@ -35,6 +41,9 @@
 #' ## return full dictionary (only recommended if not printing and
 #' ## storing in object)
 #' df <- sc_dict('.', limit = Inf, print_off = TRUE, return_df = TRUE)
+#'
+#' ## print list of variables that can be used to filter
+#' df <- sc_dict('.', filter_vars = TRUE, return_df = TRUE)
 
 #' @export
 sc_dict <- function(search_string,
@@ -47,13 +56,43 @@ sc_dict <- function(search_string,
                                    'source'),
                     ignore_case = TRUE, limit = 10, confirm = FALSE,
                     print_dev = FALSE, print_notes = FALSE,
-                    return_df = FALSE, print_off = FALSE) {
+                    return_df = FALSE, print_off = FALSE,
+                    can_filter = FALSE, filter_vars = FALSE) {
 
     ## only for confirm
     if (confirm) {
         return(!is.null(sc_hash[[search_string]]))
     }
 
+    ## only for can_filter
+    if (can_filter) {
+        ## NB: using any() to be TRUE if any TRUE
+        return(any(dict[["can_filter"]][dict[["varname"]] == search_string] == 1))
+    }
+
+    ## print filter variables
+    if (filter_vars) {
+        out <- dict[dict[["can_filter"]] == 1,]
+        uniqv <- sort(unique(out[["varname"]]))
+        ## console table
+        cat('\n' %+% paste(rep('', 70), collapse = '-') %+% '\n')
+        cat('The following variables can be used in sc_filter():')
+        cat('\n' %+% paste(rep('', 70), collapse = '-') %+% '\n\n')
+        for (i in 1:length(uniqv)) {
+            cat(' - ' %+% uniqv[i] %+% '\n')
+        }
+        if (return_df) {
+            cat('\n')
+            var_order <- c('varname', 'value', 'label', 'description', 'source',
+                           'dev_friendly_name', 'dev_category', 'notes', 'can_filter')
+            out <- tidyr::as_tibble(out) %>%
+                dplyr::select(dplyr::one_of(var_order))
+            return(out)
+        } else {
+            return(cat('\n'))
+        }
+    }
+
     ## get values
     if (match.arg(search_col) == 'all') {
         rows <- rep(FALSE, nrow(dict))
@@ -126,7 +165,12 @@ sc_dict <- function(search_string,
                 }
                 cat('\n')
             }
-
+            cat('CAN FILTER? ')
+            if (d[['can_filter']][[1]] == 1) {
+                cat('Yes\n\n')
+            } else {
+                cat('No\n\n')
+            }
         }
 
         cat(paste(rep('', 70), collapse = '-') %+% '\n')
@@ -138,10 +182,12 @@ sc_dict <- function(search_string,
 
     ## return_df ? return (out) : <>
     if (return_df) {
-        var_order <- c('varname', 'value', 'label', 'description', 'source',
-                       'dev_friendly_name', 'dev_category', 'notes')
-        out <- tidyr::as_tibble(out) %>%
-            dplyr::select(dplyr::one_of(var_order))
-        return(out)
+         var_order <- c('varname', 'value', 'label', 'description', 'source',
+                        'dev_friendly_name', 'dev_category', 'notes', 'can_filter')
+         out <- tidyr::as_tibble(out) %>%
+             dplyr::select(dplyr::one_of(var_order))
+         return(out)
     }
+
+
 }
diff --git a/R/filter.R b/R/filter.R
@@ -72,6 +72,13 @@ sc_filter_ <- function(sccall, filter_string) {
             stop('Must use either \"==\", \"!=\", or \"%in%\" in sc_filter.',
                  call. = FALSE)
         }
+        if (!sc_dict(tolower(as.character(filter[[i]][[2]])), can_filter = TRUE)) {
+            stop('The variable \"' %+% filter[[i]][[2]]
+                 %+% '\" cannot be used as filter. '
+                 %+% 'Revise your call to remove this filtering variable.\n'
+                 %+% 'Use sc_dict(filter_vars = TRUE) to see available filters.',
+                 call. = FALSE)
+        }
         if (!sc_dict(tolower(as.character(filter[[i]][[2]])), confirm = TRUE)) {
             stop('Variable \"' %+% filter[[i]][[2]]
                  %+% '\" not found in dictionary. '

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/README.md b/README.md
@@ -55,12 +55,12 @@ key in your R environment at the start of your R session using
     ## # A tibble: 6 x 4
     ##   unitid instnm                                                  stabbr year  
     ##    <int> <chr>                                                   <chr>  <chr> 
-    ## 1 214625 Pennsylvania State University-Penn State New Kensington PA     latest
-    ## 2 196051 SUNY Morrisville                                        NY     latest
-    ## 3 194392 Paul Smiths College of Arts and Science                 NY     latest
-    ## 4 214643 Pennsylvania State University-Penn State Wilkes-Barre   PA     latest
-    ## 5 191676 Houghton College                                        NY     latest
-    ## 6 197230 Wells College                                           NY     latest
+    ## 1 191676 Houghton College                                        NY     latest
+    ## 2 194392 Paul Smiths College of Arts and Science                 NY     latest
+    ## 3 196051 SUNY Morrisville                                        NY     latest
+    ## 4 197230 Wells College                                           NY     latest
+    ## 5 214625 Pennsylvania State University-Penn State New Kensington PA     latest
+    ## 6 214643 Pennsylvania State University-Penn State Wilkes-Barre   PA     latest
 
 For more example calls and information about other package commands, see
 the [extended
@@ -91,6 +91,8 @@ function.
     ## 3 = Private for-profit
     ## 4 = Foreign
     ## 
+    ## CAN FILTER? Yes
+    ## 
     ## 
     ## ---------------------------------------------------------------------
     ## varname: schtype                                          source: FSA
@@ -105,6 +107,8 @@ function.
     ## 2 = Private, Nonprofit
     ## 3 = Proprietary
     ## 
+    ## CAN FILTER? No
+    ## 
     ## ---------------------------------------------------------------------
     ## Printed information for 2 of out 2 variables.
 

diff --git a/data-raw/make_dict_hash.R b/data-raw/make_dict_hash.R
@@ -14,9 +14,9 @@ sheets <- c("institution_data_dictionary",
 
 ## read in each sheet, munge, and bind
 df <- purrr::map(sheets,
-                 ~ read_excel(file, sheet = .x) %>%
+                 ~ read_excel(file, sheet = .x) |>
                      ## lower names
-                     setNames(tolower(names(.))) %>%
+                     rename_all(tolower) |>
                      ## subset/rename
                      select(description = `name of data element`,
                             dev_category = `dev-category`,
@@ -25,27 +25,39 @@ df <- purrr::map(sheets,
                             value,
                             label,
                             source,
-                            notes) %>%
+                            notes,
+                            can_filter = index) |>
                      ## lower name values for varname column
-                     mutate(varname = tolower(varname)) %>%
+                     mutate(varname = tolower(varname)) |>
                      ## remove extra \r\n from description column
-                     mutate(description = gsub("^(.+)\r\n 0.*$", "\\1", description)) %>%
+                     mutate(description = gsub("^(.+)\r\n 0.*$", "\\1", description)) |>
                      ## remove trailing characters from dev_friendly_name
                      mutate(dev_friendly_name = gsub("^(.+):$", "\\1",
-                                                     dev_friendly_name)) %>%
+                                                     dev_friendly_name)) |>
+                     ## convert can_filter column to 1 if there's text there
+                     mutate(can_filter = ifelse(!is.na(can_filter), 1, can_filter),
+                            ## fix unitid can_filter to 1 since it works for filtering
+                            can_filter = ifelse(varname == "unitid", 1, can_filter),
+                            ## convert to integer
+                            can_filter = as.integer(can_filter)) |>
                      ## roll values forward to fill NA
                      mutate(description = na.locf(description),
                             dev_category = na.locf(dev_category),
                             dev_friendly_name = na.locf(dev_friendly_name),
                             varname = na.locf(varname),
                             source = na.locf(source),
-                            notes = na.locf(notes))
-                 ) %>%
+                            notes = na.locf(notes)) |>
+                     ## roll values forward in can_filter, grouped by variable name
+                     group_by(varname) |>
+                     mutate(can_filter = na.locf(can_filter, na.rm = FALSE)) |>
+                     ungroup() |>
+                     mutate(can_filter = ifelse(is.na(can_filter), 0, can_filter))
+                 ) |>
     ## bind together
-    bind_rows
+    bind_rows()
 
 ## make dictionary data frame
-dict <- df %>% data.frame(.)
+dict <- df |> data.frame()
 
 ## create hash environment for quick conversion between varnames
 ## and developer-friendly names
@@ -55,7 +67,7 @@ sc_hash <- new.env(parent = emptyenv())
 ## (1) varname <- dev_friendly
 ## (2) varname_c <- root
 
-tmp <- df %>% distinct(varname, .keep_all = TRUE)
+tmp <- df |> distinct(varname, .keep_all = TRUE)
 
 for (i in 1:nrow(tmp)) {
     ## key/value pair (1)
@@ -72,7 +84,7 @@ for (i in 1:nrow(tmp)) {
 ## (3) dev_friendly <- varname
 ## (4) dev_friendly_c <- root
 
-tmp <- df %>% distinct(dev_friendly_name, .keep_all = TRUE)
+tmp <- df |> distinct(dev_friendly_name, .keep_all = TRUE)
 
 for (i in 1:nrow(tmp)) {
     ## key/value pair (3)
@@ -96,8 +108,8 @@ for (i in 1:nrow(tmp)) {
 ## the information will continue to come from the institution data
 both_sources <- c("unitid", "opeid6", "control", "main", "instnm")
 
-tmp <- df %>%
-    filter(varname %in% both_sources) %>%
+tmp <- df |>
+    filter(varname %in% both_sources) |>
     distinct(dev_category, varname, .keep_all = TRUE)
 
 for (i in 1:nrow(tmp)) {

diff --git a/docs/404.html b/docs/404.html
diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html
diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html
diff --git a/docs/articles/commands.html b/docs/articles/commands.html