File adrios (#118)

Adds new ADRIOMaker class capable of producing ADRIOs to fetch data attributes from CSV files in N, TxN, or NxN format.
NAU-CCL · Jun 20, 2024 · e399d5c · e399d5c
1 parent db79f8c
commit e399d5c
Show file tree

Hide file tree

Showing 8 changed files with 581 additions and 39 deletions.
diff --git a/doc/devlog/2024-06-12.ipynb b/doc/devlog/2024-06-12.ipynb
@@ -0,0 +1,255 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# devlog 2024-06-12\n",
+    "\n",
+    "_author: Trevor Johnson_\n",
+    "\n",
+    "We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).\n",
+    "\n",
+    "The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates geos with CSV ADRIOs to load the data into NDArrays. These geos also contain Census ADRIOs that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import date\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from numpy import array_equal\n",
+    "\n",
+    "from epymorph.data_shape import Shapes\n",
+    "from epymorph.geo.adrio import adrio_maker_library\n",
+    "from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus\n",
+    "from epymorph.geo.adrio.file.adrio_csv import (CSVSpec, CSVSpecMatrix,\n",
+    "                                               CSVSpecTime)\n",
+    "from epymorph.geo.dynamic import DynamicGeo\n",
+    "from epymorph.geo.spec import DynamicGeoSpec, Year\n",
+    "from epymorph.geography.us_census import (STATE, CountyScope, StateScope,\n",
+    "                                          get_us_counties, get_us_states)\n",
+    "from epymorph.simulation import geo_attrib\n",
+    "from pandas import DataFrame, concat\n",
+    "\n",
+    "# create and store 'pei_population.csv'\n",
+    "census_maker = ADRIOMakerCensus()\n",
+    "states_list = ['AZ', 'FL', 'GA', 'MD', 'NY', 'NC', 'SC', 'VA']\n",
+    "population = census_maker.make_adrio(geo_attrib(\n",
+    "    'population', int, Shapes.N), StateScope.in_states_by_code(states_list), Year(2015))\n",
+    "df = DataFrame({'label': states_list, 'population': population.get_value()})\n",
+    "df.sort_values(by='population', inplace=True)\n",
+    "df.to_csv(\"./scratch/pei_population.csv\", header=False, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = DynamicGeoSpec(\n",
+    "    attributes=[\n",
+    "        geo_attrib('label', str, Shapes.N),\n",
+    "        geo_attrib('population', int, Shapes.N),\n",
+    "        geo_attrib('population_census', int, Shapes.N)\n",
+    "    ],\n",
+    "    time_period=Year(2015),\n",
+    "    scope=StateScope.in_states(['12', '13', '24', '37', '45', '51']),\n",
+    "    source={\n",
+    "        'label': 'Census:name',\n",
+    "        'population': CSVSpec(file_path=Path(\"./scratch/pei_population.csv\"),\n",
+    "                              key_col=0, data_col=1, key_type=\"state_abbrev\", skiprows=None),\n",
+    "        'population_census': 'Census:population'\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
+    "\n",
+    "# validate geo and ensure both ADRIOs fetched identical data\n",
+    "geo.validate()\n",
+    "if not array_equal(geo['population'], geo['population_census']):\n",
+    "    raise Exception(\"Data not equal.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create and store 'us_sw_counties_population.csv'\n",
+    "\n",
+    "# get commuters data from asc5\n",
+    "states_list = ['04', '08', '49', '35', '32']\n",
+    "population_2015 = census_maker.make_adrio(geo_attrib(\n",
+    "    'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2015)).get_value()\n",
+    "population_2016 = census_maker.make_adrio(geo_attrib(\n",
+    "    'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2016)).get_value()\n",
+    "\n",
+    "# get county and state info from shapefiles and convert to dataframes\n",
+    "counties_info = get_us_counties(2010)\n",
+    "states_info = get_us_states(2010)\n",
+    "counties_info_df = DataFrame({'state_geoid': [STATE.extract(\n",
+    "    county_id) for county_id in counties_info.geoid], 'geoid': counties_info.geoid, 'name': counties_info.name})\n",
+    "states_info_df = DataFrame(\n",
+    "    {'state_geoid': states_info.geoid, 'state_name': states_info.name})\n",
+    "\n",
+    "# merge dataframes and create \"County, State\" name column\n",
+    "merged_df = counties_info_df.merge(states_info_df, on='state_geoid')\n",
+    "merged_df['county_name'] = merged_df['name'] + \", \" + merged_df['state_name']\n",
+    "merged_df = merged_df.loc[merged_df['state_geoid'].isin(states_list)]\n",
+    "\n",
+    "# create and merge dataframes to be converted to csvs\n",
+    "df_2015 = DataFrame({'Date': [date(2015, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n",
+    "                    pop[0] for pop in population_2015], 'Adult': [pop[1] for pop in population_2015], 'Elderly': [pop[2] for pop in population_2015]})\n",
+    "df_2016 = DataFrame({'Date': [date(2016, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n",
+    "                    pop[0] for pop in population_2016], 'Adult': [pop[1] for pop in population_2016], 'Elderly': [pop[2] for pop in population_2016]})\n",
+    "df = concat([df_2015, df_2016])\n",
+    "\n",
+    "# sort incorrectly and store as csv\n",
+    "df.sort_values('Young', inplace=True)\n",
+    "df.to_csv(\"./scratch/us_sw_counties_population.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = DynamicGeoSpec(\n",
+    "    attributes=[\n",
+    "        geo_attrib('label', str, Shapes.N),\n",
+    "        geo_attrib('population', int, Shapes.N),\n",
+    "        geo_attrib('population_0-19', int, Shapes.N),\n",
+    "        geo_attrib('population_20-64', int, Shapes.N),\n",
+    "        geo_attrib('population_65+', int, Shapes.N),\n",
+    "        geo_attrib('population_by_age', int, Shapes.NxA(3))\n",
+    "    ],\n",
+    "    time_period=Year(2015),\n",
+    "    scope=CountyScope.in_states(['04', '08', '49', '35', '32']),\n",
+    "    source={\n",
+    "        'label': 'Census:name',\n",
+    "        'population': 'Census',\n",
+    "        'population_0-19': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
+    "                                       time_col=0, key_col=1, data_col=2, key_type=\"county_state\", skiprows=1),\n",
+    "        'population_20-64': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
+    "                                        time_col=0, key_col=1, data_col=3, key_type=\"county_state\", skiprows=1),\n",
+    "        'population_65+': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
+    "                                      time_col=0, key_col=1, data_col=4, key_type=\"county_state\", skiprows=1),\n",
+    "        'population_by_age': 'Census'\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
+    "\n",
+    "geo.validate()\n",
+    "\n",
+    "census_df = DataFrame({'Young': [pop[0] for pop in geo['population_by_age']], 'Adult': [\n",
+    "                      pop[1] for pop in geo['population_by_age']], 'Elderly': [pop[2] for pop in geo['population_by_age']]})\n",
+    "if not array_equal(geo['population_0-19'], census_df['Young']):\n",
+    "    raise Exception(\"Young data not equal.\")\n",
+    "if not array_equal(geo['population_20-64'], census_df['Adult']):\n",
+    "    raise Exception(\"Adult data not equal.\")\n",
+    "if not array_equal(geo['population_65+'], census_df['Elderly']):\n",
+    "    raise Exception(\"Elderly data not equal.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create and store 'counties_commuters_2020.csv'\n",
+    "counties_list = ['08001', '35001', '04013', '04017']\n",
+    "df = census_maker.fetch_commuters(CountyScope.in_counties(counties_list), 2020)\n",
+    "df['res_geoid'] = df['res_state_code'] + df['res_county_code']\n",
+    "df['wrk_geoid'] = df['wrk_state_code'].apply(lambda x: x[1:]) + df['wrk_county_code']\n",
+    "\n",
+    "df.to_csv('./scratch/counties_commuters_2020.csv',\n",
+    "          columns=['res_geoid', 'wrk_geoid', 'workers'], index=False)\n",
+    "\n",
+    "df.sort_values(by='workers', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = DynamicGeoSpec(\n",
+    "    attributes=[\n",
+    "        geo_attrib('label', str, Shapes.N),\n",
+    "        geo_attrib('population', int, Shapes.N),\n",
+    "        geo_attrib('commuters', int, Shapes.NxN),\n",
+    "        geo_attrib('commuters_census', int, Shapes.NxN)\n",
+    "    ],\n",
+    "    time_period=Year(2020),\n",
+    "    scope=CountyScope.in_counties(['35001', '04013', '04017']),\n",
+    "    source={\n",
+    "        'label': 'Census:name',\n",
+    "        'population': 'Census',\n",
+    "        'commuters': CSVSpecMatrix(file_path=Path(\"./scratch/counties_commuters_2020.csv\"),\n",
+    "                                   from_key_col=0, to_key_col=1, data_col=2, key_type=\"geoid\", skiprows=1),\n",
+    "        'commuters_census': 'Census:commuters'\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
+    "\n",
+    "geo.validate()\n",
+    "if not array_equal(geo['commuters'], geo['commuters_census']):\n",
+    "    raise Exception(\"Data not equal.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/doc/devlog/README.md b/doc/devlog/README.md
@@ -54,6 +54,7 @@ This folder is a handy place to put Jupyter notebooks or other documents which h
 | 2024-05-03.ipynb | Tyler | ✓ | Integration test: loading US Census geography from TIGER |
 | 2024-05-22.ipynb | Sachin |  | Integrating particle filter with epymorph. Propagating the particles using epymorph simulation and plot the infection rates |
 | 2024-06-03.ipynb | Trevor |  | Integration test: using dynamic geos to fetch Census data |
+| 2024-06-12.ipynb | Trevor |  | Integration test: CSV file ADRIOs |
 
 ## Contributing
 

diff --git a/epymorph/geo/adrio/__init__.py b/epymorph/geo/adrio/__init__.py
@@ -1,7 +1,9 @@
 """AdrioMaker library."""
 from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus
+from epymorph.geo.adrio.file.adrio_csv import ADRIOMakerCSV
 from epymorph.geo.dynamic import ADRIOMaker
 
 adrio_maker_library: dict[str, type[ADRIOMaker]] = {
     'Census': ADRIOMakerCensus,
+    'CSV': ADRIOMakerCSV
 }
diff --git a/epymorph/geo/adrio/adrio.py b/epymorph/geo/adrio/adrio.py
@@ -3,7 +3,7 @@
 and ADRIOMakers create ADRIOs for a data soruce and specialized for a geo's purposes.
 """
 from abc import ABC, abstractmethod
-from typing import Callable
+from typing import Any, Callable
 
 from numpy.typing import NDArray
 
@@ -40,8 +40,13 @@ class ADRIOMaker(ABC):
     """Abstract class to serve as an outline for ADRIO makers for specific data sources."""
     attributes: list[AttributeDef]
 
+    @staticmethod
     @abstractmethod
-    def make_adrio(self, attrib: AttributeDef, scope: GeoScope, time_period: TimePeriod) -> ADRIO:
+    def accepts_source(source: Any) -> bool:
+        """Checks whether the ADRIOMaker accepts a given source type and returns the result as a boolean."""
+
+    @abstractmethod
+    def make_adrio(self, attrib: AttributeDef, scope: GeoScope, time_period: TimePeriod, source: Any | None = None) -> ADRIO:
         """Creates an ADRIO to fetch the specified attribute for the specified time and place."""
 
 

diff --git a/epymorph/geo/adrio/census/adrio_census.py b/epymorph/geo/adrio/census/adrio_census.py
@@ -1,6 +1,7 @@
 import os
 from collections import defaultdict
 from functools import partial
+from typing import Any
 
 import numpy as np
 from census import Census
@@ -130,6 +131,10 @@ class ADRIOMakerCensus(ADRIOMaker):
     census: Census
     """Census API interface object."""
 
+    @staticmethod
+    def accepts_source(source: Any):
+        return False
+
     def __init__(self) -> None:
         """Initializer to create Census object."""
         api_key = os.environ.get('CENSUS_API_KEY')