Skip to content

Commit

Permalink
File adrios (#118)
Browse files Browse the repository at this point in the history
Adds new ADRIOMaker class capable of producing ADRIOs to fetch data attributes from CSV files in N, TxN, or NxN format.
  • Loading branch information
TJohnsonAZ authored Jun 20, 2024
1 parent db79f8c commit e399d5c
Show file tree
Hide file tree
Showing 8 changed files with 581 additions and 39 deletions.
255 changes: 255 additions & 0 deletions doc/devlog/2024-06-12.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# devlog 2024-06-12\n",
"\n",
"_author: Trevor Johnson_\n",
"\n",
"We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).\n",
"\n",
"The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates geos with CSV ADRIOs to load the data into NDArrays. These geos also contain Census ADRIOs that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from datetime import date\n",
"from pathlib import Path\n",
"\n",
"from numpy import array_equal\n",
"\n",
"from epymorph.data_shape import Shapes\n",
"from epymorph.geo.adrio import adrio_maker_library\n",
"from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus\n",
"from epymorph.geo.adrio.file.adrio_csv import (CSVSpec, CSVSpecMatrix,\n",
" CSVSpecTime)\n",
"from epymorph.geo.dynamic import DynamicGeo\n",
"from epymorph.geo.spec import DynamicGeoSpec, Year\n",
"from epymorph.geography.us_census import (STATE, CountyScope, StateScope,\n",
" get_us_counties, get_us_states)\n",
"from epymorph.simulation import geo_attrib\n",
"from pandas import DataFrame, concat\n",
"\n",
"# create and store 'pei_population.csv'\n",
"census_maker = ADRIOMakerCensus()\n",
"states_list = ['AZ', 'FL', 'GA', 'MD', 'NY', 'NC', 'SC', 'VA']\n",
"population = census_maker.make_adrio(geo_attrib(\n",
" 'population', int, Shapes.N), StateScope.in_states_by_code(states_list), Year(2015))\n",
"df = DataFrame({'label': states_list, 'population': population.get_value()})\n",
"df.sort_values(by='population', inplace=True)\n",
"df.to_csv(\"./scratch/pei_population.csv\", header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"spec = DynamicGeoSpec(\n",
" attributes=[\n",
" geo_attrib('label', str, Shapes.N),\n",
" geo_attrib('population', int, Shapes.N),\n",
" geo_attrib('population_census', int, Shapes.N)\n",
" ],\n",
" time_period=Year(2015),\n",
" scope=StateScope.in_states(['12', '13', '24', '37', '45', '51']),\n",
" source={\n",
" 'label': 'Census:name',\n",
" 'population': CSVSpec(file_path=Path(\"./scratch/pei_population.csv\"),\n",
" key_col=0, data_col=1, key_type=\"state_abbrev\", skiprows=None),\n",
" 'population_census': 'Census:population'\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
"\n",
"# validate geo and ensure both ADRIOs fetched identical data\n",
"geo.validate()\n",
"if not array_equal(geo['population'], geo['population_census']):\n",
" raise Exception(\"Data not equal.\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# create and store 'us_sw_counties_population.csv'\n",
"\n",
"# get commuters data from asc5\n",
"states_list = ['04', '08', '49', '35', '32']\n",
"population_2015 = census_maker.make_adrio(geo_attrib(\n",
" 'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2015)).get_value()\n",
"population_2016 = census_maker.make_adrio(geo_attrib(\n",
" 'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2016)).get_value()\n",
"\n",
"# get county and state info from shapefiles and convert to dataframes\n",
"counties_info = get_us_counties(2010)\n",
"states_info = get_us_states(2010)\n",
"counties_info_df = DataFrame({'state_geoid': [STATE.extract(\n",
" county_id) for county_id in counties_info.geoid], 'geoid': counties_info.geoid, 'name': counties_info.name})\n",
"states_info_df = DataFrame(\n",
" {'state_geoid': states_info.geoid, 'state_name': states_info.name})\n",
"\n",
"# merge dataframes and create \"County, State\" name column\n",
"merged_df = counties_info_df.merge(states_info_df, on='state_geoid')\n",
"merged_df['county_name'] = merged_df['name'] + \", \" + merged_df['state_name']\n",
"merged_df = merged_df.loc[merged_df['state_geoid'].isin(states_list)]\n",
"\n",
"# create and merge dataframes to be converted to csvs\n",
"df_2015 = DataFrame({'Date': [date(2015, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n",
" pop[0] for pop in population_2015], 'Adult': [pop[1] for pop in population_2015], 'Elderly': [pop[2] for pop in population_2015]})\n",
"df_2016 = DataFrame({'Date': [date(2016, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n",
" pop[0] for pop in population_2016], 'Adult': [pop[1] for pop in population_2016], 'Elderly': [pop[2] for pop in population_2016]})\n",
"df = concat([df_2015, df_2016])\n",
"\n",
"# sort incorrectly and store as csv\n",
"df.sort_values('Young', inplace=True)\n",
"df.to_csv(\"./scratch/us_sw_counties_population.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"spec = DynamicGeoSpec(\n",
" attributes=[\n",
" geo_attrib('label', str, Shapes.N),\n",
" geo_attrib('population', int, Shapes.N),\n",
" geo_attrib('population_0-19', int, Shapes.N),\n",
" geo_attrib('population_20-64', int, Shapes.N),\n",
" geo_attrib('population_65+', int, Shapes.N),\n",
" geo_attrib('population_by_age', int, Shapes.NxA(3))\n",
" ],\n",
" time_period=Year(2015),\n",
" scope=CountyScope.in_states(['04', '08', '49', '35', '32']),\n",
" source={\n",
" 'label': 'Census:name',\n",
" 'population': 'Census',\n",
" 'population_0-19': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
" time_col=0, key_col=1, data_col=2, key_type=\"county_state\", skiprows=1),\n",
" 'population_20-64': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
" time_col=0, key_col=1, data_col=3, key_type=\"county_state\", skiprows=1),\n",
" 'population_65+': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n",
" time_col=0, key_col=1, data_col=4, key_type=\"county_state\", skiprows=1),\n",
" 'population_by_age': 'Census'\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
"\n",
"geo.validate()\n",
"\n",
"census_df = DataFrame({'Young': [pop[0] for pop in geo['population_by_age']], 'Adult': [\n",
" pop[1] for pop in geo['population_by_age']], 'Elderly': [pop[2] for pop in geo['population_by_age']]})\n",
"if not array_equal(geo['population_0-19'], census_df['Young']):\n",
" raise Exception(\"Young data not equal.\")\n",
"if not array_equal(geo['population_20-64'], census_df['Adult']):\n",
" raise Exception(\"Adult data not equal.\")\n",
"if not array_equal(geo['population_65+'], census_df['Elderly']):\n",
" raise Exception(\"Elderly data not equal.\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# create and store 'counties_commuters_2020.csv'\n",
"counties_list = ['08001', '35001', '04013', '04017']\n",
"df = census_maker.fetch_commuters(CountyScope.in_counties(counties_list), 2020)\n",
"df['res_geoid'] = df['res_state_code'] + df['res_county_code']\n",
"df['wrk_geoid'] = df['wrk_state_code'].apply(lambda x: x[1:]) + df['wrk_county_code']\n",
"\n",
"df.to_csv('./scratch/counties_commuters_2020.csv',\n",
" columns=['res_geoid', 'wrk_geoid', 'workers'], index=False)\n",
"\n",
"df.sort_values(by='workers', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"spec = DynamicGeoSpec(\n",
" attributes=[\n",
" geo_attrib('label', str, Shapes.N),\n",
" geo_attrib('population', int, Shapes.N),\n",
" geo_attrib('commuters', int, Shapes.NxN),\n",
" geo_attrib('commuters_census', int, Shapes.NxN)\n",
" ],\n",
" time_period=Year(2020),\n",
" scope=CountyScope.in_counties(['35001', '04013', '04017']),\n",
" source={\n",
" 'label': 'Census:name',\n",
" 'population': 'Census',\n",
" 'commuters': CSVSpecMatrix(file_path=Path(\"./scratch/counties_commuters_2020.csv\"),\n",
" from_key_col=0, to_key_col=1, data_col=2, key_type=\"geoid\", skiprows=1),\n",
" 'commuters_census': 'Census:commuters'\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n",
"\n",
"geo.validate()\n",
"if not array_equal(geo['commuters'], geo['commuters_census']):\n",
" raise Exception(\"Data not equal.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
1 change: 1 addition & 0 deletions doc/devlog/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ This folder is a handy place to put Jupyter notebooks or other documents which h
| 2024-05-03.ipynb | Tyler || Integration test: loading US Census geography from TIGER |
| 2024-05-22.ipynb | Sachin | | Integrating particle filter with epymorph. Propagating the particles using epymorph simulation and plot the infection rates |
| 2024-06-03.ipynb | Trevor | | Integration test: using dynamic geos to fetch Census data |
| 2024-06-12.ipynb | Trevor | | Integration test: CSV file ADRIOs |

## Contributing

Expand Down
2 changes: 2 additions & 0 deletions epymorph/geo/adrio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""AdrioMaker library."""
from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus
from epymorph.geo.adrio.file.adrio_csv import ADRIOMakerCSV
from epymorph.geo.dynamic import ADRIOMaker

adrio_maker_library: dict[str, type[ADRIOMaker]] = {
'Census': ADRIOMakerCensus,
'CSV': ADRIOMakerCSV
}
9 changes: 7 additions & 2 deletions epymorph/geo/adrio/adrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
and ADRIOMakers create ADRIOs for a data soruce and specialized for a geo's purposes.
"""
from abc import ABC, abstractmethod
from typing import Callable
from typing import Any, Callable

from numpy.typing import NDArray

Expand Down Expand Up @@ -40,8 +40,13 @@ class ADRIOMaker(ABC):
"""Abstract class to serve as an outline for ADRIO makers for specific data sources."""
attributes: list[AttributeDef]

@staticmethod
@abstractmethod
def make_adrio(self, attrib: AttributeDef, scope: GeoScope, time_period: TimePeriod) -> ADRIO:
def accepts_source(source: Any) -> bool:
"""Checks whether the ADRIOMaker accepts a given source type and returns the result as a boolean."""

@abstractmethod
def make_adrio(self, attrib: AttributeDef, scope: GeoScope, time_period: TimePeriod, source: Any | None = None) -> ADRIO:
"""Creates an ADRIO to fetch the specified attribute for the specified time and place."""


Expand Down
5 changes: 5 additions & 0 deletions epymorph/geo/adrio/census/adrio_census.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from collections import defaultdict
from functools import partial
from typing import Any

import numpy as np
from census import Census
Expand Down Expand Up @@ -130,6 +131,10 @@ class ADRIOMakerCensus(ADRIOMaker):
census: Census
"""Census API interface object."""

@staticmethod
def accepts_source(source: Any):
return False

def __init__(self) -> None:
"""Initializer to create Census object."""
api_key = os.environ.get('CENSUS_API_KEY')
Expand Down
Loading

0 comments on commit e399d5c

Please sign in to comment.