-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds new ADRIOMaker class capable of producing ADRIOs to fetch data attributes from CSV files in N, TxN, or NxN format.
- Loading branch information
1 parent
db79f8c
commit e399d5c
Showing
8 changed files
with
581 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,255 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# devlog 2024-06-12\n", | ||
"\n", | ||
"_author: Trevor Johnson_\n", | ||
"\n", | ||
"We have a new class of ADRIO capable of loading in data from CSV files with shapes N, TxN, and NxN. CSV files used must have a column (or two in the NxN case) to identify geographic location and a column containing the relevant data. A time column in YYYY-MM-DD format must also be included if loading in time-series data. Available formats for geographic identifiers are state_abbrev (AZ), county_state, (Maricopa, Arizona), and geoid (04013).\n", | ||
"\n", | ||
"The following notebook creates a series of incorrectly sorted CSV files with various data formats and geographic identifiers, then creates geos with CSV ADRIOs to load the data into NDArrays. These geos also contain Census ADRIOs that fetch identical data and are used as a source of truth as to whether the CSV ADRIOs fetched, filtered, and sorted their data correctly." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from datetime import date\n", | ||
"from pathlib import Path\n", | ||
"\n", | ||
"from numpy import array_equal\n", | ||
"\n", | ||
"from epymorph.data_shape import Shapes\n", | ||
"from epymorph.geo.adrio import adrio_maker_library\n", | ||
"from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus\n", | ||
"from epymorph.geo.adrio.file.adrio_csv import (CSVSpec, CSVSpecMatrix,\n", | ||
" CSVSpecTime)\n", | ||
"from epymorph.geo.dynamic import DynamicGeo\n", | ||
"from epymorph.geo.spec import DynamicGeoSpec, Year\n", | ||
"from epymorph.geography.us_census import (STATE, CountyScope, StateScope,\n", | ||
" get_us_counties, get_us_states)\n", | ||
"from epymorph.simulation import geo_attrib\n", | ||
"from pandas import DataFrame, concat\n", | ||
"\n", | ||
"# create and store 'pei_population.csv'\n", | ||
"census_maker = ADRIOMakerCensus()\n", | ||
"states_list = ['AZ', 'FL', 'GA', 'MD', 'NY', 'NC', 'SC', 'VA']\n", | ||
"population = census_maker.make_adrio(geo_attrib(\n", | ||
" 'population', int, Shapes.N), StateScope.in_states_by_code(states_list), Year(2015))\n", | ||
"df = DataFrame({'label': states_list, 'population': population.get_value()})\n", | ||
"df.sort_values(by='population', inplace=True)\n", | ||
"df.to_csv(\"./scratch/pei_population.csv\", header=False, index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spec = DynamicGeoSpec(\n", | ||
" attributes=[\n", | ||
" geo_attrib('label', str, Shapes.N),\n", | ||
" geo_attrib('population', int, Shapes.N),\n", | ||
" geo_attrib('population_census', int, Shapes.N)\n", | ||
" ],\n", | ||
" time_period=Year(2015),\n", | ||
" scope=StateScope.in_states(['12', '13', '24', '37', '45', '51']),\n", | ||
" source={\n", | ||
" 'label': 'Census:name',\n", | ||
" 'population': CSVSpec(file_path=Path(\"./scratch/pei_population.csv\"),\n", | ||
" key_col=0, data_col=1, key_type=\"state_abbrev\", skiprows=None),\n", | ||
" 'population_census': 'Census:population'\n", | ||
" }\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n", | ||
"\n", | ||
"# validate geo and ensure both ADRIOs fetched identical data\n", | ||
"geo.validate()\n", | ||
"if not array_equal(geo['population'], geo['population_census']):\n", | ||
" raise Exception(\"Data not equal.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create and store 'us_sw_counties_population.csv'\n", | ||
"\n", | ||
"# get commuters data from asc5\n", | ||
"states_list = ['04', '08', '49', '35', '32']\n", | ||
"population_2015 = census_maker.make_adrio(geo_attrib(\n", | ||
" 'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2015)).get_value()\n", | ||
"population_2016 = census_maker.make_adrio(geo_attrib(\n", | ||
" 'population_by_age', int, Shapes.NxA(3)), CountyScope.in_states(states_list), Year(2016)).get_value()\n", | ||
"\n", | ||
"# get county and state info from shapefiles and convert to dataframes\n", | ||
"counties_info = get_us_counties(2010)\n", | ||
"states_info = get_us_states(2010)\n", | ||
"counties_info_df = DataFrame({'state_geoid': [STATE.extract(\n", | ||
" county_id) for county_id in counties_info.geoid], 'geoid': counties_info.geoid, 'name': counties_info.name})\n", | ||
"states_info_df = DataFrame(\n", | ||
" {'state_geoid': states_info.geoid, 'state_name': states_info.name})\n", | ||
"\n", | ||
"# merge dataframes and create \"County, State\" name column\n", | ||
"merged_df = counties_info_df.merge(states_info_df, on='state_geoid')\n", | ||
"merged_df['county_name'] = merged_df['name'] + \", \" + merged_df['state_name']\n", | ||
"merged_df = merged_df.loc[merged_df['state_geoid'].isin(states_list)]\n", | ||
"\n", | ||
"# create and merge dataframes to be converted to csvs\n", | ||
"df_2015 = DataFrame({'Date': [date(2015, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n", | ||
" pop[0] for pop in population_2015], 'Adult': [pop[1] for pop in population_2015], 'Elderly': [pop[2] for pop in population_2015]})\n", | ||
"df_2016 = DataFrame({'Date': [date(2016, 1, 1) for i in merged_df.index], 'County': merged_df['county_name'], 'Young': [\n", | ||
" pop[0] for pop in population_2016], 'Adult': [pop[1] for pop in population_2016], 'Elderly': [pop[2] for pop in population_2016]})\n", | ||
"df = concat([df_2015, df_2016])\n", | ||
"\n", | ||
"# sort incorrectly and store as csv\n", | ||
"df.sort_values('Young', inplace=True)\n", | ||
"df.to_csv(\"./scratch/us_sw_counties_population.csv\", index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spec = DynamicGeoSpec(\n", | ||
" attributes=[\n", | ||
" geo_attrib('label', str, Shapes.N),\n", | ||
" geo_attrib('population', int, Shapes.N),\n", | ||
" geo_attrib('population_0-19', int, Shapes.N),\n", | ||
" geo_attrib('population_20-64', int, Shapes.N),\n", | ||
" geo_attrib('population_65+', int, Shapes.N),\n", | ||
" geo_attrib('population_by_age', int, Shapes.NxA(3))\n", | ||
" ],\n", | ||
" time_period=Year(2015),\n", | ||
" scope=CountyScope.in_states(['04', '08', '49', '35', '32']),\n", | ||
" source={\n", | ||
" 'label': 'Census:name',\n", | ||
" 'population': 'Census',\n", | ||
" 'population_0-19': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n", | ||
" time_col=0, key_col=1, data_col=2, key_type=\"county_state\", skiprows=1),\n", | ||
" 'population_20-64': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n", | ||
" time_col=0, key_col=1, data_col=3, key_type=\"county_state\", skiprows=1),\n", | ||
" 'population_65+': CSVSpecTime(file_path=Path(\"./scratch/us_sw_counties_population.csv\"),\n", | ||
" time_col=0, key_col=1, data_col=4, key_type=\"county_state\", skiprows=1),\n", | ||
" 'population_by_age': 'Census'\n", | ||
" }\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n", | ||
"\n", | ||
"geo.validate()\n", | ||
"\n", | ||
"census_df = DataFrame({'Young': [pop[0] for pop in geo['population_by_age']], 'Adult': [\n", | ||
" pop[1] for pop in geo['population_by_age']], 'Elderly': [pop[2] for pop in geo['population_by_age']]})\n", | ||
"if not array_equal(geo['population_0-19'], census_df['Young']):\n", | ||
" raise Exception(\"Young data not equal.\")\n", | ||
"if not array_equal(geo['population_20-64'], census_df['Adult']):\n", | ||
" raise Exception(\"Adult data not equal.\")\n", | ||
"if not array_equal(geo['population_65+'], census_df['Elderly']):\n", | ||
" raise Exception(\"Elderly data not equal.\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# create and store 'counties_commuters_2020.csv'\n", | ||
"counties_list = ['08001', '35001', '04013', '04017']\n", | ||
"df = census_maker.fetch_commuters(CountyScope.in_counties(counties_list), 2020)\n", | ||
"df['res_geoid'] = df['res_state_code'] + df['res_county_code']\n", | ||
"df['wrk_geoid'] = df['wrk_state_code'].apply(lambda x: x[1:]) + df['wrk_county_code']\n", | ||
"\n", | ||
"df.to_csv('./scratch/counties_commuters_2020.csv',\n", | ||
" columns=['res_geoid', 'wrk_geoid', 'workers'], index=False)\n", | ||
"\n", | ||
"df.sort_values(by='workers', inplace=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"spec = DynamicGeoSpec(\n", | ||
" attributes=[\n", | ||
" geo_attrib('label', str, Shapes.N),\n", | ||
" geo_attrib('population', int, Shapes.N),\n", | ||
" geo_attrib('commuters', int, Shapes.NxN),\n", | ||
" geo_attrib('commuters_census', int, Shapes.NxN)\n", | ||
" ],\n", | ||
" time_period=Year(2020),\n", | ||
" scope=CountyScope.in_counties(['35001', '04013', '04017']),\n", | ||
" source={\n", | ||
" 'label': 'Census:name',\n", | ||
" 'population': 'Census',\n", | ||
" 'commuters': CSVSpecMatrix(file_path=Path(\"./scratch/counties_commuters_2020.csv\"),\n", | ||
" from_key_col=0, to_key_col=1, data_col=2, key_type=\"geoid\", skiprows=1),\n", | ||
" 'commuters_census': 'Census:commuters'\n", | ||
" }\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"geo = DynamicGeo.from_library(spec, adrio_maker_library)\n", | ||
"\n", | ||
"geo.validate()\n", | ||
"if not array_equal(geo['commuters'], geo['commuters_census']):\n", | ||
" raise Exception(\"Data not equal.\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": ".venv", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
"""AdrioMaker library.""" | ||
from epymorph.geo.adrio.census.adrio_census import ADRIOMakerCensus | ||
from epymorph.geo.adrio.file.adrio_csv import ADRIOMakerCSV | ||
from epymorph.geo.dynamic import ADRIOMaker | ||
|
||
adrio_maker_library: dict[str, type[ADRIOMaker]] = { | ||
'Census': ADRIOMakerCensus, | ||
'CSV': ADRIOMakerCSV | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.