GeoScopes and an encoding of the main US Census geographic hierarchy. (…

…#108) Removes pygris dependency from epymorph.plots
NAU-CCL · May 6, 2024 · 008c435 · 008c435
1 parent 5544701
commit 008c435
Show file tree

Hide file tree

Showing 13 changed files with 1,298 additions and 58 deletions.
diff --git a/doc/demo/02-states-GEO.ipynb b/doc/demo/02-states-GEO.ipynb
diff --git a/doc/demo/03-counties-GEO.ipynb b/doc/demo/03-counties-GEO.ipynb
diff --git a/doc/devlog/2024-05-03.ipynb b/doc/devlog/2024-05-03.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# devlog 2024-05-03\n",
+    "\n",
+    "_Author: Tyler Coles_\n",
+    "\n",
+    "Testing our us_census functions for loading canonical sets of IDs for Census granularities from state to block group for the years 2000, 2010, and 2020.\n",
+    "\n",
+    "Since this is our source of truth for these delineations, we want to make sure we're getting complete data. One thing we can test is that at each level of granularity (above block group) each node should contain at least one child node. That is every state should contain a county, every county a tract, and every tract a block group. Otherwise we know something is missing.\n",
+    "\n",
+    "(This may seem like a trivial test, but in fact it discovered that my original assumptions about how TIGER provides the data were invalid and has already saved us from bugs!)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import epymorph.geography.us_census as c\n",
+    "\n",
+    "\n",
+    "class Fail(Exception):\n",
+    "    pass\n",
+    "\n",
+    "\n",
+    "def test_year(year: c.CensusYear) -> None:\n",
+    "    # 1. test that we have 56 states\n",
+    "    states = c.get_us_states(year).geoid\n",
+    "\n",
+    "    if len(states) != 56:\n",
+    "        raise Fail(\"There weren't 56 states!\")\n",
+    "\n",
+    "    # 2. test that each state contains at least one county\n",
+    "    counties = c.get_us_counties(year).geoid\n",
+    "    counties_by_state = c.STATE.grouped(counties)\n",
+    "\n",
+    "    exs = []\n",
+    "    for x in states:\n",
+    "        if x not in counties_by_state or len(counties_by_state[x]) == 0:\n",
+    "            exs.append(Fail(f\"State {x} does not have at least one county.\"))\n",
+    "    if len(exs) > 0:\n",
+    "        raise ExceptionGroup(\"Failed checking counties.\", exs)\n",
+    "\n",
+    "    # 3. test that each county contains at least one tract\n",
+    "    tracts = c.get_us_tracts(year).geoid\n",
+    "    tracts_by_county = c.COUNTY.grouped(tracts)\n",
+    "\n",
+    "    exs = []\n",
+    "    for x in counties:\n",
+    "        if x not in tracts_by_county or len(tracts_by_county[x]) == 0:\n",
+    "            exs.append(Fail(f\"County {x} does not have at least one tract.\"))\n",
+    "    if len(exs) > 0:\n",
+    "        raise ExceptionGroup(\"Failed checking tracts.\", exs)\n",
+    "\n",
+    "    # 4. test that each tract contains at least one block group\n",
+    "    cbgs = c.get_us_block_groups(year).geoid\n",
+    "    cbgs_by_tract = c.TRACT.grouped(cbgs)\n",
+    "\n",
+    "    exs = []\n",
+    "    for x in tracts:\n",
+    "        if x not in cbgs_by_tract or len(cbgs_by_tract[x]) == 0:\n",
+    "            exs.append(Fail(f\"Tract {x} does not have at least one block group.\"))\n",
+    "    if len(exs) > 0:\n",
+    "        raise ExceptionGroup(\"Failed checking block groups.\", exs)\n",
+    "\n",
+    "    print(f\"Census year {year} passed!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Census year 2020 passed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_year(2020)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Census year 2010 passed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_year(2010)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Census year 2000 passed!\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_year(2000)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/doc/devlog/README.md b/doc/devlog/README.md
@@ -49,6 +49,7 @@ This folder is a handy place to put Jupyter notebooks or other documents which h
 | 2024-04-04-draw-demo.ipynb | Izaac | | Showing the new draw module for visualising IPMs (NEW!) |
 | 2024-04-16.ipynb | Izaac | | Showing error handling for common ipm errors (NEW!)|
 | 2024-04-25.ipynb | Tyler | | Integration test: epymorph cache utilities |
+| 2024-05-03.ipynb | Tyler | | Integration test: loading US Census geography from TIGER |
 
 ## Contributing
 

diff --git a/epymorph/error.py b/epymorph/error.py
@@ -6,6 +6,12 @@
 from typing import Self
 
 
+class GeographyError(Exception):
+    """Exception working with geographic system representations."""
+    # NOTE: this is *not* for general errors related to the epymorph GEO module,
+    # but instead for things like utility functions for working with US Census delineations.
+
+
 class UnknownModel(Exception):
     """Exception for the inability to load a model as specified."""
     model_type: str

diff --git a/epymorph/geography/__init__.py b/epymorph/geography/__init__.py
diff --git a/epymorph/geography/scope.py b/epymorph/geography/scope.py
@@ -0,0 +1,13 @@
+from abc import abstractmethod
+from typing import Protocol
+
+import numpy as np
+from numpy.typing import NDArray
+
+
+class GeoScope(Protocol):
+    """The common interface expected of all geo scopes."""
+
+    @abstractmethod
+    def get_node_ids(self) -> NDArray[np.str_]:
+        """Retrieve the complete list of node IDs included in this scope."""
diff --git a/epymorph/geography/test/__init__.py b/epymorph/geography/test/__init__.py
diff --git a/epymorph/geography/test/us_census_test.py b/epymorph/geography/test/us_census_test.py
@@ -0,0 +1,119 @@
+# pylint: disable=missing-docstring
+import unittest
+
+import numpy as np
+
+from epymorph.error import GeographyError
+from epymorph.geography.us_census import (BLOCK, BLOCK_GROUP,
+                                          CENSUS_GRANULARITY, COUNTY, STATE,
+                                          TRACT)
+
+
+class TestCensusGranularity(unittest.TestCase):
+
+    def test_is_nested(self):
+        # a triangular array perfectly describes the pattern of truth
+        expected = np.tri(5)
+        for i, test in enumerate(CENSUS_GRANULARITY):
+            for j, outer in enumerate(CENSUS_GRANULARITY):
+                if expected[i, j] == 1:
+                    self.assertTrue(test.is_nested(outer.name))
+                else:
+                    self.assertFalse(test.is_nested(outer.name))
+
+    def test_matches(self):
+        self.assertTrue(STATE.matches("04"))
+        self.assertTrue(COUNTY.matches("04003"))
+        self.assertTrue(TRACT.matches("04003999999"))
+        self.assertTrue(BLOCK_GROUP.matches("040039999998"))
+        self.assertTrue(BLOCK.matches("040039999998777"))
+
+        self.assertFalse(STATE.matches("0"))
+        self.assertFalse(STATE.matches(""))
+        self.assertFalse(STATE.matches("04003"))
+        self.assertFalse(STATE.matches("AZ"))
+        self.assertFalse(COUNTY.matches("04"))
+        self.assertFalse(COUNTY.matches("04003999999"))
+
+    def test_extract(self):
+        self.assertEqual("04", STATE.extract("04"))
+        self.assertEqual("04", STATE.extract("04003"))
+        self.assertEqual("04", STATE.extract("04003999999"))
+        self.assertEqual("04", STATE.extract("040039999998"))
+        self.assertEqual("04", STATE.extract("040039999998777"))
+
+        self.assertEqual("003", COUNTY.extract("04003"))
+        self.assertEqual("003", COUNTY.extract("04003999999"))
+        self.assertEqual("003", COUNTY.extract("040039999998"))
+        self.assertEqual("003", COUNTY.extract("040039999998777"))
+        with self.assertRaises(GeographyError):
+            COUNTY.extract("04")
+
+        self.assertEqual("999999", TRACT.extract("04003999999"))
+        self.assertEqual("999999", TRACT.extract("040039999998"))
+        self.assertEqual("999999", TRACT.extract("040039999998777"))
+        with self.assertRaises(GeographyError):
+            TRACT.extract("04")
+
+        self.assertEqual("8", BLOCK_GROUP.extract("040039999998"))
+        self.assertEqual("8", BLOCK_GROUP.extract("040039999998777"))
+        with self.assertRaises(GeographyError):
+            BLOCK_GROUP.extract("04")
+
+        self.assertEqual("8777", BLOCK.extract("040039999998777"))
+        with self.assertRaises(GeographyError):
+            BLOCK.extract("04")
+
+    def test_truncate(self):
+        self.assertEqual("04", STATE.truncate("04"))
+        self.assertEqual("04", STATE.truncate("04003"))
+        self.assertEqual("04", STATE.truncate("04003999999"))
+        self.assertEqual("04", STATE.truncate("040039999998"))
+        self.assertEqual("04", STATE.truncate("040039999998777"))
+
+        self.assertEqual("04003", COUNTY.truncate("04003"))
+        self.assertEqual("04003", COUNTY.truncate("04003999999"))
+        self.assertEqual("04003", COUNTY.truncate("040039999998"))
+        self.assertEqual("04003", COUNTY.truncate("040039999998777"))
+
+        self.assertEqual("04003999999", TRACT.truncate("04003999999"))
+        self.assertEqual("04003999999", TRACT.truncate("040039999998"))
+        self.assertEqual("04003999999", TRACT.truncate("040039999998777"))
+
+        self.assertEqual("040039999998", BLOCK_GROUP.truncate("040039999998"))
+        self.assertEqual("040039999998", BLOCK_GROUP.truncate("040039999998777"))
+
+        self.assertEqual("040039999998777", BLOCK.truncate("040039999998777"))
+
+    def test_truncate_list(self):
+        exp = ["08", "35", "04"]
+        act = STATE.truncate_list(["08001", "35", "04003", "08002", "04005", "35005"])
+        self.assertEqual(exp, act)
+
+    def test_decompose(self):
+        self.assertEqual(("04",), STATE.decompose("04"))
+        self.assertEqual(("04", "003"), COUNTY.decompose("04003"))
+        self.assertEqual(("04", "003", "999999"), TRACT.decompose("04003999999"))
+        self.assertEqual(("04", "003", "999999", "8"),
+                         BLOCK_GROUP.decompose("040039999998"))
+        self.assertEqual(("04", "003", "999999", "8", "8777"),
+                         BLOCK.decompose("040039999998777"))
+
+        with self.assertRaises(GeographyError):
+            STATE.decompose("04013")
+
+        with self.assertRaises(GeographyError):
+            TRACT.decompose("04013")
+
+    def test_grouped(self):
+        expected = {
+            "04004": np.array(["04004111111", "04004222222", "04004333333"]),
+            "04013": np.array(["04013444444", "04013555555", "04013666666"]),
+        }
+        actual = COUNTY.grouped(np.array([
+            "04004111111", "04004222222", "04004333333",
+            "04013444444", "04013555555", "04013666666",
+        ]))
+        self.assertSetEqual(set(expected.keys()), set(actual.keys()))
+        np.testing.assert_array_equal(expected["04004"], actual["04004"])
+        np.testing.assert_array_equal(expected["04013"], actual["04013"])