diff --git a/doc/devlog/2024-08-26-prism-adrio-demo.ipynb b/doc/devlog/2024-08-26-prism-adrio-demo.ipynb index a5685e14..35b66f31 100644 --- a/doc/devlog/2024-08-26-prism-adrio-demo.ipynb +++ b/doc/devlog/2024-08-26-prism-adrio-demo.ipynb @@ -47,9 +47,22 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ADRIO data usage estimation:\n", + "- epymorph.adrio.us_tiger.GeometricCentroid will download 0 Bytes and write 0 Bytes to disk\n", + "- epymorph.adrio.prism.Precipitation will download 1.2 MB and write 1.2 MB to disk\n", + "In total we will:\n", + "- Download 1.2 MB, taking a second (assuming 1.0 MB/s)\n", + "- Write 1.2 MB to disk cache (you have 249.1 GB free space)\n" + ] + } + ], "source": [ "from epymorph.geography.us_census import CountyScope\n", "from epymorph import *\n", @@ -77,12 +90,22 @@ " },\n", ")\n", "\n", - "precipitation = evaluate_param(rume, \"precipitation\")" + "rume.estimate_data()\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "with sim_messaging():\n", + " precipitation = evaluate_param(rume, \"precipitation\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -164,9 +187,22 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ADRIO data usage estimation:\n", + "- epymorph.adrio.prism.Temperature will download 51.0 MB and write 51.0 MB to disk\n", + "- epymorph.adrio.us_tiger.GeometricCentroid will download 0 Bytes and write 0 Bytes to disk\n", + "In total we will:\n", + "- Download 51.0 MB, taking 51 seconds (assuming 1.0 MB/s)\n", + "- Write 51.0 MB to disk cache (you have 249.1 GB free space)\n" + ] + } + ], "source": [ "from epymorph.geography.us_census import StateScope\n", "from epymorph import *\n", @@ -186,25 +222,46 @@ " scope=state_scope,\n", " time_frame=time_period,\n", " params={\n", + " \"temperature\": prism.Temperature(time_period, \"Mean\"),\n", " \"population\": acs5.Population(),\n", " \"centroid\": us_tiger.GeometricCentroid(),\n", - " \"temperature\": prism.Temperature(time_period, \"Mean\"),\n", " },\n", ")\n", "\n", - "temperature = evaluate_param(rume, \"temperature\")" + "rume.estimate_data()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "States: ['04']\n", + "Loading epymorph.adrio.us_tiger.GeometricCentroid:\n", + " |####################| 100% (4.787s)\n", + "Loading epymorph.adrio.prism.Temperature:\n", + " |####################| 100% (21.742s)\n" + ] + } + ], + "source": [ + "with sim_messaging():\n", + " temperature = evaluate_param(rume, \"temperature\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Counties: ['04']\n", "\n", "Temperature in Celsius:\n", " [[18.10199928]\n", @@ -242,7 +299,7 @@ } ], "source": [ - "print(f\"States: {geoids}\\n\")\n", + "print(f\"Counties: {geoids}\\n\")\n", "\n", "print(f\"Temperature in Celsius:\\n {temperature}\\n\")" ] diff --git a/epymorph/adrio/prism.py b/epymorph/adrio/prism.py index 0d8188e9..67ceb6d4 100644 --- a/epymorph/adrio/prism.py +++ b/epymorph/adrio/prism.py @@ -10,20 +10,55 @@ from numpy.typing import NDArray from typing_extensions import override -from epymorph.adrio.adrio import Adrio -from epymorph.cache import load_or_fetch_url, module_cache_path +from epymorph.adrio.adrio import Adrio, ProgressCallback, adrio_cache +from epymorph.cache import check_file_in_cache, load_or_fetch_url, module_cache_path from epymorph.data_shape import Shapes from epymorph.data_type import CentroidType +from epymorph.data_usage import DataEstimate from epymorph.error import DataResourceException from epymorph.geography.scope import GeoScope from epymorph.geography.us_census import STATE, CensusScope +from epymorph.geography.us_tiger import CacheEstimate from epymorph.simulation import AttributeDef, TimeFrame _PRISM_CACHE_PATH = module_cache_path(__name__) +def _generate_file_name( + attribute: str, + latest_date: datetype, + last_completed_month: datetype, + date: datetype, +) -> tuple[str, str]: + """ + Generates the url for the given date and climate attribute. Returns a tuple + of strings with the url and the name of the bil file within the zip file. + """ + + if date.year == latest_date.year and date.month == latest_date.month: + stability = "early" + + # if it is before the last finished month + elif date > last_completed_month: + stability = "provisional" + + # if it is older than 6 completed months + else: + stability = "stable" + + # format the date for the url + formatted_date = date.strftime("%Y%m%d") + year = date.year + + url = f"https://ftp.prism.oregonstate.edu/daily/{attribute}/{year}/PRISM_{attribute}_{stability}_4kmD2_{formatted_date}_bil.zip" + + bil_name = f"PRISM_{attribute}_{stability}_4kmD2_{formatted_date}_bil.bil" + + return url, bil_name + + def _fetch_raster( - attribute: str, date_range: TimeFrame + attribute: str, date_range: TimeFrame, progress: ProgressCallback ) -> Generator[BytesIO, None, None]: """ Fetches the raster values at the url with the given attribute and date range. @@ -43,26 +78,13 @@ def _fetch_raster( six_months_ago = datetype.today() + relativedelta(months=-6) last_completed_month = six_months_ago.replace(day=1) - timedelta(days=1) - for single_date in date_list: - if ( - single_date.year == latest_date.year - and single_date.month == latest_date.month - ): - stability = "early" - - # if it is before the last finished month - elif single_date > last_completed_month: - stability = "provisional" + # for progress tracking + processing_steps = len(date_list) + 1 - # if it is older than 6 completed months - else: - stability = "stable" - - # format the date for the url - formatted_date = single_date.strftime("%Y%m%d") - year = single_date.year - - url = f"https://ftp.prism.oregonstate.edu/daily/{attribute}/{year}/PRISM_{attribute}_{stability}_4kmD2_{formatted_date}_bil.zip" + for i, single_date in enumerate(date_list): + url, bil_name = _generate_file_name( + attribute, latest_date, last_completed_month, single_date + ) # load/fetch the url for the file try: @@ -71,18 +93,23 @@ def _fetch_raster( except Exception as e: raise DataResourceException("Unable to fetch PRISM data.") from e - file.name = f"PRISM_{attribute}_{stability}_4kmD2_{formatted_date}_bil.bil" + # if the progress isnt None + if progress is not None: + # progress by one, increasing percentage done + progress((i + 1) / processing_steps, None) + + file.name = bil_name yield file def _make_centroid_strategy_adrio( - attribute: str, date: TimeFrame, centroids: NDArray + attribute: str, date: TimeFrame, centroids: NDArray, progress: ProgressCallback ) -> NDArray[np.float64]: """ Retrieves the raster value at a centroid of a granularity. """ - raster_files = _fetch_raster(attribute, date) + raster_files = _fetch_raster(attribute, date, progress) results = [] # read in each file @@ -128,6 +155,56 @@ def _validate_scope(scope: GeoScope) -> CensusScope: return scope +def _estimate_prism( + self, file_size: int, date_range: TimeFrame, attribute: str +) -> DataEstimate: + """ + Calculate estimates for downloading PRISM files. + """ + est_file_size = file_size + total_files = date_range.duration_days + + # setup urls as list to check if theyre in the cache + + # setup date variables + first_day = date_range.start_date + last_day = date_range.end_date + latest_date = datetype.today() - timedelta(days=1) + six_months_ago = datetype.today() + relativedelta(months=-6) + last_completed_month = six_months_ago.replace(day=1) - timedelta(days=1) + date_list = [ + first_day + timedelta(days=x) for x in range((last_day - first_day).days + 1) + ] + + # get url names to check in cache + urls = [ + _generate_file_name(attribute, latest_date, last_completed_month, day)[0] + for day in date_list + ] + + # sum the files needed to download + missing_files = total_files - sum( + 1 for u in urls if check_file_in_cache(_PRISM_CACHE_PATH / Path(u).name) + ) + + # calculate the cache estimate + est = CacheEstimate( + total_cache_size=total_files * est_file_size, + missing_cache_size=missing_files * est_file_size, + ) + + key = f"prism:{attribute}:{date_range}" + return DataEstimate( + name=self.full_name, + cache_key=key, + new_network_bytes=est.missing_cache_size, + new_cache_bytes=est.missing_cache_size, + total_cache_bytes=est.total_cache_size, + max_bandwidth=None, + ) + + +@adrio_cache class Precipitation(Adrio[np.float64]): """ Creates an TxN matrix of floats representing the amount of precipitation in an area, @@ -141,12 +218,19 @@ class Precipitation(Adrio[np.float64]): def __init__(self, date_range: TimeFrame): self.date_range = _validate_dates(date_range) + def estimate_data(self) -> DataEstimate: + file_size = 1_200_000 # no significant change in size, average to about 1.2MB + est = _estimate_prism(self, file_size, self.date_range, "ppt") + return est + @override def evaluate_adrio(self) -> NDArray[np.float64]: scope = self.scope scope = _validate_scope(scope) centroids = self.data("centroid") - raster_vals = _make_centroid_strategy_adrio("ppt", self.date_range, centroids) + raster_vals = _make_centroid_strategy_adrio( + "ppt", self.date_range, centroids, self.progress + ) return raster_vals @@ -163,13 +247,23 @@ class DewPoint(Adrio[np.float64]): def __init__(self, date_range: TimeFrame): self.date_range = _validate_dates(date_range) + def estimate_data(self) -> DataEstimate: + year = self.date_range.end_date.year + + # file sizes are larger after the year 2020 + if year > 2020: + file_size = 1_800_000 # average to 1.8MB after 2020 + else: + file_size = 1_400_000 # average to 1.4MB 2020 and before + return _estimate_prism(self, file_size, self.date_range, "tdmean") + @override def evaluate_adrio(self) -> NDArray[np.float64]: scope = self.scope scope = _validate_scope(scope) centroids = self.data("centroid") raster_vals = _make_centroid_strategy_adrio( - "tdmean", self.date_range, centroids + "tdmean", self.date_range, centroids, self.progress ) return raster_vals @@ -177,7 +271,7 @@ def evaluate_adrio(self) -> NDArray[np.float64]: class Temperature(Adrio[np.float64]): """ Creates an TxN matrix of floats representing the temperature in an area, represented - in degrees Celsius (°C). + in degrees Celsius (°C). """ date_range: TimeFrame @@ -198,6 +292,17 @@ def __init__(self, date_range: TimeFrame, temp_var: TemperatureType): self.temp_var = temp_var self.date_range = _validate_dates(date_range) + def estimate_data(self) -> DataEstimate: + year = self.date_range.end_date.year + temp_var = self.temp_variables[self.temp_var] + + # file sizes are larger after the year 2020 + if year > 2020: + file_size = 1_700_000 # average to 1.7MB after 2020 + else: + file_size = 1_400_000 # average to 1.4MB 2020 and before + return _estimate_prism(self, file_size, self.date_range, temp_var) + @override def evaluate_adrio(self) -> NDArray[np.float64]: scope = self.scope @@ -205,7 +310,7 @@ def evaluate_adrio(self) -> NDArray[np.float64]: temp_var = self.temp_variables[self.temp_var] centroids = self.data("centroid") raster_vals = _make_centroid_strategy_adrio( - temp_var, self.date_range, centroids + temp_var, self.date_range, centroids, self.progress ) return raster_vals @@ -214,7 +319,7 @@ def evaluate_adrio(self) -> NDArray[np.float64]: class VaporPressureDeficit(Adrio[np.float64]): """ Creates an TxN matrix of floats representing the vapor pressure deficit in an area, - represented in hectopascals (hPa). + represented in hectopascals (hPa). """ date_range: TimeFrame @@ -231,11 +336,23 @@ def __init__(self, date_range: TimeFrame, vpd_var: VPDType): self.vpd_var = vpd_var self.date_range = _validate_dates(date_range) + def estimate_data(self) -> DataEstimate: + year = self.date_range.end_date.year + + # file sizes are larger after the year 2020 + if year > 2020: + file_size = 1_700_000 # average to 1.7MB after 2020 + else: + file_size = 1_300_000 # average to 1.3MB 2020 and before + return _estimate_prism(self, file_size, self.date_range, self.vpd_var) + @override def evaluate_adrio(self) -> NDArray[np.float64]: scope = self.scope scope = _validate_scope(scope) vpd_var = self.vpd_variables[self.vpd_var] centroids = self.data("centroid") - raster_vals = _make_centroid_strategy_adrio(vpd_var, self.date_range, centroids) + raster_vals = _make_centroid_strategy_adrio( + vpd_var, self.date_range, centroids, self.progress + ) return raster_vals