Skip to content

Commit 41f9060

Browse files
Coerce model dimension dtypes to numeric (#575)
1 parent 0240bcc commit 41f9060

File tree

4 files changed

+101
-6
lines changed

4 files changed

+101
-6
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ Both the index and the values of the timeseries (both being date strings) should
2222

2323
|changed| `inbuilt` math -> `pre-defined` math and `custom` math -> `pre-defined` math in the documentation.
2424

25+
|fixed| Dimensions with numeric data can be defined in tabular data _or_ YAML and will appear as numeric in the processed Calliope model input dataset.
26+
If all dimension data can be coerced to a numeric data type (e.g. `["10", 100, "-1"]`), then it _will_ be coerced (e.g., `[10, 100, -1]`).
27+
2528
## 0.7.0.dev2 (2024-01-26)
2629

2730
v0.7 includes a major change to how Calliope internally operates.

docs/creating/data_sources.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,11 @@ E.g.,
400400
data_sources:
401401
...
402402
```
403+
6. We process dimension data after loading it in according to a limited set of heuristics:
404+
1. we assume any dimension with the suffix `steps` (e.g., `timesteps`, `monthsteps`) is timeseries data, and attempt to convert the data type of the dimension values accordingly.
405+
2. We will attempt to convert dimension data to numeric values.
406+
Therefore, dimensions with the data `[1, 2]`, `["1", "2"]`, `[1, "2"]`, and `["1.0", 2.0]` will all be converted to having a numeric data type (integer or float).
407+
`["foo", "1"]` and `["foo", 1]` will _not_ be converted, as not all dimension data entries are convertible to numeric data types.
403408

404409
### Data you _cannot_ load in tabular format
405410

src/calliope/preprocess/model_data.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -695,18 +695,23 @@ def _links_to_node_format(self, active_node_dict: AttrDict) -> AttrDict:
695695

696696
return link_tech_dict
697697

698-
def _add_to_dataset(self, to_add: xr.Dataset, id: str):
698+
def _add_to_dataset(self, to_add: xr.Dataset, id_: str):
699699
"""Add new data to the central class dataset.
700700
701701
Before being added, any dimensions with the `steps` suffix will be cast to datetime dtype.
702702
703703
Args:
704704
to_add (xr.Dataset): Dataset to merge into the central dataset.
705-
id (str): ID of dataset being added, to use in log messages
705+
id_ (str): ID of dataset being added, to use in log messages
706706
"""
707-
to_add = time.timeseries_to_datetime(to_add, self.config["time_format"], id)
707+
to_add_numeric_dims = self._update_numeric_dims(to_add, id_)
708+
to_add_numeric_ts_dims = time.timeseries_to_datetime(
709+
to_add_numeric_dims, self.config["time_format"], id_
710+
)
708711
self.dataset = xr.merge(
709-
[to_add, self.dataset], combine_attrs="no_conflicts", compat="override"
712+
[to_add_numeric_ts_dims, self.dataset],
713+
combine_attrs="no_conflicts",
714+
compat="override",
710715
).fillna(self.dataset)
711716

712717
def _log_param_updates(self, param_name: str, param_da: xr.DataArray):
@@ -747,6 +752,33 @@ def _update_one_way_links(node_from_data: dict, node_to_data: dict):
747752
node_from_data.pop("carrier_out") # cannot import carriers at the `from` node
748753
node_to_data.pop("carrier_in") # cannot export carrier at the `to` node
749754

755+
@staticmethod
756+
def _update_numeric_dims(ds: xr.Dataset, id_: str) -> xr.Dataset:
757+
"""Try coercing all dimension data of the input dataset to a numeric data type.
758+
759+
Any dimensions where _all_ its data is potentially numeric will be returned with all data coerced to numeric.
760+
All other dimensions will be returned as they were in the input dataset.
761+
No changes are made to data variables in the dataset.
762+
763+
Args:
764+
ds (xr.Dataset): Dataset possibly containing numeric dimensions.
765+
id_ (str): Identifier for `ds` to use in logging.
766+
767+
Returns:
768+
xr.Dataset: Input `ds` with numeric coordinates.
769+
"""
770+
771+
for dim_name in ds.dims:
772+
try:
773+
ds.coords[dim_name] = pd.to_numeric(ds.coords[dim_name].to_index())
774+
LOGGER.debug(
775+
f"{id_} | Updating `{dim_name}` dimension index values to numeric type."
776+
)
777+
except ValueError:
778+
continue
779+
780+
return ds
781+
750782
def _raise_error_on_transmission_tech_def(
751783
self, tech_def_dict: AttrDict, node_name: str
752784
):

tests/test_preprocess_model_data.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -702,12 +702,67 @@ def test_add_to_dataset_no_timeseries(
702702
new_param = simple_da.copy().to_dataset(name="non_ts_data")
703703
model_data_factory._add_to_dataset(new_param, "foo")
704704

705-
assert "foo | Updating" not in my_caplog.text
706-
assert "datetime format" not in my_caplog.text
705+
assert "dimension index values to datetime format" not in my_caplog.text
707706
# make sure nothing has changed in the array
708707
assert "non_ts_data" in model_data_factory.dataset
709708
assert model_data_factory.dataset["non_ts_data"].equals(simple_da)
710709

710+
@pytest.mark.parametrize(
711+
["data", "kind"],
712+
[
713+
([1, 2], "i"),
714+
(["1", "2"], "i"),
715+
(["1", 2], "i"),
716+
([1, "2"], "i"),
717+
([1.0, 2.0], "f"),
718+
(["1.0", "2.0"], "f"),
719+
([1, "2.0"], "f"),
720+
(["1", 2.0], "f"),
721+
],
722+
)
723+
def test_update_numeric_dims(
724+
self, my_caplog, model_data_factory: ModelDataFactory, data, kind
725+
):
726+
new_idx = pd.Index(data, name="bar")
727+
new_param = pd.DataFrame({"my_data": [True, False]}, index=new_idx).to_xarray()
728+
updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")
729+
730+
assert (
731+
"foo | Updating `bar` dimension index values to numeric type"
732+
in my_caplog.text
733+
)
734+
assert updated_ds.coords["bar"].dtype.kind == kind
735+
736+
@pytest.mark.parametrize(["data", "kind"], [(["1", 2], "i"), ([1.0, "2.0"], "f")])
737+
def test_update_numeric_dims_in_model_data(
738+
self, my_caplog, model_data_factory: ModelDataFactory, data, kind
739+
):
740+
new_idx = pd.Index(data, name="bar")
741+
new_param = pd.DataFrame({"num_data": [True, False]}, index=new_idx).to_xarray()
742+
model_data_factory._add_to_dataset(new_param, "foo")
743+
744+
assert (
745+
"foo | Updating `bar` dimension index values to numeric type"
746+
in my_caplog.text
747+
)
748+
assert model_data_factory.dataset.coords["bar"].dtype.kind == kind
749+
750+
@pytest.mark.parametrize(
751+
"data", [["foo", 2], [1.0, "foo"], ["foo", "bar"], ["Y1", "Y2"]]
752+
)
753+
def test_update_numeric_dims_no_update(
754+
self, my_caplog, model_data_factory: ModelDataFactory, data
755+
):
756+
new_idx = pd.Index(data, name="bar")
757+
new_param = pd.DataFrame({"ts_data": [True, False]}, index=new_idx).to_xarray()
758+
updated_ds = model_data_factory._update_numeric_dims(new_param, "foo")
759+
760+
assert (
761+
"foo | Updating `bar` dimension index values to numeric type"
762+
not in my_caplog.text
763+
)
764+
assert updated_ds.coords["bar"].dtype.kind not in ["f", "i"]
765+
711766
@pytest.mark.parametrize(
712767
["coords", "new_coords"],
713768
[(["foobar", "baz"], ["baz"]), (["bazfoo", "baz"], ["bazfoo", "baz"])],

0 commit comments

Comments
 (0)