From a434664dd1db0a7bbd5160120390883297f059cc Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:06:16 +0000 Subject: [PATCH 01/17] add to __all__ missing function --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index f4dcdf4c8..0d8df0d74 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -14,7 +14,12 @@ # limitations under the License. """Generates a JSON file representing the genome metadata from a core database.""" -__all__ = ["get_genome_metadata", "filter_genome_meta", "check_assembly_version"] +__all__ = [ + "get_genome_metadata", + "filter_genome_meta", + "check_assembly_version", + "check_genebuild_version", +] import json from typing import Any, Dict From 2443c14bba5dcfaaad98d1a122df062f6c7c117a Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:09:17 +0000 Subject: [PATCH 02/17] update check_assembly_version and check_genebuild_version --- .../io/genomio/genome_metadata/dump.py | 57 +++++++++---------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 0d8df0d74..e3f7c1615 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -125,58 +125,57 @@ def filter_genome_meta(gmeta: Dict[str, Any]) -> Dict[str, Any]: return gmeta_out -def check_assembly_version(gmeta_out: Dict[str, Any]) -> None: - """Update the assembly version of the genome metadata provided to use an integer. - Get the version from the assembly accession as alternative. +def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: + """Updates the assembly version of the genome metadata provided. + + If `version` meta key is not and integer or it is not available, the assembly accession's version + will be used instead. Args: - gmeta (Dict[str, Any]): Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. + + Raises: + ValueError: If both `version` and the assembly accession's version are not integers or are missing. """ - assembly = gmeta_out["assembly"] + assembly = genome_metadata["assembly"] version = assembly.get("version") - # Check the version is an integer try: assembly["version"] = int(version) except (ValueError, TypeError) as exc: # Get the version from the assembly accession accession = assembly["accession"] - parts = accession.split(".") - if len(parts) == 2 and parts[1].isdigit(): - version = parts[1] + version = accession.partition(".")[2] + try: assembly["version"] = int(version) - logging.info( - f'Asm version [v{version}] obtained from: assembly accession ({assembly["accession"]}).' - ) - else: + except ValueError: raise ValueError(f"Assembly version is not an integer in {assembly}") from exc + logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).") else: - logging.info(f"Located version [v{int(version)}] info from meta data.") + logging.info(f'Located version [v{assembly["version"]}] info from meta data.') -def check_genebuild_version(metadata: Dict[str, Any]) -> None: +def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. Args: - metadata: Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. Raises: ValueError: If there is no genebuild version or ID available. """ - genebuild = metadata.get("genebuild") - if genebuild is None: + try: + genebuild = genome_metadata["genebuild"] + except KeyError: return - version = genebuild.get("version") - - # Check there is a version - if version is None: - gb_id = genebuild.get("id") - if gb_id is None: - raise ValueError("No genebuild version or id") - metadata["genebuild"]["version"] = str(gb_id) - - if "id" in genebuild: - del metadata["genebuild"]["id"] + if "version" not in genebuild: + try: + genebuild_id = genebuild["id"] + except KeyError: + raise ValueError("No genebuild version or ID found") + genome_metadata["genebuild"]["version"] = str(genebuild_id) + # Drop genebuild ID since there is a genebuild version + genome_metadata["genebuild"].pop("id", None) def main() -> None: From 852fb6a8ab6337cb1c959ef16481a6312dca5dee Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:10:02 +0000 Subject: [PATCH 03/17] add genome_metadata.dump.check_*_version() unit tests --- src/python/tests/genome_metadata/test_dump.py | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 src/python/tests/genome_metadata/test_dump.py diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py new file mode 100644 index 000000000..57fddad17 --- /dev/null +++ b/src/python/tests/genome_metadata/test_dump.py @@ -0,0 +1,117 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.genome_metadata.dump` module. + +Typical usage example:: + $ pytest test_dump.py + +""" + +from contextlib import nullcontext as does_not_raise +from typing import Any, ContextManager, Dict + +from deepdiff import DeepDiff +import pytest +from pytest import param + +from ensembl.io.genomio.genome_metadata import dump + + +@pytest.mark.dependency(name="test_check_assembly_version") +@pytest.mark.parametrize( + "genome_metadata, output, expectation", + [ + param( + {"assembly": {"version": "1"}}, + {"assembly": {"version": 1}}, + does_not_raise(), + id="Version is '1'" + ), + param( + {"assembly": {"accession": "GCA_00000001.1", "version": "a"}}, + {"assembly": {"accession": "GCA_00000001.1", "version": 1}}, + does_not_raise(), + id="Version is 'a', accession is '1'", + ), + param( + {"assembly": {"accession": "GCA_00000001.1"}}, + {"assembly": {"accession": "GCA_00000001.1", "version": 1}}, + does_not_raise(), + id="No version, accession with version", + ), + param( + {"assembly": {"accession": "GCA_00000001"}}, + {}, + pytest.raises(ValueError), + id="No version, accession without version", + ), + ], +) +def test_check_assembly_version( + genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager +) -> None: + """Tests the `dump.check_assembly_version()` method. + + Args: + genome_metadata: Nested metadata key values from the core metadata table. + output: Expected change in the genome metadata dictionary. + expectation: Context manager for the expected exception, i.e. the test will only pass if that + exception is raised. Use `~contextlib.nullcontext` if no exception is expected. + """ + with expectation: + dump.check_assembly_version(genome_metadata) + assert not DeepDiff(genome_metadata, output) + + +@pytest.mark.dependency(name="test_check_genebuild_version") +@pytest.mark.parametrize( + "genome_metadata, output, expectation", + [ + param({}, {}, does_not_raise(), id="No 'genebuild' entry"), + param( + {"genebuild": {"version": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="Version is 'v1', no ID", + ), + param( + {"genebuild": {"version": "v1", "id": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="Version is 'v1', ID dropped", + ), + param( + {"genebuild": {"id": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="No version, ID moved to version", + ), + param({"genebuild": {}}, {}, pytest.raises(ValueError), id="No version or ID"), + ], +) +def test_check_genebuild_version( + genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager +) -> None: + """Tests the `dump.check_genebuild_version()` method. + + Args: + genome_metadata: Nested metadata key values from the core metadata table. + output: Expected change in the genome metadata dictionary. + expectation: Context manager for the expected exception, i.e. the test will only pass if that + exception is raised. Use `~contextlib.nullcontext` if no exception is expected. + """ + with expectation: + dump.check_genebuild_version(genome_metadata) + assert not DeepDiff(genome_metadata, output) From b776c7a17a0f5604db23635f83500d17cb7e96d7 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:18:42 +0000 Subject: [PATCH 04/17] simplify docstring --- src/python/tests/genome_metadata/test_dump.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 57fddad17..34f40cb4e 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -67,8 +67,7 @@ def test_check_assembly_version( Args: genome_metadata: Nested metadata key values from the core metadata table. output: Expected change in the genome metadata dictionary. - expectation: Context manager for the expected exception, i.e. the test will only pass if that - exception is raised. Use `~contextlib.nullcontext` if no exception is expected. + expectation: Context manager for the expected exception (if any). """ with expectation: dump.check_assembly_version(genome_metadata) @@ -109,8 +108,7 @@ def test_check_genebuild_version( Args: genome_metadata: Nested metadata key values from the core metadata table. output: Expected change in the genome metadata dictionary. - expectation: Context manager for the expected exception, i.e. the test will only pass if that - exception is raised. Use `~contextlib.nullcontext` if no exception is expected. + expectation: Context manager for the expected exception (if any). """ with expectation: dump.check_genebuild_version(genome_metadata) From 809cfc2943d130260d4e56343e89fd6bc964298a Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:31:11 +0000 Subject: [PATCH 05/17] ignore pylint warning since we want to change the exception raised --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index e3f7c1615..6158912d1 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -172,6 +172,7 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: try: genebuild_id = genebuild["id"] except KeyError: + # pylint: disable=raise-missing-from raise ValueError("No genebuild version or ID found") genome_metadata["genebuild"]["version"] = str(genebuild_id) # Drop genebuild ID since there is a genebuild version From 4d5d9218e867dd348244f0fa9114f7a7db1d6377 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:31:32 +0000 Subject: [PATCH 06/17] remove unneeded script functionality --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 6158912d1..f42fbc6d6 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -195,7 +195,3 @@ def main() -> None: genome_meta = filter_genome_meta(genome_meta) print(json.dumps(genome_meta, indent=2, sort_keys=True)) - - -if __name__ == "__main__": - main() From 083fa3869da87d0eabc9cacf9cee3a8ecc169310 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 12:31:44 +0000 Subject: [PATCH 07/17] make black happy --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 34f40cb4e..fb5199d20 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -37,7 +37,7 @@ {"assembly": {"version": "1"}}, {"assembly": {"version": 1}}, does_not_raise(), - id="Version is '1'" + id="Version is '1'", ), param( {"assembly": {"accession": "GCA_00000001.1", "version": "a"}}, From 88cdaa964d640ab4fcccabe77ea8339a677d07f9 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 15:24:40 +0000 Subject: [PATCH 08/17] update dump.filter_genome_metadata() code and add unit test --- .../io/genomio/genome_metadata/dump.py | 97 +++++++++---------- src/python/tests/genome_metadata/test_dump.py | 36 ++++++- 2 files changed, 82 insertions(+), 51 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index f42fbc6d6..6ed2f085c 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -34,6 +34,32 @@ from ensembl.utils.logging import init_logging_with_args +METADATA_FILTER = { + "added_seq": {"region_name": str}, + "annotation": {"provider_name": str, "provider_url": str}, + "assembly": { + "accession": str, + "date": str, + "name": str, + "provider_name": str, + "provider_url": str, + "version": int, + }, + "BRC4": {"organism_abbrev": str, "component": str}, + "genebuild": {"id": str, "method": str, "method_display": str, "start_date": str, "version": str}, + "species": { + "alias": str, + "annotation_source": str, + "display_name": str, + "division": str, + "production_name": str, + "scientific_name": str, + "strain": str, + "taxonomy_id": int, + }, +} + + def get_genome_metadata(session: Session) -> Dict[str, Any]: """Retrieve a select list of metadata from the core database. @@ -70,59 +96,30 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: return gmeta -def filter_genome_meta(gmeta: Dict[str, Any]) -> Dict[str, Any]: - """Returns a filtered metadata dict with only predefined keys. - Also converts expected numbers to integers (to follow the genome json schema). +def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. + + Also converts to expected data types (to follow the genome JSON schema). Args: - gmeta (Dict[str, Any]): Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. """ - meta_list = { - "species": { - "taxonomy_id", - "production_name", - "scientific_name", - "strain", - "display_name", - "division", - "alias", - "annotation_source", - }, - "assembly": {"accession", "date", "name", "version", "provider_name", "provider_url"}, - "genebuild": {"version", "method", "start_date", "method_display", "id"}, - "annotation": {"provider_name", "provider_url"}, - "BRC4": {"organism_abbrev", "component"}, - "added_seq": {"region_name"}, - } - is_integer = {"species": {"taxonomy_id"}, "assembly": {"version"}} - - gmeta_out: Dict[str, Any] = {} - for key1, subkeys in meta_list.items(): - if key1 not in gmeta: - continue - if subkeys: - gmeta_out[key1] = {} - for key2 in subkeys: - if key2 not in gmeta[key1]: - continue - value = gmeta[key1][key2] - if len(value) == 1: - value = value[0] - if key2 in is_integer.get(key1, {}): - value = int(value) - gmeta_out[key1][key2] = value - else: - value = gmeta[key1] - if len(value) == 1: - value = value[0] - if is_integer.get(key1): - value = int(value) - gmeta_out[key1] = value - - check_assembly_version(gmeta_out) - check_genebuild_version(gmeta_out) - - return gmeta_out + filtered_metadata: Dict[str, Any] = {} + for key, subfilter in METADATA_FILTER.items(): + if key in genome_metadata: + filtered_metadata[key] = {} + for subkey, value_type in subfilter.items(): + if subkey in genome_metadata[key]: + value = genome_metadata[key][subkey] + if isinstance(value, list): + value = [value_type(x) for x in value] + else: + value = value_type(value) + filtered_metadata[key][subkey] = value + # Check assembly and genebuild versions + check_assembly_version(filtered_metadata) + check_genebuild_version(filtered_metadata) + return filtered_metadata def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index fb5199d20..4f07d2cac 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -21,6 +21,7 @@ from contextlib import nullcontext as does_not_raise from typing import Any, ContextManager, Dict +from unittest.mock import Mock, patch from deepdiff import DeepDiff import pytest @@ -106,10 +107,43 @@ def test_check_genebuild_version( """Tests the `dump.check_genebuild_version()` method. Args: - genome_metadata: Nested metadata key values from the core metadata table. + genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. expectation: Context manager for the expected exception (if any). """ with expectation: dump.check_genebuild_version(genome_metadata) assert not DeepDiff(genome_metadata, output) + + +@patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version") +@patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version") +@pytest.mark.parametrize( + "genome_metadata, output", + [ + ({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}), + ({"species": {"display_name": "Dog"}}, {"species": {"display_name": "Dog"}}), + ({"genebuild": {"new_key": "_"}}, {"genebuild": {}}), + ({"BRC5": "new_value"}, {}), + ({"meta": "key", "species": {"alias": "woof"}}, {"species": {"alias": "woof"}}), + ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), + ], +) +def test_filter_genome_meta( + mock_check_assembly_version: Mock, + mock_check_genebuild_version: Mock, + genome_metadata: Dict[str, Any], + output: Dict[str, Any], +) -> None: + """Tests the `dump.check_genebuild_version()` method. + + Args: + mock_check_assembly_version: A mock of `dump.check_assembly_version()` method. + mock_check_genebuild_version: A mock of `dump.check_genebuild_version()` method. + genome_metadata: Nested genome metadata key values. + output: Expected change in the genome metadata dictionary. + """ + mock_check_assembly_version.return_value = None + mock_check_genebuild_version.return_value = None + result = dump.filter_genome_meta(genome_metadata) + assert not DeepDiff(result, output) From fadedb8c3d05ecf0f1910e2481448b6337ec0720 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 15:25:01 +0000 Subject: [PATCH 09/17] fix schema tabulation --- src/python/ensembl/io/genomio/data/schemas/genome.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/data/schemas/genome.json b/src/python/ensembl/io/genomio/data/schemas/genome.json index 3b14160c2..54985bb7a 100644 --- a/src/python/ensembl/io/genomio/data/schemas/genome.json +++ b/src/python/ensembl/io/genomio/data/schemas/genome.json @@ -25,7 +25,7 @@ }, "required": [ "taxonomy_id" - ] + ] }, "assembly_info": { "type": "object", @@ -73,7 +73,7 @@ }, "required": [ "version" - ] + ] }, "provider_info": { "description" : "legacy. use (annotation|assembly).provider_(name|url) instead", @@ -85,7 +85,7 @@ }, "required": [ "name" - ] + ] }, "BRC4_info": { "type": "object", @@ -97,7 +97,7 @@ "required": [ "component", "organism_abbrev" - ] + ] }, "added_sequence_info" : { "type": "object", @@ -128,7 +128,7 @@ "required" : [ "species", "assembly" - ] + ] } }, From f65bdbd0d3fa5c40c6ff036f7a76d98c8c677102 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 15:26:08 +0000 Subject: [PATCH 10/17] update docstring --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 4f07d2cac..3406949a3 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -66,7 +66,7 @@ def test_check_assembly_version( """Tests the `dump.check_assembly_version()` method. Args: - genome_metadata: Nested metadata key values from the core metadata table. + genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. expectation: Context manager for the expected exception (if any). """ From f20f63265815e2550938ebaec67a3241a74a03fe Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 16:00:32 +0000 Subject: [PATCH 11/17] simplify expected output for test_check_assembly_version() --- src/python/tests/genome_metadata/test_dump.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 3406949a3..799fc9ee0 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -20,7 +20,7 @@ """ from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict +from typing import Any, ContextManager, Dict, Optional from unittest.mock import Mock, patch from deepdiff import DeepDiff @@ -34,45 +34,40 @@ @pytest.mark.parametrize( "genome_metadata, output, expectation", [ - param( - {"assembly": {"version": "1"}}, - {"assembly": {"version": 1}}, - does_not_raise(), - id="Version is '1'", - ), + param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), param( {"assembly": {"accession": "GCA_00000001.1", "version": "a"}}, - {"assembly": {"accession": "GCA_00000001.1", "version": 1}}, + 1, does_not_raise(), - id="Version is 'a', accession is '1'", + id="Version is 'a', accession's version is 1", ), param( {"assembly": {"accession": "GCA_00000001.1"}}, - {"assembly": {"accession": "GCA_00000001.1", "version": 1}}, + 1, does_not_raise(), - id="No version, accession with version", + id="No version, accession's version is 1", ), param( {"assembly": {"accession": "GCA_00000001"}}, - {}, + 0, pytest.raises(ValueError), id="No version, accession without version", ), ], ) def test_check_assembly_version( - genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager + genome_metadata: Dict[str, Any], output: int, expectation: ContextManager ) -> None: """Tests the `dump.check_assembly_version()` method. Args: genome_metadata: Nested genome metadata key values. - output: Expected change in the genome metadata dictionary. + output: Expected assembly version. expectation: Context manager for the expected exception (if any). """ with expectation: dump.check_assembly_version(genome_metadata) - assert not DeepDiff(genome_metadata, output) + assert genome_metadata["assembly"]["version"] == output @pytest.mark.dependency(name="test_check_genebuild_version") From c5dd66a2cbc9514584addda330fa59f33d116c50 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 16:39:38 +0000 Subject: [PATCH 12/17] remove unused Optional --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 799fc9ee0..d143894bc 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -20,7 +20,7 @@ """ from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict, Optional +from typing import Any, ContextManager, Dict from unittest.mock import Mock, patch from deepdiff import DeepDiff From b4d82e9a232da8e1e1e95109ae2b240404585036 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 29 Feb 2024 16:43:29 +0000 Subject: [PATCH 13/17] add METADATA_FILTER type hint to make mypy happy --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 6ed2f085c..ee7d4cdfc 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -22,7 +22,7 @@ ] import json -from typing import Any, Dict +from typing import Any, Dict, Type import logging from sqlalchemy import select @@ -34,7 +34,7 @@ from ensembl.utils.logging import init_logging_with_args -METADATA_FILTER = { +METADATA_FILTER: Dict[str, Dict[str, Type]] = { "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, "assembly": { From 97d5e03b659767c4c0b9127283126a02fcd54d8a Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Mon, 4 Mar 2024 11:54:49 +0000 Subject: [PATCH 14/17] update dump.get_genome_metadata method and add unit test --- .../io/genomio/genome_metadata/dump.py | 50 +++++++------- src/python/tests/genome_metadata/test_dump.py | 66 ++++++++++++++++++- 2 files changed, 88 insertions(+), 28 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index ee7d4cdfc..365debf8f 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -61,39 +61,39 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: - """Retrieve a select list of metadata from the core database. + """Returns the meta table content from the core database in a nested dictionary. Args: session: Session for the current core. - Returns: - A nested dict. """ - gmeta: Dict[str, Any] = {} - - gmeta_st = select(Meta) - for row in session.execute(gmeta_st).unique().all(): - dat = row[0] - meta_key = dat.meta_key - meta_value = dat.meta_value - - if "." in meta_key: - (high_key, low_key) = meta_key.split(".") - if high_key in gmeta: - if low_key in gmeta[high_key]: - gmeta[high_key][low_key].append(meta_value) - else: - gmeta[high_key][low_key] = [meta_value] + genome_metadata: Dict[str, Any] = {} + meta_statement = select(Meta) + for row in session.execute(meta_statement).unique().all(): + meta_key = row[0].meta_key + meta_value = row[0].meta_value + (main_key, _, subkey) = meta_key.partition(".") + # Use empty string as subkey when no "." found to simplify dictionary creation + if main_key in genome_metadata: + if subkey in genome_metadata[main_key]: + genome_metadata[main_key][subkey].append(meta_value) else: - gmeta[high_key] = {} - gmeta[high_key][low_key] = [meta_value] + genome_metadata[main_key][subkey] = [meta_value] else: - if meta_key in gmeta: - gmeta[meta_key].append(meta_value) + genome_metadata[main_key] = {subkey: [meta_value]} + # Parse genome metadata to simplify dictionary and check data consistency + for main_key, subkeys_dict in genome_metadata.items(): + # Replace single-value lists by the value itself + for subkey, value in subkeys_dict.items(): + if len(value) == 1: + subkeys_dict[subkey] = value[0] + # Remove nested dictionary if it only has "" as key, passing its value to the main key + if "" in subkeys_dict: + if len(subkeys_dict) == 1: + genome_metadata[main_key] = subkeys_dict.pop("") else: - gmeta[meta_key] = [meta_value] - - return gmeta + raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}") + return genome_metadata def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index d143894bc..3ffe39411 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -19,8 +19,9 @@ """ +from collections import namedtuple from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict +from typing import Any, ContextManager, Dict, List from unittest.mock import Mock, patch from deepdiff import DeepDiff @@ -30,7 +31,9 @@ from ensembl.io.genomio.genome_metadata import dump -@pytest.mark.dependency(name="test_check_assembly_version") +MetaRow = namedtuple("MetaRow", "meta_key meta_value") + + @pytest.mark.parametrize( "genome_metadata, output, expectation", [ @@ -70,7 +73,6 @@ def test_check_assembly_version( assert genome_metadata["assembly"]["version"] == output -@pytest.mark.dependency(name="test_check_genebuild_version") @pytest.mark.parametrize( "genome_metadata, output, expectation", [ @@ -142,3 +144,61 @@ def test_filter_genome_meta( mock_check_genebuild_version.return_value = None result = dump.filter_genome_meta(genome_metadata) assert not DeepDiff(result, output) + + +@patch("sqlalchemy.orm.Session") +@patch("sqlalchemy.engine.Result") +@pytest.mark.parametrize( + "meta_data, output, expectation", + [ + param([], {}, does_not_raise(), id="Empty meta table"), + param( + [ + [MetaRow("sample", "gene1")], + [MetaRow("species.name", "dog")], + [MetaRow("species.synonym", "puppy")] + ], + {"sample": "gene1", "species": {"name": "dog", "synonym": "puppy"}}, + does_not_raise(), + id="Meta table with simple values", + ), + param( + [ + [MetaRow("sample", "gene1")], + [MetaRow("sample", "gene2")], + [MetaRow("species.synonym", "dog")], + [MetaRow("species.synonym", "puppy")] + ], + {"sample": ["gene1", "gene2"], "species": {"synonym": ["dog", "puppy"]}}, + does_not_raise(), + id="Meta table with lists", + ), + param( + [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], + {}, + pytest.raises(ValueError), + id="'species' and 'species.synonym' meta keys", + ), + ], +) +def test_get_genome_metadata( + mock_session: Mock, + mock_result: Mock, + meta_data: List[MetaRow], + output: Dict[str, Any], + expectation: ContextManager, +) -> None: + """Tests the `dump.get_genome_metadata()` method. + + Args: + mock_session: A mock of `sqlalchemy.orm.Session()` class. + meta_data: `meta` table content in a list of named tuples. + output: Expected genome metadata dictionary. + expectation: Context manager for the expected exception (if any). + """ + mock_result.unique.return_value = mock_result + mock_result.all.return_value = meta_data + mock_session.execute.return_value = mock_result + with expectation: + result = dump.get_genome_metadata(mock_session) + assert not DeepDiff(result, output) From 700df60de4b52e6187f7cf7a838b66fe5a43505f Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Mon, 4 Mar 2024 12:21:11 +0000 Subject: [PATCH 15/17] patch without passing mocked functions as args --- src/python/tests/genome_metadata/test_dump.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 3ffe39411..27ea9efe3 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -113,8 +113,8 @@ def test_check_genebuild_version( assert not DeepDiff(genome_metadata, output) -@patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version") -@patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version") +@patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) +@patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) @pytest.mark.parametrize( "genome_metadata, output", [ @@ -126,22 +126,13 @@ def test_check_genebuild_version( ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), ], ) -def test_filter_genome_meta( - mock_check_assembly_version: Mock, - mock_check_genebuild_version: Mock, - genome_metadata: Dict[str, Any], - output: Dict[str, Any], -) -> None: +def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: """Tests the `dump.check_genebuild_version()` method. Args: - mock_check_assembly_version: A mock of `dump.check_assembly_version()` method. - mock_check_genebuild_version: A mock of `dump.check_genebuild_version()` method. genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. """ - mock_check_assembly_version.return_value = None - mock_check_genebuild_version.return_value = None result = dump.filter_genome_meta(genome_metadata) assert not DeepDiff(result, output) From c1f7718f8edbe518590fd3f6c663ac13f0d676e9 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Mon, 4 Mar 2024 12:21:46 +0000 Subject: [PATCH 16/17] bugfix: left-rigth args match bottom-up patching --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 27ea9efe3..13de72330 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -137,8 +137,8 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A assert not DeepDiff(result, output) -@patch("sqlalchemy.orm.Session") @patch("sqlalchemy.engine.Result") +@patch("sqlalchemy.orm.Session") @pytest.mark.parametrize( "meta_data, output, expectation", [ From ff052ff206e4d1ba8f38f3cb63f188c495fa4532 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Mon, 4 Mar 2024 12:26:49 +0000 Subject: [PATCH 17/17] reformat --- src/python/tests/genome_metadata/test_dump.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 13de72330..f6f0c8082 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -26,7 +26,6 @@ from deepdiff import DeepDiff import pytest -from pytest import param from ensembl.io.genomio.genome_metadata import dump @@ -37,20 +36,20 @@ @pytest.mark.parametrize( "genome_metadata, output, expectation", [ - param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), - param( + pytest.param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), + pytest.param( {"assembly": {"accession": "GCA_00000001.1", "version": "a"}}, 1, does_not_raise(), id="Version is 'a', accession's version is 1", ), - param( + pytest.param( {"assembly": {"accession": "GCA_00000001.1"}}, 1, does_not_raise(), id="No version, accession's version is 1", ), - param( + pytest.param( {"assembly": {"accession": "GCA_00000001"}}, 0, pytest.raises(ValueError), @@ -76,26 +75,26 @@ def test_check_assembly_version( @pytest.mark.parametrize( "genome_metadata, output, expectation", [ - param({}, {}, does_not_raise(), id="No 'genebuild' entry"), - param( + pytest.param({}, {}, does_not_raise(), id="No 'genebuild' entry"), + pytest.param( {"genebuild": {"version": "v1"}}, {"genebuild": {"version": "v1"}}, does_not_raise(), id="Version is 'v1', no ID", ), - param( + pytest.param( {"genebuild": {"version": "v1", "id": "v1"}}, {"genebuild": {"version": "v1"}}, does_not_raise(), id="Version is 'v1', ID dropped", ), - param( + pytest.param( {"genebuild": {"id": "v1"}}, {"genebuild": {"version": "v1"}}, does_not_raise(), id="No version, ID moved to version", ), - param({"genebuild": {}}, {}, pytest.raises(ValueError), id="No version or ID"), + pytest.param({"genebuild": {}}, {}, pytest.raises(ValueError), id="No version or ID"), ], ) def test_check_genebuild_version( @@ -142,29 +141,29 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A @pytest.mark.parametrize( "meta_data, output, expectation", [ - param([], {}, does_not_raise(), id="Empty meta table"), - param( + pytest.param([], {}, does_not_raise(), id="Empty meta table"), + pytest.param( [ [MetaRow("sample", "gene1")], [MetaRow("species.name", "dog")], - [MetaRow("species.synonym", "puppy")] + [MetaRow("species.synonym", "puppy")], ], {"sample": "gene1", "species": {"name": "dog", "synonym": "puppy"}}, does_not_raise(), id="Meta table with simple values", ), - param( + pytest.param( [ [MetaRow("sample", "gene1")], [MetaRow("sample", "gene2")], [MetaRow("species.synonym", "dog")], - [MetaRow("species.synonym", "puppy")] + [MetaRow("species.synonym", "puppy")], ], {"sample": ["gene1", "gene2"], "species": {"synonym": ["dog", "puppy"]}}, does_not_raise(), id="Meta table with lists", ), - param( + pytest.param( [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], {}, pytest.raises(ValueError),