diff --git a/src/python/ensembl/io/genomio/data/schemas/genome.json b/src/python/ensembl/io/genomio/data/schemas/genome.json index 3b14160c2..54985bb7a 100644 --- a/src/python/ensembl/io/genomio/data/schemas/genome.json +++ b/src/python/ensembl/io/genomio/data/schemas/genome.json @@ -25,7 +25,7 @@ }, "required": [ "taxonomy_id" - ] + ] }, "assembly_info": { "type": "object", @@ -73,7 +73,7 @@ }, "required": [ "version" - ] + ] }, "provider_info": { "description" : "legacy. use (annotation|assembly).provider_(name|url) instead", @@ -85,7 +85,7 @@ }, "required": [ "name" - ] + ] }, "BRC4_info": { "type": "object", @@ -97,7 +97,7 @@ "required": [ "component", "organism_abbrev" - ] + ] }, "added_sequence_info" : { "type": "object", @@ -128,7 +128,7 @@ "required" : [ "species", "assembly" - ] + ] } }, diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index f4dcdf4c8..365debf8f 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -14,10 +14,15 @@ # limitations under the License. """Generates a JSON file representing the genome metadata from a core database.""" -__all__ = ["get_genome_metadata", "filter_genome_meta", "check_assembly_version"] +__all__ = [ + "get_genome_metadata", + "filter_genome_meta", + "check_assembly_version", + "check_genebuild_version", +] import json -from typing import Any, Dict +from typing import Any, Dict, Type import logging from sqlalchemy import select @@ -29,149 +34,146 @@ from ensembl.utils.logging import init_logging_with_args +METADATA_FILTER: Dict[str, Dict[str, Type]] = { + "added_seq": {"region_name": str}, + "annotation": {"provider_name": str, "provider_url": str}, + "assembly": { + "accession": str, + "date": str, + "name": str, + "provider_name": str, + "provider_url": str, + "version": int, + }, + "BRC4": {"organism_abbrev": str, "component": str}, + "genebuild": {"id": str, "method": str, "method_display": str, "start_date": str, "version": str}, + "species": { + "alias": str, + "annotation_source": str, + "display_name": str, + "division": str, + "production_name": str, + "scientific_name": str, + "strain": str, + "taxonomy_id": int, + }, +} + + def get_genome_metadata(session: Session) -> Dict[str, Any]: - """Retrieve a select list of metadata from the core database. + """Returns the meta table content from the core database in a nested dictionary. Args: session: Session for the current core. - Returns: - A nested dict. """ - gmeta: Dict[str, Any] = {} - - gmeta_st = select(Meta) - for row in session.execute(gmeta_st).unique().all(): - dat = row[0] - meta_key = dat.meta_key - meta_value = dat.meta_value - - if "." in meta_key: - (high_key, low_key) = meta_key.split(".") - if high_key in gmeta: - if low_key in gmeta[high_key]: - gmeta[high_key][low_key].append(meta_value) - else: - gmeta[high_key][low_key] = [meta_value] + genome_metadata: Dict[str, Any] = {} + meta_statement = select(Meta) + for row in session.execute(meta_statement).unique().all(): + meta_key = row[0].meta_key + meta_value = row[0].meta_value + (main_key, _, subkey) = meta_key.partition(".") + # Use empty string as subkey when no "." found to simplify dictionary creation + if main_key in genome_metadata: + if subkey in genome_metadata[main_key]: + genome_metadata[main_key][subkey].append(meta_value) else: - gmeta[high_key] = {} - gmeta[high_key][low_key] = [meta_value] + genome_metadata[main_key][subkey] = [meta_value] else: - if meta_key in gmeta: - gmeta[meta_key].append(meta_value) + genome_metadata[main_key] = {subkey: [meta_value]} + # Parse genome metadata to simplify dictionary and check data consistency + for main_key, subkeys_dict in genome_metadata.items(): + # Replace single-value lists by the value itself + for subkey, value in subkeys_dict.items(): + if len(value) == 1: + subkeys_dict[subkey] = value[0] + # Remove nested dictionary if it only has "" as key, passing its value to the main key + if "" in subkeys_dict: + if len(subkeys_dict) == 1: + genome_metadata[main_key] = subkeys_dict.pop("") else: - gmeta[meta_key] = [meta_value] + raise ValueError(f"Unexpected meta keys for '{main_key}': {', '.join(subkeys_dict.keys())}") + return genome_metadata - return gmeta +def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: + """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. -def filter_genome_meta(gmeta: Dict[str, Any]) -> Dict[str, Any]: - """Returns a filtered metadata dict with only predefined keys. - Also converts expected numbers to integers (to follow the genome json schema). + Also converts to expected data types (to follow the genome JSON schema). Args: - gmeta (Dict[str, Any]): Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. """ - meta_list = { - "species": { - "taxonomy_id", - "production_name", - "scientific_name", - "strain", - "display_name", - "division", - "alias", - "annotation_source", - }, - "assembly": {"accession", "date", "name", "version", "provider_name", "provider_url"}, - "genebuild": {"version", "method", "start_date", "method_display", "id"}, - "annotation": {"provider_name", "provider_url"}, - "BRC4": {"organism_abbrev", "component"}, - "added_seq": {"region_name"}, - } - is_integer = {"species": {"taxonomy_id"}, "assembly": {"version"}} - - gmeta_out: Dict[str, Any] = {} - for key1, subkeys in meta_list.items(): - if key1 not in gmeta: - continue - if subkeys: - gmeta_out[key1] = {} - for key2 in subkeys: - if key2 not in gmeta[key1]: - continue - value = gmeta[key1][key2] - if len(value) == 1: - value = value[0] - if key2 in is_integer.get(key1, {}): - value = int(value) - gmeta_out[key1][key2] = value - else: - value = gmeta[key1] - if len(value) == 1: - value = value[0] - if is_integer.get(key1): - value = int(value) - gmeta_out[key1] = value - - check_assembly_version(gmeta_out) - check_genebuild_version(gmeta_out) - - return gmeta_out - - -def check_assembly_version(gmeta_out: Dict[str, Any]) -> None: - """Update the assembly version of the genome metadata provided to use an integer. - Get the version from the assembly accession as alternative. + filtered_metadata: Dict[str, Any] = {} + for key, subfilter in METADATA_FILTER.items(): + if key in genome_metadata: + filtered_metadata[key] = {} + for subkey, value_type in subfilter.items(): + if subkey in genome_metadata[key]: + value = genome_metadata[key][subkey] + if isinstance(value, list): + value = [value_type(x) for x in value] + else: + value = value_type(value) + filtered_metadata[key][subkey] = value + # Check assembly and genebuild versions + check_assembly_version(filtered_metadata) + check_genebuild_version(filtered_metadata) + return filtered_metadata + + +def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: + """Updates the assembly version of the genome metadata provided. + + If `version` meta key is not and integer or it is not available, the assembly accession's version + will be used instead. Args: - gmeta (Dict[str, Any]): Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. + + Raises: + ValueError: If both `version` and the assembly accession's version are not integers or are missing. """ - assembly = gmeta_out["assembly"] + assembly = genome_metadata["assembly"] version = assembly.get("version") - # Check the version is an integer try: assembly["version"] = int(version) except (ValueError, TypeError) as exc: # Get the version from the assembly accession accession = assembly["accession"] - parts = accession.split(".") - if len(parts) == 2 and parts[1].isdigit(): - version = parts[1] + version = accession.partition(".")[2] + try: assembly["version"] = int(version) - logging.info( - f'Asm version [v{version}] obtained from: assembly accession ({assembly["accession"]}).' - ) - else: + except ValueError: raise ValueError(f"Assembly version is not an integer in {assembly}") from exc + logging.info(f"Assembly version [v{version}] obtained from assembly accession ({accession}).") else: - logging.info(f"Located version [v{int(version)}] info from meta data.") + logging.info(f'Located version [v{assembly["version"]}] info from meta data.') -def check_genebuild_version(metadata: Dict[str, Any]) -> None: +def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. Args: - metadata: Nested metadata key values from the core metadata table. + genome_metadata: Nested metadata key values from the core metadata table. Raises: ValueError: If there is no genebuild version or ID available. """ - genebuild = metadata.get("genebuild") - if genebuild is None: + try: + genebuild = genome_metadata["genebuild"] + except KeyError: return - version = genebuild.get("version") - - # Check there is a version - if version is None: - gb_id = genebuild.get("id") - if gb_id is None: - raise ValueError("No genebuild version or id") - metadata["genebuild"]["version"] = str(gb_id) - - if "id" in genebuild: - del metadata["genebuild"]["id"] + if "version" not in genebuild: + try: + genebuild_id = genebuild["id"] + except KeyError: + # pylint: disable=raise-missing-from + raise ValueError("No genebuild version or ID found") + genome_metadata["genebuild"]["version"] = str(genebuild_id) + # Drop genebuild ID since there is a genebuild version + genome_metadata["genebuild"].pop("id", None) def main() -> None: @@ -190,7 +192,3 @@ def main() -> None: genome_meta = filter_genome_meta(genome_meta) print(json.dumps(genome_meta, indent=2, sort_keys=True)) - - -if __name__ == "__main__": - main() diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py new file mode 100644 index 000000000..f6f0c8082 --- /dev/null +++ b/src/python/tests/genome_metadata/test_dump.py @@ -0,0 +1,194 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.genome_metadata.dump` module. + +Typical usage example:: + $ pytest test_dump.py + +""" + +from collections import namedtuple +from contextlib import nullcontext as does_not_raise +from typing import Any, ContextManager, Dict, List +from unittest.mock import Mock, patch + +from deepdiff import DeepDiff +import pytest + +from ensembl.io.genomio.genome_metadata import dump + + +MetaRow = namedtuple("MetaRow", "meta_key meta_value") + + +@pytest.mark.parametrize( + "genome_metadata, output, expectation", + [ + pytest.param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), + pytest.param( + {"assembly": {"accession": "GCA_00000001.1", "version": "a"}}, + 1, + does_not_raise(), + id="Version is 'a', accession's version is 1", + ), + pytest.param( + {"assembly": {"accession": "GCA_00000001.1"}}, + 1, + does_not_raise(), + id="No version, accession's version is 1", + ), + pytest.param( + {"assembly": {"accession": "GCA_00000001"}}, + 0, + pytest.raises(ValueError), + id="No version, accession without version", + ), + ], +) +def test_check_assembly_version( + genome_metadata: Dict[str, Any], output: int, expectation: ContextManager +) -> None: + """Tests the `dump.check_assembly_version()` method. + + Args: + genome_metadata: Nested genome metadata key values. + output: Expected assembly version. + expectation: Context manager for the expected exception (if any). + """ + with expectation: + dump.check_assembly_version(genome_metadata) + assert genome_metadata["assembly"]["version"] == output + + +@pytest.mark.parametrize( + "genome_metadata, output, expectation", + [ + pytest.param({}, {}, does_not_raise(), id="No 'genebuild' entry"), + pytest.param( + {"genebuild": {"version": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="Version is 'v1', no ID", + ), + pytest.param( + {"genebuild": {"version": "v1", "id": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="Version is 'v1', ID dropped", + ), + pytest.param( + {"genebuild": {"id": "v1"}}, + {"genebuild": {"version": "v1"}}, + does_not_raise(), + id="No version, ID moved to version", + ), + pytest.param({"genebuild": {}}, {}, pytest.raises(ValueError), id="No version or ID"), + ], +) +def test_check_genebuild_version( + genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager +) -> None: + """Tests the `dump.check_genebuild_version()` method. + + Args: + genome_metadata: Nested genome metadata key values. + output: Expected change in the genome metadata dictionary. + expectation: Context manager for the expected exception (if any). + """ + with expectation: + dump.check_genebuild_version(genome_metadata) + assert not DeepDiff(genome_metadata, output) + + +@patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) +@patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) +@pytest.mark.parametrize( + "genome_metadata, output", + [ + ({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}), + ({"species": {"display_name": "Dog"}}, {"species": {"display_name": "Dog"}}), + ({"genebuild": {"new_key": "_"}}, {"genebuild": {}}), + ({"BRC5": "new_value"}, {}), + ({"meta": "key", "species": {"alias": "woof"}}, {"species": {"alias": "woof"}}), + ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), + ], +) +def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: + """Tests the `dump.check_genebuild_version()` method. + + Args: + genome_metadata: Nested genome metadata key values. + output: Expected change in the genome metadata dictionary. + """ + result = dump.filter_genome_meta(genome_metadata) + assert not DeepDiff(result, output) + + +@patch("sqlalchemy.engine.Result") +@patch("sqlalchemy.orm.Session") +@pytest.mark.parametrize( + "meta_data, output, expectation", + [ + pytest.param([], {}, does_not_raise(), id="Empty meta table"), + pytest.param( + [ + [MetaRow("sample", "gene1")], + [MetaRow("species.name", "dog")], + [MetaRow("species.synonym", "puppy")], + ], + {"sample": "gene1", "species": {"name": "dog", "synonym": "puppy"}}, + does_not_raise(), + id="Meta table with simple values", + ), + pytest.param( + [ + [MetaRow("sample", "gene1")], + [MetaRow("sample", "gene2")], + [MetaRow("species.synonym", "dog")], + [MetaRow("species.synonym", "puppy")], + ], + {"sample": ["gene1", "gene2"], "species": {"synonym": ["dog", "puppy"]}}, + does_not_raise(), + id="Meta table with lists", + ), + pytest.param( + [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], + {}, + pytest.raises(ValueError), + id="'species' and 'species.synonym' meta keys", + ), + ], +) +def test_get_genome_metadata( + mock_session: Mock, + mock_result: Mock, + meta_data: List[MetaRow], + output: Dict[str, Any], + expectation: ContextManager, +) -> None: + """Tests the `dump.get_genome_metadata()` method. + + Args: + mock_session: A mock of `sqlalchemy.orm.Session()` class. + meta_data: `meta` table content in a list of named tuples. + output: Expected genome metadata dictionary. + expectation: Context manager for the expected exception (if any). + """ + mock_result.unique.return_value = mock_result + mock_result.all.return_value = meta_data + mock_session.execute.return_value = mock_result + with expectation: + result = dump.get_genome_metadata(mock_session) + assert not DeepDiff(result, output)