Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update VEuPathDB meta keys #360

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions data/test/pipelines/dumper/dump_files/dumped_genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
"taxonomy_id": 7159,
"production_name": "aedes_aegypti"
},
"BRC4": {
"veupathdb": {
"organism_abbrev": "aaegL5",
"component": "VectorBase"

"component_db": "VectorBase",
"build_version": 65
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"BRC4": {
"component": "OrganismDB",
"veupathdb": {
"build_version": 65,
"component_db": "OrganismDB",
"organism_abbrev": "organAbrev123"
},
"species": {},
Expand Down
4 changes: 2 additions & 2 deletions pipelines/nextflow/subworkflows/genome_prepare/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json

name: "genome_prepare"
description: BRC/Ensembl metazoa pipeline. Retrieve data for genome(s), obtained from INSDC and RefSeq, validate and prepare GFF3, FASTA, JSON files for each genome accession.
description: VEuPathDB/Ensembl Metazoa pipeline. Retrieve data for genome(s), obtained from INSDC and RefSeq, validate and prepare GFF3, FASTA, JSON files for each genome accession.
keywords:
- fasta
- gff3
Expand Down Expand Up @@ -45,7 +45,7 @@ input:
type: directory
description: |
MANDATORY param. User supplied input directory name containing genome json(s) storing meta information including:
genome INSDC accession, BRC 'organism_abbrev', BRC 'component'.
genome INSDC accession, VEuPathDB 'organism_abbrev', VEuPathDB 'component_db', VEuPathDB 'build_version'.
pattern: "input_dir/*.json"
output:
- fasta_dna:
Expand Down
4 changes: 2 additions & 2 deletions pipelines/nextflow/workflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def meta_from_genome_json(json_path) {
prod_name = data.assembly.accession
publish_dir = data.assembly.accession
if ( params.brc_mode ) {
prod_name = data.BRC4.organism_abbrev
publish_dir = "${data.BRC4.component}/${data.BRC4.organism_abbrev}"
prod_name = data.veupathdb.organism_abbrev
publish_dir = "${data.veupathdb.component_db}/${data.veupathdb.organism_abbrev}"
} else if ( data.species && data.species.production_name ) {
prod_name = data.species.production_name
publish_dir = prod_name
Expand Down
15 changes: 3 additions & 12 deletions scripts/brc4/check_descriptions.pl
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ sub main {
my @all_species = ($opt{species}) || @{$registry->get_all_species()};
for my $species (sort @all_species) {
my $ma = $registry->get_adaptor($species, "core", "MetaContainer");
my $component = get_meta_value($ma, 'BRC4.component');
my $component = get_meta_value($ma, 'veupathdb.component_db');
if ($opt{component} and $opt{component} ne $component) {
$ma->dbc->disconnect_if_idle();
next;
}

my $count = check_genes($registry, $species);
my $build = get_build($ma, $species);
my $org = get_meta_value($ma, 'BRC4.organism_abbrev');
my $build = get_meta_value($ma, 'veupathdb.build_version');
my $org = get_meta_value($ma, 'veupathdb.organism_abbrev');

$ma->dbc->disconnect_if_idle();

Expand Down Expand Up @@ -110,15 +110,6 @@ sub main {
}
}

sub get_build {
my ($ma, $key) = @_;

my $dbname = $ma->dbc->dbname;
if ($dbname =~ /_(\d+)_\d+_\d+$/) {
return $1;
}
}

sub get_meta_value {
my ($ma, $key) = @_;

Expand Down
25 changes: 13 additions & 12 deletions scripts/brc4/get_metadata_from_redmine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from redminelib import Redmine
import argparse
import os, json, re, time
import requests
import xml.etree.ElementTree as ET


url = "https://redmine.apidb.org"
default_fields = dict(
Expand Down Expand Up @@ -63,7 +62,7 @@ def retrieve_genomes(redmine, output_dir, build=None):
failed_issues.append({"issue": issue, "desc": failure})
continue

abbrev = genome["BRC4"]["organism_abbrev"]
abbrev = genome["veupathdb"]["organism_abbrev"]
group = "other"
if "Reference change" in extra["operations"]:
group = "reference_change"
Expand Down Expand Up @@ -110,7 +109,7 @@ def retrieve_genomes(redmine, output_dir, build=None):
pass

for genome in genomes:
organism = genome["BRC4"]["organism_abbrev"]
organism = genome["veupathdb"]["organism_abbrev"]
organism_file = os.path.join(group_dir, organism + ".json")
with open(organism_file, "w") as f:
json.dump(genome, f, indent=True)
Expand Down Expand Up @@ -147,8 +146,9 @@ def parse_genome(issue):

customs = get_custom_fields(issue)
genome = {
"BRC4": {
"component": "",
"veupathdb": {
"build_version": 0,
"component_db": "",
"organism_abbrev": "",
},
"species": {},
Expand All @@ -164,11 +164,14 @@ def parse_genome(issue):
accession = check_accession(accession)
genome["assembly"]["accession"] = accession

# Get BRC4 component
# Get VEuPathDB build version
genome["veupathdb"]["build_version"] = int(re.search(r"Build (\d+)", str(issue.fixed_version)).group(1))

# Get VEuPathDB component db
if "Component DB" in customs:
components = customs["Component DB"]["value"]
if len(components) == 1:
genome["BRC4"]["component"] = components[0]
genome["veupathdb"]["component_db"] = components[0]
elif len(components) > 1:
raise Exception("More than 1 component for genome " + str(issue.id))

Expand All @@ -181,7 +184,7 @@ def parse_genome(issue):
if not check_organism_abbrev(abbrev):
print(f"Invalid organism_abbrev in {issue.id}: {abbrev}")
else:
genome["BRC4"]["organism_abbrev"] = abbrev
genome["veupathdb"]["organism_abbrev"] = abbrev
except KeyError:
print(f"Can't get organism abbrev for {issue.id} because: missing organism_abbrev")
return
Expand All @@ -191,15 +194,13 @@ def parse_genome(issue):
gff_path = customs["GFF 2 Load"]["value"]
if gff_path:
extra["GFF"] = True
# print("GFF2LOAD: separate gff file for %s: %s (issue %d)" % (genome["BRC4"]["organism_abbrev"], gff_path, issue.id))
except:
pass

# Warn for replacement
try:
if customs["Replacement genome?"]["value"].startswith("Yes"):
extra["Replacement"] = True
# print("REPLACEMENT: the organism %s is a replacement (issue %d)" % (genome["BRC4"]["organism_abbrev"], issue.id))
except:
pass

Expand All @@ -217,7 +218,7 @@ def check_genome(genome, extra):
if not genome:
return "No genome parsed"

if not "organism_abbrev" in genome["BRC4"] or not genome["BRC4"]["organism_abbrev"]:
if not "organism_abbrev" in genome["veupathdb"] or not genome["veupathdb"]["organism_abbrev"]:
return "No organism_abbrev defined"

operations = extra["operations"]
Expand Down
10 changes: 6 additions & 4 deletions src/python/ensembl/io/genomio/data/schemas/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,17 @@
"name"
]
},
"BRC4_info": {
"vpdb_info": {
"type": "object",
"additionalProperties": false,
"properties": {
"component" : { "type" : "string" },
"build_version" : { "type" : "integer" },
"component_db" : { "type" : "string" },
"organism_abbrev" : { "type" : "string" }
},
"required": [
"component",
"build_version",
"component_db",
"organism_abbrev"
]
},
Expand All @@ -122,7 +124,7 @@
"annotation" : { "$ref" : "#/definitions/annotation_info" },
"genebuild" : { "$ref" : "#/definitions/genebuild_info" },
"provider" : { "$ref" : "#/definitions/provider_info" },
"BRC4" : { "$ref" : "#/definitions/BRC4_info" },
"veupathdb" : { "$ref" : "#/definitions/vpdb_info" },
"added_seq" : { "$ref" : "#/definitions/added_sequence_info" }
},
"required" : [
Expand Down
26 changes: 16 additions & 10 deletions src/python/ensembl/io/genomio/database/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ def format_db_data(server_url: URL, dbs: List[str], brc_mode: bool = False) -> L
Args:
server: Server URL where all the databases are hosted.
dbs: List of database names.
brc_mode: If true, assign ``BRC4.organism_abbrev`` as the species, and ``BRC4.component`` as the
division. Otherwise, the species will be ``species.production_name`` and the division will be
``species.division``.
brc_mode: If true, assign `veupathdb.organism_abbrev` as the species, `veupathdb.component_db` as the
division and `veupathdb.build_version` as the project release. Otherwise, the species will be
`species.production_name` and the division will be `species.division`.

Returns:
List of dictionaries with 3 keys: "database", "species" and "division".
Expand All @@ -55,12 +55,15 @@ def format_db_data(server_url: URL, dbs: List[str], brc_mode: bool = False) -> L
project_release = core_db.get_project_release()

if brc_mode:
brc_organism = core_db.get_meta_value("BRC4.organism_abbrev")
brc_component = core_db.get_meta_value("BRC4.component")
if brc_organism is not None:
species = brc_organism
if brc_component is not None:
division = brc_component
vpdb_organism = core_db.get_meta_value("veupathdb.organism_abbrev")
vpdb_component = core_db.get_meta_value("veupathdb.component_db")
vpdb_build = core_db.get_meta_value("veupathdb.build_version")
if vpdb_organism is not None:
species = vpdb_organism
if vpdb_component is not None:
division = vpdb_component
if vpdb_build is not None:
project_release = vpdb_build

if not division:
division = "all"
Expand Down Expand Up @@ -137,7 +140,10 @@ def main() -> None:
parser.add_argument(
"--brc_mode",
action="store_true",
help="Enable BRC mode, i.e. use organism_abbrev for species, component for division",
help=(
"Enable BRC mode, i.e. use organism_abbrev for species, component_db for division, "
"build_version for project release"
),
)
parser.add_log_arguments()
args = parser.parse_args()
Expand Down
12 changes: 6 additions & 6 deletions src/python/ensembl/io/genomio/events/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
__all__ = [
"IdsSet",
"DictToIdsSet",
"BRC4_START_DATE",
"VPDB_START_DATE",
"Pair",
"UnsupportedEvent",
"Event",
Expand All @@ -38,7 +38,7 @@
from ensembl.utils.logging import init_logging_with_args


BRC4_START_DATE = datetime(2020, 5, 1)
VPDB_START_DATE = datetime(2020, 5, 1)
IdsSet = Set[str]
DictToIdsSet = Dict[str, IdsSet]

Expand Down Expand Up @@ -89,7 +89,7 @@ class Event:
name: Name of the event (will be updated automatically).
pairs: All pair of ids for this event.

Any gene set before 2019-09 is dubbed pre-BRC4.
Any gene set before 2019-09 is dubbed pre-VPDB.

"""

Expand Down Expand Up @@ -234,14 +234,14 @@ def add_pair(self, pair: Pair) -> None:
self.pairs.append(pair)

def get_full_release(self) -> str:
"""Returns the expanded release name, pre-BRC4 or `BRC4 = build`."""
"""Returns the expanded release name, pre-VPDB or `VPDB = build`."""
release = self.release
date = self.date

if date and date > BRC4_START_DATE:
if date and date > VPDB_START_DATE:
release = f"build {release}"
else:
release = f"pre-BRC4 {release}"
release = f"pre-VPDB {release}"

return release

Expand Down
2 changes: 1 addition & 1 deletion src/python/ensembl/io/genomio/genome_metadata/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"provider_url": str,
"version": int,
},
"BRC4": {"organism_abbrev": str, "component": str},
"veupathdb": {"organism_abbrev": str, "component_db": str, "build_version": int},
"genebuild": {"id": str, "method": str, "method_display": str, "start_date": str, "version": str},
"species": {
"alias": str,
Expand Down
2 changes: 1 addition & 1 deletion src/python/ensembl/io/genomio/gff3/id_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class StableIDAllocator:
def set_prefix(self, genome: Dict) -> None:
"""Sets the ID prefix using the organism abbrev if it exists in the genome metadata."""
try:
org = genome["BRC4"]["organism_abbrev"]
org = genome["veupathdb"]["organism_abbrev"]
except KeyError:
prefix = "TMP_PREFIX_"
else:
Expand Down
19 changes: 12 additions & 7 deletions src/python/tests/database/test_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""

from pathlib import Path
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
from unittest.mock import call, Mock, patch

from deepdiff import DeepDiff
Expand All @@ -35,8 +35,9 @@
"species.production_name": "dog",
"species.division": "metazoa",
"assembly.accession": "GCA_000111222.3",
"BRC4.organism_abbrev": "brc_dog",
"BRC4.component": "brc_db",
"veupathdb.organism_abbrev": "brc_dog",
"veupathdb.component_db": "brc_db",
"veupathdb.build_version": 12,
}


Expand Down Expand Up @@ -124,15 +125,15 @@ def test_format_db_data(
server_url: Server URL where all the databases are hosted.
dbs: List of database names.
brc_mode: BRC mode?
skip_keys: Return `None` instead of the assigned value for "BRC4.*" meta keys.
skip_keys: Return `None` instead of the assigned value for "veupathdb.*" meta keys.
output: Expected list of dictionaries with metadata per database.
"""

def _get_meta_value(meta_key: str) -> Optional[str]:
def _get_meta_value(meta_key: str) -> Optional[Any]:
"""Return empty string if "species.division" is requested in BRC mode, "Metazoa" otherwise."""
if (meta_key == "species.division") and brc_mode:
return ""
if meta_key.startswith("BRC4.") and skip_keys:
if meta_key.startswith("veupathdb.") and skip_keys:
return None
return _META[meta_key]

Expand All @@ -146,7 +147,11 @@ def _get_meta_value(meta_key: str) -> Optional[str]:
if dbs:
calls = [call("species.production_name"), call("species.division"), call("assembly.accession")]
if brc_mode:
calls += [call("BRC4.organism_abbrev"), call("BRC4.component")]
calls += [
call("veupathdb.organism_abbrev"),
call("veupathdb.component_db"),
call("veupathdb.build_version"),
]
dbconnection.get_meta_value.assert_has_calls(calls)
dbconnection.get_project_release.assert_called()

Expand Down
9 changes: 5 additions & 4 deletions src/python/tests/genome_metadata/test_extend/genome.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
{
"BRC4": {
"component": "PlasmoDB",
"organism_abbrev": "pfal3D7"
},
"assembly": {
"accession": "GCA_000002765.1",
"provider_name": "RefSeq",
Expand All @@ -16,5 +12,10 @@
"species": {
"scientific_name": "Plasmodium falciparum",
"taxonomy_id": 36329
},
"veupathdb": {
"build_version": 65,
"component_db": "PlasmoDB",
"organism_abbrev": "pfal3D7"
}
}
Loading