From 2cf1db9c1389853697088c3a062915912eca3b3d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:23:16 +0000 Subject: [PATCH 01/45] Add container definition file datasets v16.10 --- containers/ncbi_datasets_v16.10.0.def | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 containers/ncbi_datasets_v16.10.0.def diff --git a/containers/ncbi_datasets_v16.10.0.def b/containers/ncbi_datasets_v16.10.0.def new file mode 100644 index 000000000..c277a3b74 --- /dev/null +++ b/containers/ncbi_datasets_v16.10.0.def @@ -0,0 +1,50 @@ +Bootstrap: docker +From: ubuntu:24.04 + +%environment + export SINGULARITY_SHELL=/bin/bash + export DEBIAN_FRONTEND=noninteractive + export LC_ALL=C + +%labels + Author lcampbell@ebi.ac.uk + Software "NCBI's datasets and dataformat" + Software.version v16.10.0 + Software.website "https://github.com/ncbi/datasets/releases/tag/v16.10.0" + Description "NCBI Datasets is a new resource that lets you easily gather data from across NCBI databases." + +%post + apt-get update && apt-get -y upgrade + apt-get -y install \ + wget \ + unzip \ + procps \ + ca-certificates \ + + rm -rf /var/lib/apt/lists/* + apt-get clean + + #Installing ncbi datasets & dataformat + cd /usr/local/bin/ && \ + wget https://github.com/ncbi/datasets/releases/download/v16.10.0/linux-amd64.cli.package.zip && \ + unzip linux-amd64.cli.package.zip && \ + rm linux-amd64.cli.package.zip && \ + chmod +x datasets dataformat + +%test + #!/usr/bin/bash + echo "Testing OS is Ubuntu...." + source /etc/os-release + grep -q -e "PRETTY_NAME=\"Ubuntu" /etc/os-release + if [ $? -eq 0 ]; then + if [ $VERSION_ID == "24.04" ]; then + echo "Container base is Ubuntu version ${VERSION_ID} as expected." + fi + else + echo "Container base is not Ubuntu." + exit 1 + fi + + echo -e -n "\n** Checking we have datasets installed **\n" + datasets --version + datasets --help From ad03293146869ff99e70218c5d8d3e4ab8476444 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:24:00 +0000 Subject: [PATCH 02/45] New assembly tracking sub module v1.0 --- pyproject.toml | 2 + .../ensembl/io/genomio/assembly/status.py | 458 ++++++++++++++++++ 2 files changed, 460 insertions(+) create mode 100644 src/python/ensembl/io/genomio/assembly/status.py diff --git a/pyproject.toml b/pyproject.toml index cf3af0879..a10b35568 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dependencies = [ "mysql-connector-python >= 8.0.29", "python-redmine >= 2.3.0", "requests >= 2.28.0", + "spython >= 0.3.13", ] [project.optional-dependencies] @@ -91,6 +92,7 @@ documentation = "https://ensembl.github.io/ensembl-genomio" [project.scripts] # Assembly assembly_download = "ensembl.io.genomio.assembly.download:main" +assembly_tracker = "ensembl.io.genomio.assembly.status:main" # Database database_factory = "ensembl.io.genomio.database.factory:main" # Events diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py new file mode 100644 index 000000000..997987b36 --- /dev/null +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -0,0 +1,458 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Record the assembly status for a set of INSDC accessions using ncbi 'datasets' tool""" + +__all__ = [ + "resolve_query_type", + "fetch_asm_accn", + "datasets_asm_reports", + "extract_assembly_metadata", + "generate_report_tsv", +] + +import csv +import json +import os +from os import PathLike, getcwd +from pathlib import Path +import re +import logging +from sys import exit +from typing import Dict, Tuple, Union + +from spython.main import Client + +from ensembl.io.genomio.utils.json_utils import print_json +from ensembl.database import DBConnection as dbc +from ensembl.utils.argparse import ArgumentParser +from ensembl.utils.logging import init_logging_with_args + +DATASETS_SINGULARITY = { + "datasets_version_url": "library://lcampbell/ensembl-genomio/ncbi-datasets-v16.10.0:latest", +} + + +class UnsupportedFormatError(Exception): + """When a string does not have the expected format.""" + + +class ReportStructure(dict): + """Dict setter class of key report meta information""" + + def __init__(self): + dict.__init__(self) + self.update( + { + "Species Name": "", + "Taxon ID": "", + "Strain": "", + "Isolate": "", + "Isolate/Strain": "", + "Asm name": "", + "Assembly type": "", + "Asm accession": "", + "Paired assembly": "", + "Asm last updated": "", + "Asm status": "", + "Asm notes": "", + } + ) + + +def resolve_query_type( + query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str +) -> Union[Tuple[Dict,str]]: + """Function to indentify the kind of querys being passed by user, + then extract the queries (core names or accesisons) and store each with appropriate identifier. + + Args: + query_list: List of user defined queries either core names, or accessions + input_cores: arg parse param '--input_cores' + input_accessions: arg parse param '--input_accns' + + Returns: + User queries stored as indentifier[(core db name | UniqueID#)] : accession + """ + + query_accessions: Dict = {} + query_type: str = "" + + if input_cores and input_accessions is None: + server_details = f"mysql://ensro@{host_server}:{host_port}/" ## Requires some more dev !! + query_accessions = fetch_asm_accn(query_list, server_details) + query_type = "CoreDB" + elif input_cores is None and input_accessions: + query_count = 1 + query_type = "Accession" + for accession in query_list: + match = re.match(r"(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.?([0-9]+)", accession) + if not match: + raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") + else: + query_name = f"Query_#{query_count}" + query_count += 1 + query_accessions[query_name] = accession + + return query_accessions, query_type + + +def fetch_asm_accn(database_names: list, server_details: str) -> Dict: + """Obtain the associated INSDC accession [meta.assembly.accession] given a set of core(s) names + and a MYSQL server host. + + Args: + cores: Set of names for one or more core databases + server_details: MYSQL host server name and port [mysql-ens-(NAME:Port)] + + Returns: + Dict of core name(s) (key) and its INSDC assembly.accession (value) + """ + + core_accn_meta = {} + core_list_count = len(database_names) + count_accn_found = 0 + + for core in database_names: + db_connection_url = f"{server_details}{core}" + db_connection = dbc(f"{db_connection_url}") + qry_result = db_connection.execute( + 'SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";' + ).fetchall() + + if qry_result is None: + logging.warning(f"We have no accession on core: {core}") + elif len(qry_result) == 1: + count_accn_found += 1 + asm_accession = qry_result.pop()[0] + logging.info(f"{core} -> assembly.accession[{asm_accession}]") + core_accn_meta[core] = asm_accession + else: + logging.warning(f"Core {core} Has {len(qry_result)} assembly.accessions") + + logging.info(f"From initial input cores ({core_list_count}), obtained ({count_accn_found}) accessions") + + return core_accn_meta + + +def datasets_asm_reports( + sif_image: str, assembly_accessions: dict, download_directory: PathLike, batch_size: int +) -> Dict: + """Obtain multiple assembly report JSONs in one or more querys to datasets, + i.e. make individual since accn query to datasets tool. + + Args: + sif_image: Instance of Client.loaded singularity image. + assembly_accessions: Dict of core accessions. + download_directory: Dir path to store assembly report JSON files. + batch_size: Number of assembly accessions to batch submit to 'datasets'. + + Returns: + Dictionary of core name and its assoicated assembly report + """ + + master_accn_list = list(assembly_accessions.values()) + combined_asm_reports = {} + + # Setting the number of combined accessions to query in a single call to datasets + list_split = [i for i in range(0, len(master_accn_list), batch_size)] ## Note best to use >=10 + accn_subsample = [master_accn_list[ind : ind + batch_size] for ind in list_split] + + for accessions in accn_subsample: + datasets_command = ["datasets", "summary", "genome", "accession"] + accessions + + # Make call to singularity datasets providing a multi accn query: + client_return = Client.execute( + image=sif_image, command=datasets_command, return_result=True, quiet=True + ) + + result = client_return["message"] + + ## Test what result we have returned following execution of sif image and accession value + # Returned a str, i.e. no datasets result obtained exited with fatal error + if isinstance(result, str) and re.search("^FATAL", result): + logging.critical(f"Singularity image execution failed! -> '{result.strip()}'") + # Returned a list, i.e. datasets returned a result to client.execute + elif isinstance(result, str): + tmp_asm_dict = json.loads(result) + + if tmp_asm_dict["total_count"] == 0: + logging.warning(f"No assembly report found for accession(s) {accessions}") + elif tmp_asm_dict["reports"]: + logging.info(f"Asm report obtained for accession(s) [{accessions}]") + batch_reports_json = tmp_asm_dict["reports"] + for assembly_report in batch_reports_json: + accession = assembly_report["accession"] + asm_json_outfile = f"{download_directory}/{accession}.asm_report.json" + print_json(Path(asm_json_outfile), assembly_report) + + # Save assembly report into master core<>report dict + for core, accession_core in assembly_accessions.items(): + if accession == accession_core: + combined_asm_reports[core] = assembly_report + else: + logging.critical("Something is not right parsing 'datasets' results !") + else: + logging.critical( + f"Something not right while running datasets with singularity client.execute {result}" + ) + return combined_asm_reports + + +def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, ReportStructure]: + """ "Function to parse assembly reports and extract specific key information on + status and related fields. + + Args: + assembly_reports: Key value pair of core_name : assembly report. + + Returns: + Parsed assembly report meta (core, meta). + """ + parsed_meta = {} + + for core, asm_report in assembly_reports.items(): + asm_meta_info = ReportStructure() + + # Mandatory meta key parsing: + asm_meta_info["Asm accession"] = asm_report["accession"] + asm_meta_info["Asm name"] = asm_report["assembly_info"]["assembly_name"] + asm_meta_info["Assembly type"] = asm_report["assembly_info"]["assembly_type"] + asm_meta_info["Asm status"] = asm_report["assembly_info"]["assembly_status"] + asm_meta_info["Species Name"] = asm_report["organism"]["organism_name"] + asm_meta_info["Taxon ID"] = asm_report["organism"]["tax_id"] + + ## Non-mandatory meta key parsing: + # asm_meta_info["Asm last updated"] = asm_report["assembly_info"]["biosample"]["last_updated"] + assembly_meta_keys = asm_report["assembly_info"].keys() + organism_keys = asm_report["organism"].keys() + + # check for genome_notes: + if "genome_notes" in assembly_meta_keys: + complete_notes = ", ".join(asm_report["assembly_info"]["genome_notes"]) + asm_meta_info["Asm notes"] = complete_notes + else: + asm_meta_info["Asm notes"] = "NA" + + # check for biosample: + if "biosample" in assembly_meta_keys: + asm_meta_info["Asm last updated"] = asm_report["assembly_info"]["biosample"]["last_updated"] + else: + asm_meta_info["Asm last updated"] = "NA" + + # check for paired assembly: + if "paired_assembly" in assembly_meta_keys: + asm_meta_info["Paired assembly"] = asm_report["assembly_info"]["paired_assembly"]["accession"] + else: + asm_meta_info["Paired assembly"] = "NA" + + # check for isolate/strain type: + if "infraspecific_names" in organism_keys: + organism_type_keys = asm_report["organism"]["infraspecific_names"].keys() + if "isolate" in organism_type_keys: + asm_meta_info["Isolate"] = asm_report["organism"]["infraspecific_names"]["isolate"] + asm_meta_info.pop("Strain") + asm_meta_info.pop("Isolate/Strain") + elif "strain" in organism_type_keys: + asm_meta_info["Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] + asm_meta_info.pop("Isolate") + asm_meta_info.pop("Isolate/Strain") + else: + asm_meta_info["Isolate/Strain"] = "NA" + asm_meta_info.pop("Strain") + asm_meta_info.pop("Isolate") + else: + # elif ("strain" not in organism_type_keys) and ("isolate" not in organism_type_keys): + asm_meta_info["Isolate/Strain"] = "NA" + asm_meta_info.pop("Strain") + asm_meta_info.pop("Isolate") + + parsed_meta[core] = asm_meta_info + + return parsed_meta + + +def generate_report_tsv( + parsed_asm_reports: dict, outfile_prefix: str, query_type: str, output_directoy: PathLike = Path(getcwd()) +) -> None: + """Generate and write the assembly report to a TSV file + + Args: + parsed_asm_reports: Parsed assembly report meta + output_directoy: Path to directory where output TSV is stored. + """ + + tsv_outfile = f"{output_directoy}/{outfile_prefix}.tsv" + + header_list = list(ReportStructure().keys()) + header_list.remove("Strain") + header_list.remove("Isolate") + header_list = [query_type] + header_list + + with open(tsv_outfile, "w+") as tsv_out: + + writer = csv.writer(tsv_out, delimiter="\t", lineterminator="\n") + writer.writerow(header_list) + + for core, report_meta in parsed_asm_reports.items(): + final_asm_report = [core] + list(report_meta.values()) + writer.writerow(final_asm_report) + tsv_out.close() + + +# def classify_assembly_status(core_accessions: dict) -> None: +# """Main function to pare set of core list and call ncbi datasets""" + +def main() -> None: + """Module's entry-point.""" + parser = ArgumentParser( + description="Track the assembly status of a set of input core(s) using NCBI 'datasets'" + ) + parser.add_argument_src_path( + "--input_cores", + required=False, + default=None, + help="List of ensembl core db names to retrieve accessions", + ) + parser.add_argument_src_path( + "--input_accns", required=False, default=None, help="List of query assembly accessions" + ) + parser.add_argument_dst_path( + "--download_dir", + default="Assembly_report_jsons", + help="Folder where the assembly report JSON file(s) are stored", + ) + parser.add_argument_dst_path( + "--assembly_report_prefix", + default="AssemblyStatusReport", + help="Prefix used in assembly report TSV output file.", + ) + parser.add_argument( + "--host", + type=str, + required=False, + help="Server hostname (fmt: mysql-ens-XXXXX-YY); required with '--input_cores'", + ) + parser.add_argument( + "--port", type=str, required=False, help="Server port (fmt: 1234); required with '--input_cores'" + ) + parser.add_argument( + "--datasets_version_url", + type=str, + required=False, + metavar="URL", + help="Custom datasets version. E.g. library://lcampbell/ensembl-genomio/ncbi-datasets-v16.10.0:latest", + ) + parser.add_argument( + "--cache_dir", + type=Path, + required=False, + default="$NXF_SINGULARITY_CACHEDIR", + metavar="SINGULARITY_CACHE", + help="Custom path to user generated singularity container housing ncbi tool 'datasets'", + ) + parser.add_argument( + "--datasets_batch_size", + type=int, + required=False, + default=100, + metavar="BATCH_SIZE", + help="Number of accessions requested in one query to datasets", + ) + + parser.add_log_arguments(add_log_file=True) + args = parser.parse_args() + + init_logging_with_args(args) + + # Set and create dir for download files + if not args.download_dir.is_dir(): + args.download_dir.mkdir(parents=True) + + # Check for required input in the form of cores/accessions + if args.input_cores is None and args.input_accns is None: + logging.critical( + f"Did not detect user required input. Please specify option: '--input_cores' (core db names); OR '--input_accns' (INSDC accessions)." + ) + exit() + elif args.input_cores and args.input_accns: + logging.critical( + f"Detected both '--input_cores' AND '--input_accns' user inputs. Please provide just one such option." + ) + exit() + # Input core names centered run + elif args.input_cores and args.input_accns is None: + user_query_file = args.input_cores + logging.info(f"Performing assembly status report using core db list file: {user_query_file}") + if args.host is None or args.port is None: + print( + f"User must specify both arguments '--host' and '--port' when providing core database names. Exiting !" + ) + exit() + # Accession centered run + elif args.input_cores is None and args.input_accns: + user_query_file = args.input_accns + logging.info(f"Performing assembly status report using INSDC accession list file: {user_query_file}") + + ## Parse and store cores/accessions from user input query file + try: + with user_query_file.open(mode="r") as f: + query_list = f.read().splitlines() + except IOError as err: + logging.error(f"Unable to read user queries from inputfile '{user_query_file}' due to {err}.") + exit() + + # Set singularity cache dir from user defined path or use environment + if args.cache_dir and args.cache_dir.is_dir(): + image_dl_path = Path(args.cache_dir) + logging.info(f"Using user-defined cache_dir: '{image_dl_path}'") + elif os.environ.get("NXF_SINGULARITY_CACHEDIR"): + image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"]) + logging.info( + f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}" + ) + elif os.environ.get("SINGULARITY_CACHEDIR"): + image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"]) + logging.info( + f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}" + ) + else: + image_dl_path = Path() + logging.warning(f"Unable to set singularity cache dir properly, using CWD {image_dl_path}") + + # Set the datasets version URL + if args.datasets_version_url is None: + container_url = DATASETS_SINGULARITY["datasets_version_url"] + logging.info(f"Using default 'ncbi datasets' version '{container_url}'") + else: + container_url = args.datasets_version_url + logging.info(f"Using user defined 'ncbi datasets' version '{container_url}'") + + ## Get accessions on cores list or use user accession list directly + query_accessions, query_type = resolve_query_type( + query_list, args.host, args.port, args.input_cores, args.input_accns + ) + + # Pull or load pre-existing 'datasets' singularity container image. + datasets_image = Client.pull(container_url, stream=False, pull_folder=image_dl_path, quiet=True) + + # Datasets query implementation for one or more bacthed accessions + assembly_reports = datasets_asm_reports( + datasets_image, query_accessions, args.download_dir, args.datasets_batch_size + ) + + # Extract the key assembly report meta information for reporting status + key_asmreport_meta = extract_assembly_metadata(assembly_reports) + + generate_report_tsv(key_asmreport_meta, args.assembly_report_prefix, query_type, args.download_dir) From 33646d572703e8d7d209e1b06cb1ee20f8948fff Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:27:13 +0000 Subject: [PATCH 03/45] Black formatting --- src/python/ensembl/io/genomio/assembly/status.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 997987b36..52f4746d6 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -73,7 +73,7 @@ def __init__(self): def resolve_query_type( query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str -) -> Union[Tuple[Dict,str]]: +) -> Union[Tuple[Dict, str]]: """Function to indentify the kind of querys being passed by user, then extract the queries (core names or accesisons) and store each with appropriate identifier. @@ -314,6 +314,7 @@ def generate_report_tsv( # def classify_assembly_status(core_accessions: dict) -> None: # """Main function to pare set of core list and call ncbi datasets""" + def main() -> None: """Module's entry-point.""" parser = ArgumentParser( From 42b8d2608f42f31774d7ace3bcae7a61cd4b37d1 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:37:54 +0000 Subject: [PATCH 04/45] Fix pylint issues on line lengths --- src/python/ensembl/io/genomio/assembly/status.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 52f4746d6..26f958599 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -353,7 +353,7 @@ def main() -> None: type=str, required=False, metavar="URL", - help="Custom datasets version. E.g. library://lcampbell/ensembl-genomio/ncbi-datasets-v16.10.0:latest", + help="datasets version: E.g. library://lcampbell/ensembl-genomio/ncbi-datasets-v16.10.0:latest", ) parser.add_argument( "--cache_dir", @@ -384,12 +384,12 @@ def main() -> None: # Check for required input in the form of cores/accessions if args.input_cores is None and args.input_accns is None: logging.critical( - f"Did not detect user required input. Please specify option: '--input_cores' (core db names); OR '--input_accns' (INSDC accessions)." + "Missing required input: '--input_cores' (core db names) OR '--input_accns' (INSDC accessions)." ) exit() elif args.input_cores and args.input_accns: logging.critical( - f"Detected both '--input_cores' AND '--input_accns' user inputs. Please provide just one such option." + "Detected '--input_cores' AND '--input_accns'. Please provide just one such option." ) exit() # Input core names centered run @@ -397,8 +397,8 @@ def main() -> None: user_query_file = args.input_cores logging.info(f"Performing assembly status report using core db list file: {user_query_file}") if args.host is None or args.port is None: - print( - f"User must specify both arguments '--host' and '--port' when providing core database names. Exiting !" + logging.critical( + "User must specify both arguments '--host' and '--port' when providing core database names." ) exit() # Accession centered run From 9e2677458b229c1501c2b8db11557afb8c0ebd4d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:43:34 +0000 Subject: [PATCH 05/45] Code improvements, list-compre, unneeded else --- src/python/ensembl/io/genomio/assembly/status.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 26f958599..80994408f 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -100,10 +100,9 @@ def resolve_query_type( match = re.match(r"(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.?([0-9]+)", accession) if not match: raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") - else: - query_name = f"Query_#{query_count}" - query_count += 1 - query_accessions[query_name] = accession + query_name = f"Query_#{query_count}" + query_count += 1 + query_accessions[query_name] = accession return query_accessions, query_type @@ -166,7 +165,7 @@ def datasets_asm_reports( combined_asm_reports = {} # Setting the number of combined accessions to query in a single call to datasets - list_split = [i for i in range(0, len(master_accn_list), batch_size)] ## Note best to use >=10 + list_split = list(range(0, len(master_accn_list), batch_size)) ## Note best to use >=10 accn_subsample = [master_accn_list[ind : ind + batch_size] for ind in list_split] for accessions in accn_subsample: From f11d4c871c583a2b05395eee8884ae0cbc70ef01 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 15:47:11 +0000 Subject: [PATCH 06/45] Update assembly module __init__.py --- src/python/ensembl/io/genomio/assembly/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/io/genomio/assembly/__init__.py b/src/python/ensembl/io/genomio/assembly/__init__.py index b138d9636..747530741 100644 --- a/src/python/ensembl/io/genomio/assembly/__init__.py +++ b/src/python/ensembl/io/genomio/assembly/__init__.py @@ -15,3 +15,4 @@ """Assembly preparation module.""" from .download import * +from .status import * From 2b53e735af93b1914a634ccfc3c52a186e81a5a3 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 16:27:40 +0000 Subject: [PATCH 07/45] Re-Blacked --- src/python/ensembl/io/genomio/assembly/status.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 80994408f..74299f484 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -165,7 +165,7 @@ def datasets_asm_reports( combined_asm_reports = {} # Setting the number of combined accessions to query in a single call to datasets - list_split = list(range(0, len(master_accn_list), batch_size)) ## Note best to use >=10 + list_split = list(range(0, len(master_accn_list), batch_size)) ## Note best to use >=10 accn_subsample = [master_accn_list[ind : ind + batch_size] for ind in list_split] for accessions in accn_subsample: @@ -300,7 +300,6 @@ def generate_report_tsv( header_list = [query_type] + header_list with open(tsv_outfile, "w+") as tsv_out: - writer = csv.writer(tsv_out, delimiter="\t", lineterminator="\n") writer.writerow(header_list) @@ -387,9 +386,7 @@ def main() -> None: ) exit() elif args.input_cores and args.input_accns: - logging.critical( - "Detected '--input_cores' AND '--input_accns'. Please provide just one such option." - ) + logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") exit() # Input core names centered run elif args.input_cores and args.input_accns is None: From e710518259b3e2c17bd5c45a70e21f1dba66ac10 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 17:01:52 +0000 Subject: [PATCH 08/45] Fixes and new function on input file param parsing --- .../ensembl/io/genomio/assembly/status.py | 66 ++++++++++++------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 74299f484..26f86d9d6 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -15,6 +15,7 @@ """Record the assembly status for a set of INSDC accessions using ncbi 'datasets' tool""" __all__ = [ + "examine_parameterization", "resolve_query_type", "fetch_asm_accn", "datasets_asm_reports", @@ -70,6 +71,47 @@ def __init__(self): } ) +def examine_parameterization(input_cores: PathLike, + input_accessions: PathLike, db_host: str, db_port: int + ) -> Path: + """Detect the kind of user input (cores/accessions) and determine any missing or + incorrect parameterization. + + Args: + input_cores: input core(s) list file name. + input_accessions: input accession (s) list file name. + db_host: host server name + db_port: host serer port + + Returns: + User input file used in assembly status querying + """ + user_query_file: Path + + # Check for required input in the form of cores/accessions + if input_cores is None and input_accessions is None: + logging.critical( + "Missing required input: '--input_cores' (core db names) OR '--input_accns' (INSDC accessions)." + ) + exit() + elif input_cores and input_accessions: + logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") + exit() + # Input core names centered run + elif input_cores and input_accessions is None: + user_query_file = input_cores + logging.info(f"Performing assembly status report using core db list file: {user_query_file}") + if db_host is None or db_port is None: + logging.critical( + "User must specify both arguments '--host' and '--port' when providing core database names." + ) + exit() + # Accession centered run + elif input_cores is None and input_accessions: + user_query_file = input_accessions + logging.info(f"Performing assembly status report using INSDC accession list file: {user_query_file}") + + return user_query_file def resolve_query_type( query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str @@ -379,28 +421,8 @@ def main() -> None: if not args.download_dir.is_dir(): args.download_dir.mkdir(parents=True) - # Check for required input in the form of cores/accessions - if args.input_cores is None and args.input_accns is None: - logging.critical( - "Missing required input: '--input_cores' (core db names) OR '--input_accns' (INSDC accessions)." - ) - exit() - elif args.input_cores and args.input_accns: - logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") - exit() - # Input core names centered run - elif args.input_cores and args.input_accns is None: - user_query_file = args.input_cores - logging.info(f"Performing assembly status report using core db list file: {user_query_file}") - if args.host is None or args.port is None: - logging.critical( - "User must specify both arguments '--host' and '--port' when providing core database names." - ) - exit() - # Accession centered run - elif args.input_cores is None and args.input_accns: - user_query_file = args.input_accns - logging.info(f"Performing assembly status report using INSDC accession list file: {user_query_file}") + # Set input file and determine if proper parameterization options are defined. + user_query_file = examine_parameterization(args.input_cores, args.input_accns, args.host, args.port) ## Parse and store cores/accessions from user input query file try: From a8ccc134e9a50005e2759b5eb909cc463e829dfd Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 17:17:00 +0000 Subject: [PATCH 09/45] Refactor - Too many nested blocks --- .../ensembl/io/genomio/assembly/status.py | 50 ++++++++----------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 26f86d9d6..a2a502e40 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -30,7 +30,7 @@ from pathlib import Path import re import logging -from sys import exit +import sys from typing import Dict, Tuple, Union from spython.main import Client @@ -93,10 +93,10 @@ def examine_parameterization(input_cores: PathLike, logging.critical( "Missing required input: '--input_cores' (core db names) OR '--input_accns' (INSDC accessions)." ) - exit() + sys.exit() elif input_cores and input_accessions: logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") - exit() + sys.exit() # Input core names centered run elif input_cores and input_accessions is None: user_query_file = input_cores @@ -105,7 +105,7 @@ def examine_parameterization(input_cores: PathLike, logging.critical( "User must specify both arguments '--host' and '--port' when providing core database names." ) - exit() + sys.exit() # Accession centered run elif input_cores is None and input_accessions: user_query_file = input_accessions @@ -225,29 +225,23 @@ def datasets_asm_reports( if isinstance(result, str) and re.search("^FATAL", result): logging.critical(f"Singularity image execution failed! -> '{result.strip()}'") # Returned a list, i.e. datasets returned a result to client.execute - elif isinstance(result, str): - tmp_asm_dict = json.loads(result) - - if tmp_asm_dict["total_count"] == 0: - logging.warning(f"No assembly report found for accession(s) {accessions}") - elif tmp_asm_dict["reports"]: - logging.info(f"Asm report obtained for accession(s) [{accessions}]") - batch_reports_json = tmp_asm_dict["reports"] - for assembly_report in batch_reports_json: - accession = assembly_report["accession"] - asm_json_outfile = f"{download_directory}/{accession}.asm_report.json" - print_json(Path(asm_json_outfile), assembly_report) - - # Save assembly report into master core<>report dict - for core, accession_core in assembly_accessions.items(): - if accession == accession_core: - combined_asm_reports[core] = assembly_report - else: - logging.critical("Something is not right parsing 'datasets' results !") - else: - logging.critical( - f"Something not right while running datasets with singularity client.execute {result}" - ) + + tmp_asm_dict = json.loads(result) + if isinstance(result, str) and tmp_asm_dict["total_count"] >= 1: + logging.info(f"Asm report obtained for accession(s) [{accessions}]") + + batch_reports_json = tmp_asm_dict["reports"] + for assembly_report in batch_reports_json: + accession = assembly_report["accession"] + asm_json_outfile = f"{download_directory}/{accession}.asm_report.json" + print_json(Path(asm_json_outfile), assembly_report) + # Save assembly report into master core<>report dict + for core, accession_core in assembly_accessions.items(): + if accession == accession_core: + combined_asm_reports[core] = assembly_report + elif isinstance(result, str) and tmp_asm_dict["total_count"] == 0: + logging.warning(f"No assembly report found for accession(s) {accessions}") + return combined_asm_reports @@ -430,7 +424,7 @@ def main() -> None: query_list = f.read().splitlines() except IOError as err: logging.error(f"Unable to read user queries from inputfile '{user_query_file}' due to {err}.") - exit() + sys.exit() # Set singularity cache dir from user defined path or use environment if args.cache_dir and args.cache_dir.is_dir(): From 27f4c4b9730a85dad76a03b50b8415b53fe29041 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Mon, 25 Mar 2024 17:20:07 +0000 Subject: [PATCH 10/45] Black formatting --- src/python/ensembl/io/genomio/assembly/status.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index a2a502e40..67fc9e942 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -71,10 +71,11 @@ def __init__(self): } ) -def examine_parameterization(input_cores: PathLike, - input_accessions: PathLike, db_host: str, db_port: int - ) -> Path: - """Detect the kind of user input (cores/accessions) and determine any missing or + +def examine_parameterization( + input_cores: PathLike, input_accessions: PathLike, db_host: str, db_port: int +) -> Path: + """Detect the kind of user input (cores/accessions) and determine any missing or incorrect parameterization. Args: @@ -113,6 +114,7 @@ def examine_parameterization(input_cores: PathLike, return user_query_file + def resolve_query_type( query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str ) -> Union[Tuple[Dict, str]]: From 4df0c5f182b564275ccf2df209d1bf20671a00d0 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:51:27 +0000 Subject: [PATCH 11/45] Update src/python/ensembl/io/genomio/assembly/status.py Code refinement, superflous condition Co-authored-by: Matthieu Barba --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 67fc9e942..60e75be77 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -99,7 +99,7 @@ def examine_parameterization( logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") sys.exit() # Input core names centered run - elif input_cores and input_accessions is None: + elif input_cores: user_query_file = input_cores logging.info(f"Performing assembly status report using core db list file: {user_query_file}") if db_host is None or db_port is None: From 557ecc9cc599ec0343b87e0374d89a3d70c47e3d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Tue, 26 Mar 2024 13:53:08 +0000 Subject: [PATCH 12/45] Update src/python/ensembl/io/genomio/assembly/status.py refinement on condition Co-authored-by: Matthieu Barba --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 60e75be77..5e1bb415c 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -108,7 +108,7 @@ def examine_parameterization( ) sys.exit() # Accession centered run - elif input_cores is None and input_accessions: + else: user_query_file = input_accessions logging.info(f"Performing assembly status report using INSDC accession list file: {user_query_file}") From 734db6da4dccff108f6d66759200fd1514433f76 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Tue, 26 Mar 2024 14:00:32 +0000 Subject: [PATCH 13/45] Update src/python/ensembl/io/genomio/assembly/status.py Fix typos Co-authored-by: Matthieu Barba --- src/python/ensembl/io/genomio/assembly/status.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 5e1bb415c..d4599e32a 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -118,8 +118,8 @@ def examine_parameterization( def resolve_query_type( query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str ) -> Union[Tuple[Dict, str]]: - """Function to indentify the kind of querys being passed by user, - then extract the queries (core names or accesisons) and store each with appropriate identifier. + """Function to identify the kind of queries being passed by user, + then extract the queries (core names or accessions) and store each with appropriate identifier. Args: query_list: List of user defined queries either core names, or accessions @@ -127,7 +127,7 @@ def resolve_query_type( input_accessions: arg parse param '--input_accns' Returns: - User queries stored as indentifier[(core db name | UniqueID#)] : accession + User queries stored as identifier[(core db name | UniqueID#)] : accession """ query_accessions: Dict = {} From 2d3e3778f299539bf01a7d8bb5f24dbcd2d4cc85 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Tue, 26 Mar 2024 14:37:52 +0000 Subject: [PATCH 14/45] Update src/python/ensembl/io/genomio/assembly/status.py typo fix Co-authored-by: Matthieu Barba --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index d4599e32a..9cd8784df 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -462,7 +462,7 @@ def main() -> None: # Pull or load pre-existing 'datasets' singularity container image. datasets_image = Client.pull(container_url, stream=False, pull_folder=image_dl_path, quiet=True) - # Datasets query implementation for one or more bacthed accessions + # Datasets query implementation for one or more batched accessions assembly_reports = datasets_asm_reports( datasets_image, query_accessions, args.download_dir, args.datasets_batch_size ) From 7437350dfd719539975e13e4cdd6b3f90bf4a400 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Tue, 26 Mar 2024 16:21:11 +0000 Subject: [PATCH 15/45] Vasily suggestion linebreaks on def file --- containers/ncbi_datasets_v16.10.0.def | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/containers/ncbi_datasets_v16.10.0.def b/containers/ncbi_datasets_v16.10.0.def index c277a3b74..7a7c0dae5 100644 --- a/containers/ncbi_datasets_v16.10.0.def +++ b/containers/ncbi_datasets_v16.10.0.def @@ -16,12 +16,17 @@ From: ubuntu:24.04 %post apt-get update && apt-get -y upgrade apt-get -y install \ + wget \ + unzip \ + procps \ + ca-certificates \ rm -rf /var/lib/apt/lists/* + apt-get clean #Installing ncbi datasets & dataformat From 3e5c85d815b28973469de94cf14c9232f5e271e0 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Tue, 26 Mar 2024 16:45:18 +0000 Subject: [PATCH 16/45] @MatBarba: Various code improvements --- .../ensembl/io/genomio/assembly/status.py | 189 ++++++++++-------- 1 file changed, 103 insertions(+), 86 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 9cd8784df..da3383eba 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -12,12 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Record the assembly status for a set of INSDC accessions using ncbi 'datasets' tool""" +"""Obtain and record the assembly 'status' for a set of INSDC accession(s) using NCBI 'datasets' tool.""" __all__ = [ - "examine_parameterization", + "singularity_image_setter", + "check_parameterization", "resolve_query_type", - "fetch_asm_accn", + "fetch_accessions_from_cores", "datasets_asm_reports", "extract_assembly_metadata", "generate_report_tsv", @@ -34,9 +35,10 @@ from typing import Dict, Tuple, Union from spython.main import Client +from sqlalchemy.engine import URL from ensembl.io.genomio.utils.json_utils import print_json -from ensembl.database import DBConnection as dbc +from ensembl.io.genomio.database.dbconnection_lite import DBConnectionLite as dbc from ensembl.utils.argparse import ArgumentParser from ensembl.utils.logging import init_logging_with_args @@ -72,9 +74,53 @@ def __init__(self): ) -def examine_parameterization( +def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Client: + """Parse ENV and User specified variables related to 'datasets' singularity SIF + container and define version and location of container. + + Args: + sif_cache_dir: Path to attempt locating/download SIF container image. + datasets_version: URL of singularity container (custom 'datasets' version if desired) + + Returns: + spython.main.client instance of singularity container image housing 'datasets' + """ + + # Set singularity cache dir from user defined path or use environment + if sif_cache_dir and sif_cache_dir.is_dir(): + image_dl_path = Path(sif_cache_dir) + logging.info(f"Using user-defined cache_dir: '{image_dl_path}'") + elif os.environ.get("NXF_SINGULARITY_CACHEDIR"): + image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"]) + logging.info( + f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}" + ) + elif os.environ.get("SINGULARITY_CACHEDIR"): + image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"]) + logging.info( + f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}" + ) + else: + image_dl_path = Path() + logging.warning(f"Unable to set singularity cache dir properly, using CWD {image_dl_path}") + + # Set the datasets version URL + if datasets_version is None: + container_url = DATASETS_SINGULARITY["datasets_version_url"] + logging.info(f"Using default 'ncbi datasets' version '{container_url}'") + else: + container_url = datasets_version + logging.info(f"Using user defined 'ncbi datasets' version '{container_url}'") + + # Pull or load pre-existing 'datasets' singularity container image. + datasets_image = Client.pull(container_url, stream=False, pull_folder=image_dl_path, quiet=True) + + return datasets_image + + +def check_parameterization( input_cores: PathLike, input_accessions: PathLike, db_host: str, db_port: int -) -> Path: +) -> PathLike: """Detect the kind of user input (cores/accessions) and determine any missing or incorrect parameterization. @@ -87,26 +133,17 @@ def examine_parameterization( Returns: User input file used in assembly status querying """ - user_query_file: Path + user_query_file: PathLike - # Check for required input in the form of cores/accessions - if input_cores is None and input_accessions is None: - logging.critical( - "Missing required input: '--input_cores' (core db names) OR '--input_accns' (INSDC accessions)." - ) - sys.exit() - elif input_cores and input_accessions: - logging.critical("Detected '--input_cores' AND '--input_accns'. Please provide just one such option.") - sys.exit() # Input core names centered run - elif input_cores: + if input_cores: user_query_file = input_cores logging.info(f"Performing assembly status report using core db list file: {user_query_file}") if db_host is None or db_port is None: logging.critical( "User must specify both arguments '--host' and '--port' when providing core database names." ) - sys.exit() + sys.exit(1) # Accession centered run else: user_query_file = input_accessions @@ -116,13 +153,14 @@ def examine_parameterization( def resolve_query_type( - query_list: list, host_server: str, host_port: str, input_cores: str, input_accessions: str + query_list: list, partial_url: URL, input_cores: str, input_accessions: str ) -> Union[Tuple[Dict, str]]: """Function to identify the kind of queries being passed by user, then extract the queries (core names or accessions) and store each with appropriate identifier. Args: query_list: List of user defined queries either core names, or accessions + partial_url: A partial MYSQL connection URL (host:port) input_cores: arg parse param '--input_cores' input_accessions: arg parse param '--input_accns' @@ -134,8 +172,7 @@ def resolve_query_type( query_type: str = "" if input_cores and input_accessions is None: - server_details = f"mysql://ensro@{host_server}:{host_port}/" ## Requires some more dev !! - query_accessions = fetch_asm_accn(query_list, server_details) + query_accessions = fetch_accessions_from_cores(query_list, partial_url) query_type = "CoreDB" elif input_cores is None and input_accessions: query_count = 1 @@ -151,14 +188,13 @@ def resolve_query_type( return query_accessions, query_type -def fetch_asm_accn(database_names: list, server_details: str) -> Dict: +def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Dict: """Obtain the associated INSDC accession [meta.assembly.accession] given a set of core(s) names and a MYSQL server host. Args: - cores: Set of names for one or more core databases - server_details: MYSQL host server name and port [mysql-ens-(NAME:Port)] - + database_names: Set of names for one or more core databases + connection_url: Partial MYSQL host name : port Returns: Dict of core name(s) (key) and its INSDC assembly.accession (value) """ @@ -168,7 +204,7 @@ def fetch_asm_accn(database_names: list, server_details: str) -> Dict: count_accn_found = 0 for core in database_names: - db_connection_url = f"{server_details}{core}" + db_connection_url = connection_url.set(database=core) db_connection = dbc(f"{db_connection_url}") qry_result = db_connection.execute( 'SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";' @@ -202,7 +238,7 @@ def datasets_asm_reports( batch_size: Number of assembly accessions to batch submit to 'datasets'. Returns: - Dictionary of core name and its assoicated assembly report + Dictionary of core name and its associated assembly report """ master_accn_list = list(assembly_accessions.values()) @@ -224,12 +260,15 @@ def datasets_asm_reports( ## Test what result we have returned following execution of sif image and accession value # Returned a str, i.e. no datasets result obtained exited with fatal error - if isinstance(result, str) and re.search("^FATAL", result): + if not isinstance(result, str): + raise ValueError("Result obtained from datasets is not the expected format 'string'") + if re.search("^FATAL", result): logging.critical(f"Singularity image execution failed! -> '{result.strip()}'") + sys.exit(1) # Returned a list, i.e. datasets returned a result to client.execute tmp_asm_dict = json.loads(result) - if isinstance(result, str) and tmp_asm_dict["total_count"] >= 1: + if tmp_asm_dict["total_count"] >= 1: logging.info(f"Asm report obtained for accession(s) [{accessions}]") batch_reports_json = tmp_asm_dict["reports"] @@ -241,8 +280,9 @@ def datasets_asm_reports( for core, accession_core in assembly_accessions.items(): if accession == accession_core: combined_asm_reports[core] = assembly_report - elif isinstance(result, str) and tmp_asm_dict["total_count"] == 0: - logging.warning(f"No assembly report found for accession(s) {accessions}") + else: + logging.warning(f"No assembly report found for accession(s) {accessions}. Exiting !") + sys.exit(0) return combined_asm_reports @@ -271,7 +311,6 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re asm_meta_info["Taxon ID"] = asm_report["organism"]["tax_id"] ## Non-mandatory meta key parsing: - # asm_meta_info["Asm last updated"] = asm_report["assembly_info"]["biosample"]["last_updated"] assembly_meta_keys = asm_report["assembly_info"].keys() organism_keys = asm_report["organism"].keys() @@ -300,19 +339,11 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re if "isolate" in organism_type_keys: asm_meta_info["Isolate"] = asm_report["organism"]["infraspecific_names"]["isolate"] asm_meta_info.pop("Strain") - asm_meta_info.pop("Isolate/Strain") elif "strain" in organism_type_keys: asm_meta_info["Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] asm_meta_info.pop("Isolate") - asm_meta_info.pop("Isolate/Strain") - else: - asm_meta_info["Isolate/Strain"] = "NA" - asm_meta_info.pop("Strain") - asm_meta_info.pop("Isolate") else: - # elif ("strain" not in organism_type_keys) and ("isolate" not in organism_type_keys): - asm_meta_info["Isolate/Strain"] = "NA" - asm_meta_info.pop("Strain") + asm_meta_info["Strain"] = "NA" asm_meta_info.pop("Isolate") parsed_meta[core] = asm_meta_info @@ -321,16 +352,19 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re def generate_report_tsv( - parsed_asm_reports: dict, outfile_prefix: str, query_type: str, output_directoy: PathLike = Path(getcwd()) + parsed_asm_reports: dict, + outfile_prefix: str, + query_type: str, + output_directory: PathLike = Path(getcwd()), ) -> None: """Generate and write the assembly report to a TSV file Args: parsed_asm_reports: Parsed assembly report meta - output_directoy: Path to directory where output TSV is stored. + output_directory: Path to directory where output TSV is stored. """ - tsv_outfile = f"{output_directoy}/{outfile_prefix}.tsv" + tsv_outfile = f"{output_directory}/{outfile_prefix}.tsv" header_list = list(ReportStructure().keys()) header_list.remove("Strain") @@ -356,14 +390,20 @@ def main() -> None: parser = ArgumentParser( description="Track the assembly status of a set of input core(s) using NCBI 'datasets'" ) - parser.add_argument_src_path( + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument( "--input_cores", + type=Path, required=False, default=None, - help="List of ensembl core db names to retrieve accessions", + help="List of ensembl core database(s) names to retrieve query accessions", ) - parser.add_argument_src_path( - "--input_accns", required=False, default=None, help="List of query assembly accessions" + input_group.add_argument( + "--input_accessions", + type=Path, + required=False, + default=None, + help="List of assembly INSDC query accessions", ) parser.add_argument_dst_path( "--download_dir", @@ -418,56 +458,33 @@ def main() -> None: args.download_dir.mkdir(parents=True) # Set input file and determine if proper parameterization options are defined. - user_query_file = examine_parameterization(args.input_cores, args.input_accns, args.host, args.port) + user_query_file = check_parameterization(args.input_cores, args.input_accessions, args.host, args.port) ## Parse and store cores/accessions from user input query file - try: - with user_query_file.open(mode="r") as f: - query_list = f.read().splitlines() - except IOError as err: - logging.error(f"Unable to read user queries from inputfile '{user_query_file}' due to {err}.") - sys.exit() + with user_query_file.open(mode="r") as f: + query_list = f.read().splitlines() - # Set singularity cache dir from user defined path or use environment - if args.cache_dir and args.cache_dir.is_dir(): - image_dl_path = Path(args.cache_dir) - logging.info(f"Using user-defined cache_dir: '{image_dl_path}'") - elif os.environ.get("NXF_SINGULARITY_CACHEDIR"): - image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"]) - logging.info( - f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}" - ) - elif os.environ.get("SINGULARITY_CACHEDIR"): - image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"]) - logging.info( - f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}" - ) - else: - image_dl_path = Path() - logging.warning(f"Unable to set singularity cache dir properly, using CWD {image_dl_path}") - - # Set the datasets version URL - if args.datasets_version_url is None: - container_url = DATASETS_SINGULARITY["datasets_version_url"] - logging.info(f"Using default 'ncbi datasets' version '{container_url}'") - else: - container_url = args.datasets_version_url - logging.info(f"Using user defined 'ncbi datasets' version '{container_url}'") + ## Parse singularity setting and define the SIF image for 'datasets' + datasets_image = singularity_image_setter(args.cache_dir, args.datasets_version_url) ## Get accessions on cores list or use user accession list directly + connection_url = URL.create( + "mysql", + host=args.host, + port=args.port, + username="ensro", + ) query_accessions, query_type = resolve_query_type( - query_list, args.host, args.port, args.input_cores, args.input_accns + query_list, connection_url, args.input_cores, args.input_accessions ) - # Pull or load pre-existing 'datasets' singularity container image. - datasets_image = Client.pull(container_url, stream=False, pull_folder=image_dl_path, quiet=True) - # Datasets query implementation for one or more batched accessions assembly_reports = datasets_asm_reports( datasets_image, query_accessions, args.download_dir, args.datasets_batch_size ) # Extract the key assembly report meta information for reporting status - key_asmreport_meta = extract_assembly_metadata(assembly_reports) + key_assembly_report_meta = extract_assembly_metadata(assembly_reports) - generate_report_tsv(key_asmreport_meta, args.assembly_report_prefix, query_type, args.download_dir) + # Produce the finalized assembly status report TSV from set of parsed 'datasets summary report' + generate_report_tsv(key_assembly_report_meta, args.assembly_report_prefix, query_type, args.download_dir) From 331029e948343f7c41daf354a5ae60326360e924 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 27 Mar 2024 10:27:38 +0000 Subject: [PATCH 17/45] Change accession query to accn, revert extract_assembly_metadata --- .../ensembl/io/genomio/assembly/status.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index da3383eba..b379f990a 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -119,8 +119,8 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie def check_parameterization( - input_cores: PathLike, input_accessions: PathLike, db_host: str, db_port: int -) -> PathLike: + input_cores: Path, input_accessions: Path, db_host: str, db_port: int +) -> Path: """Detect the kind of user input (cores/accessions) and determine any missing or incorrect parameterization. @@ -133,7 +133,7 @@ def check_parameterization( Returns: User input file used in assembly status querying """ - user_query_file: PathLike + user_query_file: Path # Input core names centered run if input_cores: @@ -175,15 +175,15 @@ def resolve_query_type( query_accessions = fetch_accessions_from_cores(query_list, partial_url) query_type = "CoreDB" elif input_cores is None and input_accessions: - query_count = 1 + # query_count = 1 query_type = "Accession" for accession in query_list: match = re.match(r"(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.?([0-9]+)", accession) if not match: raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") - query_name = f"Query_#{query_count}" - query_count += 1 - query_accessions[query_name] = accession + # query_name = f"Query_#{query_count}" + # query_count += 1 + query_accessions[accession] = accession return query_accessions, query_type @@ -339,11 +339,19 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re if "isolate" in organism_type_keys: asm_meta_info["Isolate"] = asm_report["organism"]["infraspecific_names"]["isolate"] asm_meta_info.pop("Strain") + asm_meta_info.pop("Isolate/Strain") elif "strain" in organism_type_keys: asm_meta_info["Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] asm_meta_info.pop("Isolate") + asm_meta_info.pop("Isolate/Strain") + else: + asm_meta_info["Isolate/Strain"] = "NA" + asm_meta_info.pop("Strain") + asm_meta_info.pop("Isolate") else: - asm_meta_info["Strain"] = "NA" + # elif ("strain" not in organism_type_keys) and ("isolate" not in organism_type_keys): + asm_meta_info["Isolate/Strain"] = "NA" + asm_meta_info.pop("Strain") asm_meta_info.pop("Isolate") parsed_meta[core] = asm_meta_info From e81895753e17fb1d7378aada8e47f35bd0f96f6d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 27 Mar 2024 11:17:03 +0000 Subject: [PATCH 18/45] Simplified strain/isolate logic + report dict --- .../ensembl/io/genomio/assembly/status.py | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index b379f990a..8953ad07d 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -60,8 +60,6 @@ def __init__(self): { "Species Name": "", "Taxon ID": "", - "Strain": "", - "Isolate": "", "Isolate/Strain": "", "Asm name": "", "Assembly type": "", @@ -118,9 +116,7 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie return datasets_image -def check_parameterization( - input_cores: Path, input_accessions: Path, db_host: str, db_port: int -) -> Path: +def check_parameterization(input_cores: Path, input_accessions: Path, db_host: str, db_port: int) -> Path: """Detect the kind of user input (cores/accessions) and determine any missing or incorrect parameterization. @@ -175,14 +171,11 @@ def resolve_query_type( query_accessions = fetch_accessions_from_cores(query_list, partial_url) query_type = "CoreDB" elif input_cores is None and input_accessions: - # query_count = 1 query_type = "Accession" for accession in query_list: match = re.match(r"(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.?([0-9]+)", accession) if not match: raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") - # query_name = f"Query_#{query_count}" - # query_count += 1 query_accessions[accession] = accession return query_accessions, query_type @@ -337,22 +330,13 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re if "infraspecific_names" in organism_keys: organism_type_keys = asm_report["organism"]["infraspecific_names"].keys() if "isolate" in organism_type_keys: - asm_meta_info["Isolate"] = asm_report["organism"]["infraspecific_names"]["isolate"] - asm_meta_info.pop("Strain") - asm_meta_info.pop("Isolate/Strain") + asm_meta_info["Isolate/Strain"] = asm_report["organism"]["infraspecific_names"]["isolate"] elif "strain" in organism_type_keys: - asm_meta_info["Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] - asm_meta_info.pop("Isolate") - asm_meta_info.pop("Isolate/Strain") + asm_meta_info["Isolate/Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] else: asm_meta_info["Isolate/Strain"] = "NA" - asm_meta_info.pop("Strain") - asm_meta_info.pop("Isolate") else: - # elif ("strain" not in organism_type_keys) and ("isolate" not in organism_type_keys): asm_meta_info["Isolate/Strain"] = "NA" - asm_meta_info.pop("Strain") - asm_meta_info.pop("Isolate") parsed_meta[core] = asm_meta_info @@ -375,8 +359,8 @@ def generate_report_tsv( tsv_outfile = f"{output_directory}/{outfile_prefix}.tsv" header_list = list(ReportStructure().keys()) - header_list.remove("Strain") - header_list.remove("Isolate") + # header_list.remove("Strain") + # header_list.remove("Isolate") header_list = [query_type] + header_list with open(tsv_outfile, "w+") as tsv_out: From e4a9416f021c48ba76072870d1521225cd0024b2 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Tue, 2 Apr 2024 10:49:19 +0100 Subject: [PATCH 19/45] Apply suggestions from code review Code changes/fixes from @jalvarez Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 8953ad07d..ba3426701 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -281,7 +281,7 @@ def datasets_asm_reports( def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, ReportStructure]: - """ "Function to parse assembly reports and extract specific key information on + """Function to parse assembly reports and extract specific key information on status and related fields. Args: From c06ad7d3359d68e399019b3e7781e3a27afc3355 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 27 Mar 2024 17:03:51 +0000 Subject: [PATCH 20/45] Clean up commented code, add successful exit --- src/python/ensembl/io/genomio/assembly/status.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index ba3426701..97f8ee07a 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -359,8 +359,6 @@ def generate_report_tsv( tsv_outfile = f"{output_directory}/{outfile_prefix}.tsv" header_list = list(ReportStructure().keys()) - # header_list.remove("Strain") - # header_list.remove("Isolate") header_list = [query_type] + header_list with open(tsv_outfile, "w+") as tsv_out: @@ -480,3 +478,5 @@ def main() -> None: # Produce the finalized assembly status report TSV from set of parsed 'datasets summary report' generate_report_tsv(key_assembly_report_meta, args.assembly_report_prefix, query_type, args.download_dir) + + sys.exit(0) From 7173a548b94b726d68c05cff600c1aadcb534bd5 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Tue, 2 Apr 2024 10:41:11 +0100 Subject: [PATCH 21/45] Add license header to datasets singularity def file --- containers/ncbi_datasets_v16.10.0.def | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/containers/ncbi_datasets_v16.10.0.def b/containers/ncbi_datasets_v16.10.0.def index 7a7c0dae5..ec289236c 100644 --- a/containers/ncbi_datasets_v16.10.0.def +++ b/containers/ncbi_datasets_v16.10.0.def @@ -1,3 +1,18 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + Bootstrap: docker From: ubuntu:24.04 @@ -16,19 +31,14 @@ From: ubuntu:24.04 %post apt-get update && apt-get -y upgrade apt-get -y install \ - wget \ - unzip \ - procps \ - ca-certificates \ - + rm -rf /var/lib/apt/lists/* - apt-get clean - + #Installing ncbi datasets & dataformat cd /usr/local/bin/ && \ wget https://github.com/ncbi/datasets/releases/download/v16.10.0/linux-amd64.cli.package.zip && \ @@ -52,4 +62,4 @@ From: ubuntu:24.04 echo -e -n "\n** Checking we have datasets installed **\n" datasets --version - datasets --help + datasets --help \ No newline at end of file From 2c8027af4076a6f780e3d3c8a02cd31e6925e4a9 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Tue, 2 Apr 2024 10:57:57 +0100 Subject: [PATCH 22/45] Updates requested by Jorge --- src/python/ensembl/io/genomio/assembly/status.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 97f8ee07a..a5bd9e281 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -27,12 +27,12 @@ import csv import json import os -from os import PathLike, getcwd -from pathlib import Path import re import logging import sys from typing import Dict, Tuple, Union +from pathlib import Path +from os import PathLike from spython.main import Client from sqlalchemy.engine import URL @@ -42,6 +42,7 @@ from ensembl.utils.argparse import ArgumentParser from ensembl.utils.logging import init_logging_with_args + DATASETS_SINGULARITY = { "datasets_version_url": "library://lcampbell/ensembl-genomio/ncbi-datasets-v16.10.0:latest", } @@ -81,7 +82,7 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie datasets_version: URL of singularity container (custom 'datasets' version if desired) Returns: - spython.main.client instance of singularity container image housing 'datasets' + `spython.main.client` instance of singularity container image housing 'datasets'. """ # Set singularity cache dir from user defined path or use environment @@ -347,7 +348,7 @@ def generate_report_tsv( parsed_asm_reports: dict, outfile_prefix: str, query_type: str, - output_directory: PathLike = Path(getcwd()), + output_directory: PathLike = Path(), ) -> None: """Generate and write the assembly report to a TSV file @@ -371,10 +372,6 @@ def generate_report_tsv( tsv_out.close() -# def classify_assembly_status(core_accessions: dict) -> None: -# """Main function to pare set of core list and call ncbi datasets""" - - def main() -> None: """Module's entry-point.""" parser = ArgumentParser( @@ -444,8 +441,7 @@ def main() -> None: init_logging_with_args(args) # Set and create dir for download files - if not args.download_dir.is_dir(): - args.download_dir.mkdir(parents=True) + args.download_dir.mkdir(parents=True, exist_ok=True) # Set input file and determine if proper parameterization options are defined. user_query_file = check_parameterization(args.input_cores, args.input_accessions, args.host, args.port) From 1c528e7ed27195a75fe4863ba23fe62afbe27efe Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Tue, 2 Apr 2024 10:58:39 +0100 Subject: [PATCH 23/45] Replace personal email with metazoa email --- containers/ncbi_datasets_v16.10.0.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/containers/ncbi_datasets_v16.10.0.def b/containers/ncbi_datasets_v16.10.0.def index ec289236c..88b5c3d9a 100644 --- a/containers/ncbi_datasets_v16.10.0.def +++ b/containers/ncbi_datasets_v16.10.0.def @@ -22,7 +22,7 @@ From: ubuntu:24.04 export LC_ALL=C %labels - Author lcampbell@ebi.ac.uk + Author ensembl-metazoa@ebi.ac.uk Software "NCBI's datasets and dataformat" Software.version v16.10.0 Software.website "https://github.com/ncbi/datasets/releases/tag/v16.10.0" From 71e49e30917b0bc0265017de67a9771f662486d6 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 10 Apr 2024 09:26:55 +0100 Subject: [PATCH 24/45] Jorge round of suggesstions/tweaks --- .../ensembl/io/genomio/assembly/status.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index a5bd9e281..ec0967163 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -12,27 +12,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Obtain and record the assembly 'status' for a set of INSDC accession(s) using NCBI 'datasets' tool.""" +"""Obtain and record the assembly status for a set of INSDC accession(s) using NCBI's datasets CLI tool.""" __all__ = [ - "singularity_image_setter", "check_parameterization", - "resolve_query_type", - "fetch_accessions_from_cores", "datasets_asm_reports", "extract_assembly_metadata", + "fetch_accessions_from_cores", "generate_report_tsv", + "resolve_query_type", + "singularity_image_setter", ] import csv import json +import logging import os +from os import PathLike +from pathlib import Path import re -import logging import sys from typing import Dict, Tuple, Union -from pathlib import Path -from os import PathLike from spython.main import Client from sqlalchemy.engine import URL @@ -87,7 +87,7 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie # Set singularity cache dir from user defined path or use environment if sif_cache_dir and sif_cache_dir.is_dir(): - image_dl_path = Path(sif_cache_dir) + image_dl_path = sif_cache_dir logging.info(f"Using user-defined cache_dir: '{image_dl_path}'") elif os.environ.get("NXF_SINGULARITY_CACHEDIR"): image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"]) @@ -125,7 +125,7 @@ def check_parameterization(input_cores: Path, input_accessions: Path, db_host: s input_cores: input core(s) list file name. input_accessions: input accession (s) list file name. db_host: host server name - db_port: host serer port + db_port: host server port Returns: User input file used in assembly status querying @@ -355,9 +355,14 @@ def generate_report_tsv( Args: parsed_asm_reports: Parsed assembly report meta output_directory: Path to directory where output TSV is stored. + query_type: Type of query core_db or accession + output_directory: Directory to store report TSV """ - tsv_outfile = f"{output_directory}/{outfile_prefix}.tsv" + if not parsed_asm_reports: + return + + tsv_outfile = Path(output_directory, f"{outfile_prefix}.tsv") header_list = list(ReportStructure().keys()) header_list = [query_type] + header_list @@ -375,7 +380,7 @@ def generate_report_tsv( def main() -> None: """Module's entry-point.""" parser = ArgumentParser( - description="Track the assembly status of a set of input core(s) using NCBI 'datasets'" + description=__doc__ ) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( @@ -475,4 +480,3 @@ def main() -> None: # Produce the finalized assembly status report TSV from set of parsed 'datasets summary report' generate_report_tsv(key_assembly_report_meta, args.assembly_report_prefix, query_type, args.download_dir) - sys.exit(0) From 1ac417a51f5952626f842b63837b822e915a8e80 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 10 Apr 2024 10:20:29 +0100 Subject: [PATCH 25/45] black and pylint --- src/python/ensembl/io/genomio/assembly/status.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index ec0967163..0b6a3c38b 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -379,9 +379,7 @@ def generate_report_tsv( def main() -> None: """Module's entry-point.""" - parser = ArgumentParser( - description=__doc__ - ) + parser = ArgumentParser(description=__doc__) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( "--input_cores", @@ -479,4 +477,3 @@ def main() -> None: # Produce the finalized assembly status report TSV from set of parsed 'datasets summary report' generate_report_tsv(key_assembly_report_meta, args.assembly_report_prefix, query_type, args.download_dir) - From df3a254a9d3d4f7b18dc11c8813c5aafaa91562c Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:31:58 +0100 Subject: [PATCH 26/45] Simplify parameterisation check Co-authored-by: J. Alvarez-Jarreta --- .../ensembl/io/genomio/assembly/status.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 0b6a3c38b..e37dc6494 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -130,23 +130,15 @@ def check_parameterization(input_cores: Path, input_accessions: Path, db_host: s Returns: User input file used in assembly status querying """ - user_query_file: Path - # Input core names centered run if input_cores: - user_query_file = input_cores - logging.info(f"Performing assembly status report using core db list file: {user_query_file}") + logging.info(f"Performing assembly status report using core db list file: {input_cores}") if db_host is None or db_port is None: - logging.critical( - "User must specify both arguments '--host' and '--port' when providing core database names." - ) - sys.exit(1) + raise RuntimeError("Core database names require both arguments '--host' and '--port'") + return input_cores # Accession centered run - else: - user_query_file = input_accessions - logging.info(f"Performing assembly status report using INSDC accession list file: {user_query_file}") - - return user_query_file + logging.info(f"Performing assembly status report using INSDC accession list file: {input_accessions}") + return input_accessions def resolve_query_type( From d4bb4307a12805a9d418a2d0fcc9fc5c94d37c76 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:33:46 +0100 Subject: [PATCH 27/45] Update src/python/ensembl/io/genomio/assembly/status.py add newline Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index e37dc6494..c84b759ea 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -181,6 +181,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di Args: database_names: Set of names for one or more core databases connection_url: Partial MYSQL host name : port + Returns: Dict of core name(s) (key) and its INSDC assembly.accession (value) """ From c35b5d118a99734e8ce7ed281c105f8dacf3b63e Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:35:47 +0100 Subject: [PATCH 28/45] Update src/python/ensembl/io/genomio/assembly/status.py change shorthand qry -> query Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index c84b759ea..2b1ce9afc 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -193,7 +193,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di for core in database_names: db_connection_url = connection_url.set(database=core) db_connection = dbc(f"{db_connection_url}") - qry_result = db_connection.execute( + query_result = db_connection.execute( 'SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";' ).fetchall() From 2202d60f8bb2d5504a153bd038ba81293046d9f6 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:36:08 +0100 Subject: [PATCH 29/45] Update src/python/ensembl/io/genomio/assembly/status.py qry -> query Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 2b1ce9afc..575afad59 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -197,7 +197,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di 'SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";' ).fetchall() - if qry_result is None: + if query_result is None: logging.warning(f"We have no accession on core: {core}") elif len(qry_result) == 1: count_accn_found += 1 From 9e127c333617432ae95f6f1c0fd511a7b0326254 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:36:31 +0100 Subject: [PATCH 30/45] Update src/python/ensembl/io/genomio/assembly/status.py qry->query Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 575afad59..52f14ad6d 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -199,7 +199,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di if query_result is None: logging.warning(f"We have no accession on core: {core}") - elif len(qry_result) == 1: + elif len(query_result) == 1: count_accn_found += 1 asm_accession = qry_result.pop()[0] logging.info(f"{core} -> assembly.accession[{asm_accession}]") From a58ebe393fe6f7ccca7c32647ab8c927067876c6 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:37:07 +0100 Subject: [PATCH 31/45] Update src/python/ensembl/io/genomio/assembly/status.py capitalisation to formated string Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 52f14ad6d..7873f7094 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -205,7 +205,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di logging.info(f"{core} -> assembly.accession[{asm_accession}]") core_accn_meta[core] = asm_accession else: - logging.warning(f"Core {core} Has {len(qry_result)} assembly.accessions") + logging.warning(f"Core {core} has {len(query_result)} assembly.accessions") logging.info(f"From initial input cores ({core_list_count}), obtained ({count_accn_found}) accessions") From d93dc7155a9acf2bb4ef546d7141273714e94210 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:37:48 +0100 Subject: [PATCH 32/45] Update src/python/ensembl/io/genomio/assembly/status.py Warning message improvement Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 7873f7094..4f473be08 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -198,7 +198,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di ).fetchall() if query_result is None: - logging.warning(f"We have no accession on core: {core}") + logging.warning(f"No accessions found in core: {core}") elif len(query_result) == 1: count_accn_found += 1 asm_accession = qry_result.pop()[0] From 1312ff205e05e67a0474933fed3ab76f52421280 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:41:28 +0100 Subject: [PATCH 33/45] Update src/python/ensembl/io/genomio/assembly/status.py Remove batch size comment, not needed default set Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 4f473be08..a85e214ca 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -232,7 +232,7 @@ def datasets_asm_reports( combined_asm_reports = {} # Setting the number of combined accessions to query in a single call to datasets - list_split = list(range(0, len(master_accn_list), batch_size)) ## Note best to use >=10 + list_split = list(range(0, len(master_accn_list), batch_size)) accn_subsample = [master_accn_list[ind : ind + batch_size] for ind in list_split] for accessions in accn_subsample: From 255f08067b944feb80234a08adf62dfa12970d98 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:41:59 +0100 Subject: [PATCH 34/45] Update src/python/ensembl/io/genomio/assembly/status.py shorthand wording fix Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index a85e214ca..056846005 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -238,7 +238,7 @@ def datasets_asm_reports( for accessions in accn_subsample: datasets_command = ["datasets", "summary", "genome", "accession"] + accessions - # Make call to singularity datasets providing a multi accn query: + # Make call to singularity datasets providing a multi-accession query: client_return = Client.execute( image=sif_image, command=datasets_command, return_result=True, quiet=True ) From 9da00a763bd74f6d0976a8edaca390f7584a2425 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:42:32 +0100 Subject: [PATCH 35/45] Update src/python/ensembl/io/genomio/assembly/status.py wording change Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 056846005..10505278f 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -245,7 +245,7 @@ def datasets_asm_reports( result = client_return["message"] - ## Test what result we have returned following execution of sif image and accession value + ## Test what result we have obtained following execution of sif image and accession value # Returned a str, i.e. no datasets result obtained exited with fatal error if not isinstance(result, str): raise ValueError("Result obtained from datasets is not the expected format 'string'") From 437a6747c1d503f16274550f6edcd03e88a8c24f Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:44:08 +0100 Subject: [PATCH 36/45] Update src/python/ensembl/io/genomio/assembly/status.py fix to formatted string and list parentheses Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 10505278f..806b31d9b 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -256,7 +256,7 @@ def datasets_asm_reports( tmp_asm_dict = json.loads(result) if tmp_asm_dict["total_count"] >= 1: - logging.info(f"Asm report obtained for accession(s) [{accessions}]") + logging.info(f"Assembly report obtained for accession(s) {accessions}") batch_reports_json = tmp_asm_dict["reports"] for assembly_report in batch_reports_json: From 1a7d88a19aef4d51702bface5752d7796297c872 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:44:48 +0100 Subject: [PATCH 37/45] Update src/python/ensembl/io/genomio/assembly/status.py runtime error instead of critical logging Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 806b31d9b..79c5e47b9 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -250,8 +250,7 @@ def datasets_asm_reports( if not isinstance(result, str): raise ValueError("Result obtained from datasets is not the expected format 'string'") if re.search("^FATAL", result): - logging.critical(f"Singularity image execution failed! -> '{result.strip()}'") - sys.exit(1) + raise RuntimeError(f"Singularity image execution failed! -> '{result.strip()}'") # Returned a list, i.e. datasets returned a result to client.execute tmp_asm_dict = json.loads(result) From 711a68498812a90be356ac45cbb0869d6cf5b97b Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:45:44 +0100 Subject: [PATCH 38/45] Update src/python/ensembl/io/genomio/assembly/status.py Improvement to print_json call() Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 79c5e47b9..b34ecd449 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -260,8 +260,8 @@ def datasets_asm_reports( batch_reports_json = tmp_asm_dict["reports"] for assembly_report in batch_reports_json: accession = assembly_report["accession"] - asm_json_outfile = f"{download_directory}/{accession}.asm_report.json" - print_json(Path(asm_json_outfile), assembly_report) + asm_json_outfile = Path(download_directory, f"/{accession}.asm_report.json") + print_json(asm_json_outfile, assembly_report) # Save assembly report into master core<>report dict for core, accession_core in assembly_accessions.items(): if accession == accession_core: From 1a6617ef50b0acb8c2d3929d44aabb6ff3e3fda5 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:46:19 +0100 Subject: [PATCH 39/45] Update src/python/ensembl/io/genomio/assembly/status.py shorthand wording fix Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index b34ecd449..cbb7384cd 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -201,7 +201,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di logging.warning(f"No accessions found in core: {core}") elif len(query_result) == 1: count_accn_found += 1 - asm_accession = qry_result.pop()[0] + asm_accession = query_result.pop()[0] logging.info(f"{core} -> assembly.accession[{asm_accession}]") core_accn_meta[core] = asm_accession else: From 70dfed0edc0124d3d8d75c86df2024243b4c90a7 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:49:12 +0100 Subject: [PATCH 40/45] Update src/python/ensembl/io/genomio/assembly/status.py allow empty dict and check in TSV creation instead Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index cbb7384cd..102218649 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -268,7 +268,6 @@ def datasets_asm_reports( combined_asm_reports[core] = assembly_report else: logging.warning(f"No assembly report found for accession(s) {accessions}. Exiting !") - sys.exit(0) return combined_asm_reports From 3e947f6a81560c5b6a6619bdb51b71512c068a2d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:50:37 +0100 Subject: [PATCH 41/45] Update src/python/ensembl/io/genomio/assembly/status.py py3.8 dictionary dict -> Dict declaration Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 102218649..ad8e0e784 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -336,7 +336,7 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re def generate_report_tsv( - parsed_asm_reports: dict, + parsed_asm_reports: Dict, outfile_prefix: str, query_type: str, output_directory: PathLike = Path(), From dce58eb617acbc4d6d21f2417f85707d45b99e25 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:51:50 +0100 Subject: [PATCH 42/45] Update src/python/ensembl/io/genomio/assembly/status.py punctuation Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/assembly/status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index ad8e0e784..1e2589535 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -341,7 +341,7 @@ def generate_report_tsv( query_type: str, output_directory: PathLike = Path(), ) -> None: - """Generate and write the assembly report to a TSV file + """Generate and write the assembly report to a TSV file. Args: parsed_asm_reports: Parsed assembly report meta From 8f8b8f28169c4d68468e2ebc113832d0b139761c Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 10 Apr 2024 11:53:51 +0100 Subject: [PATCH 43/45] edits to declaration and doc string --- src/python/ensembl/io/genomio/assembly/status.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 1e2589535..a4fd3baa4 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -32,7 +32,7 @@ from pathlib import Path import re import sys -from typing import Dict, Tuple, Union +from typing import Dict, Tuple, Union, List from spython.main import Client from sqlalchemy.engine import URL @@ -174,7 +174,7 @@ def resolve_query_type( return query_accessions, query_type -def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Dict: +def fetch_accessions_from_cores(database_names: List, connection_url: URL) -> Dict: """Obtain the associated INSDC accession [meta.assembly.accession] given a set of core(s) names and a MYSQL server host. @@ -215,8 +215,7 @@ def fetch_accessions_from_cores(database_names: list, connection_url: URL) -> Di def datasets_asm_reports( sif_image: str, assembly_accessions: dict, download_directory: PathLike, batch_size: int ) -> Dict: - """Obtain multiple assembly report JSONs in one or more querys to datasets, - i.e. make individual since accn query to datasets tool. + """Obtain assembly report(s) JSONs for one or more queries made to datasets CLI. Args: sif_image: Instance of Client.loaded singularity image. From be90eed00fe14b9aa6cb841a54537143f9e893f8 Mon Sep 17 00:00:00 2001 From: "J. Alvarez-Jarreta" Date: Wed, 10 Apr 2024 11:59:18 +0100 Subject: [PATCH 44/45] Update src/python/ensembl/io/genomio/assembly/status.py --- src/python/ensembl/io/genomio/assembly/status.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index a4fd3baa4..548159081 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -31,8 +31,7 @@ from os import PathLike from pathlib import Path import re -import sys -from typing import Dict, Tuple, Union, List +from typing import Dict, List, Tuple, Union from spython.main import Client from sqlalchemy.engine import URL From af6d3419409b2d49a1e275f9b4b137c984f2f6a8 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell Date: Wed, 10 Apr 2024 14:40:20 +0100 Subject: [PATCH 45/45] Fix to Path declaration and docs string tweaks --- src/python/ensembl/io/genomio/assembly/status.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 548159081..ee913f5e9 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -77,7 +77,7 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie container and define version and location of container. Args: - sif_cache_dir: Path to attempt locating/download SIF container image. + sif_cache_dir: Path to locate existing, or download new SIF container image. datasets_version: URL of singularity container (custom 'datasets' version if desired) Returns: @@ -121,10 +121,10 @@ def check_parameterization(input_cores: Path, input_accessions: Path, db_host: s incorrect parameterization. Args: - input_cores: input core(s) list file name. - input_accessions: input accession (s) list file name. - db_host: host server name - db_port: host server port + input_cores: Input core(s) list file name. + input_accessions: Input accession (s) list file name. + db_host: Host server name + db_port: Host server port Returns: User input file used in assembly status querying @@ -149,8 +149,8 @@ def resolve_query_type( Args: query_list: List of user defined queries either core names, or accessions partial_url: A partial MYSQL connection URL (host:port) - input_cores: arg parse param '--input_cores' - input_accessions: arg parse param '--input_accns' + input_cores: Arg parse param '--input_cores' + input_accessions: Arg parse param '--input_accessions' Returns: User queries stored as identifier[(core db name | UniqueID#)] : accession @@ -258,7 +258,7 @@ def datasets_asm_reports( batch_reports_json = tmp_asm_dict["reports"] for assembly_report in batch_reports_json: accession = assembly_report["accession"] - asm_json_outfile = Path(download_directory, f"/{accession}.asm_report.json") + asm_json_outfile = Path(download_directory, f"{accession}.asm_report.json") print_json(asm_json_outfile, assembly_report) # Save assembly report into master core<>report dict for core, accession_core in assembly_accessions.items():