From 970dfdb49da86e0c4b931ec337c8c08e45d6b1ab Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Sun, 5 May 2024 05:06:29 -0700 Subject: [PATCH] partially remove loading_bar (unused) --- goatools/associations.py | 147 +++++++++------ goatools/base.py | 54 +++--- goatools/cli/compare_gos.py | 252 ++++++++++++++++---------- goatools/cli/wr_sections.py | 74 ++++---- goatools/test_data/nature3102_goea.py | 14 +- 5 files changed, 324 insertions(+), 217 deletions(-) diff --git a/goatools/associations.py b/goatools/associations.py index fcf1a3db..795b30a8 100755 --- a/goatools/associations.py +++ b/goatools/associations.py @@ -5,21 +5,23 @@ __copyright__ = "Copyright (C) 2010-present, H Tang et al. All rights reserved." __author__ = "various" -from collections import defaultdict +import gzip import os import sys -from goatools.base import dnld_file -from goatools.base import ftp_get -from goatools.anno.factory import get_objanno -from goatools.anno.factory import get_anno_desc -from goatools.anno.factory import get_objanno_g_kws -from goatools.semantic import TermCounts -from goatools.anno.gaf_reader import GafReader -from goatools.anno.genetogo_reader import Gene2GoReader -from goatools.anno.opts import AnnoOptions -from goatools.utils import get_b2aset as utils_get_b2aset - -def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): + +from collections import defaultdict + +from .anno.factory import get_anno_desc, get_objanno, get_objanno_g_kws +from .anno.gaf_reader import GafReader +from .anno.genetogo_reader import Gene2GoReader +from .anno.opts import AnnoOptions +from .semantic import TermCounts +from .utils import get_b2aset as utils_get_b2aset + +from .base import dnld_file, ftp_get + + +def dnld_assc(assc_name, go2obj=None, namespace="BP", prt=sys.stdout): """Download association from http://geneontology.org/gene-associations.""" # Example assc_name: "tair.gaf" # Download the Association @@ -39,113 +41,144 @@ def dnld_assc(assc_name, go2obj=None, namespace='BP', prt=sys.stdout): assc[gene] = goids_cur.intersection(goids_dag) return assc + def dnld_annotation(assc_file, prt=sys.stdout): """Download gaf, gpad, or gpi from http://current.geneontology.org/annotations/""" if not os.path.isfile(assc_file): - # assc_http = "http://geneontology.org/gene-associations/" assc_http = "http://current.geneontology.org/annotations/" _, assc_base = os.path.split(assc_file) src = os.path.join(assc_http, "{ASSC}.gz".format(ASSC=assc_base)) - dnld_file(src, assc_file, prt, loading_bar=None) + dnld_file(src, assc_file, prt) -def read_associations(assoc_fn, anno_type='id2gos', namespace='BP', **kws): + +def read_associations(assoc_fn, anno_type="id2gos", namespace="BP", **kws): """Return associatinos in id2gos format""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol obj = get_objanno(assoc_fn, anno_type, **kws) # kws get_id2gos: ev_include ev_exclude keep_ND keep_NOT b_geneid2gos go2geneids return obj.get_id2gos(namespace, **kws) -def get_assoc_ncbi_taxids(taxids, force_dnld=False, loading_bar=True, **kws): + +def get_assoc_ncbi_taxids(taxids, force_dnld=False, **kws): """Download NCBI's gene2go. Return annotations for user-specified taxid(s).""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) - fin = kws['gene2go'] if 'gene2go' in kws else os.path.join(os.getcwd(), "gene2go") - dnld_ncbi_gene_file(fin, force_dnld, loading_bar=loading_bar) + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) + fin = kws["gene2go"] if "gene2go" in kws else os.path.join(os.getcwd(), "gene2go") + dnld_ncbi_gene_file(fin, force_dnld) return read_ncbi_gene2go(fin, taxids, **kws) + # pylint: disable=unused-argument -def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout, loading_bar=True): +def dnld_ncbi_gene_file(fin, force_dnld=False, log=sys.stdout): """Download a file from NCBI Gene's ftp server.""" if not os.path.exists(fin) or force_dnld: - import gzip fin_dir, fin_base = os.path.split(fin) fin_gz = "{F}.gz".format(F=fin_base) fin_gz = os.path.join(fin_dir, fin_gz) if os.path.exists(fin_gz): os.remove(fin_gz) fin_ftp = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{F}.gz".format(F=fin_base) - ## if log is not None: - ## log.write(" DOWNLOADING GZIP: {GZ}\n".format(GZ=fin_ftp)) - ## if loading_bar: - ## loading_bar = wget.bar_adaptive - ## wget.download(fin_ftp, bar=loading_bar) - ## rsp = wget(fin_ftp) ftp_get(fin_ftp, fin_gz) - with gzip.open(fin_gz, 'rb') as zstrm: + with gzip.open(fin_gz, "rb") as zstrm: if log is not None: log.write("\n READ GZIP: {F}\n".format(F=fin_gz)) - with open(fin, 'wb') as ostrm: + with open(fin, "wb") as ostrm: ostrm.write(zstrm.read()) if log is not None: log.write(" WROTE UNZIPPED: {F}\n".format(F=fin)) + def dnld_annofile(fin_anno, anno_type): """Download annotation file, if needed""" if os.path.exists(fin_anno): return anno_type = get_anno_desc(fin_anno, anno_type) - if anno_type == 'gene2go': + if anno_type == "gene2go": dnld_ncbi_gene_file(fin_anno) - if anno_type in {'gaf', 'gpad'}: + if anno_type in {"gaf", "gpad"}: dnld_annotation(fin_anno) -def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace='BP', **kws): + +def read_ncbi_gene2go(fin_gene2go, taxids=None, namespace="BP", **kws): """Read NCBI's gene2go. Return gene2go data for user-specified taxids.""" - print('DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader') + print( + "DEPRECATED read_ncbi_gene2go: USE Gene2GoReader FROM goatools.anno.genetogo_reader" + ) # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}'.format( - PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED read_ncbi_gene2go CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) obj = Gene2GoReader(fin_gene2go, taxids=taxids) # By default, return id2gos. User can cause go2geneids to be returned by: # >>> read_ncbi_gene2go(..., go2geneids=True - if 'taxid2asscs' not in kws: + if "taxid2asscs" not in kws: if len(obj.taxid2asscs) == 1: taxid = next(iter(obj.taxid2asscs)) - kws_ncbi = {k:v for k, v in kws.items() if k in AnnoOptions.keys_exp} - kws_ncbi['taxid'] = taxid + kws_ncbi = {k: v for k, v in kws.items() if k in AnnoOptions.keys_exp} + kws_ncbi["taxid"] = taxid return obj.get_id2gos(namespace, **kws_ncbi) # Optional detailed associations split by taxid and having both ID2GOs & GO2IDs # e.g., taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set)) t2asscs_ret = obj.get_taxid2asscs(taxids, **kws) - t2asscs_usr = kws.get('taxid2asscs', defaultdict(lambda: defaultdict(lambda: defaultdict(set)))) - if 'taxid2asscs' in kws: + t2asscs_usr = kws.get( + "taxid2asscs", defaultdict(lambda: defaultdict(lambda: defaultdict(set))) + ) + if "taxid2asscs" in kws: obj.fill_taxid2asscs(t2asscs_usr, t2asscs_ret) return obj.get_id2gos_all(t2asscs_ret) + def get_gaf_hdr(fin_gaf): """Read Gene Association File (GAF). Return GAF version and data info.""" return GafReader(fin_gaf, hdr_only=True).hdr + # pylint: disable=line-too-long -def read_gaf(fin_gaf, prt=sys.stdout, hdr_only=False, namespace='BP', allow_missing_symbol=False, **kws): +def read_gaf( + fin_gaf, + prt=sys.stdout, + hdr_only=False, + namespace="BP", + allow_missing_symbol=False, + **kws +): """Read Gene Association File (GAF). Return data.""" return GafReader( - fin_gaf, hdr_only=hdr_only, prt=prt, allow_missing_symbol=allow_missing_symbol, godag=kws.get('godag')).get_id2gos( - namespace, **kws) + fin_gaf, + hdr_only=hdr_only, + prt=prt, + allow_missing_symbol=allow_missing_symbol, + godag=kws.get("godag"), + ).get_id2gos(namespace, **kws) + def get_b2aset(a2bset): """Given gene2gos, return go2genes. Given go2genes, return gene2gos.""" - print('DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils') + print("DEPRECATED get_b2aset MOVED: USE get_b2aset IN goatools.utils") # pylint: disable=protected-access frm = sys._getframe().f_back.f_code - print('DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}'.format(PY=frm.co_filename, FNC=frm.co_name)) + print( + "DEPRECATED get_b2aset CALLED FROM: {PY} BY {FNC}".format( + PY=frm.co_filename, FNC=frm.co_name + ) + ) return utils_get_b2aset(a2bset) -def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout): + +def get_assc_pruned( + assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys.stdout +): """Remove GO IDs associated with large numbers of genes. Used in stochastic simulations.""" # DEFN WAS: get_assc_pruned(assc_geneid2gos, max_genecnt=None, prt=sys.stdout): # ADDED min_genecnt argument and functionality @@ -156,22 +189,27 @@ def get_assc_pruned(assc_geneid2gos, min_genecnt=None, max_genecnt=None, prt=sys go2genes_prun = {} for goid, genes in go2genes_orig.items(): num_genes = len(genes) - if (min_genecnt is None or num_genes >= min_genecnt) and \ - (max_genecnt is None or num_genes <= max_genecnt): + if (min_genecnt is None or num_genes >= min_genecnt) and ( + max_genecnt is None or num_genes <= max_genecnt + ): go2genes_prun[goid] = genes num_was = len(go2genes_orig) num_now = len(go2genes_prun) gos_rm = set(go2genes_orig.keys()).difference(set(go2genes_prun.keys())) - assert num_was-num_now == len(gos_rm) + assert num_was - num_now == len(gos_rm) if prt is not None: if min_genecnt is None: min_genecnt = 1 if max_genecnt is None: max_genecnt = "Max" - prt.write("{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( - m=min_genecnt, M=max_genecnt, N=num_was-num_now, NOW=num_now)) + prt.write( + "{N:4} GO IDs pruned. Kept {NOW} GOs assc w/({m} to {M} genes)\n".format( + m=min_genecnt, M=max_genecnt, N=num_was - num_now, NOW=num_now + ) + ) return utils_get_b2aset(go2genes_prun), gos_rm + def read_annotations(**kws): """Read annotations from either a GAF file or NCBI's gene2go file.""" # Read and save annotation lines @@ -179,6 +217,7 @@ def read_annotations(**kws): # Return associations return objanno.get_id2gos(**kws) if objanno is not None else {} + def get_tcntobj(go2obj, **kws): """Return a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gpad gaf gene2go id2gos diff --git a/goatools/base.py b/goatools/base.py index b7749054..4a410607 100644 --- a/goatools/base.py +++ b/goatools/base.py @@ -21,14 +21,16 @@ def get_logger(name: str): - """Return a logger with a default ColoredFormatter.""" - logger = logging.getLogger(name) - if logger.hasHandlers(): - logger.handlers.clear() - logger.addHandler(RichHandler()) - logger.propagate = False - logger.setLevel(logging.INFO) - return logger + """ + Return a logger with a default ColoredFormatter. + """ + log = logging.getLogger(name) + if log.hasHandlers(): + log.handlers.clear() + log.addHandler(RichHandler()) + log.propagate = False + log.setLevel(logging.INFO) + return log logger = get_logger("goatools") @@ -70,7 +72,6 @@ def nopen(f, mode="r"): stderr=sys.stderr if mode == "r" else PIPE, shell=True, bufsize=-1, # use system default for buffering - preexec_fn=prefunc, close_fds=False, executable=os.environ.get("SHELL"), ) @@ -79,8 +80,6 @@ def nopen(f, mode="r"): if mode != "r": p.stderr = io.TextIOWrapper(p.stderr) - if mode and mode[0] == "r": - return process_iter(p, f[1:]) return p if f.startswith(("http://", "https://", "ftp://")): @@ -96,7 +95,11 @@ def nopen(f, mode="r"): fh = bz2.BZ2File(f, mode) return io.TextIOWrapper(fh) - return {"r": sys.stdin, "w": sys.stdout}[mode[0]] if f == "-" else open(f, mode) + return ( + {"r": sys.stdin, "w": sys.stdout}[mode[0]] + if f == "-" + else open(f, mode, encoding="utf-8") + ) def ungzipper(fh, blocksize=16384): @@ -116,27 +119,27 @@ def ungzipper(fh, blocksize=16384): data[0] = save + data[0] -def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout, loading_bar=True): +def download_go_basic_obo(obo="go-basic.obo", prt=sys.stdout): """Download Ontologies, if necessary.""" if not isfile(obo): http = "http://purl.obolibrary.org/obo/go" if "slim" in obo: http = "http://www.geneontology.org/ontology/subsets" obo_remote = f"{http}/{op.basename(obo)}" - dnld_file(obo_remote, obo, prt, loading_bar) + dnld_file(obo_remote, obo, prt) else: if prt: prt.write(" EXISTS: {FILE}\n".format(FILE=obo)) return obo -def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout, loading_bar=True): +def download_ncbi_associations(gene2go="gene2go", prt=sys.stdout): """Download associations from NCBI, if necessary""" # Download: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz gzip_file = "{GENE2GO}.gz".format(GENE2GO=gene2go) if not isfile(gene2go): file_remote = f"ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/{op.basename(gzip_file)}" - dnld_file(file_remote, gene2go, prt, loading_bar) + dnld_file(file_remote, gene2go, prt) else: if prt is not None: prt.write(" EXISTS: {FILE}\n".format(FILE=gene2go)) @@ -151,22 +154,20 @@ def gunzip(gzip_file, file_gunzip=None): return file_gunzip -def get_godag( - fin_obo="go-basic.obo", prt=sys.stdout, loading_bar=True, optional_attrs=None -): +def get_godag(fin_obo="go-basic.obo", prt=sys.stdout, optional_attrs=None): """Return GODag object. Initialize, if necessary.""" from .obo_parser import GODag - download_go_basic_obo(fin_obo, prt, loading_bar) + download_go_basic_obo(fin_obo, prt) return GODag(fin_obo, optional_attrs, load_obsolete=False, prt=prt) -def dnld_gaf(species_txt, prt=sys.stdout, loading_bar=True): +def dnld_gaf(species_txt, prt=sys.stdout): """Download GAF file if necessary.""" - return dnld_gafs([species_txt], prt, loading_bar)[0] + return dnld_gafs([species_txt], prt)[0] -def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): +def dnld_gafs(species_list, prt=sys.stdout): """Download GAF files if necessary.""" # Example GAF files in http://current.geneontology.org/annotations/: # http://current.geneontology.org/annotations/mgi.gaf.gz @@ -180,7 +181,7 @@ def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): gaf_base = "{ABC}.gaf".format(ABC=species_txt) # goa_human.gaf gaf_cwd = os.path.join(cwd, gaf_base) # {CWD}/goa_human.gaf remove_filename = "{HTTP}/{GAF}.gz".format(HTTP=http, GAF=gaf_base) - dnld_file(remove_filename, gaf_cwd, prt, loading_bar) + dnld_file(remove_filename, gaf_cwd, prt) fin_gafs.append(gaf_cwd) return fin_gafs @@ -188,7 +189,7 @@ def dnld_gafs(species_list, prt=sys.stdout, loading_bar=True): def http_get(url, fout=None): """Download a file from http. Save it in a file named by fout""" print("requests.get({URL}, stream=True)".format(URL=url)) - rsp = requests.get(url, stream=True) + rsp = requests.get(url, stream=True, timeout=10) if rsp.status_code == 200 and fout is not None: with open(fout, "wb") as prt: for chunk in rsp: # .iter_content(chunk_size=128): @@ -221,7 +222,7 @@ def ftp_get(fin_src, fout): ftp.quit() -def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): +def dnld_file(src_ftp, dst_file, prt=sys.stdout): """Download specified file if necessary.""" if isfile(dst_file): return @@ -231,7 +232,6 @@ def dnld_file(src_ftp, dst_file, prt=sys.stdout, loading_bar=True): cmd_msg = "get({SRC} out={DST})\n".format(SRC=src_ftp, DST=dst_gz) try: print("$ get {SRC}".format(SRC=src_ftp)) - #### wget.download(src_ftp, out=dst_gz, bar=loading_bar) if src_ftp[:4] == "http": http_get(src_ftp, dst_gz) else: diff --git a/goatools/cli/compare_gos.py b/goatools/cli/compare_gos.py index c8df612c..46b86dc1 100644 --- a/goatools/cli/compare_gos.py +++ b/goatools/cli/compare_gos.py @@ -24,81 +24,109 @@ from __future__ import print_function -__copyright__ = "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +__copyright__ = ( + "Copyright (C) 2016-present, DV Klopfenstein, H Tang. All rights reserved." +) __author__ = "DV Klopfenstein" import os import sys -from collections import namedtuple -# from collections import OrderedDict -from goatools.base import get_godag -from goatools.associations import get_tcntobj -from goatools.godag.relationship_str import RelationshipStr +from collections import namedtuple -from goatools.cli.docopt_parse import DocOptParse -from goatools.cli.gos_get import GetGOs -from goatools.cli.grouped import Grouped +from ..associations import get_tcntobj +from ..base import get_godag, logger +from ..godag.relationship_str import RelationshipStr +from ..gosubdag.gosubdag import GoSubDag +from ..gosubdag.rpt.wr_xlsx import GoDepth1LettersWr +from ..grouper.sorter import Sorter +from ..grouper.wrxlsx import WrXlsxSortedGos -from goatools.gosubdag.gosubdag import GoSubDag -from goatools.gosubdag.rpt.wr_xlsx import GoDepth1LettersWr -from goatools.grouper.sorter import Sorter -from goatools.grouper.wrxlsx import WrXlsxSortedGos +from .docopt_parse import DocOptParse +from .gos_get import GetGOs +from .grouped import Grouped # pylint: disable=too-few-public-methods class CompareGOsCli: """Class for command-line interface for creating GO term diagrams""" - kws_dict = set(['GO_FILE', - 'sections', 'S', - 'obo', 'slims', - 'ofile', 'xlsx', - 'gaf', 'gene2go', 'taxid', - ]) - kws_set = set(['verbose']) + kws_dict = set( + [ + "GO_FILE", + "sections", + "S", + "obo", + "slims", + "ofile", + "xlsx", + "gaf", + "gene2go", + "taxid", + ] + ) + kws_set = set(["verbose"]) # Print fields to exclude, unless verbose is used - excl_flds = {'level', 'reldepth', 'alt', 'D1', 'childcnt', - 'format_txt', 'num_usrgos', 'is_hdrgo', 'is_usrgo', 'hdr_idx', 'hdr1usr01', - 'REL', 'REL_short', 'rel', 'id'} + excl_flds = { + "level", + "reldepth", + "alt", + "D1", + "childcnt", + "format_txt", + "num_usrgos", + "is_hdrgo", + "is_usrgo", + "hdr_idx", + "hdr1usr01", + "REL", + "REL_short", + "rel", + "id", + } def __init__(self, **kws): _objdoc = DocOptParse(__doc__, self.kws_dict, self.kws_set) self.kws = _objdoc.get_docargs(prt=None) if not kws else kws - self.godag = get_godag(self.kws.get('obo'), prt=sys.stdout, - loading_bar=False, optional_attrs=['relationship']) + self.godag = get_godag( + self.kws.get("obo"), prt=sys.stdout, optional_attrs=["relationship"] + ) _ini = _Init(self.godag) - self.go_ntsets = _ini.get_go_ntsets(self.kws.get('GO_FILE')) + self.go_ntsets = _ini.get_go_ntsets(self.kws.get("GO_FILE")) self.go_all = set.union(*[nt.go_set for nt in self.go_ntsets]) _tcntobj = _ini.get_tcntobj(self.go_all, **self.kws) # Gets TermCounts or None - self.gosubdag = GoSubDag(self.go_all, self.godag, True, tcntobj=_tcntobj, prt=sys.stdout) - self.objgrpd = _ini.get_grouped(self.go_ntsets, self.go_all, self.gosubdag, **self.kws) + self.gosubdag = GoSubDag( + self.go_all, self.godag, True, tcntobj=_tcntobj, prt=sys.stdout + ) + self.objgrpd = _ini.get_grouped( + self.go_ntsets, self.go_all, self.gosubdag, **self.kws + ) # KWS: sortby hdrgo_sortby section_sortby def write(self, fout_xlsx=None, fout_txt=None, verbose=False): """Command-line interface for go_draw script.""" - sortby = self._get_fncsortnt(self.objgrpd.grprobj.gosubdag.prt_attr['flds']) - kws_sort = {'sortby' if verbose else 'section_sortby': sortby} + sortby = self._get_fncsortnt(self.objgrpd.grprobj.gosubdag.prt_attr["flds"]) + kws_sort = {"sortby" if verbose else "section_sortby": sortby} sortobj = Sorter(self.objgrpd.grprobj, **kws_sort) # KWS: hdrgo_prt=True section_prt=None top_n=None use_sections=True # RET: {sortobj, sections, hdrgo_prt} or {sortobj flat hdrgo_prt} desc2nts = sortobj.get_desc2nts_fnc( - hdrgo_prt=verbose, - section_prt=True, - top_n=None, - use_sections=True) + hdrgo_prt=verbose, section_prt=True, top_n=None, use_sections=True + ) # print('FFFF', desc2nts['flds']) # Write user GO IDs in sections objgowr = WrXlsxSortedGos("init", sortobj, self.objgrpd.ver_list) if fout_xlsx is not None: - kws_xlsx = {'shade_hdrgos':verbose} + kws_xlsx = {"shade_hdrgos": verbose} if not verbose: - kws_xlsx['prt_flds'] = [f for f in desc2nts['flds'] if f not in self.excl_flds] + kws_xlsx["prt_flds"] = [ + f for f in desc2nts["flds"] if f not in self.excl_flds + ] self._adj_hdrs(kws_xlsx, desc2nts) objgowr.wr_xlsx_nts(fout_xlsx, desc2nts, **kws_xlsx) - fout_desc = '{BASE}_desc.txt'.format(BASE=os.path.splitext(fout_xlsx)[0]) + fout_desc = "{BASE}_desc.txt".format(BASE=os.path.splitext(fout_xlsx)[0]) self._wr_ver_n_key(fout_desc, verbose) if fout_txt is not None: self._wr_txt_nts(fout_txt, desc2nts, objgowr, verbose) @@ -108,98 +136,121 @@ def write(self, fout_xlsx=None, fout_txt=None, verbose=False): summary_dct = objgowr.prt_txt_desc2nts(sys.stdout, desc2nts, prtfmt) self._prt_ver_n_key(sys.stdout, verbose) if summary_dct: - print("\n{N} GO IDs in {S} sections".format( - N=desc2nts['num_items'], S=desc2nts['num_sections'])) + print( + "\n{N} GO IDs in {S} sections".format( + N=desc2nts["num_items"], S=desc2nts["num_sections"] + ) + ) def _adj_hdrs(self, kws_xlsx, desc2nts): """Replace xlsx column header, fileN, with base input filenames""" filehdrs = [nt.hdr for nt in self.go_ntsets] num_files = len(filehdrs) if num_files == len(set(filehdrs)): - kws_xlsx['hdrs'] = filehdrs + list(desc2nts['flds'][num_files:]) + kws_xlsx["hdrs"] = filehdrs + list(desc2nts["flds"][num_files:]) def _get_prtfmt(self, objgowr, verbose): """Get print format containing markers.""" - prtfmt = objgowr.get_prtfmt('fmt') - prtfmt = prtfmt.replace('# ', '') + prtfmt = objgowr.get_prtfmt("fmt") + prtfmt = prtfmt.replace("# ", "") if not verbose: - prtfmt = prtfmt.replace('{hdr1usr01:2}', '') - prtfmt = prtfmt.replace('{childcnt:3} L{level:02} ', '') - prtfmt = prtfmt.replace('{num_usrgos:>4} uGOs ', '') - prtfmt = prtfmt.replace('{D1:5} {REL} {rel}', '') - prtfmt = prtfmt.replace('R{reldepth:02} ', '') - marks = ''.join(['{{{}}}'.format(nt.fileN) for nt in self.go_ntsets]) - return '{MARKS} {PRTFMT}'.format(MARKS=marks, PRTFMT=prtfmt) + prtfmt = prtfmt.replace("{hdr1usr01:2}", "") + prtfmt = prtfmt.replace("{childcnt:3} L{level:02} ", "") + prtfmt = prtfmt.replace("{num_usrgos:>4} uGOs ", "") + prtfmt = prtfmt.replace("{D1:5} {REL} {rel}", "") + prtfmt = prtfmt.replace("R{reldepth:02} ", "") + marks = "".join(["{{{}}}".format(nt.fileN) for nt in self.go_ntsets]) + return "{MARKS} {PRTFMT}".format(MARKS=marks, PRTFMT=prtfmt) @staticmethod def _get_fncsortnt(flds): """Return a sort function for sorting header GO IDs found in sections.""" - if 'tinfo' in flds: - return lambda ntgo: [ntgo.NS, -1*ntgo.tinfo, ntgo.depth, ntgo.alt] - if 'dcnt' in flds: - return lambda ntgo: [ntgo.NS, -1*ntgo.dcnt, ntgo.depth, ntgo.alt] - return lambda ntgo: [ntgo.NS, -1*ntgo.depth, ntgo.alt] + if "tinfo" in flds: + return lambda ntgo: [ntgo.NS, -1 * ntgo.tinfo, ntgo.depth, ntgo.alt] + if "dcnt" in flds: + return lambda ntgo: [ntgo.NS, -1 * ntgo.dcnt, ntgo.depth, ntgo.alt] + return lambda ntgo: [ntgo.NS, -1 * ntgo.depth, ntgo.alt] def _wr_txt_nts(self, fout_txt, desc2nts, objgowr, verbose): """Write grouped and sorted GO IDs to GOs.""" - with open(fout_txt, 'w') as prt: + with open(fout_txt, "w", encoding="utf-8") as prt: self._prt_ver_n_key(prt, verbose) - prt.write('\n\n') - prt.write('# ----------------------------------------------------------------\n') - prt.write('# - Sections and GO IDs\n') - prt.write('# ----------------------------------------------------------------\n') + prt.write("\n\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write("# - Sections and GO IDs\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) prtfmt = self._get_prtfmt(objgowr, verbose) summary_dct = objgowr.prt_txt_desc2nts(prt, desc2nts, prtfmt) if summary_dct: - print(" {N:>5} GO IDs WROTE: {FOUT} ({S} sections)".format( - N=desc2nts['num_items'], FOUT=fout_txt, S=desc2nts['num_sections'])) + print( + " {N:>5} GO IDs WROTE: {FOUT} ({S} sections)".format( + N=desc2nts["num_items"], + FOUT=fout_txt, + S=desc2nts["num_sections"], + ) + ) else: print(" WROTE: {TXT}".format(TXT=fout_txt)) def _wr_ver_n_key(self, fout_txt, verbose): """Write GO DAG version and key indicating presence of GO ID in a list.""" - with open(fout_txt, 'w') as prt: + with open(fout_txt, "w", encoding="utf-8") as prt: self._prt_ver_n_key(prt, verbose) - print(' WROTE: {TXT}'.format(TXT=fout_txt)) - + print(" WROTE: {TXT}".format(TXT=fout_txt)) def _prt_ver_n_key(self, prt, verbose): """Print GO DAG version and key indicating presence of GO ID in a list.""" - pre = '# ' - prt.write('# ----------------------------------------------------------------\n') - prt.write('# - Description of GO ID fields\n') - prt.write('# ----------------------------------------------------------------\n') - prt.write("# Versions:\n# {VER}\n".format(VER="\n# ".join(self.objgrpd.ver_list))) - prt.write('\n# Marker keys:\n') + pre = "# " + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write("# - Description of GO ID fields\n") + prt.write( + "# ----------------------------------------------------------------\n" + ) + prt.write( + "# Versions:\n# {VER}\n".format( + VER="\n# ".join(self.objgrpd.ver_list) + ) + ) + prt.write("\n# Marker keys:\n") for ntgos in self.go_ntsets: - prt.write('# X -> GO is present in {HDR}\n'.format(HDR=ntgos.hdr)) + prt.write("# X -> GO is present in {HDR}\n".format(HDR=ntgos.hdr)) if verbose: - prt.write('\n# Markers for header GO IDs and user GO IDs:\n') + prt.write("\n# Markers for header GO IDs and user GO IDs:\n") prt.write("# '**' -> GO term is both a header and a user GO ID\n") prt.write("# '* ' -> GO term is a header, but not a user GO ID\n") prt.write("# ' ' -> GO term is a user GO ID\n") - prt.write('\n# GO Namspaces:\n') - prt.write('# BP -> Biological Process\n') - prt.write('# MF -> Molecular Function\n') - prt.write('# CC -> Cellular Component\n') + prt.write("\n# GO Namspaces:\n") + prt.write("# BP -> Biological Process\n") + prt.write("# MF -> Molecular Function\n") + prt.write("# CC -> Cellular Component\n") if verbose: - prt.write('\n# Example fields: 5 uGOs 362 47 L04 D04 R04\n') - prt.write('# N uGOs -> number of user GO IDs under this GO header\n') - prt.write('# First integer -> number of GO descendants\n') - prt.write('# Second integer -> number of GO children for the current GO ID\n') - prt.write('\n# Depth information:\n') + prt.write("\n# Example fields: 5 uGOs 362 47 L04 D04 R04\n") + prt.write( + "# N uGOs -> number of user GO IDs under this GO header\n" + ) + prt.write("# First integer -> number of GO descendants\n") + prt.write( + "# Second integer -> number of GO children for the current GO ID\n" + ) + prt.write("\n# Depth information:\n") if not verbose: - prt.write('# int -> number of GO descendants\n') + prt.write("# int -> number of GO descendants\n") if verbose: - prt.write('# Lnn -> level (minimum distance from root to node)\n') - prt.write('# Dnn -> depth (maximum distance from root to node)\n') + prt.write("# Lnn -> level (minimum distance from root to node)\n") + prt.write("# Dnn -> depth (maximum distance from root to node)\n") if verbose: - prt.write('# Rnn -> depth accounting for relationships\n\n') + prt.write("# Rnn -> depth accounting for relationships\n\n") RelationshipStr().prt_keys(prt, pre) if verbose: - prt.write('\n') + prt.write("\n") objd1 = GoDepth1LettersWr(self.gosubdag.rcntobj) - objd1.prt_header(prt, 'DEPTH-01 GO terms and their aliases', pre) + objd1.prt_header(prt, "DEPTH-01 GO terms and their aliases", pre) objd1.prt_txt(prt, pre) @@ -212,7 +263,7 @@ def __init__(self, godag): def get_tcntobj(self, go_all, **kws): """Get a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gaf (gene2go taxid) - if 'gaf' in kws or 'gene2go' in kws: + if "gaf" in kws or "gene2go" in kws: # Get a reduced go2obj set for TermCounts _gosubdag = GoSubDag(go_all, self.godag, rcntobj=False, prt=None) return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts @@ -220,8 +271,8 @@ def get_tcntobj(self, go_all, **kws): def get_grouped(self, go_ntsets, go_all, gosubdag, **kws): """Get Grouped object.""" - kws_grpd = {k:v for k, v in kws.items() if k in Grouped.kws_dict} - kws_grpd['go2nt'] = self._init_go2ntpresent(go_ntsets, go_all, gosubdag) + kws_grpd = {k: v for k, v in kws.items() if k in Grouped.kws_dict} + kws_grpd["go2nt"] = self._init_go2ntpresent(go_ntsets, go_all, gosubdag) return Grouped(gosubdag, self.godag.version, **kws_grpd) @staticmethod @@ -229,16 +280,16 @@ def _init_go2ntpresent(go_ntsets, go_all, gosubdag): """Mark all GO IDs with an X if present in the user GO list.""" go2ntpresent = {} flds = " ".join(nt.fileN for nt in go_ntsets) - ntobj = namedtuple('NtPresent', flds) + ntobj = namedtuple("NtPresent", flds) # Get present marks for GO sources for goid_all in go_all: present_true = [goid_all in nt.go_set for nt in go_ntsets] - present_str = ['X' if tf else '.' for tf in present_true] + present_str = ["X" if tf else "." for tf in present_true] go2ntpresent[goid_all] = ntobj._make(present_str) # Get present marks for all other GO ancestors goids_ancestors = set(gosubdag.go2obj).difference(go2ntpresent) assert not goids_ancestors.intersection(go_all) - strmark = ['.' for _ in range(len(go_ntsets))] + strmark = ["." for _ in range(len(go_ntsets))] for goid in goids_ancestors: go2ntpresent[goid] = ntobj._make(strmark) return go2ntpresent @@ -247,7 +298,7 @@ def get_go_ntsets(self, go_fins): """For each file containing GOs, extract GO IDs, store filename and header.""" nts = [] go_fins = list(go_fins) - ntobj = namedtuple('NtGOFiles', 'fileN hdr go_set go_fin') + ntobj = namedtuple("NtGOFiles", "fileN hdr go_set go_fin") go_sets = self._init_go_sets(go_fins) hdrs = [os.path.splitext(os.path.basename(f))[0] for f in go_fins] assert len(go_fins) == len(go_sets) @@ -256,10 +307,14 @@ def get_go_ntsets(self, go_fins): for idx, (hdr, go_set, go_fin) in enumerate(zip(hdrs, go_sets, go_fins), 1): goids.update(go_set) if not go_set: - print('**WARNING: NO GO IDs FOUND IN {FIN}'.format(FIN=go_fin)) - nts.append(ntobj(fileN='file{I}'.format(I=idx), hdr=hdr, go_set=go_set, go_fin=go_fin)) + logger.warning("NO GO IDs FOUND IN %s", go_fin) + nts.append( + ntobj( + fileN="file{I}".format(I=idx), hdr=hdr, go_set=go_set, go_fin=go_fin + ) + ) if not goids: - print('**WARNING: NO GO IDs FOUND') + logger.warning("NO GO IDs FOUND") sys.exit(1) return nts @@ -268,7 +323,8 @@ def _init_go_sets(self, go_fins): go_sets = [] assert go_fins, "EXPECTED FILES CONTAINING GO IDs" assert len(go_fins) >= 2, "EXPECTED 2+ GO LISTS. FOUND: {L}".format( - L=' '.join(go_fins)) + L=" ".join(go_fins) + ) obj = GetGOs(self.godag) for fin in go_fins: assert os.path.exists(fin), "GO FILE({F}) DOES NOT EXIST".format(F=fin) diff --git a/goatools/cli/wr_sections.py b/goatools/cli/wr_sections.py index ab65dd03..6dc9f50c 100644 --- a/goatools/cli/wr_sections.py +++ b/goatools/cli/wr_sections.py @@ -31,32 +31,40 @@ import os import sys -from goatools.base import get_godag -from goatools.associations import get_tcntobj +from ..associations import get_tcntobj +from ..base import get_godag +from ..gosubdag.gosubdag import GoSubDag +from ..grouper.grprdflts import GrouperDflts +from ..grouper.grprobj import Grouper +from ..grouper.hdrgos import HdrgosSections +from ..grouper.read_goids import read_sections +from ..grouper.sorter import Sorter +from ..grouper.wr_sections import WrSectionsPy, WrSectionsTxt +from ..grouper.wrxlsx import WrXlsxSortedGos -from goatools.cli.docopt_parse import DocOptParse -from goatools.cli.gos_get import GetGOs - -from goatools.gosubdag.gosubdag import GoSubDag - -from goatools.grouper.read_goids import read_sections -from goatools.grouper.grprdflts import GrouperDflts -from goatools.grouper.hdrgos import HdrgosSections -from goatools.grouper.grprobj import Grouper -from goatools.grouper.wr_sections import WrSectionsTxt -from goatools.grouper.wr_sections import WrSectionsPy -from goatools.grouper.sorter import Sorter -from goatools.grouper.wrxlsx import WrXlsxSortedGos +from .docopt_parse import DocOptParse +from .gos_get import GetGOs # pylint: disable=too-few-public-methods class WrSectionsCli(object): """Class for command-line interface for creating GO term diagrams""" - kws_dict = set(['GO_FILE', 'obo', 'slims', - 'ifile', 'ofile', 'txt', - 'py', 'xlsx', - 'gaf', 'gene2go', 'taxid']) + kws_dict = set( + [ + "GO_FILE", + "obo", + "slims", + "ifile", + "ofile", + "txt", + "py", + "xlsx", + "gaf", + "gene2go", + "taxid", + ] + ) kws_set = set() def __init__(self, gosubdag=None): @@ -66,30 +74,32 @@ def __init__(self, gosubdag=None): def cli(self, prt=sys.stdout): """Command-line interface for go_draw script.""" kws = self.objdoc.get_docargs(prt=None) - godag = get_godag(kws['obo'], prt=None, loading_bar=False, optional_attrs=['relationship']) - usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get('GO_FILE'), prt) + godag = get_godag(kws["obo"], prt=None, optional_attrs=["relationship"]) + usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get("GO_FILE"), prt) tcntobj = self._get_tcntobj(usrgos, godag, **kws) # Gets TermCounts or None - self.gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None) - grprdflt = GrouperDflts(self.gosubdag, kws['slims']) + self.gosubdag = GoSubDag( + usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None + ) + grprdflt = GrouperDflts(self.gosubdag, kws["slims"]) ver_list = [godag.version, grprdflt.ver_goslims] prt.write("{VER}\n".format(VER="\n".join(ver_list))) - sections = self._read_sections(kws['ifile']) + sections = self._read_sections(kws["ifile"]) # print("SECSECSEC", sections) hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections) grprobj = Grouper("init", usrgos, hdrobj, self.gosubdag) # Write sections objsecwr = WrSectionsTxt(grprobj, ver_list) - if not os.path.exists(kws['ifile']): - objsecwr.wr_txt_section_hdrgos(kws['ifile']) - objsecwr.wr_txt_section_hdrgos(kws['ofile']) + if not os.path.exists(kws["ifile"]): + objsecwr.wr_txt_section_hdrgos(kws["ifile"]) + objsecwr.wr_txt_section_hdrgos(kws["ofile"]) objsecpy = WrSectionsPy(grprobj, ver_list) - if 'py' in kws: - objsecpy.wr_py_sections(kws['py'], sections, doc=godag.version) + if "py" in kws: + objsecpy.wr_py_sections(kws["py"], sections, doc=godag.version) # Write user GO IDs in sections sortobj = Sorter(grprobj) objgowr = WrXlsxSortedGos("init", sortobj, ver_list) - objgowr.wr_txt_gos(kws['txt'], sortby=objsecpy.fncsortnt) - #objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt) + objgowr.wr_txt_gos(kws["txt"], sortby=objsecpy.fncsortnt) + # objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt) self._prt_cnt_usrgos(usrgos, sys.stdout) @staticmethod @@ -109,7 +119,7 @@ def _prt_cnt_usrgos(self, usrgos_read, prt): def _get_tcntobj(goids, go2obj, **kws): """Get a TermCounts object if the user provides an annotation file, otherwise None.""" # kws: gaf (gene2go taxid) - if 'gaf' in kws or 'gene2go' in kws: + if "gaf" in kws or "gene2go" in kws: # Get a reduced go2obj set for TermCounts _gosubdag = GoSubDag(goids, go2obj, rcntobj=False, prt=None) return get_tcntobj(_gosubdag.go2obj, **kws) # TermCounts diff --git a/goatools/test_data/nature3102_goea.py b/goatools/test_data/nature3102_goea.py index 4624fbdf..03a43d78 100644 --- a/goatools/test_data/nature3102_goea.py +++ b/goatools/test_data/nature3102_goea.py @@ -7,11 +7,13 @@ import pandas as pd from tests.utils import repofn -from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus -from goatools.base import get_godag -from goatools.associations import dnld_ncbi_gene_file -from goatools.go_enrichment import GOEnrichmentStudy -from goatools.anno.genetogo_reader import Gene2GoReader + +from ..anno.genetogo_reader import Gene2GoReader +from ..associations import dnld_ncbi_gene_file +from ..base import get_godag +from ..go_enrichment import GOEnrichmentStudy + +from .genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus def get_goea_results(keep_if=None): @@ -55,7 +57,7 @@ def get_geneid2symbol(fin_xlsx): def get_goeaobj(method, geneids_pop, taxid, nspc="BP"): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") - godag = get_godag(fin_obo, loading_bar=None) + godag = get_godag(fin_obo) assoc_geneid2gos = get_annotations(taxid, nspc) goeaobj = GOEnrichmentStudy( geneids_pop,