raw-lab
diff --git a/‎CHANGELOG.md
Lines changed: 22 additions & 5 deletions b/‎CHANGELOG.md
Lines changed: 22 additions & 5 deletions
diff --git a/‎bin/metacerberus.py
Lines changed: 203 additions & 190 deletions b/‎bin/metacerberus.py
Lines changed: 203 additions & 190 deletions
diff --git a/‎dist/MetaCerberus-1.3.1.post1-py3-none-any.whl
-7.9 MB b/‎dist/MetaCerberus-1.3.1.post1-py3-none-any.whl
-7.9 MB
diff --git a/‎dist/metacerberus-1.3.1.post1.tar.gz
-7.87 MB b/‎dist/metacerberus-1.3.1.post1.tar.gz
-7.87 MB
diff --git a/‎install_metacerberus-lite.sh
Lines changed: 1 addition & 1 deletion b/‎install_metacerberus-lite.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎install_metacerberus.sh
Lines changed: 4 additions & 4 deletions b/‎install_metacerberus.sh
Lines changed: 4 additions & 4 deletions
diff --git a/‎lib/metacerberus_formatFasta.py
Lines changed: 3 additions & 0 deletions b/‎lib/metacerberus_formatFasta.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/metacerberus_genecall.py
Lines changed: 4 additions & 0 deletions b/‎lib/metacerberus_genecall.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/metacerberus_hmm.py
Lines changed: 7 additions & 1 deletion b/‎lib/metacerberus_hmm.py
Lines changed: 7 additions & 1 deletion
diff --git a/‎lib/metacerberus_mpp.py
Lines changed: 0 additions & 58 deletions b/‎lib/metacerberus_mpp.py
Lines changed: 0 additions & 58 deletions
diff --git a/‎lib/metacerberus_parser.py
Lines changed: 19 additions & 57 deletions b/‎lib/metacerberus_parser.py
Lines changed: 19 additions & 57 deletions
@@ -1,14 +1,31 @@
 # Change log
 
+## Version 1.4.0
+
+### v1.4.0 New Features
+
+- Replaced Ray with HydraMPP
+  - Reduced number of dependencies making install easier
+
+### v1.4.0 Improvements
+
+- Removed a redundant hmm search when using KOFam
+- Organized output files
+
+### v1.4.0 Bug Fixes
+
+- Fixed resume feature for hmm step
+- Fixed counting conflict between parser and filter steps
+
 ## Version 1.3.2
 
-### Bug Fixes
+### v1.3.2 Bug Fixes
 
 - Fixed Ray dependency issue for MetaCerberus-lite, or when Ray is not available.
 
 ## Version 1.3.1
 
-### New Features
+### v1.3.1 New Features
 
 - created "lite" version.
   - removed hard dependency requirements, failing more gracefully to make some dependencies optional
@@ -22,7 +39,7 @@
 - Made N removal optional
 - Improved Genbank output
 
-### Bug Fixes
+### v1.3.1 Bug Fixes
 
 - Fixed prodigal-gv GFF
 - Fixed some Phanotate bugs
@@ -32,7 +49,7 @@
 
 ## Version 1.3.0
 
-### New Features
+### v1.3.0 New Features
 
 - Custom download location for databases
 - Ability to download individual databases
@@ -49,7 +66,7 @@
   -Performance improvements
   -Improved some error handling and reporting
 
-### Bug Fixes
+### v1.3.0 Bug Fixes
 
 - Multi-domain in summary files
   -Individual database summary files contain a line per match found
 
@@ -10,7 +10,7 @@ eval "$(conda shell.bash hook)"
 
 # create the metacerberus environment in conda
 mamba create -y -n $ENV_NAME -c conda-forge -c bioconda \
-	python'>=3.8' setuptools"<70.0.0" grpcio=1.43 pyhmmer flash2 \
+	python'>=3.8' setuptools"<70.0.0" hydrampp pyhmmer flash2 \
 	pyrodigal pyrodigal-gv \
 	metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas
 
 
@@ -10,11 +10,11 @@ eval "$(conda shell.bash hook)"
 
 # create the metacerberus environment in conda
 mamba create -y -n $ENV_NAME -c conda-forge -c bioconda \
-	python'>=3.8' setuptools"<70.0.0" grpcio=1.43 \
+	python'>=3.8' setuptools"<70.0.0" hydrampp pyhmmer flash2 \
+	pyrodigal pyrodigal-gv \
+	metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas \
 	fastqc flash2 fastp porechop bbmap trnascan-se phanotate \
-	ray-default"<=2.6.3" ray-core"<=2.6.3" ray-tune"<=2.6.3" ray-dashboard"<=2.6.3" \
-	pyrodigal pyrodigal-gv pyhmmer \
-	metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas
+
 
 conda activate $ENV_NAME
 
 
@@ -8,9 +8,11 @@
 import re
 import subprocess
 import textwrap
+import hydraMPP
 
 
 # Remove quality from fastq
+@hydraMPP.remote
 def reformat(fastq:Path, config:dict, subdir:Path):
     path = Path(config['DIR_OUT'], subdir)
     fastq = Path(fastq)
@@ -62,6 +64,7 @@ def split_sequenceN(name, sequence):
 
 
 # Remove N's
+@hydraMPP.remote
 def removeN(fasta:str, config:dict, subdir:os.PathLike):
     path = Path(config['DIR_OUT'], subdir)
 
 
@@ -11,9 +11,11 @@
 import subprocess
 import pyrodigal
 import pyrodigal_gv
+import hydraMPP
 
 
 # Eukaryotic option
+@hydraMPP.remote
 def findORF_fgs(contig, config, subdir):
     path = Path(config['DIR_OUT'], subdir)
     done = path / "complete"
@@ -45,6 +47,7 @@ def findORF_fgs(contig, config, subdir):
 
 
 # Microbial option
+@hydraMPP.remote
 def findORF_prod(contig, config, subdir, meta=False, viral=False):
     path = Path(config['DIR_OUT'], subdir)
     path.mkdir(exist_ok=True, parents=True)
@@ -91,6 +94,7 @@ def findORF_prod(contig, config, subdir, meta=False, viral=False):
 
 
 # Phage
+@hydraMPP.remote
 def findORF_phanotate(contig, config, subdir, meta=False):
     path = Path(config['DIR_OUT'], subdir)
     done = path / "complete"
 
@@ -8,9 +8,11 @@
 import re
 from pathlib import Path
 import pyhmmer
+import hydraMPP
 
 
 ## HMMER Search
+@hydraMPP.remote
 def searchHMM(aminoAcids:dict, config:dict, subdir:str, hmm:tuple, CPUs:int=4):
     minscore = config['MINSCORE']
     evalue = config['EVALUE']
@@ -51,9 +53,13 @@ def searchHMM(aminoAcids:dict, config:dict, subdir:str, hmm:tuple, CPUs:int=4):
 
 
 # Filter HMM results
-def filterHMM(hmm_tsv:Path, outfile:Path, dbpath:Path):
+@hydraMPP.remote
+def filterHMM(hmm_tsv:Path, outfile:Path, dbpath:Path, replace:bool=True):
     outfile.parent.mkdir(parents=True, exist_ok=True)
 
+    if not replace and outfile.exists():
+        return outfile
+
     for i in range(1, len(dbpath.suffixes)):
         dbpath = Path(dbpath.with_suffix(''))
     dbLookup = dbpath.with_suffix('.tsv')
 
@@ -13,45 +13,10 @@ def warn(*args, **kwargs):
 import re
 from pathlib import Path
 import pandas as pd
+import hydraMPP
 
 
-def top5s(hmm_tsv:dict, outfile:Path):
-    outfile.parent.mkdir(0o777, True, True)
-
-    # Calculate Best Hit
-    BH_top5 = {}
-    for hmm,filename in hmm_tsv.items():
-        with open(filename, "r") as reader:
-            reader.readline()
-            for line in reader:
-                line = line.split('\t')
-                target = line[0]
-                query = line[1]
-                e_value = line[2]
-                score = float(line[3])
-                ec = '' #TODO: Match query to EC
-
-                # store top 5 per query
-                match = [target, query, ec, e_value, score, hmm]
-                if target not in BH_top5:
-                    BH_top5[target] = [match]
-                elif len(BH_top5[target]) < 5:
-                    BH_top5[target].append(match)
-                else:
-                    BH_top5[target].sort(key = lambda x: x[3], reverse=False)
-                    if score > float(BH_top5[target][0][3]):
-                        BH_top5[target][0] = match
-
-    # Save Top 5 hits tsv rollup
-    with outfile.open('w') as writer:
-        print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", "hmmDB", sep='\t', file=writer)
-        for target in sorted(BH_top5.keys()):
-            BH_top5[target].sort(key = lambda x: x[3], reverse=True)
-            for line in BH_top5[target]:
-                print(*line, sep='\t', file=writer)
-    return outfile
-
-
+@hydraMPP.remote
 def top5(hmm_tsv:Path, outfile:Path):
     outfile.parent.mkdir(0o777, True, True)
 
@@ -60,15 +25,16 @@ def top5(hmm_tsv:Path, outfile:Path):
     with open(hmm_tsv, "r") as reader:
         reader.readline()
         for line in reader:
-            line = line.split('\t')
+            line = line.rstrip('\r\n').split('\t')
             target = line[0]
             query = line[1]
             e_value = line[2]
             score = float(line[3])
+            hmm = line[7]
             ec = '' #TODO: Match query to EC
 
             # store top 5 per query
-            match = [target, query, ec, e_value, score]
+            match = [target, query, ec, e_value, score, hmm]
             if target not in BH_top5:
                 BH_top5[target] = [match]
             elif len(BH_top5[target]) < 5:
@@ -80,14 +46,15 @@ def top5(hmm_tsv:Path, outfile:Path):
 
     # Save Top 5 hits tsv rollup
     with outfile.open('w') as writer:
-        print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", sep='\t', file=writer)
+        print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", "hmmDB", sep='\t', file=writer)
         for target in sorted(BH_top5.keys()):
             BH_top5[target].sort(key = lambda x: x[3], reverse=True)
             for line in BH_top5[target]:
                 print(*line, sep='\t', file=writer)
     return outfile
 
 
+@hydraMPP.remote
 def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
     path = Path(config['DIR_OUT'], subdir)
     path.mkdir(exist_ok=True, parents=True)
@@ -105,12 +72,12 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
 
     minscore = config["MINSCORE"]
 
-    top5File = Path(path, f"HMMER-{dbname}_top_5.tsv")
+    top5File = Path(path, f"top_5-{dbname}.tsv")
 
 
     # Calculate Best Hit
-    BH_query = {}
     BH_top5 = {}
+    ID_counts = {}
     #"target", "query", "e-value", "score", "length", "start", "end"
     with open(hmm_tsv, "r") as reader:
         for line in reader:
@@ -140,11 +107,13 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
                 if score > float(BH_top5[query][0][3]):
                     BH_top5[query][0] = line
 
-            # Check for Best Score per query
-            if query not in BH_query:
-                BH_query[query] = line
-            elif score > float(BH_query[query][3]):
-                BH_query[query] = line
+            # Create dictionary with found IDs and counts
+            #IDs = [ID for ID in line[1].split(",")]
+            #for ID in IDs:
+            if query not in ID_counts:
+                ID_counts[query] = 0
+            ID_counts[query] += 1
+
 
     # Save Top 5 hits tsv rollup
     with top5File.open('w') as writer:
@@ -156,20 +125,12 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
                 ec = []
                 print(line[0], ','.join(id), ','.join(ec), line[2], line[3], file=writer, sep='\t')
 
-    # Create dictionary with found IDs and counts
-    ID_counts = {}
-    for line in BH_query.values():
-        IDs = [ID for ID in line[1].split(",")]
-        for ID in IDs:
-            if ID not in ID_counts:
-                ID_counts[ID] = 0
-            ID_counts[ID] += 1
 
     # Write rollup files to disk
     dbRollup = rollup(ID_counts, dbname, dbpath, path)
     rollup_files = dict()
     if len(dbRollup) > 1:
-        outfile = Path(path, f"HMMER_BH_{dbname}_rollup2.tsv")
+        outfile = Path(path, f"HMMER_BH_{dbname}_rollup.tsv")
         with open(outfile, 'w') as writer:
             for line in dbRollup:
                 print(*line, sep='\t', file=writer)
@@ -216,12 +177,13 @@ def rollup(COUNTS:dict, dbname:str, dbpath:Path, outpath:str):
 
 
 ########## Counts Table #########
+@hydraMPP.remote
 def createCountTables(rollup_files:dict, config:dict, subdir: str):
     done = Path(config['DIR_OUT']) / subdir / "complete"
     dfCounts = dict()
 
     for dbName,filepath in rollup_files.items():
-        outpath = Path(config['DIR_OUT'], subdir, f"{dbName}-rollup_counts.tsv")
+        outpath = Path(config['DIR_OUT'], subdir, f"rollup-counts_{dbName}.tsv")
         if not config['REPLACE'] and done.exists() and outpath.exists():
             dfCounts[dbName] = outpath
             continue