Skip to content

Commit 3c29e97

Browse files
author
Jose Figueroa
committed
Merge branch 'dev'
2 parents 53976c6 + f4d45ef commit 3c29e97

17 files changed

+348
-345
lines changed

CHANGELOG.md

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,31 @@
11
# Change log
22

3+
## Version 1.4.0
4+
5+
### v1.4.0 New Features
6+
7+
- Replaced Ray with HydraMPP
8+
- Reduced number of dependencies making install easier
9+
10+
### v1.4.0 Improvements
11+
12+
- Removed a redundant hmm search when using KOFam
13+
- Organized output files
14+
15+
### v1.4.0 Bug Fixes
16+
17+
- Fixed resume feature for hmm step
18+
- Fixed counting conflict between parser and filter steps
19+
320
## Version 1.3.2
421

5-
### Bug Fixes
22+
### v1.3.2 Bug Fixes
623

724
- Fixed Ray dependency issue for MetaCerberus-lite, or when Ray is not available.
825

926
## Version 1.3.1
1027

11-
### New Features
28+
### v1.3.1 New Features
1229

1330
- created "lite" version.
1431
- removed hard dependency requirements, failing more gracefully to make some dependencies optional
@@ -22,7 +39,7 @@
2239
- Made N removal optional
2340
- Improved Genbank output
2441

25-
### Bug Fixes
42+
### v1.3.1 Bug Fixes
2643

2744
- Fixed prodigal-gv GFF
2845
- Fixed some Phanotate bugs
@@ -32,7 +49,7 @@
3249

3350
## Version 1.3.0
3451

35-
### New Features
52+
### v1.3.0 New Features
3653

3754
- Custom download location for databases
3855
- Ability to download individual databases
@@ -49,7 +66,7 @@
4966
-Performance improvements
5067
-Improved some error handling and reporting
5168

52-
### Bug Fixes
69+
### v1.3.0 Bug Fixes
5370

5471
- Multi-domain in summary files
5572
-Individual database summary files contain a line per match found

bin/metacerberus.py

Lines changed: 203 additions & 190 deletions
Large diffs are not rendered by default.
-7.9 MB
Binary file not shown.

dist/metacerberus-1.3.1.post1.tar.gz

-7.87 MB
Binary file not shown.

install_metacerberus-lite.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ eval "$(conda shell.bash hook)"
1010

1111
# create the metacerberus environment in conda
1212
mamba create -y -n $ENV_NAME -c conda-forge -c bioconda \
13-
python'>=3.8' setuptools"<70.0.0" grpcio=1.43 pyhmmer flash2 \
13+
python'>=3.8' setuptools"<70.0.0" hydrampp pyhmmer flash2 \
1414
pyrodigal pyrodigal-gv \
1515
metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas
1616

install_metacerberus.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@ eval "$(conda shell.bash hook)"
1010

1111
# create the metacerberus environment in conda
1212
mamba create -y -n $ENV_NAME -c conda-forge -c bioconda \
13-
python'>=3.8' setuptools"<70.0.0" grpcio=1.43 \
13+
python'>=3.8' setuptools"<70.0.0" hydrampp pyhmmer flash2 \
14+
pyrodigal pyrodigal-gv \
15+
metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas \
1416
fastqc flash2 fastp porechop bbmap trnascan-se phanotate \
15-
ray-default"<=2.6.3" ray-core"<=2.6.3" ray-tune"<=2.6.3" ray-dashboard"<=2.6.3" \
16-
pyrodigal pyrodigal-gv pyhmmer \
17-
metaomestats plotly scikit-learn dominate python-kaleido configargparse psutil pandas
17+
1818

1919
conda activate $ENV_NAME
2020

lib/metacerberus_formatFasta.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
import re
99
import subprocess
1010
import textwrap
11+
import hydraMPP
1112

1213

1314
# Remove quality from fastq
15+
@hydraMPP.remote
1416
def reformat(fastq:Path, config:dict, subdir:Path):
1517
path = Path(config['DIR_OUT'], subdir)
1618
fastq = Path(fastq)
@@ -62,6 +64,7 @@ def split_sequenceN(name, sequence):
6264

6365

6466
# Remove N's
67+
@hydraMPP.remote
6568
def removeN(fasta:str, config:dict, subdir:os.PathLike):
6669
path = Path(config['DIR_OUT'], subdir)
6770

lib/metacerberus_genecall.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111
import subprocess
1212
import pyrodigal
1313
import pyrodigal_gv
14+
import hydraMPP
1415

1516

1617
# Eukaryotic option
18+
@hydraMPP.remote
1719
def findORF_fgs(contig, config, subdir):
1820
path = Path(config['DIR_OUT'], subdir)
1921
done = path / "complete"
@@ -45,6 +47,7 @@ def findORF_fgs(contig, config, subdir):
4547

4648

4749
# Microbial option
50+
@hydraMPP.remote
4851
def findORF_prod(contig, config, subdir, meta=False, viral=False):
4952
path = Path(config['DIR_OUT'], subdir)
5053
path.mkdir(exist_ok=True, parents=True)
@@ -91,6 +94,7 @@ def findORF_prod(contig, config, subdir, meta=False, viral=False):
9194

9295

9396
# Phage
97+
@hydraMPP.remote
9498
def findORF_phanotate(contig, config, subdir, meta=False):
9599
path = Path(config['DIR_OUT'], subdir)
96100
done = path / "complete"

lib/metacerberus_hmm.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
import re
99
from pathlib import Path
1010
import pyhmmer
11+
import hydraMPP
1112

1213

1314
## HMMER Search
15+
@hydraMPP.remote
1416
def searchHMM(aminoAcids:dict, config:dict, subdir:str, hmm:tuple, CPUs:int=4):
1517
minscore = config['MINSCORE']
1618
evalue = config['EVALUE']
@@ -51,9 +53,13 @@ def searchHMM(aminoAcids:dict, config:dict, subdir:str, hmm:tuple, CPUs:int=4):
5153

5254

5355
# Filter HMM results
54-
def filterHMM(hmm_tsv:Path, outfile:Path, dbpath:Path):
56+
@hydraMPP.remote
57+
def filterHMM(hmm_tsv:Path, outfile:Path, dbpath:Path, replace:bool=True):
5558
outfile.parent.mkdir(parents=True, exist_ok=True)
5659

60+
if not replace and outfile.exists():
61+
return outfile
62+
5763
for i in range(1, len(dbpath.suffixes)):
5864
dbpath = Path(dbpath.with_suffix(''))
5965
dbLookup = dbpath.with_suffix('.tsv')

lib/metacerberus_mpp.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

lib/metacerberus_parser.py

Lines changed: 19 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -13,45 +13,10 @@ def warn(*args, **kwargs):
1313
import re
1414
from pathlib import Path
1515
import pandas as pd
16+
import hydraMPP
1617

1718

18-
def top5s(hmm_tsv:dict, outfile:Path):
19-
outfile.parent.mkdir(0o777, True, True)
20-
21-
# Calculate Best Hit
22-
BH_top5 = {}
23-
for hmm,filename in hmm_tsv.items():
24-
with open(filename, "r") as reader:
25-
reader.readline()
26-
for line in reader:
27-
line = line.split('\t')
28-
target = line[0]
29-
query = line[1]
30-
e_value = line[2]
31-
score = float(line[3])
32-
ec = '' #TODO: Match query to EC
33-
34-
# store top 5 per query
35-
match = [target, query, ec, e_value, score, hmm]
36-
if target not in BH_top5:
37-
BH_top5[target] = [match]
38-
elif len(BH_top5[target]) < 5:
39-
BH_top5[target].append(match)
40-
else:
41-
BH_top5[target].sort(key = lambda x: x[3], reverse=False)
42-
if score > float(BH_top5[target][0][3]):
43-
BH_top5[target][0] = match
44-
45-
# Save Top 5 hits tsv rollup
46-
with outfile.open('w') as writer:
47-
print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", "hmmDB", sep='\t', file=writer)
48-
for target in sorted(BH_top5.keys()):
49-
BH_top5[target].sort(key = lambda x: x[3], reverse=True)
50-
for line in BH_top5[target]:
51-
print(*line, sep='\t', file=writer)
52-
return outfile
53-
54-
19+
@hydraMPP.remote
5520
def top5(hmm_tsv:Path, outfile:Path):
5621
outfile.parent.mkdir(0o777, True, True)
5722

@@ -60,15 +25,16 @@ def top5(hmm_tsv:Path, outfile:Path):
6025
with open(hmm_tsv, "r") as reader:
6126
reader.readline()
6227
for line in reader:
63-
line = line.split('\t')
28+
line = line.rstrip('\r\n').split('\t')
6429
target = line[0]
6530
query = line[1]
6631
e_value = line[2]
6732
score = float(line[3])
33+
hmm = line[7]
6834
ec = '' #TODO: Match query to EC
6935

7036
# store top 5 per query
71-
match = [target, query, ec, e_value, score]
37+
match = [target, query, ec, e_value, score, hmm]
7238
if target not in BH_top5:
7339
BH_top5[target] = [match]
7440
elif len(BH_top5[target]) < 5:
@@ -80,14 +46,15 @@ def top5(hmm_tsv:Path, outfile:Path):
8046

8147
# Save Top 5 hits tsv rollup
8248
with outfile.open('w') as writer:
83-
print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", sep='\t', file=writer)
49+
print("Target Name", "ID", "EC value", "E-Value (sequence)", "Score (domain)", "hmmDB", sep='\t', file=writer)
8450
for target in sorted(BH_top5.keys()):
8551
BH_top5[target].sort(key = lambda x: x[3], reverse=True)
8652
for line in BH_top5[target]:
8753
print(*line, sep='\t', file=writer)
8854
return outfile
8955

9056

57+
@hydraMPP.remote
9158
def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
9259
path = Path(config['DIR_OUT'], subdir)
9360
path.mkdir(exist_ok=True, parents=True)
@@ -105,12 +72,12 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
10572

10673
minscore = config["MINSCORE"]
10774

108-
top5File = Path(path, f"HMMER-{dbname}_top_5.tsv")
75+
top5File = Path(path, f"top_5-{dbname}.tsv")
10976

11077

11178
# Calculate Best Hit
112-
BH_query = {}
11379
BH_top5 = {}
80+
ID_counts = {}
11481
#"target", "query", "e-value", "score", "length", "start", "end"
11582
with open(hmm_tsv, "r") as reader:
11683
for line in reader:
@@ -140,11 +107,13 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
140107
if score > float(BH_top5[query][0][3]):
141108
BH_top5[query][0] = line
142109

143-
# Check for Best Score per query
144-
if query not in BH_query:
145-
BH_query[query] = line
146-
elif score > float(BH_query[query][3]):
147-
BH_query[query] = line
110+
# Create dictionary with found IDs and counts
111+
#IDs = [ID for ID in line[1].split(",")]
112+
#for ID in IDs:
113+
if query not in ID_counts:
114+
ID_counts[query] = 0
115+
ID_counts[query] += 1
116+
148117

149118
# Save Top 5 hits tsv rollup
150119
with top5File.open('w') as writer:
@@ -156,20 +125,12 @@ def parseHmmer(hmm_tsv, config, subdir, dbname, dbpath):
156125
ec = []
157126
print(line[0], ','.join(id), ','.join(ec), line[2], line[3], file=writer, sep='\t')
158127

159-
# Create dictionary with found IDs and counts
160-
ID_counts = {}
161-
for line in BH_query.values():
162-
IDs = [ID for ID in line[1].split(",")]
163-
for ID in IDs:
164-
if ID not in ID_counts:
165-
ID_counts[ID] = 0
166-
ID_counts[ID] += 1
167128

168129
# Write rollup files to disk
169130
dbRollup = rollup(ID_counts, dbname, dbpath, path)
170131
rollup_files = dict()
171132
if len(dbRollup) > 1:
172-
outfile = Path(path, f"HMMER_BH_{dbname}_rollup2.tsv")
133+
outfile = Path(path, f"HMMER_BH_{dbname}_rollup.tsv")
173134
with open(outfile, 'w') as writer:
174135
for line in dbRollup:
175136
print(*line, sep='\t', file=writer)
@@ -216,12 +177,13 @@ def rollup(COUNTS:dict, dbname:str, dbpath:Path, outpath:str):
216177

217178

218179
########## Counts Table #########
180+
@hydraMPP.remote
219181
def createCountTables(rollup_files:dict, config:dict, subdir: str):
220182
done = Path(config['DIR_OUT']) / subdir / "complete"
221183
dfCounts = dict()
222184

223185
for dbName,filepath in rollup_files.items():
224-
outpath = Path(config['DIR_OUT'], subdir, f"{dbName}-rollup_counts.tsv")
186+
outpath = Path(config['DIR_OUT'], subdir, f"rollup-counts_{dbName}.tsv")
225187
if not config['REPLACE'] and done.exists() and outpath.exists():
226188
dfCounts[dbName] = outpath
227189
continue

0 commit comments

Comments
 (0)