Skip to content

Commit

Permalink
extractor.py class is fully tested
Browse files Browse the repository at this point in the history
  • Loading branch information
kbessonov1984 committed Apr 15, 2024
1 parent cc221f4 commit 8367063
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 24 deletions.
8 changes: 3 additions & 5 deletions locidex/classes/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ def extract_seq(self,loci_data,seq_data):
id = 0
for locus_name in loci_data:
for row in loci_data[locus_name]:
query_id = row['query_id']
query_id = str(row['query_id'])
start = row['start']
end = row['end'] + 1
seqid = row['seqid']
seqid = str(row['seqid'])
is_reverse = row['reverse']
is_complement = row['complement']
is_extended = row['is_extended']
Expand All @@ -171,10 +171,8 @@ def extract_seq(self,loci_data,seq_data):
threep_trunc = row['is_3prime_boundary']
is_trunc = False
if fivep_trunc or threep_trunc:
is_trunc = True

is_trunc = True
if seqid in seq_data:

seq = seq_data[seqid]['seq'][start:end]
if is_reverse and not is_complement:
seq = seq[::-1].translate(NT_SUB)
Expand Down
1 change: 0 additions & 1 deletion locidex/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ def run_extract(config):
}
nt_db = "{}.fasta".format(blast_database_paths['nucleotide'])
hit_file = os.path.join(blast_dir_base, "hsps.txt")

obj = blast_search(input_db_path=db_path, input_query_path=nt_db,
output_results=hit_file, blast_params=blast_params, blast_method='blastn',
blast_columns=BLAST_TABLE_COLS,create_db=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import locidex.classes.blast
from locidex.constants import BLAST_TABLE_COLS
from locidex.classes import run_command



PACKAGE_ROOT = os.path.dirname(locidex.__file__)
Expand Down
62 changes: 45 additions & 17 deletions tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,62 @@
from locidex.classes.seq_intake import seq_intake
from locidex.classes.db import db_config, search_db_conf


#could be tested via locidex extract -i ./locidex/example/search/NC_003198.1.fasta -d ./locidex/example/build_db_mlst_out/ -o tmp --force

PACKAGE_ROOT = os.path.dirname(locidex.__file__)


@pytest.fixture
def blast_db_and_search(tmpdir):

def blast_db_and_search(tmpdir,input_db_path):
blast_search_obj = locidex.classes.blast.blast_search(input_db_path=input_db_path,
input_query_path=os.path.join(PACKAGE_ROOT, 'example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta'),
output_results=os.path.join(tmpdir,"hsps.txt"), blast_params={'evalue': 0.0001,'max_target_seqs': 10,'num_threads': 1},
blast_method='blastn',
blast_columns=BLAST_TABLE_COLS,create_db=True)
blast_search_obj.run_blast()
output_blast_results_path = os.path.join(tmpdir,"hsps.txt")
parse_blast_obj = locidex.classes.blast.parse_blast(input_file = output_blast_results_path,
blast_columns = BLAST_TABLE_COLS,
filter_options={'bitscore':{'min':600, 'max':None, 'include':None}})
return parse_blast_obj
## you need to run through db, then blast, then seq intake, then you can test extractor.
pass
## pass

@pytest.fixture
def seq_intake_fixture(tmpdir):
def seq_intake_fixture():
# Mimicking the creation of seq_data from a given input fasta file
input_fasta = os.path.join(PACKAGE_ROOT, 'example/search/NC_003198.1.fasta')
format = "fasta" # Adjust this based on your file type
translation_table = 11
seq_obj = seq_intake(input_fasta, format, 'CDS', translation_table, perform_annotation=False, skip_trans=True)
seq_obj = seq_intake(input_fasta, format, 'source', translation_table, perform_annotation=False,skip_trans=True)
return seq_obj

seq_data = {}
for idx, seq in enumerate(seq_obj.seq_data):
seq_data[str(idx)] = {'id': str(seq['seq_id']), 'seq': seq['dna_seq']}
return seq_data

def test_extractor_initialization(seq_intake_fixture, tmpdir):
db_path=os.path.join(tmpdir,"contigs.fasta")
nt_db = os.path.join(PACKAGE_ROOT,'example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta')
hit_file = os.path.join(tmpdir,"hsps.txt")
blast_params={'evalue': 0.0001, 'max_target_seqs': 10, 'num_threads': 1}
metadata_path = os.path.join(PACKAGE_ROOT,'example/build_db_mlst_out/meta.json')
seq_obj = seq_intake_fixture
seq_data={}
with open(db_path,'w') as oh:
for idx,seq in enumerate(seq_obj.seq_data):
seq_data[str(idx)] = {'id':str(seq['seq_id']),'seq':seq['dna_seq']}
oh.write(">{}\n{}\n".format(idx,seq['dna_seq']))
locidex.classes.blast.blast_search(input_db_path=db_path, input_query_path=nt_db,
output_results=hit_file, blast_params=blast_params, blast_method='blastn',
blast_columns=BLAST_TABLE_COLS,create_db=True)
hit_df = locidex.classes.blast.parse_blast(hit_file, BLAST_TABLE_COLS, {}).df
loci = []; metadata_obj = db_config(metadata_path, ['meta', 'info'])
for idx,row in hit_df.iterrows():
qid = str(row['qseqid'])
loci.append(metadata_obj.config['meta'][qid]['locus_name'])
hit_df['locus_name'] = loci

def test_extractor_initialization(blast_db_and_search, seq_intake_fixture):
mock_df = blast_db_and_search
seq_data = seq_intake_fixture
extractor_instance = extractor(
df=mock_df,
df=hit_df,
seq_data=seq_data,
sseqid_col='sseqid',
queryid_col='qseqid',
Expand All @@ -46,10 +74,10 @@ def test_extractor_initialization(blast_db_and_search, seq_intake_fixture):
slen_col='slen',
sstrand_col='sstrand',
bitscore_col='bitscore',
overlap_thresh=1,
extend_threshold_ratio=0.2,
filter_contig_breaks=False
extend_threshold_ratio = 0.2,
filter_contig_breaks=True
)
extractor_instance.df.to_csv(os.path.join(tmpdir,'filtered.hsps.txt'),header=True,sep="\t",index=False)

# Verify modifications by the class's methods
assert not extractor_instance.df.empty, "The extractor DataFrame should not be empty after initialization."
Expand All @@ -63,7 +91,7 @@ def test_extractor_initialization(blast_db_and_search, seq_intake_fixture):

# Assert filtering based on contig boundaries if applicable
if extractor_instance.filter_contig_breaks:
assert not extractor_instance.df[(extractor_instance.df['is_5prime_boundary']) | (extractor_instance.df['is_3prime_boundary'])].empty, "Hits on contig boundaries should be filtered out when `filter_contig_breaks` is True."
assert not extractor_instance.df.query("`is_5prime_boundary` == False and `is_3prime_boundary`== False").empty, "Hits on contig boundaries should be filtered out when `filter_contig_breaks` is True with both 5' and 3' boundaries set to True."

# Assert all expected columns are present after adjustments
expected_columns = ['ext_start', 'ext_end', 'is_complete', 'is_5prime_boundary', 'is_3prime_boundary', 'reverse', 'complement', 'is_extended']
Expand Down

0 comments on commit 8367063

Please sign in to comment.