@@ -102,7 +102,9 @@ def read_headers_from_fastq(fastq_file):
102
102
return headers
103
103
104
104
105
- def extract_matching_reads_by_header (input_fastq , reference_fastq , output_fastq ):
105
+ def extract_matching_reads_by_header (
106
+ input_fastq , reference_fastq , output_fastq
107
+ ):
106
108
"""
107
109
Extracts reads from the reference FASTQ (.gz) file that are NOT present in the input FASTQ (.gz) file
108
110
based on headers and writes them to the output FASTQ (.gz) file.
@@ -125,8 +127,7 @@ def extract_matching_reads_by_header(input_fastq, reference_fastq, output_fastq)
125
127
# Create a SeqIO writer for the output FASTQ file
126
128
writer = SeqIO .write (
127
129
(
128
- record
129
- for record in SeqIO .parse (infile , "fastq" )
130
+ record for record in SeqIO .parse (infile , "fastq" )
130
131
if record .id not in reference_headers
131
132
),
132
133
outfile ,
@@ -148,7 +149,9 @@ def get_mm_ecs(t2g_path, txnames, temp_dir):
148
149
lines = t2g_file .readlines ()
149
150
t2g_df = pd .DataFrame ()
150
151
t2g_df ["transcript" ] = [line .split ("\t " )[0 ] for line in lines ]
151
- t2g_df ["gene_id" ] = [line .split ("\t " )[1 ].replace ("\n " , "" ) for line in lines ]
152
+ t2g_df ["gene_id" ] = [
153
+ line .split ("\t " )[1 ].replace ("\n " , "" ) for line in lines
154
+ ]
152
155
153
156
with open (txnames ) as f :
154
157
txs = f .read ().splitlines ()
@@ -166,9 +169,9 @@ def get_mm_ecs(t2g_path, txnames, temp_dir):
166
169
167
170
# Check if transcript IDs belong to one or more genes
168
171
if (
169
- len (set (t2g_df [ t2g_df [ "transcript" ]. isin ( mapped_txs )][ "gene_id" ]. values ))
170
- > 1
171
- ):
172
+ len (set (
173
+ t2g_df [ t2g_df [ "transcript" ]. isin ( mapped_txs )][ "gene_id" ]. values )
174
+ ) > 1 ):
172
175
ecs_mm .append (row [0 ])
173
176
174
177
logger .debug (
@@ -302,9 +305,8 @@ def extract(
302
305
"extract_all, extract_all_fast, and/or extract_all_unmapped cannot be used simultaneously"
303
306
)
304
307
305
- if targets is None and not (
306
- extract_all or extract_all_fast or extract_all_unmapped
307
- ):
308
+ if targets is None and not (extract_all or extract_all_fast
309
+ or extract_all_unmapped ):
308
310
raise ValueError (
309
311
"targets must be provided "
310
312
"(unless extract_all, extract_all_fast, or extract_all_unmapped are used to extract all reads)"
@@ -321,11 +323,9 @@ def extract(
321
323
f"target_type must be 'gene' or 'transcript', not { target_type } "
322
324
)
323
325
324
- if (
325
- not mm
326
- or (target_type == "gene" and not (extract_all_fast or extract_all_unmapped ))
327
- or extract_all
328
- ) and (t2g_path is None ):
326
+ if (not mm or (target_type == "gene"
327
+ and not (extract_all_fast or extract_all_unmapped ))
328
+ or extract_all ) and (t2g_path is None ):
329
329
raise ValueError (
330
330
"t2g_path must be provided if mm flag is not provided, target_type is 'gene' "
331
331
"(and extract_all_fast and extract_all_unmapped are False), OR extract_all is True"
@@ -359,7 +359,9 @@ def extract(
359
359
numreads = numreads ,
360
360
)
361
361
362
- logger .info ("Alignment complete. Beginning extraction of reads using bustools..." )
362
+ logger .info (
363
+ "Alignment complete. Beginning extraction of reads using bustools..."
364
+ )
363
365
364
366
txnames = os .path .join (temp_dir , "transcripts.txt" )
365
367
bus_in = os .path .join (temp_dir , "output.bus" )
@@ -371,7 +373,9 @@ def extract(
371
373
if not mm :
372
374
# Remove multimapped reads from bus file
373
375
# This will return None if no ecs were found that map to multiple genes
374
- bus_in_no_mm = remove_mm_from_bus (t2g_path , txnames , temp_dir , bus_in )
376
+ bus_in_no_mm = remove_mm_from_bus (
377
+ t2g_path , txnames , temp_dir , bus_in
378
+ )
375
379
if bus_in_no_mm :
376
380
bus_in = bus_in_no_mm
377
381
@@ -397,8 +401,7 @@ def extract(
397
401
unmapped_fastq = os .path .join (out_dir , "all_unmapped/1.fastq.gz" )
398
402
mapped_fastq = os .path .join (extract_out_folder , "1.fastq.gz" )
399
403
extract_matching_reads_by_header (
400
- mapped_fastq ,
401
- fastq [0 ] if isinstance (fastq , list ) else fastq ,
404
+ mapped_fastq , fastq [0 ] if isinstance (fastq , list ) else fastq ,
402
405
unmapped_fastq
403
406
)
404
407
@@ -425,9 +428,8 @@ def extract(
425
428
# Set targets to all genes
426
429
targets = list (set (t2g_df ["gene_id" ].values ))
427
430
g2ts = {
428
- gid : t2g_df [t2g_df ["gene_id" ] == gid ][
429
- "transcript"
430
- ].values .tolist ()
431
+ gid : t2g_df [t2g_df ["gene_id" ] == gid ]
432
+ ["transcript" ].values .tolist ()
431
433
for gid in targets
432
434
}
433
435
@@ -437,7 +439,8 @@ def extract(
437
439
438
440
else :
439
441
g2ts = {
440
- gid : t2g_df [t2g_df ["gene_id" ] == gid ]["transcript" ].values .tolist ()
442
+ gid : t2g_df [t2g_df ["gene_id" ] == gid ]
443
+ ["transcript" ].values .tolist ()
441
444
for gid in targets
442
445
}
443
446
@@ -461,7 +464,9 @@ def extract(
461
464
+ ", " .join (transcripts )
462
465
)
463
466
else :
464
- logger .info (f"Extracting reads for the following transcript: { gid } " )
467
+ logger .info (
468
+ f"Extracting reads for the following transcript: { gid } "
469
+ )
465
470
466
471
bus_out = os .path .join (temp_dir , f"output_extracted_{ gid } .bus" )
467
472
bus_out_sorted = os .path .join (
@@ -481,7 +486,9 @@ def extract(
481
486
)
482
487
483
488
# Extract records for this transcript ID from fastq
484
- bustools_sort (bus_path = bus_out , flags = True , out_path = bus_out_sorted )
489
+ bustools_sort (
490
+ bus_path = bus_out , flags = True , out_path = bus_out_sorted
491
+ )
485
492
486
493
extract_out_folder = os .path .join (out_dir , gid )
487
494
bustools_extract (
0 commit comments