Skip to content

Commit 298fd7f

Browse files
committed
reformatting/styling
1 parent 327dfe1 commit 298fd7f

File tree

3 files changed

+39
-37
lines changed

3 files changed

+39
-37
lines changed

kb_python/extract.py

+32-25
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,9 @@ def read_headers_from_fastq(fastq_file):
102102
return headers
103103

104104

105-
def extract_matching_reads_by_header(input_fastq, reference_fastq, output_fastq):
105+
def extract_matching_reads_by_header(
106+
input_fastq, reference_fastq, output_fastq
107+
):
106108
"""
107109
Extracts reads from the reference FASTQ (.gz) file that are NOT present in the input FASTQ (.gz) file
108110
based on headers and writes them to the output FASTQ (.gz) file.
@@ -125,8 +127,7 @@ def extract_matching_reads_by_header(input_fastq, reference_fastq, output_fastq)
125127
# Create a SeqIO writer for the output FASTQ file
126128
writer = SeqIO.write(
127129
(
128-
record
129-
for record in SeqIO.parse(infile, "fastq")
130+
record for record in SeqIO.parse(infile, "fastq")
130131
if record.id not in reference_headers
131132
),
132133
outfile,
@@ -148,7 +149,9 @@ def get_mm_ecs(t2g_path, txnames, temp_dir):
148149
lines = t2g_file.readlines()
149150
t2g_df = pd.DataFrame()
150151
t2g_df["transcript"] = [line.split("\t")[0] for line in lines]
151-
t2g_df["gene_id"] = [line.split("\t")[1].replace("\n", "") for line in lines]
152+
t2g_df["gene_id"] = [
153+
line.split("\t")[1].replace("\n", "") for line in lines
154+
]
152155

153156
with open(txnames) as f:
154157
txs = f.read().splitlines()
@@ -166,9 +169,9 @@ def get_mm_ecs(t2g_path, txnames, temp_dir):
166169

167170
# Check if transcript IDs belong to one or more genes
168171
if (
169-
len(set(t2g_df[t2g_df["transcript"].isin(mapped_txs)]["gene_id"].values))
170-
> 1
171-
):
172+
len(set(
173+
t2g_df[t2g_df["transcript"].isin(mapped_txs)]["gene_id"].values)
174+
) > 1):
172175
ecs_mm.append(row[0])
173176

174177
logger.debug(
@@ -302,9 +305,8 @@ def extract(
302305
"extract_all, extract_all_fast, and/or extract_all_unmapped cannot be used simultaneously"
303306
)
304307

305-
if targets is None and not (
306-
extract_all or extract_all_fast or extract_all_unmapped
307-
):
308+
if targets is None and not (extract_all or extract_all_fast
309+
or extract_all_unmapped):
308310
raise ValueError(
309311
"targets must be provided "
310312
"(unless extract_all, extract_all_fast, or extract_all_unmapped are used to extract all reads)"
@@ -321,11 +323,9 @@ def extract(
321323
f"target_type must be 'gene' or 'transcript', not {target_type}"
322324
)
323325

324-
if (
325-
not mm
326-
or (target_type == "gene" and not (extract_all_fast or extract_all_unmapped))
327-
or extract_all
328-
) and (t2g_path is None):
326+
if (not mm or (target_type == "gene"
327+
and not (extract_all_fast or extract_all_unmapped))
328+
or extract_all) and (t2g_path is None):
329329
raise ValueError(
330330
"t2g_path must be provided if mm flag is not provided, target_type is 'gene' "
331331
"(and extract_all_fast and extract_all_unmapped are False), OR extract_all is True"
@@ -359,7 +359,9 @@ def extract(
359359
numreads=numreads,
360360
)
361361

362-
logger.info("Alignment complete. Beginning extraction of reads using bustools...")
362+
logger.info(
363+
"Alignment complete. Beginning extraction of reads using bustools..."
364+
)
363365

364366
txnames = os.path.join(temp_dir, "transcripts.txt")
365367
bus_in = os.path.join(temp_dir, "output.bus")
@@ -371,7 +373,9 @@ def extract(
371373
if not mm:
372374
# Remove multimapped reads from bus file
373375
# This will return None if no ecs were found that map to multiple genes
374-
bus_in_no_mm = remove_mm_from_bus(t2g_path, txnames, temp_dir, bus_in)
376+
bus_in_no_mm = remove_mm_from_bus(
377+
t2g_path, txnames, temp_dir, bus_in
378+
)
375379
if bus_in_no_mm:
376380
bus_in = bus_in_no_mm
377381

@@ -397,8 +401,7 @@ def extract(
397401
unmapped_fastq = os.path.join(out_dir, "all_unmapped/1.fastq.gz")
398402
mapped_fastq = os.path.join(extract_out_folder, "1.fastq.gz")
399403
extract_matching_reads_by_header(
400-
mapped_fastq,
401-
fastq[0] if isinstance(fastq, list) else fastq,
404+
mapped_fastq, fastq[0] if isinstance(fastq, list) else fastq,
402405
unmapped_fastq
403406
)
404407

@@ -425,9 +428,8 @@ def extract(
425428
# Set targets to all genes
426429
targets = list(set(t2g_df["gene_id"].values))
427430
g2ts = {
428-
gid: t2g_df[t2g_df["gene_id"] == gid][
429-
"transcript"
430-
].values.tolist()
431+
gid: t2g_df[t2g_df["gene_id"] == gid]
432+
["transcript"].values.tolist()
431433
for gid in targets
432434
}
433435

@@ -437,7 +439,8 @@ def extract(
437439

438440
else:
439441
g2ts = {
440-
gid: t2g_df[t2g_df["gene_id"] == gid]["transcript"].values.tolist()
442+
gid: t2g_df[t2g_df["gene_id"] == gid]
443+
["transcript"].values.tolist()
441444
for gid in targets
442445
}
443446

@@ -461,7 +464,9 @@ def extract(
461464
+ ", ".join(transcripts)
462465
)
463466
else:
464-
logger.info(f"Extracting reads for the following transcript: {gid}")
467+
logger.info(
468+
f"Extracting reads for the following transcript: {gid}"
469+
)
465470

466471
bus_out = os.path.join(temp_dir, f"output_extracted_{gid}.bus")
467472
bus_out_sorted = os.path.join(
@@ -481,7 +486,9 @@ def extract(
481486
)
482487

483488
# Extract records for this transcript ID from fastq
484-
bustools_sort(bus_path=bus_out, flags=True, out_path=bus_out_sorted)
489+
bustools_sort(
490+
bus_path=bus_out, flags=True, out_path=bus_out_sorted
491+
)
485492

486493
extract_out_folder = os.path.join(out_dir, gid)
487494
bustools_extract(

kb_python/main.py

+6-9
Original file line numberDiff line numberDiff line change
@@ -1465,10 +1465,7 @@ def setup_count_args(
14651465
default=0.8
14661466
)
14671467
parser_count.add_argument(
1468-
'--error-rate',
1469-
help=argparse.SUPPRESS,
1470-
type=float,
1471-
default=None
1468+
'--error-rate', help=argparse.SUPPRESS, type=float, default=None
14721469
)
14731470
parser_count.add_argument(
14741471
'--platform',
@@ -1636,7 +1633,8 @@ def setup_extract_args(
16361633
nargs='+',
16371634
required=False,
16381635
default=None,
1639-
help='Gene or transcript names for which to extract the raw reads that align to the index'
1636+
help=
1637+
'Gene or transcript names for which to extract the raw reads that align to the index'
16401638
)
16411639
parser_extract.add_argument(
16421640
'-ttype',
@@ -1645,7 +1643,8 @@ def setup_extract_args(
16451643
type=str,
16461644
default='gene',
16471645
choices=['gene', 'transcript'],
1648-
help="'gene' (default) or 'transcript' -> Defines whether targets are gene or transcript names"
1646+
help=
1647+
"'gene' (default) or 'transcript' -> Defines whether targets are gene or transcript names"
16491648
)
16501649
parser_extract.add_argument(
16511650
'--extract_all',
@@ -1677,9 +1676,7 @@ def setup_extract_args(
16771676
)
16781677
parser_extract.add_argument(
16791678
'--mm',
1680-
help=(
1681-
'Also extract reads that multi-mapped to more than one gene.'
1682-
),
1679+
help=('Also extract reads that multi-mapped to more than one gene.'),
16831680
action='store_true',
16841681
default=False
16851682
)

kb_python/utils.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -724,9 +724,7 @@ def overlay_anndatas(
724724

725725
df_obs = unspliced_intersection.obs
726726
df_var = unspliced_intersection.var
727-
return anndata.AnnData(
728-
X=sum_X, layers=a_layers, obs=df_obs, var=df_var
729-
)
727+
return anndata.AnnData(X=sum_X, layers=a_layers, obs=df_obs, var=df_var)
730728

731729

732730
def sum_anndatas(

0 commit comments

Comments
 (0)