Skip to content

Commit

Permalink
Merge pull request #6 from peterk87/feature/ct-table
Browse files Browse the repository at this point in the history
Add option to add Ct values to QC stats table from a Ct values table
  • Loading branch information
peterk87 authored Apr 23, 2021
2 parents 58bbbf1 + 9b6c1df commit 7302c17
Show file tree
Hide file tree
Showing 11 changed files with 77 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.4
current_version = 0.3.0
commit = True
tag = True

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
'biopython',
'openpyxl',
'imageio',
'odfpy',
]

setup_requirements = ['pytest-runner', ]
Expand Down Expand Up @@ -58,6 +59,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/peterk87/xlavir',
version='0.2.4',
version='0.3.0',
zip_safe=False,
)
Binary file added tests/data/io/ct.ods
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/data/io/ct.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
sample_id ct
Sample1 23
Sample2 34.8
Sample3 0
Binary file added tests/data/io/ct.xlsx
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_io_ct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from xlavir.io import ct
from pathlib import Path

dirpath = Path(__file__).parent


def test_parse_ct_tables():
expected = {'Sample1': 23.0, 'Sample2': 34.8, 'Sample3': 0.0}
io_ct_dir = dirpath / 'data/io'
for ext in ['.ods', '.tsv', '.xlsx']:
p = io_ct_dir / f'ct{ext}'
assert ct.read_ct_table(p) == expected, \
f'Should be able to parse Ct values from table with extension="{ext}". ' \
f'Filename: {p.absolute()}'
2 changes: 1 addition & 1 deletion xlavir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Peter Kruczkiewicz"""
__email__ = 'peter.kruczkiewicz@gmail.com'
__version__ = '0.2.4'
__version__ = '0.3.0'
2 changes: 2 additions & 0 deletions xlavir/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class QCPresets(str, Enum):
def main(
input_dir: Path,
output: Path = typer.Argument('report.xlsx'),
ct_table: Path = typer.Option(None, help='Table of sample IDs and rtPCR Ct values'),
pangolin_lineage_csv: Path = typer.Option(None, help='Pangolin lineage report CSV'),
qc_preset: Optional[QCPresets] = typer.Option(None, help='Quality check preset'),
low_coverage_threshold: Optional[int] = typer.Option(None, help='Low coverage threshold. '
Expand Down Expand Up @@ -87,6 +88,7 @@ def main(

dfs = run(input_dir=input_dir,
pangolin_lineage_csv=pangolin_lineage_csv,
ct_values_table=ct_table,
quality_reqs=quality_reqs)

write_xlsx_report(dfs=dfs,
Expand Down
45 changes: 45 additions & 0 deletions xlavir/io/ct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from typing import Dict

import pandas as pd
from pathlib import Path
import logging

logger = logging.getLogger(__name__)


def validate_ct_table(df: pd.DataFrame) -> bool:
if df.empty:
logger.error(f'Ct values table is empty! No Ct values present!')
return False
n_rows, n_cols = df.shape
if n_cols != 2:
logger.error(
f'Ct values table expected to only have 2 columns, but {n_cols} were found with names: {", ".join(df.columns)}')
return False
return True


def read_ct_table(ct_path: Path) -> Dict[str, float]:
suffix = ct_path.suffix.lower()
if suffix == '.txt':
logger.warning(f'Trying to read "{ct_path.name}" as tab-delimited file with header.')
df = pd.read_table(ct_path)
elif suffix == '.tsv':
df = pd.read_table(ct_path)
elif suffix == '.ods':
df = pd.read_excel(ct_path, engine='odf')
elif suffix == '.csv':
df = pd.read_csv(ct_path)
elif suffix == '.xlsx':
df = pd.read_excel(ct_path)
else:
logger.error(f'Not sure how to parse Ct values table with extension "{suffix}". '
f'Please provide a tab-delimited file (".tsv"), CSV (".csv"), '
f'Excel file (".xlsx") or OpenDocument Spreadsheet (".ods").')
return {}
if validate_ct_table(df):
df.columns = ['sample', 'ct']
logger.info(f'Read table with shape {df.shape} from "{ct_path}"')
return {row.sample: row.ct for row in df.itertuples()}
else:
return {}
3 changes: 3 additions & 0 deletions xlavir/qc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
def columns(low_coverage_threshold: int = 5) -> List[Tuple[str, str]]:
return [
('sample', 'Sample', 'Sample name'),
('ct_value', 'Ct Value', 'Real-time PCR Ct value'),
(
'qc_status',
'QC Status',
Expand Down Expand Up @@ -84,6 +85,7 @@ def report_format(df: pd.DataFrame, low_coverage_threshold: int = 5) -> pd.DataF

def create_qc_stats_dataframe(sample_depth_info: Dict[str, mosdepth.MosdepthDepthInfo],
sample_mapping_info: Dict[str, samtools.SamtoolsFlagstat],
sample_cts: Dict[str, float],
quality_reqs: QualityRequirements):
sample_names = set(sample_depth_info.keys()) | set(sample_mapping_info.keys())
logger.info(f'N samples: {len(sample_names)}')
Expand All @@ -92,6 +94,7 @@ def create_qc_stats_dataframe(sample_depth_info: Dict[str, mosdepth.MosdepthDept
depth_info = sample_depth_info[sample].dict() if sample in sample_depth_info else {}
mapping_info = sample_mapping_info[sample].dict() if sample in sample_mapping_info else {}
merged_stats_info[sample] = {**depth_info, **mapping_info}
merged_stats_info[sample]['ct_value'] = sample_cts.get(sample, None)
df_stats = pd.DataFrame(merged_stats_info.values())
mask_pass_depth = (df_stats.median_coverage >= quality_reqs.min_median_depth)
mask_pass_breadth = (df_stats.genome_coverage >= quality_reqs.min_genome_coverage)
Expand Down
7 changes: 5 additions & 2 deletions xlavir/xlavir.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional, List

from xlavir import qc
from xlavir.io import ct
from xlavir.io.excel_sheet_dataframe import ExcelSheetDataFrame, SheetName
from xlavir.tools import mosdepth, samtools, consensus, pangolin, variants
from xlavir.tools.nextflow import exec_report
Expand All @@ -14,7 +15,8 @@

def run(input_dir: Path,
quality_reqs: Optional[qc.QualityRequirements],
pangolin_lineage_csv: Optional[Path] = None) -> List[ExcelSheetDataFrame]:
pangolin_lineage_csv: Optional[Path] = None,
ct_values_table: Optional[Path] = None) -> List[ExcelSheetDataFrame]:
if quality_reqs:
quality_reqs = qc.QualityRequirements()
nf_exec_info = exec_report.get_info(input_dir)
Expand All @@ -26,12 +28,13 @@ def run(input_dir: Path,
if logger.level == logging.DEBUG:
for sample, info in sample_mapping_info.items():
logger.debug(info.dict())

sample_cts = ct.read_ct_table(ct_values_table) if ct_values_table else {}
sample_variants = variants.get_info(input_dir)

dfs: List[ExcelSheetDataFrame] = []
df_stats = qc.create_qc_stats_dataframe(sample_depth_info,
sample_mapping_info,
sample_cts=sample_cts,
quality_reqs=quality_reqs)
dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.qc_stats.value,
df=qc.report_format(df_stats,
Expand Down

0 comments on commit 7302c17

Please sign in to comment.