Merge pull request #6 from peterk87/feature/ct-table

Add option to add Ct values to QC stats table from a Ct values table
peterk87 · Apr 23, 2021 · 7302c17 · 7302c17
2 parents 58bbbf1 + 9b6c1df
commit 7302c17
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 5 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.4
+current_version = 0.3.0
 commit = True
 tag = True
 

diff --git a/setup.py b/setup.py
@@ -20,6 +20,7 @@
     'biopython',
     'openpyxl',
     'imageio',
+    'odfpy',
 ]
 
 setup_requirements = ['pytest-runner', ]
@@ -58,6 +59,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/peterk87/xlavir',
-    version='0.2.4',
+    version='0.3.0',
     zip_safe=False,
 )
diff --git a/tests/data/io/ct.ods b/tests/data/io/ct.ods
diff --git a/tests/data/io/ct.tsv b/tests/data/io/ct.tsv
@@ -0,0 +1,4 @@
+sample_id	ct
+Sample1	23
+Sample2	34.8
+Sample3	0
diff --git a/tests/data/io/ct.xlsx b/tests/data/io/ct.xlsx
diff --git a/tests/test_io_ct.py b/tests/test_io_ct.py
@@ -0,0 +1,14 @@
+from xlavir.io import ct
+from pathlib import Path
+
+dirpath = Path(__file__).parent
+
+
+def test_parse_ct_tables():
+    expected = {'Sample1': 23.0, 'Sample2': 34.8, 'Sample3': 0.0}
+    io_ct_dir = dirpath / 'data/io'
+    for ext in ['.ods', '.tsv', '.xlsx']:
+        p = io_ct_dir / f'ct{ext}'
+        assert ct.read_ct_table(p) == expected, \
+            f'Should be able to parse Ct values from table with extension="{ext}". ' \
+            f'Filename: {p.absolute()}'
diff --git a/xlavir/__init__.py b/xlavir/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Peter Kruczkiewicz"""
 __email__ = 'peter.kruczkiewicz@gmail.com'
-__version__ = '0.2.4'
+__version__ = '0.3.0'
diff --git a/xlavir/cli.py b/xlavir/cli.py
@@ -40,6 +40,7 @@ class QCPresets(str, Enum):
 def main(
     input_dir: Path,
     output: Path = typer.Argument('report.xlsx'),
+    ct_table: Path = typer.Option(None, help='Table of sample IDs and rtPCR Ct values'),
     pangolin_lineage_csv: Path = typer.Option(None, help='Pangolin lineage report CSV'),
     qc_preset: Optional[QCPresets] = typer.Option(None, help='Quality check preset'),
     low_coverage_threshold: Optional[int] = typer.Option(None, help='Low coverage threshold. '
@@ -87,6 +88,7 @@ def main(
 
     dfs = run(input_dir=input_dir,
               pangolin_lineage_csv=pangolin_lineage_csv,
+              ct_values_table=ct_table,
               quality_reqs=quality_reqs)
 
     write_xlsx_report(dfs=dfs,

diff --git a/xlavir/io/ct.py b/xlavir/io/ct.py
@@ -0,0 +1,45 @@
+from typing import Dict
+
+import pandas as pd
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def validate_ct_table(df: pd.DataFrame) -> bool:
+    if df.empty:
+        logger.error(f'Ct values table is empty! No Ct values present!')
+        return False
+    n_rows, n_cols = df.shape
+    if n_cols != 2:
+        logger.error(
+            f'Ct values table expected to only have 2 columns, but {n_cols} were found with names: {", ".join(df.columns)}')
+        return False
+    return True
+
+
+def read_ct_table(ct_path: Path) -> Dict[str, float]:
+    suffix = ct_path.suffix.lower()
+    if suffix == '.txt':
+        logger.warning(f'Trying to read "{ct_path.name}" as tab-delimited file with header.')
+        df = pd.read_table(ct_path)
+    elif suffix == '.tsv':
+        df = pd.read_table(ct_path)
+    elif suffix == '.ods':
+        df = pd.read_excel(ct_path, engine='odf')
+    elif suffix == '.csv':
+        df = pd.read_csv(ct_path)
+    elif suffix == '.xlsx':
+        df = pd.read_excel(ct_path)
+    else:
+        logger.error(f'Not sure how to parse Ct values table with extension "{suffix}". '
+                     f'Please provide a tab-delimited file (".tsv"), CSV (".csv"), '
+                     f'Excel file (".xlsx") or OpenDocument Spreadsheet (".ods").')
+        return {}
+    if validate_ct_table(df):
+        df.columns = ['sample', 'ct']
+        logger.info(f'Read table with shape {df.shape} from "{ct_path}"')
+        return {row.sample: row.ct for row in df.itertuples()}
+    else:
+        return {}
diff --git a/xlavir/qc/__init__.py b/xlavir/qc/__init__.py
@@ -12,6 +12,7 @@
 def columns(low_coverage_threshold: int = 5) -> List[Tuple[str, str]]:
     return [
         ('sample', 'Sample', 'Sample name'),
+        ('ct_value', 'Ct Value', 'Real-time PCR Ct value'),
         (
             'qc_status',
             'QC Status',
@@ -84,6 +85,7 @@ def report_format(df: pd.DataFrame, low_coverage_threshold: int = 5) -> pd.DataF
 
 def create_qc_stats_dataframe(sample_depth_info: Dict[str, mosdepth.MosdepthDepthInfo],
                               sample_mapping_info: Dict[str, samtools.SamtoolsFlagstat],
+                              sample_cts: Dict[str, float],
                               quality_reqs: QualityRequirements):
     sample_names = set(sample_depth_info.keys()) | set(sample_mapping_info.keys())
     logger.info(f'N samples: {len(sample_names)}')
@@ -92,6 +94,7 @@ def create_qc_stats_dataframe(sample_depth_info: Dict[str, mosdepth.MosdepthDept
         depth_info = sample_depth_info[sample].dict() if sample in sample_depth_info else {}
         mapping_info = sample_mapping_info[sample].dict() if sample in sample_mapping_info else {}
         merged_stats_info[sample] = {**depth_info, **mapping_info}
+        merged_stats_info[sample]['ct_value'] = sample_cts.get(sample, None)
     df_stats = pd.DataFrame(merged_stats_info.values())
     mask_pass_depth = (df_stats.median_coverage >= quality_reqs.min_median_depth)
     mask_pass_breadth = (df_stats.genome_coverage >= quality_reqs.min_genome_coverage)

diff --git a/xlavir/xlavir.py b/xlavir/xlavir.py
@@ -4,6 +4,7 @@
 from typing import Optional, List
 
 from xlavir import qc
+from xlavir.io import ct
 from xlavir.io.excel_sheet_dataframe import ExcelSheetDataFrame, SheetName
 from xlavir.tools import mosdepth, samtools, consensus, pangolin, variants
 from xlavir.tools.nextflow import exec_report
@@ -14,7 +15,8 @@
 
 def run(input_dir: Path,
         quality_reqs: Optional[qc.QualityRequirements],
-        pangolin_lineage_csv: Optional[Path] = None) -> List[ExcelSheetDataFrame]:
+        pangolin_lineage_csv: Optional[Path] = None,
+        ct_values_table: Optional[Path] = None) -> List[ExcelSheetDataFrame]:
     if quality_reqs:
         quality_reqs = qc.QualityRequirements()
     nf_exec_info = exec_report.get_info(input_dir)
@@ -26,12 +28,13 @@ def run(input_dir: Path,
     if logger.level == logging.DEBUG:
         for sample, info in sample_mapping_info.items():
             logger.debug(info.dict())
-
+    sample_cts = ct.read_ct_table(ct_values_table) if ct_values_table else {}
     sample_variants = variants.get_info(input_dir)
 
     dfs: List[ExcelSheetDataFrame] = []
     df_stats = qc.create_qc_stats_dataframe(sample_depth_info,
                                             sample_mapping_info,
+                                            sample_cts=sample_cts,
                                             quality_reqs=quality_reqs)
     dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.qc_stats.value,
                                    df=qc.report_format(df_stats,