Merge pull request #7 from peterk87/feature/var-summary

Add Variant Summary sheet
peterk87 · Apr 23, 2021 · 912ada1 · 912ada1
2 parents 7302c17 + 9f1167c
commit 912ada1
Show file tree

Hide file tree

Showing 8 changed files with 163 additions and 9 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,38 @@
 History
 =======
 
+0.4.0 (2021-04-23)
+------------------
+
+* Adds "Variants Summary" sheet summarizing variant information across all samples
+* Adds comments to AF values in "Variant Matrix" sheet
+* Fixes width/height of cell comments to be based on length of comment text
+
+0.3.0 (2021-04-23)
+------------------
+
+* Adds support for adding Ct values from a Ct values table (tab-delimited, CSV, ODS, XLSX format) into an xlavir report.
+
+0.2.4 (2021-04-19)
+------------------
+
+* Fixes issue with SnpSift table file parsing and variable naming in variants.py (#4, #5)
+
+0.2.3 (2021-04-19)
+------------------
+
+* Fixes issue with SnpSift table file parsing. Adds check to see if SnpSift column is dtype object/str before using .str Series methods (#4)
+
+0.2.2 (2021-03-30)
+------------------
+
+* Fixes issue with SnpEff/SnpSift AA change parsing.
+
+0.2.1 (2021-03-29)
+------------------
+
+* Fix division by zero error due to variants with DP values of 0
+
 0.2.0 (2021-03-04)
 ------------------
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.0
+current_version = 0.4.0
 commit = True
 tag = True
 

diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
     'typer',
     'rich',
     'pandas',
+    'numpy>=1.20.0',
     'xlsxwriter',
     'pydantic',
     'beautifulsoup4',
@@ -59,6 +60,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/peterk87/xlavir',
-    version='0.3.0',
+    version='0.4.0',
     zip_safe=False,
 )
diff --git a/xlavir/__init__.py b/xlavir/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = """Peter Kruczkiewicz"""
 __email__ = 'peter.kruczkiewicz@gmail.com'
-__version__ = '0.3.0'
+__version__ = '0.4.0'
diff --git a/xlavir/io/excel_sheet_dataframe.py b/xlavir/io/excel_sheet_dataframe.py
@@ -10,6 +10,7 @@ class SheetName(str, Enum):
     consensus = 'Consensus'
     pangolin = 'Pangolin Lineage'
     variants = 'Variants'
+    varsum = 'Variants Summary'
     varmat = 'Variant Matrix'
 
 

diff --git a/xlavir/io/xl.py b/xlavir/io/xl.py
@@ -2,7 +2,7 @@
 import logging
 from copy import copy
 from pathlib import Path
-from typing import List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple, Dict, Union
 
 import openpyxl
 import pandas as pd
@@ -13,6 +13,7 @@
 from xlavir.io.excel_sheet_dataframe import ExcelSheetDataFrame, SheetName
 from xlavir.qc import QualityRequirements
 from xlavir.util import get_col_widths, get_row_heights
+from xlavir import __version__
 
 logger = logging.getLogger(__name__)
 
@@ -98,9 +99,9 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
         pass_qc_fmt = book.add_format(dict(bg_color='c4edce',
                                            font_name='Courier New',
                                            bold=False))
-        float_cols = {'Mean Coverage Depth'}
+        float_cols = {'Mean Coverage Depth', 'Mean Depth'}
         perc_cols = {'% Genome Coverage'}
-        perc_2dec_cols = {'Alternate Allele Frequency'}
+        perc_2dec_cols = {'Alternate Allele Frequency', 'Min AF', 'Max AF', 'Mean AF'}
 
         images_added = False
 
@@ -190,7 +191,7 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
 
     df_qc = get_qc_df(dfs)
     failed_samples = set(df_qc[df_qc['QC Status'] == 'FAIL'].index)
-    highlight_qc_failed_samples(xlsx_path=output_xlsx, failed_samples=failed_samples)
+    add_comments(xlsx_path=output_xlsx, failed_samples=failed_samples, esdfs=dfs)
 
 
 def get_qc_df(dfs: List[ExcelSheetDataFrame]) -> Optional[pd.DataFrame]:
@@ -219,7 +220,16 @@ def add_images(images_for_sheets: List[SheetImage],
         sheet.hide_row_col_headers()
 
 
-def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> None:
+def get_excel_sheet_df(esds: List[ExcelSheetDataFrame],
+                       sheet_name: str) -> Optional[ExcelSheetDataFrame]:
+    for esd in esds:
+        if esd.sheet_name == sheet_name:
+            return esd
+
+
+def add_comments(xlsx_path: Path,
+                 failed_samples: Set[str],
+                 esdfs: List[ExcelSheetDataFrame]) -> None:
     from openpyxl.comments import Comment
     from openpyxl.styles import PatternFill, Font
     from openpyxl.worksheet.worksheet import Worksheet
@@ -228,10 +238,20 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No
                 f'to highlight {len(failed_samples)} samples that have failed QC')
     book = openpyxl.load_workbook(xlsx_path)
     logger.info(f'Loaded "{xlsx_path.name}" using openpyxl. Sheets: {book.get_sheet_names()}')
+    logger.info(f'Adjusting comment textbox sizes to fit text')
+    for sheetname in book.sheetnames:
+        for row in book[sheetname]:
+            for cell in row:
+                if cell.comment:
+                    comment: Comment = cell.comment
+                    comment.width = 300
+                    comment.height = max((100, len(comment.text) / 3 * 2))
+                    comment.author = f'xlavir version {__version__}'
+
     sheet_names = [
         SheetName.pangolin.value,
         SheetName.variants.value,
-        SheetName.varmat,
+        SheetName.varmat.value,
     ]
     light_red = 'FC9295'
     for sheet_name in sheet_names:
@@ -249,6 +269,40 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No
                                             fgColor=light_red)
         except KeyError:
             pass
+
+    esd_varmat = get_excel_sheet_df(esdfs, SheetName.varmat.value)
+    esd_variants = get_excel_sheet_df(esdfs, SheetName.variants.value)
+    if esd_varmat and esd_variants:
+        try:
+            sheet: Worksheet = book[SheetName.varmat.value]
+            logger.info(f'Adding additional comments to variant matrix values')
+            df_varmat = esd_varmat.df
+            variants: Dict[Tuple[str, str], Dict[str, Union[str, float, int]]] = esd_variants \
+                .df.reset_index() \
+                .set_index(['Sample', 'Mutation']) \
+                .to_dict(orient='index')
+
+            for i, row in enumerate(sheet.rows):
+                if i == 0:
+                    continue
+                sample = df_varmat.index.values[i - 1]
+                for j, cell in enumerate(row):
+                    if j == 0:
+                        continue
+                    mutation = df_varmat.columns.values[j - 1]
+                    variant = variants.get((sample, mutation), None)
+                    if variant:
+                        variant_str = '\n'.join(f'{k}: {v}' for k, v in variant.items())
+                        comment_text = f'Sample: {sample}\nMutation: {mutation}\n{variant_str}'
+                    else:
+                        comment_text = f'Mutation "{mutation}" not found in sample "{sample}"'
+                    cell.comment = Comment(comment_text,
+                                           author=f'xlavir version {__version__}',
+                                           width=300,
+                                           height=len(comment_text))
+        except KeyError:
+            pass
+
     try:
         sheet: Worksheet = book[SheetName.consensus.value]
 

diff --git a/xlavir/tools/variants.py b/xlavir/tools/variants.py
@@ -7,6 +7,7 @@
 from typing import Dict, Tuple, List, Optional, Iterable
 
 import pandas as pd
+import numpy as np
 from pydantic import BaseModel
 
 from xlavir.util import try_parse_number, find_file_for_each_sample
@@ -89,6 +90,49 @@
     ('aa_len', 'Gene Amino Acid Length', 'Amino acid length of the reference sequence gene'),
 ]
 
+variant_summary_cols = [
+    (
+        'Mutation',
+        'Mutation',
+        'Mutation found in sample with format '
+        '"{reference allele}{reference position}{allele in sample}"'
+        ' with predicted amino acid change information in brackets with format '
+        '"{gene name}:{reference AA}{gene AA position}{AA change}"'
+    ),
+    ('n_samples', '# of Samples', 'Number of samples with the mutation.'),
+    ('samples', 'Samples', 'List of samples with mutation delimited by semicolon (";")'),
+    (
+        'min_depth',
+        'Min Depth',
+        'Minimum depth in all samples that the mutation is observed at.'
+    ),
+    (
+        'max_depth',
+        'Max Depth',
+        'Maximum depth in all samples that the mutation is observed at.'
+    ),
+    (
+        'mean_depth',
+        'Mean Depth',
+        'Mean/average depth that the mutation is observed at.'
+    ),
+    (
+        'min_af',
+        'Min AF',
+        'Minimum alternate allele frequency of mutation in all samples.'
+    ),
+    (
+        'max_af',
+        'Max AF',
+        'Maximum alternate allele frequency of mutation in all samples.'
+    ),
+    (
+        'mean_af',
+        'Mean AF',
+        'Mean/average alternate allele frequency of mutation in all samples.'
+    ),
+]
+
 BCFTOOLS_STATS_GLOB_PATTERNS = [
     '**/ivar/**/*AF0.*.bcftools_stats.txt',
     '**/ivar/**/*.bcftools_stats.txt',
@@ -411,3 +455,20 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame:
                           df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int)))
     pivot_cols.sort(key=itemgetter(1))
     return df_pivot[[x for x, y in pivot_cols]]
+
+
+def to_summary(df: pd.DataFrame) -> pd.DataFrame:
+    df_vars = df.copy()
+    df_vars.reset_index(inplace=True)
+
+    df_summary = df_vars.groupby('Mutation', sort=False).agg(
+        n_samples=pd.NamedAgg(column='Sample', aggfunc='size'),
+        samples=pd.NamedAgg(column='Sample', aggfunc=lambda x: '; '.join(x)),
+        min_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='min'),
+        max_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='max'),
+        mean_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc=lambda x: sum(x) / len(x)),
+        min_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='min'),
+        max_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='max'),
+        mean_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc=lambda x: sum(x) / len(x)),
+    )
+    return df_summary.rename(columns={x: y for x, y, _ in variant_summary_cols})
diff --git a/xlavir/xlavir.py b/xlavir/xlavir.py
@@ -56,6 +56,11 @@ def run(input_dir: Path,
                                        pd_to_excel_kwargs=dict(freeze_panes=(1, 1)),
                                        include_header_width=False,
                                        header_comments={name: desc for _, name, desc in variants.variants_cols}))
+        df_varsum = variants.to_summary(df_variants)
+        dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varsum.value,
+                                       df=df_varsum,
+                                       pd_to_excel_kwargs=dict(freeze_panes=(1, 1)),
+                                       header_comments={name: desc for _, name, desc in variants.variant_summary_cols}))
         df_varmap = variants.to_variant_pivot_table(df_variants)
         max_index_length = df_varmap.index.str.len().max()
         dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varmat.value,