diff --git a/HISTORY.rst b/HISTORY.rst index a5a79f2..fdca50d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,38 @@ History ======= +0.4.0 (2021-04-23) +------------------ + +* Adds "Variants Summary" sheet summarizing variant information across all samples +* Adds comments to AF values in "Variant Matrix" sheet +* Fixes width/height of cell comments to be based on length of comment text + +0.3.0 (2021-04-23) +------------------ + +* Adds support for adding Ct values from a Ct values table (tab-delimited, CSV, ODS, XLSX format) into an xlavir report. + +0.2.4 (2021-04-19) +------------------ + +* Fixes issue with SnpSift table file parsing and variable naming in variants.py (#4, #5) + +0.2.3 (2021-04-19) +------------------ + +* Fixes issue with SnpSift table file parsing. Adds check to see if SnpSift column is dtype object/str before using .str Series methods (#4) + +0.2.2 (2021-03-30) +------------------ + +* Fixes issue with SnpEff/SnpSift AA change parsing. + +0.2.1 (2021-03-29) +------------------ + +* Fix division by zero error due to variants with DP values of 0 + 0.2.0 (2021-03-04) ------------------ diff --git a/setup.cfg b/setup.cfg index b77590b..84cc125 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0 +current_version = 0.4.0 commit = True tag = True diff --git a/setup.py b/setup.py index 3dd5b41..b4694b8 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ 'typer', 'rich', 'pandas', + 'numpy>=1.20.0', 'xlsxwriter', 'pydantic', 'beautifulsoup4', @@ -59,6 +60,6 @@ test_suite='tests', tests_require=test_requirements, url='https://github.com/peterk87/xlavir', - version='0.3.0', + version='0.4.0', zip_safe=False, ) diff --git a/xlavir/__init__.py b/xlavir/__init__.py index aa05088..292392d 100644 --- a/xlavir/__init__.py +++ b/xlavir/__init__.py @@ -2,4 +2,4 @@ __author__ = """Peter Kruczkiewicz""" __email__ = 'peter.kruczkiewicz@gmail.com' -__version__ = '0.3.0' +__version__ = '0.4.0' diff --git a/xlavir/io/excel_sheet_dataframe.py b/xlavir/io/excel_sheet_dataframe.py index be84dcc..5efd721 100644 --- a/xlavir/io/excel_sheet_dataframe.py +++ b/xlavir/io/excel_sheet_dataframe.py @@ -10,6 +10,7 @@ class SheetName(str, Enum): consensus = 'Consensus' pangolin = 'Pangolin Lineage' variants = 'Variants' + varsum = 'Variants Summary' varmat = 'Variant Matrix' diff --git a/xlavir/io/xl.py b/xlavir/io/xl.py index 77a1921..021d392 100644 --- a/xlavir/io/xl.py +++ b/xlavir/io/xl.py @@ -2,7 +2,7 @@ import logging from copy import copy from pathlib import Path -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Set, Tuple, Dict, Union import openpyxl import pandas as pd @@ -13,6 +13,7 @@ from xlavir.io.excel_sheet_dataframe import ExcelSheetDataFrame, SheetName from xlavir.qc import QualityRequirements from xlavir.util import get_col_widths, get_row_heights +from xlavir import __version__ logger = logging.getLogger(__name__) @@ -98,9 +99,9 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], pass_qc_fmt = book.add_format(dict(bg_color='c4edce', font_name='Courier New', bold=False)) - float_cols = {'Mean Coverage Depth'} + float_cols = {'Mean Coverage Depth', 'Mean Depth'} perc_cols = {'% Genome Coverage'} - perc_2dec_cols = {'Alternate Allele Frequency'} + perc_2dec_cols = {'Alternate Allele Frequency', 'Min AF', 'Max AF', 'Mean AF'} images_added = False @@ -190,7 +191,7 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], df_qc = get_qc_df(dfs) failed_samples = set(df_qc[df_qc['QC Status'] == 'FAIL'].index) - highlight_qc_failed_samples(xlsx_path=output_xlsx, failed_samples=failed_samples) + add_comments(xlsx_path=output_xlsx, failed_samples=failed_samples, esdfs=dfs) def get_qc_df(dfs: List[ExcelSheetDataFrame]) -> Optional[pd.DataFrame]: @@ -219,7 +220,16 @@ def add_images(images_for_sheets: List[SheetImage], sheet.hide_row_col_headers() -def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> None: +def get_excel_sheet_df(esds: List[ExcelSheetDataFrame], + sheet_name: str) -> Optional[ExcelSheetDataFrame]: + for esd in esds: + if esd.sheet_name == sheet_name: + return esd + + +def add_comments(xlsx_path: Path, + failed_samples: Set[str], + esdfs: List[ExcelSheetDataFrame]) -> None: from openpyxl.comments import Comment from openpyxl.styles import PatternFill, Font from openpyxl.worksheet.worksheet import Worksheet @@ -228,10 +238,20 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No f'to highlight {len(failed_samples)} samples that have failed QC') book = openpyxl.load_workbook(xlsx_path) logger.info(f'Loaded "{xlsx_path.name}" using openpyxl. Sheets: {book.get_sheet_names()}') + logger.info(f'Adjusting comment textbox sizes to fit text') + for sheetname in book.sheetnames: + for row in book[sheetname]: + for cell in row: + if cell.comment: + comment: Comment = cell.comment + comment.width = 300 + comment.height = max((100, len(comment.text) / 3 * 2)) + comment.author = f'xlavir version {__version__}' + sheet_names = [ SheetName.pangolin.value, SheetName.variants.value, - SheetName.varmat, + SheetName.varmat.value, ] light_red = 'FC9295' for sheet_name in sheet_names: @@ -249,6 +269,40 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No fgColor=light_red) except KeyError: pass + + esd_varmat = get_excel_sheet_df(esdfs, SheetName.varmat.value) + esd_variants = get_excel_sheet_df(esdfs, SheetName.variants.value) + if esd_varmat and esd_variants: + try: + sheet: Worksheet = book[SheetName.varmat.value] + logger.info(f'Adding additional comments to variant matrix values') + df_varmat = esd_varmat.df + variants: Dict[Tuple[str, str], Dict[str, Union[str, float, int]]] = esd_variants \ + .df.reset_index() \ + .set_index(['Sample', 'Mutation']) \ + .to_dict(orient='index') + + for i, row in enumerate(sheet.rows): + if i == 0: + continue + sample = df_varmat.index.values[i - 1] + for j, cell in enumerate(row): + if j == 0: + continue + mutation = df_varmat.columns.values[j - 1] + variant = variants.get((sample, mutation), None) + if variant: + variant_str = '\n'.join(f'{k}: {v}' for k, v in variant.items()) + comment_text = f'Sample: {sample}\nMutation: {mutation}\n{variant_str}' + else: + comment_text = f'Mutation "{mutation}" not found in sample "{sample}"' + cell.comment = Comment(comment_text, + author=f'xlavir version {__version__}', + width=300, + height=len(comment_text)) + except KeyError: + pass + try: sheet: Worksheet = book[SheetName.consensus.value] diff --git a/xlavir/tools/variants.py b/xlavir/tools/variants.py index d625467..c9174d8 100644 --- a/xlavir/tools/variants.py +++ b/xlavir/tools/variants.py @@ -7,6 +7,7 @@ from typing import Dict, Tuple, List, Optional, Iterable import pandas as pd +import numpy as np from pydantic import BaseModel from xlavir.util import try_parse_number, find_file_for_each_sample @@ -89,6 +90,49 @@ ('aa_len', 'Gene Amino Acid Length', 'Amino acid length of the reference sequence gene'), ] +variant_summary_cols = [ + ( + 'Mutation', + 'Mutation', + 'Mutation found in sample with format ' + '"{reference allele}{reference position}{allele in sample}"' + ' with predicted amino acid change information in brackets with format ' + '"{gene name}:{reference AA}{gene AA position}{AA change}"' + ), + ('n_samples', '# of Samples', 'Number of samples with the mutation.'), + ('samples', 'Samples', 'List of samples with mutation delimited by semicolon (";")'), + ( + 'min_depth', + 'Min Depth', + 'Minimum depth in all samples that the mutation is observed at.' + ), + ( + 'max_depth', + 'Max Depth', + 'Maximum depth in all samples that the mutation is observed at.' + ), + ( + 'mean_depth', + 'Mean Depth', + 'Mean/average depth that the mutation is observed at.' + ), + ( + 'min_af', + 'Min AF', + 'Minimum alternate allele frequency of mutation in all samples.' + ), + ( + 'max_af', + 'Max AF', + 'Maximum alternate allele frequency of mutation in all samples.' + ), + ( + 'mean_af', + 'Mean AF', + 'Mean/average alternate allele frequency of mutation in all samples.' + ), +] + BCFTOOLS_STATS_GLOB_PATTERNS = [ '**/ivar/**/*AF0.*.bcftools_stats.txt', '**/ivar/**/*.bcftools_stats.txt', @@ -411,3 +455,20 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame: df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int))) pivot_cols.sort(key=itemgetter(1)) return df_pivot[[x for x, y in pivot_cols]] + + +def to_summary(df: pd.DataFrame) -> pd.DataFrame: + df_vars = df.copy() + df_vars.reset_index(inplace=True) + + df_summary = df_vars.groupby('Mutation', sort=False).agg( + n_samples=pd.NamedAgg(column='Sample', aggfunc='size'), + samples=pd.NamedAgg(column='Sample', aggfunc=lambda x: '; '.join(x)), + min_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='min'), + max_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='max'), + mean_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc=lambda x: sum(x) / len(x)), + min_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='min'), + max_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='max'), + mean_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc=lambda x: sum(x) / len(x)), + ) + return df_summary.rename(columns={x: y for x, y, _ in variant_summary_cols}) diff --git a/xlavir/xlavir.py b/xlavir/xlavir.py index fdd485f..04c59a4 100644 --- a/xlavir/xlavir.py +++ b/xlavir/xlavir.py @@ -56,6 +56,11 @@ def run(input_dir: Path, pd_to_excel_kwargs=dict(freeze_panes=(1, 1)), include_header_width=False, header_comments={name: desc for _, name, desc in variants.variants_cols})) + df_varsum = variants.to_summary(df_variants) + dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varsum.value, + df=df_varsum, + pd_to_excel_kwargs=dict(freeze_panes=(1, 1)), + header_comments={name: desc for _, name, desc in variants.variant_summary_cols})) df_varmap = variants.to_variant_pivot_table(df_variants) max_index_length = df_varmap.index.str.len().max() dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varmat.value,