Skip to content

Commit

Permalink
Merge pull request #7 from peterk87/feature/var-summary
Browse files Browse the repository at this point in the history
Add Variant Summary sheet
  • Loading branch information
peterk87 authored Apr 23, 2021
2 parents 7302c17 + 9f1167c commit 912ada1
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 9 deletions.
32 changes: 32 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,38 @@
History
=======

0.4.0 (2021-04-23)
------------------

* Adds "Variants Summary" sheet summarizing variant information across all samples
* Adds comments to AF values in "Variant Matrix" sheet
* Fixes width/height of cell comments to be based on length of comment text

0.3.0 (2021-04-23)
------------------

* Adds support for adding Ct values from a Ct values table (tab-delimited, CSV, ODS, XLSX format) into an xlavir report.

0.2.4 (2021-04-19)
------------------

* Fixes issue with SnpSift table file parsing and variable naming in variants.py (#4, #5)

0.2.3 (2021-04-19)
------------------

* Fixes issue with SnpSift table file parsing. Adds check to see if SnpSift column is dtype object/str before using .str Series methods (#4)

0.2.2 (2021-03-30)
------------------

* Fixes issue with SnpEff/SnpSift AA change parsing.

0.2.1 (2021-03-29)
------------------

* Fix division by zero error due to variants with DP values of 0

0.2.0 (2021-03-04)
------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.0
current_version = 0.4.0
commit = True
tag = True

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
'typer',
'rich',
'pandas',
'numpy>=1.20.0',
'xlsxwriter',
'pydantic',
'beautifulsoup4',
Expand Down Expand Up @@ -59,6 +60,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/peterk87/xlavir',
version='0.3.0',
version='0.4.0',
zip_safe=False,
)
2 changes: 1 addition & 1 deletion xlavir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Peter Kruczkiewicz"""
__email__ = 'peter.kruczkiewicz@gmail.com'
__version__ = '0.3.0'
__version__ = '0.4.0'
1 change: 1 addition & 0 deletions xlavir/io/excel_sheet_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class SheetName(str, Enum):
consensus = 'Consensus'
pangolin = 'Pangolin Lineage'
variants = 'Variants'
varsum = 'Variants Summary'
varmat = 'Variant Matrix'


Expand Down
66 changes: 60 additions & 6 deletions xlavir/io/xl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from copy import copy
from pathlib import Path
from typing import List, Optional, Set, Tuple
from typing import List, Optional, Set, Tuple, Dict, Union

import openpyxl
import pandas as pd
Expand All @@ -13,6 +13,7 @@
from xlavir.io.excel_sheet_dataframe import ExcelSheetDataFrame, SheetName
from xlavir.qc import QualityRequirements
from xlavir.util import get_col_widths, get_row_heights
from xlavir import __version__

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -98,9 +99,9 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
pass_qc_fmt = book.add_format(dict(bg_color='c4edce',
font_name='Courier New',
bold=False))
float_cols = {'Mean Coverage Depth'}
float_cols = {'Mean Coverage Depth', 'Mean Depth'}
perc_cols = {'% Genome Coverage'}
perc_2dec_cols = {'Alternate Allele Frequency'}
perc_2dec_cols = {'Alternate Allele Frequency', 'Min AF', 'Max AF', 'Mean AF'}

images_added = False

Expand Down Expand Up @@ -190,7 +191,7 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],

df_qc = get_qc_df(dfs)
failed_samples = set(df_qc[df_qc['QC Status'] == 'FAIL'].index)
highlight_qc_failed_samples(xlsx_path=output_xlsx, failed_samples=failed_samples)
add_comments(xlsx_path=output_xlsx, failed_samples=failed_samples, esdfs=dfs)


def get_qc_df(dfs: List[ExcelSheetDataFrame]) -> Optional[pd.DataFrame]:
Expand Down Expand Up @@ -219,7 +220,16 @@ def add_images(images_for_sheets: List[SheetImage],
sheet.hide_row_col_headers()


def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> None:
def get_excel_sheet_df(esds: List[ExcelSheetDataFrame],
sheet_name: str) -> Optional[ExcelSheetDataFrame]:
for esd in esds:
if esd.sheet_name == sheet_name:
return esd


def add_comments(xlsx_path: Path,
failed_samples: Set[str],
esdfs: List[ExcelSheetDataFrame]) -> None:
from openpyxl.comments import Comment
from openpyxl.styles import PatternFill, Font
from openpyxl.worksheet.worksheet import Worksheet
Expand All @@ -228,10 +238,20 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No
f'to highlight {len(failed_samples)} samples that have failed QC')
book = openpyxl.load_workbook(xlsx_path)
logger.info(f'Loaded "{xlsx_path.name}" using openpyxl. Sheets: {book.get_sheet_names()}')
logger.info(f'Adjusting comment textbox sizes to fit text')
for sheetname in book.sheetnames:
for row in book[sheetname]:
for cell in row:
if cell.comment:
comment: Comment = cell.comment
comment.width = 300
comment.height = max((100, len(comment.text) / 3 * 2))
comment.author = f'xlavir version {__version__}'

sheet_names = [
SheetName.pangolin.value,
SheetName.variants.value,
SheetName.varmat,
SheetName.varmat.value,
]
light_red = 'FC9295'
for sheet_name in sheet_names:
Expand All @@ -249,6 +269,40 @@ def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> No
fgColor=light_red)
except KeyError:
pass

esd_varmat = get_excel_sheet_df(esdfs, SheetName.varmat.value)
esd_variants = get_excel_sheet_df(esdfs, SheetName.variants.value)
if esd_varmat and esd_variants:
try:
sheet: Worksheet = book[SheetName.varmat.value]
logger.info(f'Adding additional comments to variant matrix values')
df_varmat = esd_varmat.df
variants: Dict[Tuple[str, str], Dict[str, Union[str, float, int]]] = esd_variants \
.df.reset_index() \
.set_index(['Sample', 'Mutation']) \
.to_dict(orient='index')

for i, row in enumerate(sheet.rows):
if i == 0:
continue
sample = df_varmat.index.values[i - 1]
for j, cell in enumerate(row):
if j == 0:
continue
mutation = df_varmat.columns.values[j - 1]
variant = variants.get((sample, mutation), None)
if variant:
variant_str = '\n'.join(f'{k}: {v}' for k, v in variant.items())
comment_text = f'Sample: {sample}\nMutation: {mutation}\n{variant_str}'
else:
comment_text = f'Mutation "{mutation}" not found in sample "{sample}"'
cell.comment = Comment(comment_text,
author=f'xlavir version {__version__}',
width=300,
height=len(comment_text))
except KeyError:
pass

try:
sheet: Worksheet = book[SheetName.consensus.value]

Expand Down
61 changes: 61 additions & 0 deletions xlavir/tools/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Dict, Tuple, List, Optional, Iterable

import pandas as pd
import numpy as np
from pydantic import BaseModel

from xlavir.util import try_parse_number, find_file_for_each_sample
Expand Down Expand Up @@ -89,6 +90,49 @@
('aa_len', 'Gene Amino Acid Length', 'Amino acid length of the reference sequence gene'),
]

variant_summary_cols = [
(
'Mutation',
'Mutation',
'Mutation found in sample with format '
'"{reference allele}{reference position}{allele in sample}"'
' with predicted amino acid change information in brackets with format '
'"{gene name}:{reference AA}{gene AA position}{AA change}"'
),
('n_samples', '# of Samples', 'Number of samples with the mutation.'),
('samples', 'Samples', 'List of samples with mutation delimited by semicolon (";")'),
(
'min_depth',
'Min Depth',
'Minimum depth in all samples that the mutation is observed at.'
),
(
'max_depth',
'Max Depth',
'Maximum depth in all samples that the mutation is observed at.'
),
(
'mean_depth',
'Mean Depth',
'Mean/average depth that the mutation is observed at.'
),
(
'min_af',
'Min AF',
'Minimum alternate allele frequency of mutation in all samples.'
),
(
'max_af',
'Max AF',
'Maximum alternate allele frequency of mutation in all samples.'
),
(
'mean_af',
'Mean AF',
'Mean/average alternate allele frequency of mutation in all samples.'
),
]

BCFTOOLS_STATS_GLOB_PATTERNS = [
'**/ivar/**/*AF0.*.bcftools_stats.txt',
'**/ivar/**/*.bcftools_stats.txt',
Expand Down Expand Up @@ -411,3 +455,20 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame:
df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int)))
pivot_cols.sort(key=itemgetter(1))
return df_pivot[[x for x, y in pivot_cols]]


def to_summary(df: pd.DataFrame) -> pd.DataFrame:
df_vars = df.copy()
df_vars.reset_index(inplace=True)

df_summary = df_vars.groupby('Mutation', sort=False).agg(
n_samples=pd.NamedAgg(column='Sample', aggfunc='size'),
samples=pd.NamedAgg(column='Sample', aggfunc=lambda x: '; '.join(x)),
min_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='min'),
max_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc='max'),
mean_depth=pd.NamedAgg(column='Alternate Allele Depth', aggfunc=lambda x: sum(x) / len(x)),
min_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='min'),
max_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc='max'),
mean_af=pd.NamedAgg(column='Alternate Allele Frequency', aggfunc=lambda x: sum(x) / len(x)),
)
return df_summary.rename(columns={x: y for x, y, _ in variant_summary_cols})
5 changes: 5 additions & 0 deletions xlavir/xlavir.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ def run(input_dir: Path,
pd_to_excel_kwargs=dict(freeze_panes=(1, 1)),
include_header_width=False,
header_comments={name: desc for _, name, desc in variants.variants_cols}))
df_varsum = variants.to_summary(df_variants)
dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varsum.value,
df=df_varsum,
pd_to_excel_kwargs=dict(freeze_panes=(1, 1)),
header_comments={name: desc for _, name, desc in variants.variant_summary_cols}))
df_varmap = variants.to_variant_pivot_table(df_variants)
max_index_length = df_varmap.index.str.len().max()
dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varmat.value,
Expand Down

0 comments on commit 912ada1

Please sign in to comment.