Skip to content

Commit

Permalink
Merge pull request #1 from peterk87/release/v0.2.0
Browse files Browse the repository at this point in the history
Release v0.2.0
  • Loading branch information
peterk87 authored Mar 5, 2021
2 parents 273968b + 995b4f6 commit 5988ecc
Show file tree
Hide file tree
Showing 14 changed files with 431 additions and 79 deletions.
34 changes: 32 additions & 2 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,37 @@
History
=======

0.1.0 (2021-02-16)
0.2.0 (2021-03-04)
------------------

* First release on PyPI.
* Added header comments with descriptions of field content
* Added comment to Variant Matrix sheet A1 cell describing what is shown in the matrix
* Added highlighting of samples failing QC in other sheets
* Fixed image scaling by determining image size with imageio
* Added Medaka_ / Longshot_ VCF parsing

0.1.1 (2021-02-16)
------------------

* Collect sample results from a `nf-core/viralrecon`_ or `peterk87/nf-virontus`_ into a Excel report
* Samtools_ read mapping stats (``flagstat``)
* Mosdepth_ read mapping coverage information
* Variant calling information (SnpEff_ and SnpSift_ results, VCF file information)
* Consensus sequences
* iVar VCF parsing
* QA/QC of sample analysis results (basic PASS/FAIL based on minimum genome coverage and depth)
* Nextflow workflow execution information
* Prepend worksheets from other Excel documents into the report (e.g. cover page/sheet, sample sheet, lab results)
* Add custom images into worksheets with custom names and descriptions (e.g. phylogenetic tree figure PNG)

.. _Cookiecutter: https://github.com/audreyr/cookiecutter
.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
.. _nf-core/viralrecon: https://github.com/nf-core/viralrecon
.. _peterk87/nf-virontus: https://github.com/peterk87/nf-virontus/
.. _Bcftools: https://www.htslib.org/doc/bcftools.html
.. _Samtools: https://samtools.github.io/
.. _SnpEff: https://pcingola.github.io/SnpEff/se_introduction/
.. _SnpSift: https://pcingola.github.io/SnpEff/ss_introduction/
.. _Mosdepth: https://github.com/brentp/mosdepth
.. _Longshot: https://github.com/pjedge/longshot
.. _Medaka: https://github.com/nanoporetech/medaka
9 changes: 8 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,20 @@ Features
* Collect sample results from a `nf-core/viralrecon`_ or `peterk87/nf-virontus`_ into a Excel report
* Samtools_ read mapping stats (``flagstat``)
* Mosdepth_ read mapping coverage information
* **TODO:** Variant calling information (Bcftools_ stats, SnpEff_ and SnpSift_ results, VCF file information)
* Variant calling information (SnpEff_ and SnpSift_ results, VCF file information)
* Consensus sequences
* QA/QC of sample analysis results (basic PASS/FAIL based on minimum genome coverage and depth)
* Nextflow workflow execution information
* Prepend worksheets from other Excel documents into the report (e.g. cover page/sheet, sample sheet, lab results)
* Add custom images into worksheets with custom names and descriptions (e.g. phylogenetic tree figure PNG)

Roadmap
-------

* Bcftools_ variant calling stats sheet
* Sample metadata table to merge with certain stats?
* YAML config to info sheet?
* coverage chart with controls?

Credits
-------
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
filterwarnings =
ignore:.*PytestConfigWarning.*Unknown config option.*
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.1
current_version = 0.2.0
commit = True
tag = True

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
'beautifulsoup4',
'biopython',
'openpyxl',
'imageio',
]

setup_requirements = ['pytest-runner', ]
Expand Down Expand Up @@ -57,6 +58,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/peterk87/xlavir',
version='0.1.1',
version='0.2.0',
zip_safe=False,
)
2 changes: 1 addition & 1 deletion xlavir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Peter Kruczkiewicz"""
__email__ = 'peter.kruczkiewicz@gmail.com'
__version__ = '0.1.1'
__version__ = '0.2.0'
2 changes: 1 addition & 1 deletion xlavir/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def main(
typer.echo(f'xlavir version {__version__}')
typer.Exit()
from rich.traceback import install
install(show_locals=True)
install(show_locals=True, width=120, word_wrap=True)

logging.basicConfig(format='%(message)s',
datefmt='[%Y-%m-%d %X]',
Expand Down
8 changes: 5 additions & 3 deletions xlavir/io/excel_sheet_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Iterable, Union
from typing import Optional, Iterable, Union, Mapping
import pandas as pd

from enum import Enum
Expand All @@ -10,7 +10,7 @@ class SheetName(str, Enum):
consensus = 'Consensus'
pangolin = 'Pangolin Lineage'
variants = 'Variants'
varmap = 'Variant Map'
varmat = 'Variant Matrix'


class ExcelSheetDataFrame:
Expand All @@ -20,10 +20,12 @@ def __init__(self,
pd_to_excel_kwargs: dict = None,
autofit: bool = True,
column_widths: Optional[Iterable[Union[int, float]]] = None,
include_header_width: bool = True):
include_header_width: bool = True,
header_comments: Optional[Mapping[str, str]] = None):
self.include_header_width = include_header_width
self.sheet_name = sheet_name
self.df = df
self.pd_to_excel_kwargs = pd_to_excel_kwargs or {}
self.autofit = autofit
self.column_widths = column_widths
self.header_comments = header_comments
160 changes: 150 additions & 10 deletions xlavir/io/xl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
from copy import copy
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Set, Tuple

import openpyxl
import pandas as pd
Expand Down Expand Up @@ -31,6 +31,8 @@ def copy_spreadsheet(src_path: Path,
dest_path (Path): Destination Excel spreadsheet path
source_sheet_index (int): Source spreadsheet worksheet index to copy to destination spreadsheet
"""
from openpyxl.cell.cell import Cell

src_book = openpyxl.load_workbook(src_path)
dest_book = openpyxl.load_workbook(dest_path)
sheet = src_book.worksheets[source_sheet_index]
Expand All @@ -41,9 +43,10 @@ def copy_spreadsheet(src_path: Path,
new_sheet.merged_cells = copy(sheet.merged_cells)
for k, v in sheet.row_dimensions.items():
new_sheet.row_dimensions[k] = copy(v)
row: Tuple[Cell]
for row in sheet.rows:
for cell in row:
new_cell = new_sheet[cell.coordinate]
new_cell: Cell = new_sheet[cell.coordinate]
new_cell.value = cell.value
if cell.has_style:
new_cell.font = copy(cell.font)
Expand All @@ -52,13 +55,18 @@ def copy_spreadsheet(src_path: Path,
new_cell.number_format = copy(cell.number_format)
new_cell.protection = copy(cell.protection)
new_cell.alignment = copy(cell.alignment)
new_cell.comment = copy(cell.comment)
dest_book.save(filename=dest_path)


def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
output_xlsx: Path,
quality_reqs: QualityRequirements,
images_for_sheets: Optional[List[SheetImage]] = None):
"""Write the output Excel XLSX file
"""
with pd.ExcelWriter(output_xlsx, engine='xlsxwriter') as writer:
monospace = dict(font_name='Courier New')
text_wrap = dict(text_wrap=True)
Expand All @@ -84,17 +92,33 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
valign='bottom',
rotation=45,
font_name='Courier New'))
red_bg_fmt = book.add_format(dict(bg_color='red'))
fail_qc_fmt = book.add_format(dict(bg_color='FC9295',
font_name='Courier New',
bold=True))
pass_qc_fmt = book.add_format(dict(bg_color='c4edce',
font_name='Courier New',
bold=False))
float_cols = {'Mean Coverage Depth'}
perc_cols = {'% Genome Coverage'}
perc_2dec_cols = {'Alternate Allele Frequency'}

images_added = False

for esdf in dfs:
if images_for_sheets and esdf.sheet_name == SheetName.workflow_info.value:
add_images(images_for_sheets, book)
images_added = True
esdf.df.to_excel(writer, sheet_name=esdf.sheet_name, **esdf.pd_to_excel_kwargs)

sheet: Worksheet = book.get_worksheet_by_name(esdf.sheet_name)

idx_and_cols = [esdf.df.index.name] + list(esdf.df.columns)

if esdf.header_comments:
for i, col_name in enumerate(idx_and_cols):
if col_name in esdf.header_comments:
sheet.write_comment(0, i, esdf.header_comments[col_name])

if esdf.autofit:
for i, (width, col_name) in enumerate(zip(get_col_widths(esdf.df,
index=True,
Expand All @@ -120,8 +144,17 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
for i, idx in enumerate(esdf.df.index, 1):
sheet.set_row(i, get_row_heights(esdf.df, idx), monospace_wrap_fmt)

if esdf.sheet_name == SheetName.varmap.value:
sheet.set_row(0, max(len(x) for x in idx_and_cols)*5)
if esdf.sheet_name == SheetName.varmat.value:
sheet.write_comment(row=0,
col=0,
comment=f'This sheet contains a matrix of alternate allele variant observation'
f' frequency values for samples and variants. '
f'3-colour conditional formatting is applied to the variant '
f'frequency values where a major variant '
f'(e.g. alternate allele frequency >={quality_reqs.major_allele_freq}) '
f'is highlighted in green. Red indicates where the allele variant is not '
f'observed in the sample (e.g. alternate allele frequency equals 0.0).')
sheet.set_row(0, max(len(x) for x in idx_and_cols) * 5)
for i, col_name in enumerate(idx_and_cols):
if i == 0:
continue
Expand All @@ -147,19 +180,126 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame],
sheet.conditional_format(1, 1, esdf.df.shape[0], 1, options=dict(type='cell',
value='"FAIL"',
criteria='equal to',
format=red_bg_fmt))
if images_for_sheets:
format=fail_qc_fmt))
sheet.conditional_format(1, 1, esdf.df.shape[0], 1, options=dict(type='cell',
value='"PASS"',
criteria='equal to',
format=pass_qc_fmt))
if images_for_sheets and not images_added:
add_images(images_for_sheets, book)

df_qc = get_qc_df(dfs)
failed_samples = set(df_qc[df_qc['QC Status'] == 'FAIL'].index)
highlight_qc_failed_samples(xlsx_path=output_xlsx, failed_samples=failed_samples)


def get_qc_df(dfs: List[ExcelSheetDataFrame]) -> Optional[pd.DataFrame]:
for esdf in dfs:
if esdf.sheet_name == SheetName.qc_stats.value:
return esdf.df


def add_images(images_for_sheets: List[SheetImage],
book: Workbook):
text_wrap_fmt = book.add_format(dict(text_wrap=True))
text_wrap_fmt.set_align('vjustify')
"""Add images and their descriptions to new sheets in a workbook"""
import imageio
text_wrap_fmt = book.add_format(dict(text_wrap=True, valign='justify'))
for sheet_image in images_for_sheets:
sheet = book.add_worksheet(sheet_image.sheet_name)
sheet.set_column(0, 0, 100, text_wrap_fmt)
sheet.write(0, 0, sheet_image.image_description, text_wrap_fmt)
sheet.insert_image(1, 0, sheet_image.image_path)
img = imageio.imread(sheet_image.image_path)
x_size, y_size, _ = img.shape
yx_ratio = y_size / x_size
logger.debug(f'Image "{sheet_image.image_path.name}", x={x_size}, y={y_size}, y/x={yx_ratio}')
sheet.insert_image(1, 0, sheet_image.image_path, options=dict(x_scale=1.0,
y_scale=yx_ratio,
object_position=3))
sheet.hide_gridlines(2)
sheet.hide_row_col_headers()


def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> None:
from openpyxl.comments import Comment
from openpyxl.styles import PatternFill, Font
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet.dimensions import ColumnDimension
logger.info(f'Loading workbook "{xlsx_path.name}" with openpyxl '
f'to highlight {len(failed_samples)} samples that have failed QC')
book = openpyxl.load_workbook(xlsx_path)
logger.info(f'Loaded "{xlsx_path.name}" using openpyxl. Sheets: {book.get_sheet_names()}')
sheet_names = [
SheetName.pangolin.value,
SheetName.variants.value,
SheetName.varmat,
]
light_red = 'FC9295'
for sheet_name in sheet_names:
try:
sheet: Worksheet = book[sheet_name]
logger.info(f'Highlighting failed samples in sheet "{sheet_name}".')
for i, row in enumerate(sheet.rows):
if i == 0:
continue
cell = row[0]
if cell.value in failed_samples:
cell.comment = Comment(f'Warning: Sample "{cell.value}" has failed general NGS QC',
author='xlavir')
cell.fill = PatternFill(fill_type='solid',
fgColor=light_red)
except KeyError:
pass
try:
sheet: Worksheet = book[SheetName.consensus.value]

sheet.column_dimensions['A'] = ColumnDimension(worksheet=sheet,
index='A',
width=100)

logger.info(f'Highlighting consensus sequences of failed '
f'samples in sheet "{SheetName.consensus.value}".')
highlight_seq = False
sample_name = ''

dark_red = '260000'
for i, row in enumerate(sheet.rows):
cell = row[0]
if cell.value[0] == '>':
sample_name = cell.value[1:]
if sample_name in failed_samples:
highlight_seq = True
cell.comment = Comment(f'Warning: Sample "{sample_name}" has failed general NGS QC',
author='xlavir')
cell.fill = PatternFill(fill_type='solid', fgColor=light_red)
font: Font = cell.font
cell.font = Font(name='Courier New',
color=dark_red,
size=font.size,
family=font.family)
else:
font: Font = cell.font
cell.font = Font(name='Courier New',
color='000000',
size=font.size,
family=font.family)
highlight_seq = False
elif cell.value and highlight_seq:
cell.comment = Comment(f'Warning: Sample "{sample_name}" has failed general NGS QC',
author='xlavir')

cell.fill = PatternFill(fill_type='solid', fgColor=light_red)
font: Font = cell.font
cell.font = Font(name='Courier New',
color=dark_red,
size=font.size,
family=font.family)
highlight_seq = False
else:
font: Font = cell.font
cell.font = Font(name='Courier New',
color='000000',
size=font.size,
family=font.family)
except KeyError:
pass
book.save(xlsx_path)
Loading

0 comments on commit 5988ecc

Please sign in to comment.