diff --git a/HISTORY.rst b/HISTORY.rst index 21a5a66..a5a79f2 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,7 +2,37 @@ History ======= -0.1.0 (2021-02-16) +0.2.0 (2021-03-04) ------------------ -* First release on PyPI. +* Added header comments with descriptions of field content +* Added comment to Variant Matrix sheet A1 cell describing what is shown in the matrix +* Added highlighting of samples failing QC in other sheets +* Fixed image scaling by determining image size with imageio +* Added Medaka_ / Longshot_ VCF parsing + +0.1.1 (2021-02-16) +------------------ + +* Collect sample results from a `nf-core/viralrecon`_ or `peterk87/nf-virontus`_ into a Excel report + * Samtools_ read mapping stats (``flagstat``) + * Mosdepth_ read mapping coverage information + * Variant calling information (SnpEff_ and SnpSift_ results, VCF file information) + * Consensus sequences +* iVar VCF parsing +* QA/QC of sample analysis results (basic PASS/FAIL based on minimum genome coverage and depth) +* Nextflow workflow execution information +* Prepend worksheets from other Excel documents into the report (e.g. cover page/sheet, sample sheet, lab results) +* Add custom images into worksheets with custom names and descriptions (e.g. phylogenetic tree figure PNG) + +.. _Cookiecutter: https://github.com/audreyr/cookiecutter +.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage +.. _nf-core/viralrecon: https://github.com/nf-core/viralrecon +.. _peterk87/nf-virontus: https://github.com/peterk87/nf-virontus/ +.. _Bcftools: https://www.htslib.org/doc/bcftools.html +.. _Samtools: https://samtools.github.io/ +.. _SnpEff: https://pcingola.github.io/SnpEff/se_introduction/ +.. _SnpSift: https://pcingola.github.io/SnpEff/ss_introduction/ +.. _Mosdepth: https://github.com/brentp/mosdepth +.. _Longshot: https://github.com/pjedge/longshot +.. _Medaka: https://github.com/nanoporetech/medaka diff --git a/README.rst b/README.rst index ad38b02..40e9744 100644 --- a/README.rst +++ b/README.rst @@ -27,13 +27,20 @@ Features * Collect sample results from a `nf-core/viralrecon`_ or `peterk87/nf-virontus`_ into a Excel report * Samtools_ read mapping stats (``flagstat``) * Mosdepth_ read mapping coverage information - * **TODO:** Variant calling information (Bcftools_ stats, SnpEff_ and SnpSift_ results, VCF file information) + * Variant calling information (SnpEff_ and SnpSift_ results, VCF file information) * Consensus sequences * QA/QC of sample analysis results (basic PASS/FAIL based on minimum genome coverage and depth) * Nextflow workflow execution information * Prepend worksheets from other Excel documents into the report (e.g. cover page/sheet, sample sheet, lab results) * Add custom images into worksheets with custom names and descriptions (e.g. phylogenetic tree figure PNG) +Roadmap +------- + +* Bcftools_ variant calling stats sheet +* Sample metadata table to merge with certain stats? +* YAML config to info sheet? +* coverage chart with controls? Credits ------- diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..877e467 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore:.*PytestConfigWarning.*Unknown config option.* diff --git a/setup.cfg b/setup.cfg index 0783869..8cf7f71 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1 +current_version = 0.2.0 commit = True tag = True diff --git a/setup.py b/setup.py index 53d29fc..028b4dc 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ 'beautifulsoup4', 'biopython', 'openpyxl', + 'imageio', ] setup_requirements = ['pytest-runner', ] @@ -57,6 +58,6 @@ test_suite='tests', tests_require=test_requirements, url='https://github.com/peterk87/xlavir', - version='0.1.1', + version='0.2.0', zip_safe=False, ) diff --git a/xlavir/__init__.py b/xlavir/__init__.py index 34ad602..a7f209c 100644 --- a/xlavir/__init__.py +++ b/xlavir/__init__.py @@ -2,4 +2,4 @@ __author__ = """Peter Kruczkiewicz""" __email__ = 'peter.kruczkiewicz@gmail.com' -__version__ = '0.1.1' +__version__ = '0.2.0' diff --git a/xlavir/cli.py b/xlavir/cli.py index 7b5c6a4..384269d 100644 --- a/xlavir/cli.py +++ b/xlavir/cli.py @@ -61,7 +61,7 @@ def main( typer.echo(f'xlavir version {__version__}') typer.Exit() from rich.traceback import install - install(show_locals=True) + install(show_locals=True, width=120, word_wrap=True) logging.basicConfig(format='%(message)s', datefmt='[%Y-%m-%d %X]', diff --git a/xlavir/io/excel_sheet_dataframe.py b/xlavir/io/excel_sheet_dataframe.py index ff87913..be84dcc 100644 --- a/xlavir/io/excel_sheet_dataframe.py +++ b/xlavir/io/excel_sheet_dataframe.py @@ -1,4 +1,4 @@ -from typing import Optional, Iterable, Union +from typing import Optional, Iterable, Union, Mapping import pandas as pd from enum import Enum @@ -10,7 +10,7 @@ class SheetName(str, Enum): consensus = 'Consensus' pangolin = 'Pangolin Lineage' variants = 'Variants' - varmap = 'Variant Map' + varmat = 'Variant Matrix' class ExcelSheetDataFrame: @@ -20,10 +20,12 @@ def __init__(self, pd_to_excel_kwargs: dict = None, autofit: bool = True, column_widths: Optional[Iterable[Union[int, float]]] = None, - include_header_width: bool = True): + include_header_width: bool = True, + header_comments: Optional[Mapping[str, str]] = None): self.include_header_width = include_header_width self.sheet_name = sheet_name self.df = df self.pd_to_excel_kwargs = pd_to_excel_kwargs or {} self.autofit = autofit self.column_widths = column_widths + self.header_comments = header_comments diff --git a/xlavir/io/xl.py b/xlavir/io/xl.py index c763af7..77a1921 100644 --- a/xlavir/io/xl.py +++ b/xlavir/io/xl.py @@ -2,7 +2,7 @@ import logging from copy import copy from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Set, Tuple import openpyxl import pandas as pd @@ -31,6 +31,8 @@ def copy_spreadsheet(src_path: Path, dest_path (Path): Destination Excel spreadsheet path source_sheet_index (int): Source spreadsheet worksheet index to copy to destination spreadsheet """ + from openpyxl.cell.cell import Cell + src_book = openpyxl.load_workbook(src_path) dest_book = openpyxl.load_workbook(dest_path) sheet = src_book.worksheets[source_sheet_index] @@ -41,9 +43,10 @@ def copy_spreadsheet(src_path: Path, new_sheet.merged_cells = copy(sheet.merged_cells) for k, v in sheet.row_dimensions.items(): new_sheet.row_dimensions[k] = copy(v) + row: Tuple[Cell] for row in sheet.rows: for cell in row: - new_cell = new_sheet[cell.coordinate] + new_cell: Cell = new_sheet[cell.coordinate] new_cell.value = cell.value if cell.has_style: new_cell.font = copy(cell.font) @@ -52,6 +55,7 @@ def copy_spreadsheet(src_path: Path, new_cell.number_format = copy(cell.number_format) new_cell.protection = copy(cell.protection) new_cell.alignment = copy(cell.alignment) + new_cell.comment = copy(cell.comment) dest_book.save(filename=dest_path) @@ -59,6 +63,10 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], output_xlsx: Path, quality_reqs: QualityRequirements, images_for_sheets: Optional[List[SheetImage]] = None): + """Write the output Excel XLSX file + + + """ with pd.ExcelWriter(output_xlsx, engine='xlsxwriter') as writer: monospace = dict(font_name='Courier New') text_wrap = dict(text_wrap=True) @@ -84,17 +92,33 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], valign='bottom', rotation=45, font_name='Courier New')) - red_bg_fmt = book.add_format(dict(bg_color='red')) + fail_qc_fmt = book.add_format(dict(bg_color='FC9295', + font_name='Courier New', + bold=True)) + pass_qc_fmt = book.add_format(dict(bg_color='c4edce', + font_name='Courier New', + bold=False)) float_cols = {'Mean Coverage Depth'} perc_cols = {'% Genome Coverage'} perc_2dec_cols = {'Alternate Allele Frequency'} + images_added = False + for esdf in dfs: + if images_for_sheets and esdf.sheet_name == SheetName.workflow_info.value: + add_images(images_for_sheets, book) + images_added = True esdf.df.to_excel(writer, sheet_name=esdf.sheet_name, **esdf.pd_to_excel_kwargs) sheet: Worksheet = book.get_worksheet_by_name(esdf.sheet_name) idx_and_cols = [esdf.df.index.name] + list(esdf.df.columns) + + if esdf.header_comments: + for i, col_name in enumerate(idx_and_cols): + if col_name in esdf.header_comments: + sheet.write_comment(0, i, esdf.header_comments[col_name]) + if esdf.autofit: for i, (width, col_name) in enumerate(zip(get_col_widths(esdf.df, index=True, @@ -120,8 +144,17 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], for i, idx in enumerate(esdf.df.index, 1): sheet.set_row(i, get_row_heights(esdf.df, idx), monospace_wrap_fmt) - if esdf.sheet_name == SheetName.varmap.value: - sheet.set_row(0, max(len(x) for x in idx_and_cols)*5) + if esdf.sheet_name == SheetName.varmat.value: + sheet.write_comment(row=0, + col=0, + comment=f'This sheet contains a matrix of alternate allele variant observation' + f' frequency values for samples and variants. ' + f'3-colour conditional formatting is applied to the variant ' + f'frequency values where a major variant ' + f'(e.g. alternate allele frequency >={quality_reqs.major_allele_freq}) ' + f'is highlighted in green. Red indicates where the allele variant is not ' + f'observed in the sample (e.g. alternate allele frequency equals 0.0).') + sheet.set_row(0, max(len(x) for x in idx_and_cols) * 5) for i, col_name in enumerate(idx_and_cols): if i == 0: continue @@ -147,19 +180,126 @@ def write_xlsx_report(dfs: List[ExcelSheetDataFrame], sheet.conditional_format(1, 1, esdf.df.shape[0], 1, options=dict(type='cell', value='"FAIL"', criteria='equal to', - format=red_bg_fmt)) - if images_for_sheets: + format=fail_qc_fmt)) + sheet.conditional_format(1, 1, esdf.df.shape[0], 1, options=dict(type='cell', + value='"PASS"', + criteria='equal to', + format=pass_qc_fmt)) + if images_for_sheets and not images_added: add_images(images_for_sheets, book) + df_qc = get_qc_df(dfs) + failed_samples = set(df_qc[df_qc['QC Status'] == 'FAIL'].index) + highlight_qc_failed_samples(xlsx_path=output_xlsx, failed_samples=failed_samples) + + +def get_qc_df(dfs: List[ExcelSheetDataFrame]) -> Optional[pd.DataFrame]: + for esdf in dfs: + if esdf.sheet_name == SheetName.qc_stats.value: + return esdf.df + def add_images(images_for_sheets: List[SheetImage], book: Workbook): - text_wrap_fmt = book.add_format(dict(text_wrap=True)) - text_wrap_fmt.set_align('vjustify') + """Add images and their descriptions to new sheets in a workbook""" + import imageio + text_wrap_fmt = book.add_format(dict(text_wrap=True, valign='justify')) for sheet_image in images_for_sheets: sheet = book.add_worksheet(sheet_image.sheet_name) sheet.set_column(0, 0, 100, text_wrap_fmt) sheet.write(0, 0, sheet_image.image_description, text_wrap_fmt) - sheet.insert_image(1, 0, sheet_image.image_path) + img = imageio.imread(sheet_image.image_path) + x_size, y_size, _ = img.shape + yx_ratio = y_size / x_size + logger.debug(f'Image "{sheet_image.image_path.name}", x={x_size}, y={y_size}, y/x={yx_ratio}') + sheet.insert_image(1, 0, sheet_image.image_path, options=dict(x_scale=1.0, + y_scale=yx_ratio, + object_position=3)) sheet.hide_gridlines(2) sheet.hide_row_col_headers() + + +def highlight_qc_failed_samples(xlsx_path: Path, failed_samples: Set[str]) -> None: + from openpyxl.comments import Comment + from openpyxl.styles import PatternFill, Font + from openpyxl.worksheet.worksheet import Worksheet + from openpyxl.worksheet.dimensions import ColumnDimension + logger.info(f'Loading workbook "{xlsx_path.name}" with openpyxl ' + f'to highlight {len(failed_samples)} samples that have failed QC') + book = openpyxl.load_workbook(xlsx_path) + logger.info(f'Loaded "{xlsx_path.name}" using openpyxl. Sheets: {book.get_sheet_names()}') + sheet_names = [ + SheetName.pangolin.value, + SheetName.variants.value, + SheetName.varmat, + ] + light_red = 'FC9295' + for sheet_name in sheet_names: + try: + sheet: Worksheet = book[sheet_name] + logger.info(f'Highlighting failed samples in sheet "{sheet_name}".') + for i, row in enumerate(sheet.rows): + if i == 0: + continue + cell = row[0] + if cell.value in failed_samples: + cell.comment = Comment(f'Warning: Sample "{cell.value}" has failed general NGS QC', + author='xlavir') + cell.fill = PatternFill(fill_type='solid', + fgColor=light_red) + except KeyError: + pass + try: + sheet: Worksheet = book[SheetName.consensus.value] + + sheet.column_dimensions['A'] = ColumnDimension(worksheet=sheet, + index='A', + width=100) + + logger.info(f'Highlighting consensus sequences of failed ' + f'samples in sheet "{SheetName.consensus.value}".') + highlight_seq = False + sample_name = '' + + dark_red = '260000' + for i, row in enumerate(sheet.rows): + cell = row[0] + if cell.value[0] == '>': + sample_name = cell.value[1:] + if sample_name in failed_samples: + highlight_seq = True + cell.comment = Comment(f'Warning: Sample "{sample_name}" has failed general NGS QC', + author='xlavir') + cell.fill = PatternFill(fill_type='solid', fgColor=light_red) + font: Font = cell.font + cell.font = Font(name='Courier New', + color=dark_red, + size=font.size, + family=font.family) + else: + font: Font = cell.font + cell.font = Font(name='Courier New', + color='000000', + size=font.size, + family=font.family) + highlight_seq = False + elif cell.value and highlight_seq: + cell.comment = Comment(f'Warning: Sample "{sample_name}" has failed general NGS QC', + author='xlavir') + + cell.fill = PatternFill(fill_type='solid', fgColor=light_red) + font: Font = cell.font + cell.font = Font(name='Courier New', + color=dark_red, + size=font.size, + family=font.family) + highlight_seq = False + else: + font: Font = cell.font + cell.font = Font(name='Courier New', + color='000000', + size=font.size, + family=font.family) + except KeyError: + pass + book.save(xlsx_path) diff --git a/xlavir/qc/__init__.py b/xlavir/qc/__init__.py index 3c3b91a..f364591 100644 --- a/xlavir/qc/__init__.py +++ b/xlavir/qc/__init__.py @@ -9,26 +9,75 @@ logger = logging.getLogger(__name__) -def column_rename_tuple(low_coverage_threshold: int = 5) -> List[Tuple[str, str]]: +def columns(low_coverage_threshold: int = 5) -> List[Tuple[str, str]]: return [ - ('sample', 'Sample'), - ('qc_status', 'QC Status'), - ('qc_comment', 'QC Comment'), - ('genome_coverage', '% Genome Coverage'), - ('mean_coverage', 'Mean Coverage Depth'), - ('median_coverage', 'Median Coverage Depth'), - ('n_total_reads', '# Total Reads'), - ('n_mapped_reads', '# Mapped Reads'), - ('n_zero_coverage', '# 0X positions'), - ('n_low_coverage', f'# <{low_coverage_threshold}X positions'), - ('zero_coverage_coords', '0X Coverage Regions'), - ('low_coverage_coords', f'<{low_coverage_threshold}X Coverage Regions'), + ('sample', 'Sample', 'Sample name'), + ( + 'qc_status', + 'QC Status', + 'Quality control status, i.e. PASS or FAIL based on QC criteria ' + 'such as minimum mean read depth ' + 'or percent of reference genome covered by sequencing' + ), + ( + 'qc_comment', + 'QC Comment', + 'Comments on any potential quality issues, i.e. why a sample did not pass QC.' + ), + ( + 'genome_coverage', + '% Genome Coverage', + 'Percent of reference genome sequence covered by sequencing' + ), + ( + 'mean_coverage', + 'Mean Coverage Depth', + 'Mean sequencing coverage depth across entire reference genome sequence.' + ), + ( + 'median_coverage', + 'Median Coverage Depth', + 'Median sequencing coverage depth across entire reference genome sequence.', + ), + ( + 'n_total_reads', + '# Total Reads', + 'Total number of raw reads obtained from sequencing for this sample.' + ), + ( + 'n_mapped_reads', + '# Mapped Reads', + 'Number of sequencing reads that mapped to the reference genome sequence.' + ), + ( + 'n_zero_coverage', + '# 0X positions', + 'Number of reference sequence positions with no coverage depth (0X), i.e. ' + 'reference positions that did not have any reads spanning those positions.' + ), + ( + 'n_low_coverage', + f'# <{low_coverage_threshold}X positions', + f'Number of reference sequence positions with fewer than {low_coverage_threshold} reads' + f' spanning those positions.' + ), + ( + 'zero_coverage_coords', + '0X Coverage Regions', + 'A list of reference sequence 1-based regions with no coverage (0X).' + ), + ( + 'low_coverage_coords', + f'<{low_coverage_threshold}X Coverage Regions', + f'A list of reference sequence 1-based regions with less than {low_coverage_threshold}' + f' coverage depth.' + ), ] def report_format(df: pd.DataFrame, low_coverage_threshold: int = 5) -> pd.DataFrame: - output_cols = column_rename_tuple(low_coverage_threshold) - df.rename(columns={x: y for x, y in output_cols}, inplace=True) + output_cols = columns(low_coverage_threshold) + df.rename(columns={x: y for x, y, _ in output_cols}, inplace=True) df.set_index('Sample', inplace=True) return df @@ -62,8 +111,8 @@ def create_qc_stats_dataframe(sample_depth_info: Dict[str, mosdepth.MosdepthDept df_stats.sort_values('sample', inplace=True) present_cols = set(df_stats.columns) - output_cols = column_rename_tuple(quality_reqs.low_coverage_threshold) - df_stats = df_stats.loc[:, [x for x, y in output_cols if x in present_cols]] + output_cols = columns(quality_reqs.low_coverage_threshold) + df_stats = df_stats.loc[:, [x for x, y, _ in output_cols if x in present_cols]] logger.debug(df_stats) return df_stats diff --git a/xlavir/tools/pangolin.py b/xlavir/tools/pangolin.py index 981e9c4..b58d71b 100644 --- a/xlavir/tools/pangolin.py +++ b/xlavir/tools/pangolin.py @@ -3,6 +3,39 @@ import pandas as pd +pangolin_cols = [ + ('taxon', 'Sample', 'Sample name'), + ( + 'lineage', + 'Pangolin Lineage', + 'Pangolin global SARS-CoV-2 lineage assignment based on pangoLEARN model. ' + 'For more info, see https://github.com/cov-lineages/pangolin/#pangolearn-description', + ), + ( + 'probability', + 'Lineage Assignment Probability', + 'Pangolin lineage assignment probability from multinomial logistic regression. For more info, ' + 'see https://github.com/cov-lineages/pangolin/#pangolearn-description' + ), + ( + 'pangoLEARN_version', + 'pangoLEARN Lineages Version', + 'Release version of pangoLEARN SARS-CoV-2 ' + 'lineages information used for assignment.'), + ( + 'status', + 'Pangolin QC Status', + 'QC status of Pangolin lineage assignment, i.e. QC pass or fail' + ), + ( + 'note', + 'Pangolin QC Note', + 'Issues reported with Pangolin lineage assignment such as too many Ns in the input sequence' + ' (Pangolin will not call a lineage for sequences with over 50% (0.5) N-content) or ' + 'if the sequence is too short (e.g. "seq_len:0")' + ), +] + def find_pangolin_lineage_csv(basedir: Path) -> Optional[Path]: for p in basedir.rglob('pangolin.lineage_report.csv'): @@ -12,15 +45,8 @@ def find_pangolin_lineage_csv(basedir: Path) -> Optional[Path]: def read_pangolin_csv(path: Path) -> pd.DataFrame: df = pd.read_csv(path) df.sort_values('taxon', inplace=True) - pangolin_cols = [ - ('taxon', 'Sample'), - ('lineage', 'Pangolin Lineage'), - ('probability', 'Lineage Assignment Probability'), - ('pangoLEARN_version', 'pangoLEARN Lineages Version'), - ('status', 'Pangolin QC Status'), - ('note', 'Pangolin QC Note'), - ] - df.rename(columns={x: y for x, y in pangolin_cols}, inplace=True) + + df.rename(columns={x: y for x, y, _ in pangolin_cols}, inplace=True) df.set_index('Sample', inplace=True) return df diff --git a/xlavir/tools/variants.py b/xlavir/tools/variants.py index 3941b4a..6b74c0b 100644 --- a/xlavir/tools/variants.py +++ b/xlavir/tools/variants.py @@ -1,10 +1,10 @@ +import logging import os import re from enum import Enum from operator import itemgetter from pathlib import Path from typing import Dict, Tuple, List, Optional, Iterable -import logging import pandas as pd from pydantic import BaseModel @@ -37,23 +37,56 @@ TER='*', ) +# list of tuples: [0]: column id; [1]: report column id/name; [2]: column description for report cell comment variants_cols = [ - ('sample', 'Sample'), - ('mutation', 'Mutation'), - ('POS', 'Position'), - ('REF', 'Reference Allele'), - ('ALT', 'Alternate Allele'), - ('REF_DP', 'Reference Allele Depth'), - ('ALT_DP', 'Alternate Alternate Depth'), - ('DP', 'Total Depth'), - ('ALT_FREQ', 'Alternate Allele Frequency'), - ('gene', 'Gene'), - ('impact', 'Variant Impact'), - ('effect', 'Variant Effect'), - ('aa', 'Amino Acid Change'), - ('aa_pos', 'Amino Acid Position'), - ('aa_len', 'Gene Amino Acid Length'), - ('CHROM', 'Reference Genome'), + ('sample', 'Sample', 'Sample name',), + ('CHROM', 'Reference Genome', 'Reference genome sequence ID/name'), + ( + 'mutation', + 'Mutation', + 'Mutation found in sample with format ' + '"{reference allele}{reference position}{allele in sample}"' + ' with predicted amino acid change information in brackets with format ' + '"{gene name}:{reference AA}{gene AA position}{AA change}"' + ), + ('POS', 'Position', '1-based nucleotide position in reference sequence'), + ('REF', 'Reference Allele', 'Nucleotide allele sequence found in reference sequence'), + ('ALT', 'Alternate Allele', 'Nucleotide allele sequence found in sample'), + ( + 'REF_DP', + 'Reference Allele Depth', + 'Read depth of coverage supporting reference allele at reference position' + ), + ( + 'ALT_DP', + 'Alternate Allele Depth', + 'Read depth of coverage supporting alternate allele at reference position', + ), + ('DP', 'Total Depth', 'Total depth of coverage at reference position',), + ( + 'ALT_FREQ', + 'Alternate Allele Frequency', + 'Observed frequency of alternate allele variant', + ), + ('gene', 'Gene', 'Gene name',), + ( + 'impact', + 'Variant Impact', + 'SnpEff estimation of putative impact or deleteriousness of variant ' + '(see https://pcingola.github.io/SnpEff/se_inputoutput/#ann-field-vcf-output-files)' + ), + ( + 'effect', + 'Variant Effect', + 'Effect of variant annotated using Sequence Ontology terms, e.g.' + 'for "missense_variant", see http://www.sequenceontology.org/browser/current_release/term/SO:0001583' + ' where the definition is "A sequence variant, that changes one or more bases, resulting in a ' + 'different amino acid sequence but where the length is preserved."' + ), + ('aa', 'Amino Acid Change', 'The change in the sample\'s gene amino acid sequence' + ' relative to the reference sequence'), + ('aa_pos', 'Amino Acid Position', 'Position of amino acid change in the reference sequence gene'), + ('aa_len', 'Gene Amino Acid Length', 'Amino acid length of the reference sequence gene'), ] BCFTOOLS_STATS_GLOB_PATTERNS = [ @@ -87,7 +120,8 @@ class VariantCaller(Enum): VCF_SAMPLE_NAME_CLEANUP = [ re.compile(r'\.vcf(\.gz)?$'), - re.compile(r'\.AF0\.\d+'), + re.compile(r'\.AF0\.\d+(\.filt)?'), + re.compile(r'\.0\.\d+AF(\.filt)?'), ] SNPSIFT_GLOB_PATTERNS = [ @@ -97,7 +131,8 @@ class VariantCaller(Enum): SNPSIFT_SAMPLE_NAME_CLEANUP = [ re.compile(r'\.snpSift\.table\.txt$'), - re.compile(r'\.AF0\.\d+'), + re.compile(r'\.AF0\.\d+(\.filt)?'), + re.compile(r'\.0\.\d+AF(\.filt)?'), ] @@ -226,7 +261,9 @@ def parse_ivar_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd.Dat if df.empty: return None if not sample_name: - sample_name = df.columns[-1] + sample_name = df.columns[-1] if df.columns[-1] != 'SAMPLE' else None + if sample_name is None: + raise ValueError(f'Sample name is not defined for VCF: shape={df.shape}; columns={df.columns}') pos_fmt_val = {} for row in df.itertuples(): ks = row.FORMAT.split(':') @@ -247,14 +284,48 @@ def merge_vcf_snpsift(df_vcf: Optional[pd.DataFrame], return None if df_vcf is None: snpsift_cols = set(df_snpsift.columns) - return df_snpsift.loc[:, [x for x, y in variants_cols if x in snpsift_cols]] + return df_snpsift.loc[:, [x for x, _, _ in variants_cols if x in snpsift_cols]] if df_snpsift is None: vcf_cols = set(df_vcf.columns) - return df_vcf.loc[:, [x for x, y in variants_cols if x in vcf_cols]] + return df_vcf.loc[:, [x for x, _, _ in variants_cols if x in vcf_cols]] df_merge = pd.merge(df_vcf, df_snpsift) merged_cols = set(df_merge.columns) - return df_merge.loc[:, [x for x, y in variants_cols if x in merged_cols]] + return df_merge.loc[:, [x for x, _, _ in variants_cols if x in merged_cols]] + + +def parse_longshot_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd.DataFrame]: + if df.empty: + return None + if not sample_name: + sample_name = df.columns[-1] if df.columns[-1] != 'SAMPLE' else None + if sample_name is None: + raise ValueError(f'Sample name is not defined for VCF: shape={df.shape}; columns={df.columns}') + pos_info_val = {} + for row in df.itertuples(): + infos = parse_vcf_info(row.INFO) + ac_ref, ac_alt = infos['AC'] + infos['REF_DP'] = ac_ref + infos['ALT_DP'] = ac_alt + pos_info_val[row.POS] = infos + df_longshot_info = pd.DataFrame(pos_info_val).transpose() + df_longshot_info.index.name = 'POS' + df_longshot_info.reset_index(inplace=True) + df_merge = pd.merge(df, df_longshot_info, on='POS') + df_merge['sample'] = sample_name + df_merge['ALT_FREQ'] = df_merge.ALT_DP / df_merge.DP + cols_to_keep = list({col for col, _, _ in variants_cols} & set(df_merge.columns)) + return df_merge.loc[:, cols_to_keep] + + +def parse_vcf_info(s: str) -> dict: + out = {} + for x in s.split(';'): + if not x: + continue + key, val_str = x.split('=', maxsplit=1) + out[key] = try_parse_number(val_str) + return out def get_info(basedir: Path) -> Dict[str, pd.DataFrame]: @@ -265,12 +336,18 @@ def get_info(basedir: Path) -> Dict[str, pd.DataFrame]: sample_dfvcf = {} for sample, vcf_path in sample_vcf.items(): variant_caller, df_vcf = read_vcf(vcf_path) - if variant_caller == 'iVar': + if variant_caller.startswith(VariantCaller.iVar.value): df_parsed_ivar_vcf = parse_ivar_vcf(df_vcf, sample) if df_parsed_ivar_vcf is not None: sample_dfvcf[sample] = df_parsed_ivar_vcf else: logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"') + elif variant_caller.startswith(VariantCaller.Longshot.value): + df_parsed_ivar_vcf = parse_longshot_vcf(df_vcf, sample) + if df_parsed_ivar_vcf is not None: + sample_dfvcf[sample] = df_parsed_ivar_vcf + else: + logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"') else: raise NotImplementedError() @@ -306,13 +383,21 @@ def get_info(basedir: Path) -> Dict[str, pd.DataFrame]: def to_dataframe(dfs: Iterable[pd.DataFrame]) -> pd.DataFrame: df = pd.concat(list(dfs)) df.sort_values(['sample', 'POS'], inplace=True) - return df.set_index('sample').rename(columns={x: y for x, y in variants_cols}) + df.set_index('sample', inplace=True) + df.index.name = 'Sample' + return df.rename(columns={x: y for x, y, _ in variants_cols}) def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame: df_vars = df.copy() df_vars.reset_index(inplace=True) - df_pivot = pd.pivot_table(df_vars, index='sample', columns='Mutation', values='Alternate Allele Frequency') - pivot_cols = list(zip(df_pivot.columns, df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int))) + df_pivot = pd.pivot_table(df_vars, + index='Sample', + columns='Mutation', + values='Alternate Allele Frequency', + aggfunc='first', + fill_value=0.0) + pivot_cols = list(zip(df_pivot.columns, + df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int))) pivot_cols.sort(key=itemgetter(1)) return df_pivot[[x for x, y in pivot_cols]] diff --git a/xlavir/util.py b/xlavir/util.py index cd6865b..32157c1 100644 --- a/xlavir/util.py +++ b/xlavir/util.py @@ -106,7 +106,10 @@ def list_get(xs: Optional[List], idx: int, default=None): return default -def try_parse_number(s: str) -> Union[int, float, str]: +def try_parse_number(s: str) -> Union[int, float, List[float], List[int], str]: + if ',' in s: + xs = s.split(',') + return [try_parse_number(x) for x in xs] try: return int(s) except ValueError: diff --git a/xlavir/xlavir.py b/xlavir/xlavir.py index 2c7fb32..e217d18 100644 --- a/xlavir/xlavir.py +++ b/xlavir/xlavir.py @@ -36,28 +36,34 @@ def run(input_dir: Path, dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.qc_stats.value, df=qc.report_format(df_stats, low_coverage_threshold=quality_reqs.low_coverage_threshold), - pd_to_excel_kwargs=dict(freeze_panes=(1, 1), na_rep='NA'))) + pd_to_excel_kwargs=dict(freeze_panes=(1, 1), na_rep='NA'), + header_comments={x: y for _, x, y in + qc.columns(quality_reqs.low_coverage_threshold)})) df_pangolin = pangolin.get_info(basedir=input_dir, pangolin_lineage_csv=pangolin_lineage_csv) if df_pangolin is not None: dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.pangolin.value, df=df_pangolin, - pd_to_excel_kwargs=dict(freeze_panes=(1, 1)))) + pd_to_excel_kwargs=dict(freeze_panes=(1, 1)), + header_comments={x: y for _, x, y in pangolin.pangolin_cols})) if sample_variants: df_variants = variants.to_dataframe(sample_variants.values()) dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.variants.value, df=df_variants, pd_to_excel_kwargs=dict(freeze_panes=(1, 1)), - include_header_width=False)) + include_header_width=False, + header_comments={name: desc for _, name, desc in variants.variants_cols})) df_varmap = variants.to_variant_pivot_table(df_variants) max_index_length = df_varmap.index.str.len().max() - dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varmap.value, + dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.varmat.value, df=df_varmap, pd_to_excel_kwargs=dict(freeze_panes=(1, 1), na_rep=0.0), autofit=False, - column_widths=[max_index_length + 2] + [3 for _ in range(df_varmap.columns.size)])) + column_widths=[max_index_length + 2] + [3 for _ in + range(df_varmap.columns.size)])) dfs.append(ExcelSheetDataFrame(sheet_name=SheetName.consensus.value, df=consensus.get_info(basedir=input_dir), + autofit=False, pd_to_excel_kwargs=dict(index=None, header=None))) if nf_exec_info: df_exec_info = to_dataframe(nf_exec_info)