From 5623ab87bd9d2840567f3a6fd584a5a888b4238c Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Wed, 5 Jan 2022 12:38:40 -0600 Subject: [PATCH 1/2] * Add support for reading annotated Medaka VCF files (``medaka_variant`` VCF annotated with ``medaka tools annotate``) * Changed mutation string format to ``{gene}:{AA change} ({NT change}{extra})`` if there is a AA change * Added low coverage filtering of variants for Medaka VCF * "Variants Summary" table now sorted by nucleotide position --- HISTORY.rst | 8 +++ xlavir/tools/variants.py | 103 +++++++++++++++++++++++++++++++++++---- xlavir/xlavir.py | 2 +- 3 files changed, 103 insertions(+), 10 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 6648e42..4525b80 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,14 @@ History ======= +0.6.0 (2022-01-05) +------------------ + +* Add support for reading annotated Medaka VCF files (``medaka_variant`` VCF annotated with ``medaka tools annotate``) +* Changed mutation string format to ``{gene}:{AA change} ({NT change}{extra})`` if there is a AA change +* Added low coverage filtering of variants for Medaka VCF +* "Variants Summary" table now sorted by nucleotide position + 0.5.3 (2021-11-09) ------------------ diff --git a/xlavir/tools/variants.py b/xlavir/tools/variants.py index cef111d..cf26ec2 100644 --- a/xlavir/tools/variants.py +++ b/xlavir/tools/variants.py @@ -10,6 +10,7 @@ import pandas as pd from pydantic import BaseModel +from xlavir.qc import QualityRequirements from xlavir.util import try_parse_number, find_file_for_each_sample logger = logging.getLogger(__name__) @@ -131,6 +132,16 @@ 'Mean AF', 'Mean/average alternate allele frequency of mutation in all samples.' ), + ( + 'nt_pos', + 'Nucleotide Position', + 'Nucleotide position of mutation with respect to reference genome.' + ), + ( + 'aa_pos', + 'Amino Acid Position', + 'Amino acid position of mutation in reference genome gene.' + ) ] BCFTOOLS_STATS_GLOB_PATTERNS = [ @@ -156,11 +167,13 @@ class VariantCaller(Enum): iVar = 'iVar' Longshot = 'Longshot' Nanopolish = 'nanopolish' + Medaka = 'medaka' VCF_GLOB_PATTERNS = [ '**/nanopolish/*.pass.vcf.gz', '**/ivar/*.vcf.gz', + '**/*.filt.no_fs.vcf', '**/*.longshot.vcf', '**/*.vcf', ] @@ -169,6 +182,7 @@ class VariantCaller(Enum): re.compile(r'(\.pass)?\.vcf(\.gz)?$'), re.compile(r'\.AF0\.\d+(\.filt)?'), re.compile(r'\.0\.\d+AF(\.filt)?'), + re.compile(r'\.medaka'), re.compile(r'\.longshot'), re.compile(r'\.snpeff'), re.compile(r'\.no_fs'), @@ -226,6 +240,8 @@ def read_vcf(vcf_file: Path) -> Tuple[str, pd.DataFrame]: variant_caller = line.strip().replace('##source=', '') if line.startswith('##nanopolish'): variant_caller = 'nanopolish' + if line.startswith('##medaka_version'): + variant_caller = 'medaka' if line.startswith('#CHROM'): vcf_cols = line[1:].strip().split('\t') break @@ -253,22 +269,24 @@ def parse_aa(gene: str, aa_str = aa_str.replace(aa3, aa1) if aa_str.endswith('DEL'): aa_str = aa_str.replace('DEL', 'del') - return f'{ref}{nt_pos}{alt}({gene}:{aa_str})' + return f'{gene}:{aa_str} ({ref}{nt_pos}{alt})' ref_aa, aa_pos_str, alt_aa = m.groups() ref_aa = get_aa(ref_aa) if effect == 'stop_lost': alt_aa = get_aa(alt_aa.replace('ext', '')) - return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa}[stop_lost])' + return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt} [stop_lost])' if effect == 'frameshift_variant': - return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}[FRAMESHIFT])' + return f'{gene}:{ref_aa}{aa_pos_str} ({ref}{nt_pos}{alt} [FRAMESHIFT])' if effect == 'conservative_inframe_deletion': - return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa})' + return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt})' if effect == 'disruptive_inframe_deletion': - return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa}[disruptive_inframe_deletion])' + return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt} [disruptive_inframe_deletion])' alt_aa = get_aa(alt_aa) - return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa})' + if alt_aa == ref_aa: + return f'{ref}{nt_pos}{alt}' + return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt})' def get_aa(s: str) -> str: @@ -300,6 +318,16 @@ def simplify_snpsift(df: pd.DataFrame, sample_name: str) -> Optional[pd.DataFram AF.name = 'AF' series += [REF_AC, ALT_AC, AF] continue + if c == 'SR': + df_ac = df[c].str.split(',', expand=True) + REF_AC = df_ac[0].astype(int) + df_ac[1].astype(int) + REF_AC.name = 'REF_AC' + ALT_AC = df_ac[2].astype(int) + df_ac[3].astype(int) + ALT_AC.name = 'ALT_AC' + AF = ALT_AC / (REF_AC + ALT_AC) + AF.name = 'AF' + series += [REF_AC, ALT_AC, AF] + continue idx = c.find('[*].') if idx > 0: new_series_name = c[idx + 4:].lower() @@ -413,6 +441,39 @@ def parse_longshot_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd return df_merge.loc[:, cols_to_keep] +def parse_medaka_vcf(df: pd.DataFrame, sample_name: str = None, qc_reqs: QualityRequirements = None) -> Optional[pd.DataFrame]: + if df.empty: + return None + if not sample_name: + sample_name = df.columns[-1] if df.columns[-1] != 'SAMPLE' else None + if sample_name is None: + raise ValueError(f'Sample name is not defined for VCF: shape={df.shape}; columns={df.columns}') + pos_info_val = {} + for row in df.itertuples(): + # if there is no variant INFO available then this isn't the right file + if row.INFO == '.': + return None + infos = parse_vcf_info(row.INFO) + # no DP INFO? skip this file + if 'DP' not in infos: + return None + if infos['DP'] < qc_reqs.low_coverage_threshold: + continue + ac_ref_fwd, ac_ref_rev, ac_alt_fwd, ac_alt_rev = infos['SR'] + infos['REF_DP'] = ac_ref_fwd + ac_ref_rev + infos['ALT_DP'] = ac_alt_fwd + ac_alt_rev + pos_info_val[row.POS] = infos + df_medaka_info = pd.DataFrame(pos_info_val).transpose() + df_medaka_info.index.name = 'POS' + df_medaka_info.reset_index(inplace=True) + df_merge = pd.merge(df, df_medaka_info, on='POS') + df_merge['sample'] = sample_name + df_merge = df_merge[df_merge.DP > 0] + df_merge['ALT_FREQ'] = df_merge.ALT_DP / (df_merge.ALT_DP + df_merge.REF_DP) + cols_to_keep = list({col for col, _, _ in variants_cols} & set(df_merge.columns)) + return df_merge.loc[:, cols_to_keep] + + def parse_nanopolish_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd.DataFrame]: if df.empty: return None @@ -453,7 +514,7 @@ def parse_vcf_info(s: str) -> dict: return out -def get_info(basedir: Path) -> Dict[str, pd.DataFrame]: +def get_info(basedir: Path, qc_reqs: QualityRequirements) -> Dict[str, pd.DataFrame]: sample_vcf = find_file_for_each_sample(basedir=basedir, glob_patterns=VCF_GLOB_PATTERNS, sample_name_cleanup=VCF_SAMPLE_NAME_CLEANUP, @@ -467,6 +528,12 @@ def get_info(basedir: Path) -> Dict[str, pd.DataFrame]: sample_dfvcf[sample] = df_parsed_ivar_vcf else: logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"') + elif variant_caller.startswith(VariantCaller.Medaka.value): + df_medaka_vcf = parse_medaka_vcf(df_vcf, sample, qc_reqs) + if df_medaka_vcf is not None: + sample_dfvcf[sample] = df_medaka_vcf + else: + logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"') elif variant_caller.startswith(VariantCaller.Longshot.value): df_longshot_vcf = parse_longshot_vcf(df_vcf, sample) if df_longshot_vcf is not None: @@ -521,6 +588,20 @@ def to_dataframe(dfs: Iterable[pd.DataFrame]) -> pd.DataFrame: return df.rename(columns={x: y for x, y, _ in variants_cols}) +def get_nt_position_int(s: str) -> int: + """Get the nucleotide position from the mutation string + + >>> get_nt_position_int('A1879G') + 1879 + >>> get_nt_position_int('S:Y145_H146del (TATTACC21992T)') + 21992 + """ + if ':' in s: + return int(re.sub(r'.*\([AGTC]+(\d+).*', r'\1', s)) + else: + return int(re.sub(r'[AGTC]+(\d+).*', r'\1', s)) + + def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame: df_vars = df.copy() df_vars.reset_index(inplace=True) @@ -530,8 +611,9 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame: values='Alternate Allele Frequency', aggfunc='first', fill_value=0.0) + nt_positions = [get_nt_position_int(x) for x in df_pivot.columns] pivot_cols = list(zip(df_pivot.columns, - df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int))) + nt_positions)) pivot_cols.sort(key=itemgetter(1)) return df_pivot[[x for x, y in pivot_cols]] @@ -539,7 +621,7 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame: def to_summary(df: pd.DataFrame) -> pd.DataFrame: df_vars = df.copy() df_vars.reset_index(inplace=True) - + logger.debug(f'df_vars columns: {df_vars.columns}') df_summary = df_vars.groupby('Mutation', sort=False).agg( n_samples=('Sample', 'size'), samples=('Sample', lambda x: '; '.join(x)), @@ -553,5 +635,8 @@ def to_summary(df: pd.DataFrame) -> pd.DataFrame: min_af=('Alternate Allele Frequency', 'min'), max_af=('Alternate Allele Frequency', 'max'), mean_af=('Alternate Allele Frequency', lambda x: sum(x) / len(x)), + nt_pos=('Position', 'first'), + aa_pos=('Amino Acid Position', 'first') ) + df_summary.sort_values('nt_pos', inplace=True) return df_summary.rename(columns={x: y for x, y, _ in (variant_summary_cols + variants_cols)}) diff --git a/xlavir/xlavir.py b/xlavir/xlavir.py index 187c9b7..2d61c4b 100644 --- a/xlavir/xlavir.py +++ b/xlavir/xlavir.py @@ -48,7 +48,7 @@ def run(input_dir: Path, ) mapping_info.n_total_reads = total_reads sample_cts = ct.read_ct_table(ct_values_table) if ct_values_table else {} - sample_variants = variants.get_info(input_dir) + sample_variants = variants.get_info(input_dir, qc_reqs=quality_reqs) dfs: List[ExcelSheetDataFrame] = [] df_stats = qc.create_qc_stats_dataframe(sample_depth_info, From 3520f756618cfb6b8fdb933067dfa74eb6660bf3 Mon Sep 17 00:00:00 2001 From: Peter Kruczkiewicz Date: Wed, 5 Jan 2022 12:38:53 -0600 Subject: [PATCH 2/2] =?UTF-8?q?Bump=20version:=200.5.3=20=E2=86=92=200.6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- setup.py | 2 +- xlavir/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 9d0fd46..a8cefb9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.3 +current_version = 0.6.0 commit = True tag = True diff --git a/setup.py b/setup.py index 947ca9f..2aa38fb 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,6 @@ test_suite='tests', tests_require=test_requirements, url='https://github.com/peterk87/xlavir', - version='0.5.3', + version='0.6.0', zip_safe=False, ) diff --git a/xlavir/__init__.py b/xlavir/__init__.py index c5b69b0..4f89048 100644 --- a/xlavir/__init__.py +++ b/xlavir/__init__.py @@ -2,4 +2,4 @@ __author__ = """Peter Kruczkiewicz""" __email__ = 'peter.kruczkiewicz@gmail.com' -__version__ = '0.5.3' +__version__ = '0.6.0'