Skip to content

Commit

Permalink
Merge pull request #17 from peterk87/medaka-vcf
Browse files Browse the repository at this point in the history
Add support for Medaka annotated VCFs
  • Loading branch information
peterk87 authored Jan 5, 2022
2 parents 55d0233 + 3520f75 commit 3f74811
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 13 deletions.
8 changes: 8 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
History
=======

0.6.0 (2022-01-05)
------------------

* Add support for reading annotated Medaka VCF files (``medaka_variant`` VCF annotated with ``medaka tools annotate``)
* Changed mutation string format to ``{gene}:{AA change} ({NT change}{extra})`` if there is a AA change
* Added low coverage filtering of variants for Medaka VCF
* "Variants Summary" table now sorted by nucleotide position

0.5.3 (2021-11-09)
------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.3
current_version = 0.6.0
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/peterk87/xlavir',
version='0.5.3',
version='0.6.0',
zip_safe=False,
)
2 changes: 1 addition & 1 deletion xlavir/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = """Peter Kruczkiewicz"""
__email__ = 'peter.kruczkiewicz@gmail.com'
__version__ = '0.5.3'
__version__ = '0.6.0'
103 changes: 94 additions & 9 deletions xlavir/tools/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
from pydantic import BaseModel

from xlavir.qc import QualityRequirements
from xlavir.util import try_parse_number, find_file_for_each_sample

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -131,6 +132,16 @@
'Mean AF',
'Mean/average alternate allele frequency of mutation in all samples.'
),
(
'nt_pos',
'Nucleotide Position',
'Nucleotide position of mutation with respect to reference genome.'
),
(
'aa_pos',
'Amino Acid Position',
'Amino acid position of mutation in reference genome gene.'
)
]

BCFTOOLS_STATS_GLOB_PATTERNS = [
Expand All @@ -156,11 +167,13 @@ class VariantCaller(Enum):
iVar = 'iVar'
Longshot = 'Longshot'
Nanopolish = 'nanopolish'
Medaka = 'medaka'


VCF_GLOB_PATTERNS = [
'**/nanopolish/*.pass.vcf.gz',
'**/ivar/*.vcf.gz',
'**/*.filt.no_fs.vcf',
'**/*.longshot.vcf',
'**/*.vcf',
]
Expand All @@ -169,6 +182,7 @@ class VariantCaller(Enum):
re.compile(r'(\.pass)?\.vcf(\.gz)?$'),
re.compile(r'\.AF0\.\d+(\.filt)?'),
re.compile(r'\.0\.\d+AF(\.filt)?'),
re.compile(r'\.medaka'),
re.compile(r'\.longshot'),
re.compile(r'\.snpeff'),
re.compile(r'\.no_fs'),
Expand Down Expand Up @@ -226,6 +240,8 @@ def read_vcf(vcf_file: Path) -> Tuple[str, pd.DataFrame]:
variant_caller = line.strip().replace('##source=', '')
if line.startswith('##nanopolish'):
variant_caller = 'nanopolish'
if line.startswith('##medaka_version'):
variant_caller = 'medaka'
if line.startswith('#CHROM'):
vcf_cols = line[1:].strip().split('\t')
break
Expand Down Expand Up @@ -253,22 +269,24 @@ def parse_aa(gene: str,
aa_str = aa_str.replace(aa3, aa1)
if aa_str.endswith('DEL'):
aa_str = aa_str.replace('DEL', 'del')
return f'{ref}{nt_pos}{alt}({gene}:{aa_str})'
return f'{gene}:{aa_str} ({ref}{nt_pos}{alt})'
ref_aa, aa_pos_str, alt_aa = m.groups()
ref_aa = get_aa(ref_aa)

if effect == 'stop_lost':
alt_aa = get_aa(alt_aa.replace('ext', ''))
return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa}[stop_lost])'
return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt} [stop_lost])'
if effect == 'frameshift_variant':
return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}[FRAMESHIFT])'
return f'{gene}:{ref_aa}{aa_pos_str} ({ref}{nt_pos}{alt} [FRAMESHIFT])'
if effect == 'conservative_inframe_deletion':
return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa})'
return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt})'
if effect == 'disruptive_inframe_deletion':
return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa}[disruptive_inframe_deletion])'
return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt} [disruptive_inframe_deletion])'

alt_aa = get_aa(alt_aa)
return f'{ref}{nt_pos}{alt}({gene}:{ref_aa}{aa_pos_str}{alt_aa})'
if alt_aa == ref_aa:
return f'{ref}{nt_pos}{alt}'
return f'{gene}:{ref_aa}{aa_pos_str}{alt_aa} ({ref}{nt_pos}{alt})'


def get_aa(s: str) -> str:
Expand Down Expand Up @@ -300,6 +318,16 @@ def simplify_snpsift(df: pd.DataFrame, sample_name: str) -> Optional[pd.DataFram
AF.name = 'AF'
series += [REF_AC, ALT_AC, AF]
continue
if c == 'SR':
df_ac = df[c].str.split(',', expand=True)
REF_AC = df_ac[0].astype(int) + df_ac[1].astype(int)
REF_AC.name = 'REF_AC'
ALT_AC = df_ac[2].astype(int) + df_ac[3].astype(int)
ALT_AC.name = 'ALT_AC'
AF = ALT_AC / (REF_AC + ALT_AC)
AF.name = 'AF'
series += [REF_AC, ALT_AC, AF]
continue
idx = c.find('[*].')
if idx > 0:
new_series_name = c[idx + 4:].lower()
Expand Down Expand Up @@ -413,6 +441,39 @@ def parse_longshot_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd
return df_merge.loc[:, cols_to_keep]


def parse_medaka_vcf(df: pd.DataFrame, sample_name: str = None, qc_reqs: QualityRequirements = None) -> Optional[pd.DataFrame]:
if df.empty:
return None
if not sample_name:
sample_name = df.columns[-1] if df.columns[-1] != 'SAMPLE' else None
if sample_name is None:
raise ValueError(f'Sample name is not defined for VCF: shape={df.shape}; columns={df.columns}')
pos_info_val = {}
for row in df.itertuples():
# if there is no variant INFO available then this isn't the right file
if row.INFO == '.':
return None
infos = parse_vcf_info(row.INFO)
# no DP INFO? skip this file
if 'DP' not in infos:
return None
if infos['DP'] < qc_reqs.low_coverage_threshold:
continue
ac_ref_fwd, ac_ref_rev, ac_alt_fwd, ac_alt_rev = infos['SR']
infos['REF_DP'] = ac_ref_fwd + ac_ref_rev
infos['ALT_DP'] = ac_alt_fwd + ac_alt_rev
pos_info_val[row.POS] = infos
df_medaka_info = pd.DataFrame(pos_info_val).transpose()
df_medaka_info.index.name = 'POS'
df_medaka_info.reset_index(inplace=True)
df_merge = pd.merge(df, df_medaka_info, on='POS')
df_merge['sample'] = sample_name
df_merge = df_merge[df_merge.DP > 0]
df_merge['ALT_FREQ'] = df_merge.ALT_DP / (df_merge.ALT_DP + df_merge.REF_DP)
cols_to_keep = list({col for col, _, _ in variants_cols} & set(df_merge.columns))
return df_merge.loc[:, cols_to_keep]


def parse_nanopolish_vcf(df: pd.DataFrame, sample_name: str = None) -> Optional[pd.DataFrame]:
if df.empty:
return None
Expand Down Expand Up @@ -453,7 +514,7 @@ def parse_vcf_info(s: str) -> dict:
return out


def get_info(basedir: Path) -> Dict[str, pd.DataFrame]:
def get_info(basedir: Path, qc_reqs: QualityRequirements) -> Dict[str, pd.DataFrame]:
sample_vcf = find_file_for_each_sample(basedir=basedir,
glob_patterns=VCF_GLOB_PATTERNS,
sample_name_cleanup=VCF_SAMPLE_NAME_CLEANUP,
Expand All @@ -467,6 +528,12 @@ def get_info(basedir: Path) -> Dict[str, pd.DataFrame]:
sample_dfvcf[sample] = df_parsed_ivar_vcf
else:
logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"')
elif variant_caller.startswith(VariantCaller.Medaka.value):
df_medaka_vcf = parse_medaka_vcf(df_vcf, sample, qc_reqs)
if df_medaka_vcf is not None:
sample_dfvcf[sample] = df_medaka_vcf
else:
logger.warning(f'Sample "{sample}" has no entries in VCF "{vcf_path}"')
elif variant_caller.startswith(VariantCaller.Longshot.value):
df_longshot_vcf = parse_longshot_vcf(df_vcf, sample)
if df_longshot_vcf is not None:
Expand Down Expand Up @@ -521,6 +588,20 @@ def to_dataframe(dfs: Iterable[pd.DataFrame]) -> pd.DataFrame:
return df.rename(columns={x: y for x, y, _ in variants_cols})


def get_nt_position_int(s: str) -> int:
"""Get the nucleotide position from the mutation string
>>> get_nt_position_int('A1879G')
1879
>>> get_nt_position_int('S:Y145_H146del (TATTACC21992T)')
21992
"""
if ':' in s:
return int(re.sub(r'.*\([AGTC]+(\d+).*', r'\1', s))
else:
return int(re.sub(r'[AGTC]+(\d+).*', r'\1', s))


def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame:
df_vars = df.copy()
df_vars.reset_index(inplace=True)
Expand All @@ -530,16 +611,17 @@ def to_variant_pivot_table(df: pd.DataFrame) -> pd.DataFrame:
values='Alternate Allele Frequency',
aggfunc='first',
fill_value=0.0)
nt_positions = [get_nt_position_int(x) for x in df_pivot.columns]
pivot_cols = list(zip(df_pivot.columns,
df_pivot.columns.str.replace(r'[A-Z]+(\d+).*', r'\1').astype(int)))
nt_positions))
pivot_cols.sort(key=itemgetter(1))
return df_pivot[[x for x, y in pivot_cols]]


def to_summary(df: pd.DataFrame) -> pd.DataFrame:
df_vars = df.copy()
df_vars.reset_index(inplace=True)

logger.debug(f'df_vars columns: {df_vars.columns}')
df_summary = df_vars.groupby('Mutation', sort=False).agg(
n_samples=('Sample', 'size'),
samples=('Sample', lambda x: '; '.join(x)),
Expand All @@ -553,5 +635,8 @@ def to_summary(df: pd.DataFrame) -> pd.DataFrame:
min_af=('Alternate Allele Frequency', 'min'),
max_af=('Alternate Allele Frequency', 'max'),
mean_af=('Alternate Allele Frequency', lambda x: sum(x) / len(x)),
nt_pos=('Position', 'first'),
aa_pos=('Amino Acid Position', 'first')
)
df_summary.sort_values('nt_pos', inplace=True)
return df_summary.rename(columns={x: y for x, y, _ in (variant_summary_cols + variants_cols)})
2 changes: 1 addition & 1 deletion xlavir/xlavir.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def run(input_dir: Path,
)
mapping_info.n_total_reads = total_reads
sample_cts = ct.read_ct_table(ct_values_table) if ct_values_table else {}
sample_variants = variants.get_info(input_dir)
sample_variants = variants.get_info(input_dir, qc_reqs=quality_reqs)

dfs: List[ExcelSheetDataFrame] = []
df_stats = qc.create_qc_stats_dataframe(sample_depth_info,
Expand Down

0 comments on commit 3f74811

Please sign in to comment.