Skip to content

Commit 2f5928f

Browse files
authored
Merge pull request #13 from TRON-Bioinformatics/whatshap
Integrate whatshap for phasing and fix bcftools csq
2 parents 6a71584 + 6f59133 commit 2f5928f

File tree

9 files changed

+84
-3
lines changed

9 files changed

+84
-3
lines changed

.github/workflows/automated_tests.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,5 @@ jobs:
2626
run: conda config --show-sources
2727
- name: Run tests
2828
run: |
29+
export NXF_VER=22.04.5
2930
make

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ test:
2929
bash test/scripts/run_test_11.sh
3030
bash test/scripts/run_test_12.sh
3131
bash test/scripts/run_test_13.sh
32+
bash test/scripts/run_test_14.sh

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,25 @@ These annotations are provided by VAFator (https://github.com/TRON-Bioinformatic
268268

269269
No technical annotations are performed if the parameter `--input_bams` is not passed.
270270

271+
## Phasing with WhatsHap
272+
273+
The phase of the mutations can be inferred from the reads pileup.
274+
This information is relevant to determine the impact of nearby mutations.
275+
WhatsHap adds the phasing information when possible to the VCF following the VCF specification for that purpose.
276+
Genotypes with phase information use the `|` instead of the `/` and `0|1` and `1|0` represent the two different phases.
277+
A homozygous mutations in phase is represented as `1|1`.
278+
Furthermore, because phasing may be incomplete, to define the different phased blocks the INFO/PS annotation used.
279+
All mutations belonging to the same phase block share the same PS number.
280+
281+
BAM files need to be provided to perform phasing with `--input_bams` and the phasing sample has to be determined with
282+
`--phasing_sample`.
283+
284+
This approach has some limitations:
285+
- We only support diploid genotyped VCF files. WhatsHap does support polyploid samples. But the particular case of
286+
somatic mutations is not supported.
287+
- The sample name chosen to perform the phasing has to be the same across all VCFs.
288+
Not to be mistaken with the patient name. Eg: only `normal` samples across all patients can be used for phasing
289+
271290
## Functional annotations
272291

273292
The functional annotations provide a biological context for every variant. Such as the overlapping genes or the effect

main.nf

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@ include { FILTER_VCF } from './modules/01_filter'
66
include { BCFTOOLS_NORM; VT_DECOMPOSE_COMPLEX; REMOVE_DUPLICATES } from './modules/02_normalization'
77
include { SUMMARY_VCF; SUMMARY_VCF as SUMMARY_VCF_2 } from './modules/03_summary'
88
include { VAFATOR; MULTIALLELIC_FILTER } from './modules/04_vafator'
9-
include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/05_variant_annotation'
9+
include { WHATSHAP } from './modules/05_phasing'
10+
include { VARIANT_ANNOTATION_SNPEFF; VARIANT_ANNOTATION_BCFTOOLS } from './modules/06_variant_annotation'
11+
1012

1113
params.help= false
1214
params.input_vcfs = false
15+
params.input_bams = false
1316
params.input_vcf = false
1417

1518
// optional VAFator inputs
@@ -32,6 +35,7 @@ params.skip_multiallelic_filter = false
3235
// SnpEff input
3336
params.snpeff_organism = false
3437
params.snpeff_datadir = false
38+
params.phasing_sample = false
3539

3640

3741
if (params.help) {
@@ -136,6 +140,10 @@ workflow {
136140
final_vcfs = MULTIALLELIC_FILTER(final_vcfs)
137141
final_vcfs = MULTIALLELIC_FILTER.out.filtered_vcf
138142
}
143+
if (params.phasing_sample) {
144+
WHATSHAP(final_vcfs.join(input_bams.groupTuple()))
145+
final_vcfs = WHATSHAP.out.phased_vcf
146+
}
139147
}
140148

141149
if (params.snpeff_organism) {

modules/05_phasing.nf

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
params.phasing_sample = false
2+
params.cpus = 1
3+
params.memory = '3g'
4+
5+
6+
process WHATSHAP {
7+
cpus params.cpus
8+
memory params.memory
9+
tag "${patient_name}"
10+
publishDir "${params.output}/${patient_name}", mode: "copy"
11+
12+
conda (params.enable_conda ? "bioconda::whatshap=1.4" : null)
13+
14+
input:
15+
tuple val(patient_name), file(vcf), val(bams)
16+
17+
output:
18+
tuple val(patient_name), file("${patient_name}.phased.vcf"), emit: phased_vcf
19+
20+
script:
21+
phasing_bams = bams.toList().stream()
22+
.filter{ b -> b.split(':')[0] == "${params.phasing_sample}" }
23+
.collect{ b -> b.split(":")[1] }
24+
.join(" ") ;
25+
"""
26+
whatshap phase \
27+
--indel \
28+
-o ${patient_name}.phased.vcf \
29+
--reference=${params.reference} \
30+
${vcf} \
31+
${phasing_bams}
32+
"""
33+
}

modules/05_variant_annotation.nf renamed to modules/06_variant_annotation.nf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ process VARIANT_ANNOTATION_BCFTOOLS {
4949
--fasta-ref ${params.reference} \\
5050
--gff-annot ${params.gff} ${vcf} \\
5151
--output-type v \\
52-
--output ${name}.annotated.vcf
52+
--output ${name}.annotated.vcf \\
53+
--phase a
5354
"""
5455
}

nextflow.config

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ env {
2727
// Capture exit codes from upstream processes when piping
2828
process.shell = ['/bin/bash', '-euo', 'pipefail']
2929

30-
VERSION = '2.5.0'
30+
VERSION = '3.0.0'
3131

3232
manifest {
3333
name = 'TRON-Bioinformatics/tronflow-vcf-postprocessing'
@@ -87,6 +87,7 @@ Optional input:
8787
* --snpeff_memory: for some samples SnpEff may require to use more memory (default: 3g)
8888
* --mapping_quality: VAFator minimum mapping quality (default: 0)
8989
* --base_call_quality: VAFator minimum base call quality (default: 0)
90+
* --phasing-sample: enables phasing with whatshap and defines the sample BAM to phase the VCF (BAMs need to be provided)
9091
9192
Output:
9293
* Normalized VCF file
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
single_sample test/data/test_single_sample.vcf

test/scripts/run_test_14.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
3+
4+
source test/scripts/assert.sh
5+
output_folder=test/output/test14
6+
echo -e "tumor_normal\tprimary:"`pwd`"/test/data/TESTX_S1_L001.bam" > test/data/test_bams.txt
7+
echo -e "tumor_normal\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt
8+
echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt
9+
echo -e "single_sample\ttumor:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt
10+
echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L001.bam" >> test/data/test_bams.txt
11+
echo -e "single_sample\tnormal:"`pwd`"/test/data/TESTX_S1_L002.bam" >> test/data/test_bams.txt
12+
13+
nextflow main.nf -profile test,conda --output $output_folder --input_bams test/data/test_bams.txt \
14+
--phasing_sample normal --input_vcfs test/data/test_input_single_sample.txt
15+
16+
test -s $output_folder/single_sample/single_sample.phased.vcf || { echo "Missing test 10 output file!"; exit 1; }

0 commit comments

Comments
 (0)