diff --git a/CHANGELOG.md b/CHANGELOG.md index 6eacc042..dac884c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v2.0.1 - [2024-07-15] +Version 2.0.1 fixes a minor bug in which all numeric sample names (ex. 231301) were not being compared to the nextclade output for frameshift reporting and adjustments along with updating the nextclade dataset to default to 'latest' + +### `Changed`: +- Bugfix in `qc.py` such to allow numeric sample names to be properly compared to the nextclade output for frameshift reporting +- Default nextclade dataset set to 'latest' + ## v2.0.0 - [2024-04-25] Overall version 2.0.0 has the all the same outputs as version 1.1.0 but with some adjustments to the output locations and the input parameter names. This makes this release incompatible with previous automation unfortunately but it is ultimately for a more robust and easier to run/develop pipeline diff --git a/README.md b/README.md index f517d4e5..e91aeb59 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,13 @@ This Nextflow pipeline automates the ARTIC network [nCoV-2019 novel coronavirus ### Release Notes For full changes visit the [CHANGELOG](CHANGELOG.md) +#### *v2.0.1* +Version 2.0.1 fixes a minor bug in which all numeric sample names (ex. 231301) were not being compared to the nextclade output for frameshift reporting and adjustments along with updating the nextclade dataset to 'latest' + +`Changed`: +- Bugfix in `qc.py` such to allow numeric sample names to be properly compared to the nextclade output for frameshift reporting +- Default nextclade dataset set to 'latest' + #### *v2.0.0* Overall version 2.0.0 has the all the same outputs as version 1.1.0 but with some adjustments to the output locations and the input parameter names. This makes this release incompatible with previous automation unfortunately but it is ultimately for a more robust and easier to run/develop pipeline diff --git a/bin/qc.py b/bin/qc.py index a91fe3d0..877c4de8 100755 --- a/bin/qc.py +++ b/bin/qc.py @@ -343,14 +343,14 @@ def parse_ncov_tsv(file_in, sample, negative=False): return negative_df -def compare_nextclade_fs_to_ncovtools_fs(sample: str, next_df: pd.DataFrame, ncov_df: pd.DataFrame) -> None: +def compare_nextclade_fs_to_ncovtools_fs(sample: str, nextclade_df: pd.DataFrame, ncov_df: pd.DataFrame) -> None: ''' Parse the nextclade dataframe for the presence of frameshift indels and update the qc_pass flag in the ncov summary df if they do not match INPUTS: - sample --> `str` sample name from input - next_df --> `df` from nextclade - ncov_df --> `df` Parsed ncov-tools summary df + sample --> `str` sample name from input + nextclade_df --> `df` from nextclade + ncov_df --> `df` Parsed ncov-tools summary df ''' # Adding in a column for tracking if correction occured ncov_df.reset_index(inplace=True, drop=True) @@ -358,14 +358,14 @@ def compare_nextclade_fs_to_ncovtools_fs(sample: str, next_df: pd.DataFrame, nco # Filter down nextclade df to just the wanted sample # It should only be 1 sample but just in case - next_df = next_df.loc[next_df['seqName'] == sample] - if next_df.empty: + nextclade_df = nextclade_df.loc[nextclade_df['seqName'] == sample] + if nextclade_df.empty: return # Determine if there are any non-ignored frameshifts # Both df are 1 line now so can just pull the first value - total_fs = next_df['qc.frameShifts.totalFrameShifts'].values[0] - ignored_fs = next_df['qc.frameShifts.totalFrameShiftsIgnored'].values[0] + total_fs = nextclade_df['qc.frameShifts.totalFrameShifts'].values[0] + ignored_fs = nextclade_df['qc.frameShifts.totalFrameShiftsIgnored'].values[0] ncov_qc_value_list = ncov_df['qc_pass'].values[0].split(';') # If its not in the list we don't worry @@ -495,8 +495,10 @@ def go(args): negative_df = parse_ncov_tsv(args.ncov_negative, args.sample, negative=True) # Nextclade double check of fs mutations - next_df = pd.read_csv(args.nextclade_tsv, sep='\t') - compare_nextclade_fs_to_ncovtools_fs(args.sample, next_df, summary_df) + nextclade_df = pd.read_csv(args.nextclade_tsv, sep='\t') + # Convert the seqName column type to string in case of all integer sample names + nextclade_df = nextclade_df.astype({'seqName': 'str'}) + compare_nextclade_fs_to_ncovtools_fs(args.sample, nextclade_df, summary_df) # If we have a samplesheet, use its values to create final output if args.sample_sheet: diff --git a/nextflow.config b/nextflow.config index 63c262fb..a6e1fe7a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,7 +39,7 @@ params { //- Nextclade nextclade_dataset = "sars-cov-2" - nextclade_tag = "2024-04-15--15-08-22Z" + nextclade_tag = "latest" //- Metadata and IRIDA Uploads //-- Metadata is supplied with `--irida metadata.tsv` and requires specific columns. See README