From c1661a02454b378d151f4d8ee92ab411d71b3b7a Mon Sep 17 00:00:00 2001 From: Jon Palmer Date: Mon, 2 Apr 2018 16:16:25 -0400 Subject: [PATCH] fixes for py2/3 compatibility, move amptk to bin folder hopefully for conda functionality --- bin/amptk | 2 + bin/amptk.py | 1091 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1093 insertions(+) create mode 100755 bin/amptk create mode 100755 bin/amptk.py diff --git a/bin/amptk b/bin/amptk new file mode 100755 index 0000000..c2666f5 --- /dev/null +++ b/bin/amptk @@ -0,0 +1,2 @@ +#!/usr/bin/env python +import amptk \ No newline at end of file diff --git a/bin/amptk.py b/bin/amptk.py new file mode 100755 index 0000000..87bcf1b --- /dev/null +++ b/bin/amptk.py @@ -0,0 +1,1091 @@ +#!/usr/bin/env python + +from __future__ import (absolute_import, division, + print_function, unicode_literals) +import sys +import os +import subprocess +import inspect +import tarfile +import shutil +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +sys.path.insert(0,parentdir) +import lib.amptklib as amptklib +from natsort import natsorted +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen + +URL = { 'ITS': 'https://osf.io/pbtyh/download?version=4', + '16S': 'https://osf.io/m7v5q/download?version=1', + 'LSU': 'https://osf.io/sqn5r/download?version=1', + 'COI': 'https://osf.io/pax79/download?version=2' } + +def flatten(l): + flatList = [] + for elem in l: + # if an element of a list is a list + # iterate over this list and add elements to flatList + if type(elem) == list: + for e in elem: + flatList.append(e) + else: + flatList.append(elem) + return flatList + +def fmtcols(mylist, cols): + justify = [] + for i in range(0,cols): + length = max(map(lambda x: len(x), mylist[i::cols])) + length += 2 + ljust = map(lambda x: x.ljust(length), mylist[i::cols]) + justify.append(ljust) + justify = flatten(justify) + num_lines = len(mylist) // cols + lines = (' '.join(justify[i::num_lines]) + for i in range(0,num_lines)) + return "\n".join('{}'.format(lines)) + +def download(url, name): + file_name = name + u = urlopen(url) + f = open(file_name, 'wb') + meta = u.info() + file_size = 0 + for x in meta.items(): + if x[0].lower() == 'content-length': + file_size = int(x[1]) + print("Downloading: {0} Bytes: {1}".format(url, file_size)) + file_size_dl = 0 + block_sz = 8192 + while True: + buffer = u.read(block_sz) + if not buffer: + break + file_size_dl += len(buffer) + f.write(buffer) + p = float(file_size_dl) / file_size + status = r"{0} [{1:.2%}]".format(file_size_dl, p) + status = status + chr(8)*(len(status)+1) + sys.stdout.write(status) + f.close() + +git_version = amptklib.git_version() +base_version = '1.1.2' +if git_version: + version = base_version+'-'+git_version +else: + version = base_version + +default_help = """ +Usage: amptk +version: %s + +Description: AMPtk is a package of scripts to process NGS amplicon data. + Dependencies: USEARCH v9.1.13 and VSEARCH v2.2.0 + +Process: ion pre-process Ion Torrent data + illumina pre-process folder of de-multiplexed Illumina data + illumina2 pre-process PE Illumina data from a single file + illumina3 pre-process PE Illumina + index reads (i.e. R1, R2, I) + 454 pre-process Roche 454 (pyrosequencing) data + SRA pre-process single FASTQ per sample data (i.e. SRA data) + +Clustering: cluster cluster OTUs (using UPARSE algorithm) + dada2 dada2 denoising algorithm (requires R, dada2, ShortRead) + unoise2 UNOISE2 denoising algorithm + unoise3 UNOISE3 denoising algorithm + cluster_ref closed/open reference based clustering (EXPERIMENTAL) + +Utilities: filter OTU table filtering + lulu LULU amplicon curation of OTU table + taxonomy Assign taxonomy to OTUs + show show number or reads per barcode from de-multiplexed data + select select reads (samples) from de-multiplexed data + remove remove reads (samples) from de-multiplexed data + sample sub-sample (rarify) de-multiplexed reads per sample + drop Drop OTUs from dataset + stats Hypothesis test and NMDS graphs (EXPERIMENTAL) + summarize Summarize Taxonomy (create OTU-like tables and/or stacked bar graphs) + funguild Run FUNGuild (annotate OTUs with ecological information) + meta pivot OTU table and append to meta data + heatmap Create heatmap from OTU table + SRA-submit De-multiplex data and create meta data for NCBI SRA submission + +Setup: install Download/install pre-formatted taxonomy DB. Only need to run once. + database Format Reference Databases for Taxonomy + primers List primers hard-coded in AMPtk. Can use in pre-processing steps. + +Written by Jon Palmer (2015-2017) nextgenusfs@gmail.com + """ % version + +if len(sys.argv) > 1: + if sys.argv[1] == 'ion': + help = """ +Usage: amptk %s +version: %s + +Description: Script processes Ion Torrent PGM data for AMPtk clustering. The input to this script + should be a FASTQ file obtained from the Torrent Server analyzed with the + `--disable-all-filters` flag to the BaseCaller. This script does the following: + 1) finds Ion barcode sequences, 2) relabels headers with appropriate barcode name, + 3) removes primer sequences, 4) trim/pad reads to a set length. + +Arguments: -i, --fastq,--bam Input BAM or FASTQ file (Required) + -o, --out Output base name. Default: out + -m, --mapping_file QIIME-like mapping file + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence Default: ITS4 + -b, --barcodes Barcodes used (list, e.g: 1,3,4,5,20). Default: all + -n, --name_prefix Prefix for re-naming reads. Default: R_ + -l, --trim_len Length to trim/pad reads. Default: 300 + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --min_len Minimum length read to keep. Default: 100 + --full_length Keep only full length sequences. + --barcode_fasta FASTA file containing barcodes. Default: pgm_barcodes.fa + --barcode_mismatch Number of mismatches in barcode to allow. Default: 0 + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --cpus Number of CPUs to use. Default: all + --mult_samples Combine multiple chip runs, name prefix for chip + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_ion.py') + arguments.insert(0, cmd) + arguments.append('--ion') + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'illumina2': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes Illumina MiSeq data that is not de-multiplexed and has read structure + similar to Ion/454 such that the reads are Read for + clustering using AMPtk. The default behavior is to: 1) merge the PE reads using USEARCH, + 2) find barcodes, 3)find and trim primers, 3) rename reads according to sample name, + 4) trim/pad reads to a set length. This script can also handle dual barcodes + (3' barcodes using the --reverse_barcode option). + +Arguments: -i, --fastq Input FASTQ file (Required) + --reverse Illumina PE reverse reads. + -o, --out Output base name. Default: out + -m, --mapping_file QIIME-like mapping file + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence Default: ITS4 + -n, --name_prefix Prefix for re-naming reads. Default: R_ + -l, --trim_len Length to trim/pad reads. Default: 300 + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --min_len Minimum length read to keep. Default: 100 + --barcode_fasta FASTA file containing barcodes. Default: pgm_barcodes.fa + --barcode_mismatch Number of mismatches in barcode to allow. Default: 0 + --reverse_barcode FASTA file containing 3' barcodes. Default: none + --full_length Keep only full length sequences. + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --merge_method Software to use for PE merging. Default: usearch [usearch,vsearch] + --cpus Number of CPUs to use. Default: all + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_ion.py') + arguments.insert(0, cmd) + arguments.append('--illumina') + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'illumina': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes a folder of Illumina MiSeq data that is already de-multiplexed + and processes it for clustering using AMPtk. The default behavior is to: + 1) merge the PE reads using USEARCH, 2) find and trim primers, 3) rename reads + according to sample name, 4) trim/pad reads to a set length. + +Arguments: -i, --fastq Input folder of FASTQ files (Required) + -o, --out Output folder name. Default: amptk-data + -m, --mapping_file QIIME-like mapping file + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence Default: ITS4 + -l, --trim_len Length to trim/pad reads. Default: 300 + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --min_len Minimum length read to keep. Default: 100 + --full_length Keep only full length sequences. + --reads Paired-end or forward reads. Default: paired [paired, forward] + --read_length Illumina Read length (250 if 2 x 250 bp run). Default: auto detect + --rescue_forward Rescue Forward Reads if PE do not merge, e.g. long amplicons. Default: on [on,off] + --require_primer Require the Forward primer to be present. Default: on [on,off] + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --barcode_mismatch Number of mismatches in barcode to allow. Default: 1 + --cpus Number of CPUs to use. Default: all + --cleanup Remove intermediate files. + --merge_method Software to use for PE merging. Default: usearch [usearch,vsearch] + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_illumina_folder.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'illumina3': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes PE Illumina reads, Index reads, mapping file and processes for + clustering/denoising in AMPtk. The default behavior is to: + 1) merge the PE reads using VSEARCH, 2) filter for Phix, 3) find and trim primers, + 4) rename reads according to sample name, 4) trim/pad reads. + +Arguments: -f, --forward FASTQ R1 (forward) file (Required) + -r, --reverse FASTQ R2 (reverse) file (Required) + -i, --index FASTQ I3 (index) file (Required) + -m, --mapping_file QIIME-like mapping file. + --barcode_fasta Multi-fasta file of barocdes. + -o, --out Output folder name. Default: amptk-data + -l, --trim_len Length to trim/pad reads. Default: 300 + --fwd_primer Forward primer sequence + --rev_primer Reverse primer sequence + --min_len Minimum length read to keep. Default: 100 + --full_length Keep only full length sequences. + --read_length Illumina Read length (250 if 2 x 250 bp run). Default: auto detect + --rescue_forward Rescue Forward Reads if PE do not merge, e.g. long amplicons. Default: on [on,off] + --require_primer Require the Forward primer to be present. Default: off [on,off] + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --barcode_mismatch Number of mismatches in index (barcodes) to allow. Default: 2 + --barcode_rev_comp Reverse complement barcode sequences in mapping file. + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --cpus Number of CPUs to use. Default: all + --cleanup Remove intermediate files. + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_illumina_raw.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == '454': + help = """ +Usage: amptk %s +version: %s + +Description: Script processes Roche 454 data for AMPtk clustering. The input to this script + should be either a SFF file, FASTA+QUAL files, or FASTQ file. This script does + the following: 1) finds barcode sequences, 2) relabels headers with appropriate + barcode name, 3) removes primer sequences, 4) trim/pad reads to a set length. + +Arguments: -i, --sff, --fasta Input file (SFF, FASTA, or FASTQ) (Required) + -q, --qual QUAL file (Required if -i is FASTA). + -o, --out Output base name. Default: out + -m, --mapping_file QIIME-like mapping file + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence Default: ITS4 + -n, --name_prefix Prefix for re-naming reads. Default: R_ + -l, --trim_len Length to trim/pad reads. Default: 250 + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --min_len Minimum length read to keep. Default: 50 + --barcode_fasta FASTA file containing barcodes. (Required) + --reverse_barcode FASTA file containing 3' barcodes. Default: none + --barcode_mismatch Number of mismatches in barcode to allow. Default: 0 + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --cpus Number of CPUs to use. Default: all + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_ion.py') + arguments.insert(0, cmd) + arguments.append('--454') + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'SRA': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes a folder of FASTQ files in a format you would get from NCBI SRA, i.e. + there is one FASTQ file for each sample. Reads will be named according to sample name + and workflow is 1) find and trim primers, 2) rename reads according to filename, + and 3) trim/pad reads to a set length (optional). + +Arguments: -i, --fastq Input folder of FASTQ files (Required) + -o, --out Output folder name. Default: amptk-data + -m, --mapping_file QIIME-like mapping file + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence Default: ITS4 + -l, --trim_len Length to trim/pad reads. Default: 250 + -p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off] + --min_len Minimum length read to keep. Default: 50 + --full_length Keep only full length sequences. + --require_primer Require the Forward primer to be present. Default: on [on,off] + --primer_mismatch Number of mismatches in primers to allow. Default: 2 + --cpus Number of CPUs to use. Default: all + --cleanup Remove intermediate files. + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-process_illumina_folder.py') + arguments.insert(0, cmd) + arguments.append('--sra') + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'cluster' or sys.argv[1] == 'uparse': + help = """ +Usage: amptk %s +version: %s + +Description: Script is a "wrapper" for the UPARSE algorithm. FASTQ quality trimming via expected + errors and Dereplication are run in vsearch if installed otherwise defaults to Python + which allows for the use of datasets larger than 4GB. + Chimera filtering and UNOISE are also options. + +Arguments: -i, --fastq Input FASTQ file (Required) + -o, --out Output base name. Default: out + -e, --maxee Expected error quality trimming. Default: 1.0 + -p, --pct_otu OTU Clustering Radius (percent). Default: 97 + -m, --minsize Minimum size to keep (singleton filter). Default: 2 + --uchime_ref Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path] + --map_filtered Map quality filtered reads back to OTUs. Default: off + --unoise Run De-noising pre-clustering (UNOISE). Default: off + --debug Keep intermediate files. + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-OTU_cluster.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'cluster_ref': + help = """ +Usage: amptk %s +version: %s + +Description: Script first quality filters reads, dereplicates, and then runs chimera + filtering. OTUs are then picked via reference based clustering (closed) + those that are > --id. The rest of the data can then be clustered via + de novo UPARSE and then reference clustered using UTAX. EXPERIMENTAL + +Arguments: -i, --fastq Input FASTQ file (Required) + -d, --db Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]. (Required) + -o, --out Output base name. Default: out + -e, --maxee Expected error quality trimming. Default: 1.0 + -p, --pct_otu OTU Clustering Radius (percent). Default: 97 + -m, --minsize Minimum size to keep (singleton filter). Default: 2 + --id Percent ID for closed reference clustering. Default: 97 + --utax_db UTAX formatted DB. + --utax_level UTAX Taxonomy level to keep. Default: k [k,p,c,o,f,g,s] + --utax_cutoff UTAX confidence value threshold. Default: 0.8 [0 to 0.9] + --mock Mock community fasta file + --closed_ref_only Run only closed reference clustering. + --map_filtered Map quality filtered reads back to OTUs. Default: off + --debug Keep intermediate files. + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-OTU_cluster_ref.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'dada2': + help = """ +Usage: amptk %s +version: %s + +Description: Script is a "wrapper" for the DADA2 pipeline. It will "pick OTUs" based on denoising + the data for each read predicting the original sequence. This pipeline is sensitive to + 1 bp differences between sequences. Since most reference databases classify "species" + at 97%% threshold, the inferred sequences (iSeqs) from DADA2 are then clusterd at --pct_otu + to create OTUs. Both results are saved. Requires R packages: dada2, ShortRead + +Arguments: -i, --fastq Input FASTQ file (Required) + -o, --out Output base name. Default: dada2 + -m, --min_reads Minimum number of reads per sample. Default: 10 + -l, --length Length to trim reads. + -e, --maxee Expected error quality trimming. Default: 1.0 + -p, --pct_otu OTU Clustering Radius (percent). Default: 97 + --platform Sequencing platform. [ion, illumina, 454]. Default: ion + --pool Pool all samples together for DADA2. Default: off + --uchime_ref Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path] + --debug Keep intermediate files. + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-dada2.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'unoise2': + help = """ +Usage: amptk %s +version: %s + +Description: Script will run the UNOISE2 denoising algorithm followed by clustering with + UCLUST to generate OTUs. OTU table is then constructed by mapping reads to + the OTUs. Requires USEARCH v9.0.232 or greater. + +Arguments: -i, --fastq Input FASTQ file (Required) + -o, --out Output base name. Default: out + -e, --maxee Expected error quality trimming. Default: 1.0 + -m, --minsize Minimum size to keep for denoising. Default: 8 + -p, --pct_otu OTU Clustering Radius (percent). Default: 97 + -u, --usearch Path to USEARCH9. Default: usearch9 + --uchime_ref Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path] + --debug Keep intermediate files. + """ % (sys.argv[1], version) + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-unoise2.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'unoise3': + help = """ +Usage: amptk %s +version: %s + +Description: Script will run the UNOISE3 denoising algorithm followed by clustering with + UCLUST to generate OTUs. OTU table is then constructed by mapping reads to + the OTUs. Requires USEARCH v10.0.240 or greater. + +Arguments: -i, --fastq Input FASTQ file (Required) + -o, --out Output base name. Default: out + -e, --maxee Expected error quality trimming. Default: 1.0 + -m, --minsize Minimum size to keep for denoising. Default: 8 + -p, --pct_otu OTU Clustering Radius (percent). Default: 97 + -u, --usearch Path to USEARCH9. Default: usearch9 + --uchime_ref Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path] + --debug Keep intermediate files. + """ % (sys.argv[1], version) + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-unoise3.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'lulu' or sys.argv[1] == 'LULU': + help = """ +Usage: amptk %s +version: %s + +Description: Script is a wrapper for the LULU OTU table post-clustering curation of amplicon + data. The script calculates pairwise identity between the OTUs and then filters + the OTU table based on whether closely related OTUs that share the same/similar + distributions in the data are "daughters" of the "parent" OTU. Requires R and the + LULU R package. doi:10.1038/s41467-017-01312-x + +Arguments: -i, --otu_table Input OTU table (Required) + -f, --fasta Input OTUs in FASTA format (Required) + -o, --out Output base name. Default: input basename + --min_ratio_type Minimum ratio threshold. Default: min [min,avg] + --min_ratio Minimum ratio. Default: 1 + --min_match Minimum match pident (%%). Default: 84 + --min_relative_cooccurence Minimum relative co-occurance (%%): Default: 95 + --debug Keep intermediate files. + """ % (sys.argv[1], version) + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-lulu.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'filter': + help = """ +Usage: amptk %s +version: %s + +Description: Script filters OTU table generated from the `amptk cluster` command and should + be run on all datasets to combat barcode-switching or index-bleed (as high as + 2%% in MiSeq datasets, ~ 0.3%% in Ion PGM datasets). This script works best when + a spike-in control sequence is used, e.g. Synthetic Mock, although a mock is not required. + +Required: -i, --otu_table OTU table + -f, --fasta OTU fasta + +Optional: -o, --out Base name for output files. Default: use input basename + -b, --mock_barcode Name of barcode of mock community (Recommended) + -m, --mc Mock community FASTA file. Required if -b passed. [synmock,mock1,mock2,mock3,other] + -c, --calculate Calculate index-bleed options. Default: all [in,all] + -d, --drop Sample(s) to drop from OTU table. (list, separate by space) + --negatives Negative sample names. (list, separate by space) + --ignore Ignore sample(s) during index-bleed calc (list, separate by space) + +Filtering -n, --normalize Normalize reads to number of reads per sample [y,n]. Default: y + -p, --index_bleed Filter index bleed between samples (percent). Default: 0.005 + -t, --threshold Number to use for establishing read count threshold. Default: max [max,sum,top5,top10,top25] + -s, --subtract Threshold to subtract from all OTUs (any number or auto). Default: 0 + --delimiter Delimiter of OTU tables. Default: tsv [csv, tsv] + --min_reads_otu Minimum number of reads for valid OTU from whole experiment. Default: 2 + --min_samples_otu Minimum number of samples for valid OTU from whole experiment. Default: 1 + --col_order Column order (separate by space). Default: sort naturally + --keep_mock Keep Spike-in mock community. Default: False + --show_stats Show OTU stats on STDOUT + --debug Keep intermediate files. + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-filter.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'select': + help = """ +Usage: amptk %s +version: %s + +Description: Script filters de-multiplexed data (.demux.fq) to select only reads from samples + provided in a text file, one name per line or pass a list to keep to --list. + +Required: -i, --input Input FASTQ file (.demux.fq) + -t, --threshold Keep samples with read count greater than -t + -l, --list List of sample (barcode) names to keep, separate by space + -f, --file List of sample (barcode) names to keep in a file, one per line + -o, --out Output file name + --format File format for output file. Default: fastq [fastq, fasta] + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-keep_samples.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'remove': + help = """ +Usage: amptk %s +version: %s + +Description: Script filters de-multiplexed data (.demux.fq) to remove only reads from samples provided + in a text file, one name per line. + +Required: -i, --input Input FASTQ file (.demux.fq) + -t, --threshold Keep samples with read count greater than -t + -l, --list List of sample (barcode) names to remove, separate by space + -f, --file List of sample (barcode) names to remove in a file, one per line + -o, --out Output file name + --format File format for output file. Default: fastq [fastq, fasta] + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-remove_samples.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'sample': + help = """ +Usage: amptk %s +version: %s + +Description: Script sub-samples (rarifies) de-multiplexed data to equal number of reads per + sample. For community analysis, this might not be appropriate as you are ignoring + a portion of your data, however, there might be some applications where it is useful. + +Required: -i, --input Input FASTQ file + -n, --num_reads Number of reads to sub-sample to + -o, --out Output FASTQ file name + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-barcode_rarify.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'meta': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes meta data file in CSV format (e.g. from excel) and an OTU table as input. + The first column of the meta data file must match the OTU table sample headers exactly. + It then pivots the OTU table and appends it to the meta data file. + +Required: -i, --input Input OTU table + -m, --meta Meta data table (csv format) + -o, --out Output (meta data + pivotted OTU table) + --split_taxonomy Make separate tables for groups of taxonomy [k,p,c,o,f,g] + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-merge_metadata.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'show': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes de-multiplexed data (.demux.fq) as input and counts reads per barcode. + +Required: -i, --input Input FASTQ file (.demux.fq) + --quality_trim Quality trim reads + -e, --maxee maxEE threshold for quality. Default: 1.0 + -l, --length truncation length for trimming: Default: 250 + -o, --out Output FASTQ file name (--quality_trim only) + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-get_barcode_counts.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'funguild': + help = """ +Usage: amptk %s +version: %s + +Description: Script takes OTU table as input and runs FUNGuild to assing functional annotation to an OTU + based on the Guilds database. Guilds script written by Zewei Song (2015). + +Options: -i, --input Input OTU table + -d, --db Database to use [fungi, nematode]. Default: fungi + -o, --out Output file basename. + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'Guilds_v1.0.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + + elif sys.argv[1] == 'heatmap': + help = """ +Usage: amptk %s +version: %s + +Description: Script creates a heatmap from an OTU table. Several settings are customizable. + Requires Seaborn, matplotlib, numpy, and pandas. + +Arguments: -i, --input Input OTU table (Required) + -o, --output Output file (Required) + -m, --method Type of heatmap. Default: clustermap [clustermap,heatmap] + -d, --delimiter Delimiter of OTU table. Default: tsv [tsv,csv] + -f, --format Figure format. Default: pdf [pdf,jpg,svg,png] + --font Font set. Default: arial + --color Color Palette. Default: gist_gray_r + --figsize Figure size. Default: 2x8 + --annotate Annotate heatmap with values. + --distance_metric Distance metric to use for clustermap. Default: braycurtis + --cluster_columns Cluster the columns (samples). Default: False [True,False] + --cluster_method Clustering method for clustermap. Default: single [single,complete,average,weighted] + --scaling Scale the data by row. Default: None [None, z_score, standard] + --yaxis_fontsize Y-Axis Font Size. Default: 6 + --xaxis_fontsize X-Axis Font Size. Default: 6 + --normalize Normalize data based total, tsv file IDcount + --normalize_counts Value to normalize counts to, i.e. 100000 + --vmax Maximum value for heatmap coloration. + --debug Print pandas table on import to terminal + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'csv2heatmap.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'drop': + help = """ +Usage: amptk %s +version: %s + +Description: Script drops OTUs from dataset and outputs new OTU table + +Required: -i, --input Input OTU file (.cluster.otus.fa) (FASTA) + -r, --reads Demultiplexed reads (.demux.fq) (FASTQ) + -l, --list List of OTU names to remove, separate by space + -f, --file List of OTU names to remove in a file, one per line + -o, --out Output file name. Default: amptk-drop + """ % (sys.argv[1], version) + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-drop.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'taxonomy': + db_list = ['DB_name', 'DB_type', 'FASTA originated from', 'Fwd Primer', 'Rev Primer', 'Records'] + okay_list = [] + search_path = os.path.join(parentdir, 'DB') + for file in os.listdir(search_path): + if file.endswith(".udb"): + okay_list.append(file) + info_file = file + '.txt' + with open(os.path.join(search_path, info_file), 'rU') as info: + line = info.readlines() + line = [words for segments in line for words in segments.split()] + line.insert(0, file) + db_list.append(line) + if len(db_list) < 7: + db_print = "No DB configured, run 'amptk database' or 'amptk install' command." + else: + d = flatten(db_list) + db_print = fmtcols(d, 6) + + help = """ +Usage: amptk %s +version: %s + +Description: Script maps OTUs to taxonomy information and can append to an OTU table (optional). + By default the script uses a hybrid approach, e.g. gets taxonomy information from + SINTAX, UTAX, and global alignment hits from the larger UNITE-INSD database, and + then parses results to extract the most taxonomy information that it can at 'trustable' + levels. SINTAX/UTAX results are used if BLAST-like search pct identity is less than 97%%. + If %% identity is greater than 97%%, the result with most taxonomy levels is retained. + +Arguments: -f, --fasta Input FASTA file (i.e. OTUs from amptk cluster) (Required) + -i, --otu_table Input OTU table file (i.e. otu_table from amptk cluster) + -o, --out Base name for output file. Default: amptk-taxonomy..txt + -d, --db Select Pre-installed database [ITS1, ITS2, ITS, 16S, LSU, COI]. Default: ITS2 + -m, --mapping_file QIIME-like mapping file + -t, --taxonomy Taxonomy calculated elsewhere. 2 Column file. + --method Taxonomy method. Default: hybrid [utax, sintax, usearch, hybrid, rdp, blast] + --add2db Add FASTA files to DB on the fly. + --fasta_db Alternative database of fasta sequenes to use for global alignment. + --utax_db UTAX formatted database. Default: ITS2.udb [See configured DB's below] + --utax_cutoff UTAX confidence value threshold. Default: 0.8 [0 to 0.9] + --usearch_db USEARCH formatted database. Default: USEARCH.udb + --usearch_cutoff USEARCH threshold percent identity. Default 0.7 + --sintax_cutoff SINTAX confidence value threshold. Default: 0.8 [0 to 0.9] + -r, --rdp Path to RDP Classifier. Required if --method rdp + --rdp_db RDP Classifer DB set. [fungalits_unite, fungalits_warcup. fungallsu, 16srrna] + --rdp_cutoff RDP Classifer confidence value threshold. Default: 0.8 [0 to 1.0] + --local_blast Local Blast database (full path) Default: NCBI remote nt database + --tax_filter Remove OTUs from OTU table that do not match filter, i.e. Fungi to keep only fungi. + -u, --usearch USEARCH executable. Default: usearch9 + --debug Keep intermediate files + +Databases Configured: +%s + """ % (sys.argv[1], version, db_print) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-assign_taxonomy.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'database': + help = """ +Usage: amptk %s +version: %s + +Description: Setup/Format reference database for amptk taxonomy command. + +Arguments: -i, --fasta Input FASTA file + -o, --out Base Name for Output Files. Default: DB of amptk folder + -f, --fwd_primer Forward primer. Default: fITS7 + -r, --rev_primer Reverse primer. Default: ITS4 + --format Reformat FASTA headers to UTAX format. Default: unite2utax [unite2utax, rdp2utax, off] + --drop_ns Removal sequences that have > x N's. Default: 8 + --create_db Create a DB. Default: usearch [utax, usearch] + --skip_trimming Keep full length sequences. Default: off + --derep_fulllength Remove identical sequences. + --lca Run LCA (last common ancestor) on taxonomy if dereplicating sequences. + --min_len Minimum length to keep. Default: 100 + --max_len Maximum length to keep. Default: 1200 + --trunclen Truncate records to length. + --subsample Random subsample reads. + --primer_mismatch Max Primer Mismatch. Default: 2 + --keep_all Keep Sequence if forward primer not found. + --utax_trainlevels UTAX custom parameters. Default: kpcofgs + --utax_splitlevels UTAX custom parameters. Default: NVkpcofgs + --cpus Number of CPUs to use. Default: all + -u, --usearch USEARCH executable. Default: usearch9 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-extract_region.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + try: + outLocation = arguments.index('-o') + except ValueError: + outLocation = arguments.index('--out') + outLocation = outLocation + 1 + arguments[outLocation] = os.path.join(parentdir, 'DB', arguments[outLocation]) + subprocess.call(arguments) + + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'summarize': + help = """ +Usage: amptk %s +version: %s + +Description: Script traverses the taxonomy information and creates an OTU table for each + level of taxonomy, i.e. Kingdom, Phylum, Class, etc. Optionally, it will + create a Stacked Bar Graph for each taxonomy levels for each sample. Requires + Matplotlib, numpy, and pandas. + +Arguments: -i, --table OTU Table containing Taxonomy information (Required) + -o, --out Base name for output files. Default: amptk-summary + --graphs Create stacked Bar Graphs. + --format Image output format. Default: eps [eps, svg, png, pdf] + --percent Convert numbers to Percent for Graphs. Default: off + --font_size Adjust font size for X-axis sample lables. Default: 8 + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-summarize_taxonomy.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'install': + help = """ +Usage: amptk %s +version: %s + +Description: Script downloads pre-formated databases for use with the `amptk taxonomy` + command. You can download databases for fungal ITS, bacterial 16S, fungal + LSU, or arthropod/chordate COI amplicons. + +Arguments: -i Install Databases. Choices: ITS, 16S, LSU, COI + --force Over-write existing databases + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) < 1: + print(help) + sys.exit(1) + else: + if '-i' in arguments: + arguments.remove('-i') + if len(arguments) < 1: + print(help) + sys.exit(1) + for x in arguments: + if os.path.isfile(os.path.join(parentdir, 'DB', x+'.udb')): + if not '--force' in arguments: + print("A formated database was found, to overwrite use '--force'. You can add more custom databases by using the `amptk database` command.") + sys.exit(1) + #download + if not x in URL: + if x == '--force': + continue + print("%s not valid, choices are ITS, 16S, LSU, COI" % x) + sys.exit(1) + print("Downloading %s pre-formatted database" % x) + address = URL.get(x) + download(address, x+'.amptk.tar.gz') + tfile = tarfile.open(x+'.amptk.tar.gz', 'r:gz') + tfile.extractall(x) + for file in os.listdir(x): + shutil.move(os.path.join(x,file), os.path.join(parentdir, 'DB', file)) + shutil.rmtree(x) + os.remove(x+'.amptk.tar.gz') + print("%s taxonomy database installed" % x) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'SRA-submit': + help = """ +Usage: amptk %s +version: %s + +Description: Script aids in submitted your data to NCBI Sequence Read Archive (SRA) by splitting + FASTQ file from Ion, 454, or Illumina by barcode sequence into separate files for + submission to SRA. This ensures your data is minimally processed as only barcodes + are removed. However, you can assert that primers must be found in order for + sequences to be processed. Additionally, you can pass the --biosample argument + with an NCBI biosample tab-delimited file and the script will auto-populate an + SRA submission file. + +Arguments: -i, --input Input FASTQ file or folder (Required) + -o, --out Output base name. Default: sra + -m, --mapping_file QIIME-like mapping file. + -b, --barcode_fasta Mulit-fasta file containing barcodes used. + -s, --biosample BioSample worksheet from NCBI (from confirmation email) + -p, --platform Sequencing platform. Defalt: ion (ion, illumina, 454) + -n, --names CSV name mapping file, e.g. BC_1,NewName + -d, --description Paragraph description for SRA experimental design. Use quotes to wrap paragraph. + -f, --fwd_primer Forward primer sequence. Default: fITS7 + -r, --rev_primer Reverse primer sequence. Default: ITS4 + -a, --append Append a name to the output of all files in run, i.e. run1 -> Sample_run1 + --primer_mismatch Number of mismatches allowed in primer search. Default: 2 + --barcode_mismatch Number of mismatches in barcode to allow. Default: 0 + --require_primer Require primer(s) to be present for output. Default: off [off,forward,both] + --min_len Minimum length read to keep after trimming barcodes. Default 50 + ---force Overwrite directory with same name + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'bin', 'amptk-fastq2sra.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'stats': + help = """ +Usage: amptk %s +version: %s + +Description: A wrapper script for Phyloseq and Vegan R packages that draws NMDS of all + treatments in a BIOM file (output from amptk taxonomy). The script also runs + hypothesis tests (Adonis and Betadispersion) for each treatment. + +Arguments: -i, --biom Input BIOM file with taxonomy and metadata (Required) + -t, --tree Phylogeny of OTUs (from amptk taxonomy) (Required) + -d, --distance Distance metric. Default: raupcrick [raupcrick,jaccard,bray,unifrac,wunifrac] + -o, --out Output base name. Default: amptk_stats + --indicator_species Run indicator species analysis + --ignore_otus Drop OTUs from table before running stats + """ % (sys.argv[1], version) + + arguments = sys.argv[2:] + if len(arguments) > 1: + cmd = os.path.join(parentdir, 'util', 'amptk-stats.py') + arguments.insert(0, cmd) + exe = sys.executable + arguments.insert(0, exe) + subprocess.call(arguments) + else: + print(help) + sys.exit(1) + elif sys.argv[1] == 'primers': + print("----------------------------------") + print("Primers hard-coded into AMPtk:") + print("----------------------------------") + for k,v in natsorted(amptklib.primer_db.items()): + print(k.ljust(13) + v) + print("----------------------------------") + sys.exit(1) + elif sys.argv[1] == 'version' or sys.argv[1] == '--version' or sys.argv[1] == '-version' or sys.argv[1] == '-v': + print("AMPtk v%s" % version) + elif sys.argv[1] == 'citation' or sys.argv[1] == '-citation' or sys.argv[1] == '--citation': + print("\nPalmer JM, Jusino MA, Banik MT, Lindner DL. 2017. Non-biological synthetic spike-in controls and the\n\tAMPtk software pipeline improve mycobiome data. bioRxiv 213470; doi: 10.1101/213470\n") + else: + print("%s option not recognized" % sys.argv[1]) + print(default_help) + sys.exit(1) + +else: + print(default_help) + \ No newline at end of file