From cc0c139100f4ebc7adf2cc85fde67cabb1f659ad Mon Sep 17 00:00:00 2001
From: Jon Palmer <nextgenusfs@gmail.com>
Date: Mon, 2 Apr 2018 16:16:09 -0400
Subject: [PATCH] fixes for py2/3 compatibility, move amptk to bin folder
 hopefully for conda functionality

---
 amptk                        |    2 -
 amptk.py                     | 1090 ----------------------------------
 bin/amptk-assign_taxonomy.py |    2 +-
 bin/amptk-unoise3.py         |    2 +-
 util/Guilds_v1.0.py          |  230 ++++---
 5 files changed, 114 insertions(+), 1212 deletions(-)
 delete mode 100755 amptk
 delete mode 100755 amptk.py
diff --git a/amptk b/amptk
deleted file mode 100755
index c2666f5..0000000
--- a/amptk
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/usr/bin/env python
-import amptk
\ No newline at end of file
diff --git a/amptk.py b/amptk.py
deleted file mode 100755
index 1bc90da..0000000
--- a/amptk.py
+++ /dev/null
@@ -1,1090 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import (absolute_import, division,
-                        print_function, unicode_literals)
-import sys
-import os
-import subprocess
-import inspect
-import tarfile
-import shutil
-script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
-sys.path.insert(0,script_path)
-import lib.amptklib as amptklib
-from natsort import natsorted
-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
-
-URL = { 'ITS': 'https://osf.io/pbtyh/download?version=4',
-        '16S': 'https://osf.io/m7v5q/download?version=1', 
-        'LSU': 'https://osf.io/sqn5r/download?version=1', 
-        'COI': 'https://osf.io/pax79/download?version=2' }
-
-def flatten(l):
-    flatList = []
-    for elem in l:
-        # if an element of a list is a list
-        # iterate over this list and add elements to flatList 
-        if type(elem) == list:
-            for e in elem:
-                flatList.append(e)
-        else:
-            flatList.append(elem)
-    return flatList
-
-def fmtcols(mylist, cols):
-    justify = []
-    for i in range(0,cols):
-        length = max(map(lambda x: len(x), mylist[i::cols]))
-        length += 2
-        ljust = map(lambda x: x.ljust(length), mylist[i::cols])
-        justify.append(ljust)
-    justify = flatten(justify)
-    num_lines = len(mylist) // cols
-    lines = (' '.join(justify[i::num_lines]) 
-             for i in range(0,num_lines))
-    return "\n".join('{}'.format(lines))
-
-def download(url, name):
-    file_name = name
-    u = urlopen(url)
-    f = open(file_name, 'wb')
-    meta = u.info()
-    file_size = 0
-    for x in meta.items():
-    	if x[0].lower() == 'content-length':
-    		file_size = int(x[1])
-    print("Downloading: {0} Bytes: {1}".format(url, file_size))
-    file_size_dl = 0
-    block_sz = 8192
-    while True:
-        buffer = u.read(block_sz)
-        if not buffer:
-            break
-        file_size_dl += len(buffer)
-        f.write(buffer)
-        p = float(file_size_dl) / file_size
-        status = r"{0}  [{1:.2%}]".format(file_size_dl, p)
-        status = status + chr(8)*(len(status)+1)
-        sys.stdout.write(status)
-    f.close()
-
-git_version = amptklib.git_version()
-base_version = '1.1.2'
-if git_version:
-    version = base_version+'-'+git_version
-else:
-    version = base_version
-
-default_help = """
-Usage:       amptk <command> <arguments>
-version:     %s
-
-Description: AMPtk is a package of scripts to process NGS amplicon data.  
-             Dependencies:  USEARCH v9.1.13 and VSEARCH v2.2.0
-    
-Process:     ion         pre-process Ion Torrent data
-             illumina    pre-process folder of de-multiplexed Illumina data
-             illumina2   pre-process PE Illumina data from a single file
-             illumina3   pre-process PE Illumina + index reads (i.e. R1, R2, I)
-             454         pre-process Roche 454 (pyrosequencing) data
-             SRA         pre-process single FASTQ per sample data (i.e. SRA data)
-             
-Clustering:  cluster     cluster OTUs (using UPARSE algorithm)
-             dada2       dada2 denoising algorithm (requires R, dada2, ShortRead)
-             unoise2     UNOISE2 denoising algorithm
-             unoise3     UNOISE3 denoising algorithm
-             cluster_ref closed/open reference based clustering (EXPERIMENTAL)
-
-Utilities:   filter      OTU table filtering
-             lulu        LULU amplicon curation of OTU table
-             taxonomy    Assign taxonomy to OTUs
-             show        show number or reads per barcode from de-multiplexed data
-             select      select reads (samples) from de-multiplexed data
-             remove      remove reads (samples) from de-multiplexed data
-             sample      sub-sample (rarify) de-multiplexed reads per sample
-             drop        Drop OTUs from dataset
-             stats       Hypothesis test and NMDS graphs (EXPERIMENTAL)
-             summarize   Summarize Taxonomy (create OTU-like tables and/or stacked bar graphs)
-             funguild    Run FUNGuild (annotate OTUs with ecological information) 
-             meta        pivot OTU table and append to meta data
-             heatmap     Create heatmap from OTU table
-             SRA-submit  De-multiplex data and create meta data for NCBI SRA submission
-
-Setup:       install     Download/install pre-formatted taxonomy DB. Only need to run once.
-             database    Format Reference Databases for Taxonomy
-             primers     List primers hard-coded in AMPtk. Can use in pre-processing steps.
-             
-Written by Jon Palmer (2015-2017) nextgenusfs@gmail.com
-        """ % version
-
-if len(sys.argv) > 1:
-    if sys.argv[1] == 'ion':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script processes Ion Torrent PGM data for AMPtk clustering.  The input to this script 
-             should be a FASTQ file obtained from the Torrent Server analyzed with the 
-             `--disable-all-filters` flag to the BaseCaller.  This script does the following: 
-             1) finds Ion barcode sequences, 2) relabels headers with appropriate barcode name,
-             3) removes primer sequences, 4) trim/pad reads to a set length.
-    
-Arguments:   -i, --fastq,--bam   Input BAM or FASTQ file (Required)
-             -o, --out           Output base name. Default: out
-             -m, --mapping_file  QIIME-like mapping file
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence Default: ITS4
-             -b, --barcodes      Barcodes used (list, e.g: 1,3,4,5,20). Default: all
-             -n, --name_prefix   Prefix for re-naming reads. Default: R_
-             -l, --trim_len      Length to trim/pad reads. Default: 300
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --min_len           Minimum length read to keep. Default: 100
-             --full_length       Keep only full length sequences.
-             --barcode_fasta     FASTA file containing barcodes. Default: pgm_barcodes.fa
-             --barcode_mismatch   Number of mismatches in barcode to allow. Default: 0
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --cpus              Number of CPUs to use. Default: all
-             --mult_samples      Combine multiple chip runs, name prefix for chip
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_ion.py')
-            arguments.insert(0, cmd)
-            arguments.append('--ion')
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'illumina2':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes Illumina MiSeq data that is not de-multiplexed and has read structure 
-             similar to Ion/454 such that the reads are <barcode><fwd_primer>Read<rev_primer> for 
-             clustering using AMPtk.  The default behavior is to: 1) merge the PE reads using USEARCH, 
-             2) find barcodes, 3)find and trim primers, 3) rename reads according to sample name, 
-             4) trim/pad reads to a set length.  This script can also handle dual barcodes 
-             (3' barcodes using the --reverse_barcode option). 
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             --reverse           Illumina PE reverse reads.
-             -o, --out           Output base name. Default: out
-             -m, --mapping_file  QIIME-like mapping file
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence Default: ITS4
-             -n, --name_prefix   Prefix for re-naming reads. Default: R_
-             -l, --trim_len      Length to trim/pad reads. Default: 300
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --min_len           Minimum length read to keep. Default: 100
-             --barcode_fasta     FASTA file containing barcodes. Default: pgm_barcodes.fa
-             --barcode_mismatch   Number of mismatches in barcode to allow. Default: 0
-             --reverse_barcode   FASTA file containing 3' barcodes. Default: none
-             --full_length       Keep only full length sequences.
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --merge_method      Software to use for PE merging. Default: usearch [usearch,vsearch]
-             --cpus              Number of CPUs to use. Default: all
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_ion.py')
-            arguments.insert(0, cmd)
-            arguments.append('--illumina')
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'illumina':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes a folder of Illumina MiSeq data that is already de-multiplexed 
-             and processes it for clustering using AMPtk.  The default behavior is to: 
-             1) merge the PE reads using USEARCH, 2) find and trim primers, 3) rename reads 
-             according to sample name, 4) trim/pad reads to a set length.
-    
-Arguments:   -i, --fastq         Input folder of FASTQ files (Required)
-             -o, --out           Output folder name. Default: amptk-data
-             -m, --mapping_file  QIIME-like mapping file
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence Default: ITS4      
-             -l, --trim_len      Length to trim/pad reads. Default: 300
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --min_len           Minimum length read to keep. Default: 100
-             --full_length       Keep only full length sequences.
-             --reads             Paired-end or forward reads. Default: paired [paired, forward]
-             --read_length       Illumina Read length (250 if 2 x 250 bp run). Default: auto detect
-             --rescue_forward    Rescue Forward Reads if PE do not merge, e.g. long amplicons. Default: on [on,off]
-             --require_primer    Require the Forward primer to be present. Default: on [on,off]
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --barcode_mismatch   Number of mismatches in barcode to allow. Default: 1
-             --cpus              Number of CPUs to use. Default: all
-             --cleanup           Remove intermediate files.
-             --merge_method      Software to use for PE merging. Default: usearch [usearch,vsearch]
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_illumina_folder.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'illumina3':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes PE Illumina reads, Index reads, mapping file and processes for 
-             clustering/denoising in AMPtk.  The default behavior is to: 
-             1) merge the PE reads using VSEARCH, 2) filter for Phix, 3) find and trim primers,
-             4) rename reads according to sample name, 4) trim/pad reads.
-    
-Arguments:   -f, --forward       FASTQ R1 (forward) file (Required)
-             -r, --reverse       FASTQ R2 (reverse) file (Required)
-             -i, --index         FASTQ I3 (index) file (Required)
-             -m, --mapping_file  QIIME-like mapping file.
-             --barcode_fasta     Multi-fasta file of barocdes.
-             -o, --out           Output folder name. Default: amptk-data  
-             -l, --trim_len      Length to trim/pad reads. Default: 300
-             --fwd_primer        Forward primer sequence
-             --rev_primer        Reverse primer sequence
-             --min_len           Minimum length read to keep. Default: 100
-             --full_length       Keep only full length sequences.
-             --read_length       Illumina Read length (250 if 2 x 250 bp run). Default: auto detect
-             --rescue_forward    Rescue Forward Reads if PE do not merge, e.g. long amplicons. Default: on [on,off]
-             --require_primer    Require the Forward primer to be present. Default: off [on,off]
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --barcode_mismatch  Number of mismatches in index (barcodes) to allow. Default: 2
-             --barcode_rev_comp  Reverse complement barcode sequences in mapping file.
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --cpus              Number of CPUs to use. Default: all
-             --cleanup           Remove intermediate files.
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)  
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_illumina_raw.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == '454':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script processes Roche 454 data for AMPtk clustering.  The input to this script 
-             should be either a SFF file, FASTA+QUAL files, or FASTQ file.  This script does 
-             the following: 1) finds barcode sequences, 2) relabels headers with appropriate 
-             barcode name, 3) removes primer sequences, 4) trim/pad reads to a set length.
-    
-Arguments:   -i, --sff, --fasta  Input file (SFF, FASTA, or FASTQ) (Required)
-             -q, --qual          QUAL file (Required if -i is FASTA).
-             -o, --out           Output base name. Default: out
-             -m, --mapping_file  QIIME-like mapping file
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence Default: ITS4
-             -n, --name_prefix   Prefix for re-naming reads. Default: R_
-             -l, --trim_len      Length to trim/pad reads. Default: 250
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --min_len           Minimum length read to keep. Default: 50
-             --barcode_fasta     FASTA file containing barcodes. (Required)
-             --reverse_barcode   FASTA file containing 3' barcodes. Default: none
-             --barcode_mismatch  Number of mismatches in barcode to allow. Default: 0
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --cpus              Number of CPUs to use. Default: all
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_ion.py')
-            arguments.insert(0, cmd)
-            arguments.append('--454')
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'SRA':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes a folder of FASTQ files in a format you would get from NCBI SRA, i.e.
-             there is one FASTQ file for each sample.  Reads will be named according to sample name
-             and workflow is 1) find and trim primers, 2) rename reads according to filename,
-             and 3) trim/pad reads to a set length (optional).
-    
-Arguments:   -i, --fastq         Input folder of FASTQ files (Required)
-             -o, --out           Output folder name. Default: amptk-data
-             -m, --mapping_file  QIIME-like mapping file
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence Default: ITS4      
-             -l, --trim_len      Length to trim/pad reads. Default: 250
-             -p, --pad           Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
-             --min_len           Minimum length read to keep. Default: 50
-             --full_length       Keep only full length sequences.
-             --require_primer    Require the Forward primer to be present. Default: on [on,off]
-             --primer_mismatch   Number of mismatches in primers to allow. Default: 2
-             --cpus              Number of CPUs to use. Default: all
-             --cleanup           Remove intermediate files.
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-process_illumina_folder.py')
-            arguments.insert(0, cmd)
-            arguments.append('--sra')
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'cluster' or sys.argv[1] == 'uparse':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script is a "wrapper" for the UPARSE algorithm. FASTQ quality trimming via expected 
-             errors and Dereplication are run in vsearch if installed otherwise defaults to Python 
-             which allows for the use of datasets larger than 4GB.  
-             Chimera filtering and UNOISE are also options.
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             -o, --out           Output base name. Default: out
-             -e, --maxee         Expected error quality trimming. Default: 1.0
-             -p, --pct_otu       OTU Clustering Radius (percent). Default: 97
-             -m, --minsize       Minimum size to keep (singleton filter). Default: 2
-             --uchime_ref        Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path]
-             --map_filtered      Map quality filtered reads back to OTUs. Default: off
-             --unoise            Run De-noising pre-clustering (UNOISE). Default: off
-             --debug             Keep intermediate files.
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)
-       
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-OTU_cluster.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)            
-    elif sys.argv[1] == 'cluster_ref':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script first quality filters reads, dereplicates, and then runs chimera
-             filtering.  OTUs are then picked via reference based clustering (closed)
-             those that are > --id.  The rest of the data can then be clustered via
-             de novo UPARSE and then reference clustered using UTAX.  EXPERIMENTAL
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             -d, --db            Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]. (Required)
-             -o, --out           Output base name. Default: out
-             -e, --maxee         Expected error quality trimming. Default: 1.0
-             -p, --pct_otu       OTU Clustering Radius (percent). Default: 97
-             -m, --minsize       Minimum size to keep (singleton filter). Default: 2
-             --id                Percent ID for closed reference clustering. Default: 97
-             --utax_db           UTAX formatted DB.
-             --utax_level        UTAX Taxonomy level to keep. Default: k [k,p,c,o,f,g,s]
-             --utax_cutoff       UTAX confidence value threshold. Default: 0.8 [0 to 0.9]
-             --mock              Mock community fasta file
-             --closed_ref_only   Run only closed reference clustering.
-             --map_filtered      Map quality filtered reads back to OTUs. Default: off
-             --debug             Keep intermediate files.
-             -u, --usearch       USEARCH executable. Default: usearch9
-        """ % (sys.argv[1], version)
-       
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-OTU_cluster_ref.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)            
-    
-    elif sys.argv[1] == 'dada2':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script is a "wrapper" for the DADA2 pipeline.  It will "pick OTUs" based on denoising
-             the data for each read predicting the original sequence.  This pipeline is sensitive to     
-             1 bp differences between sequences. Since most reference databases classify "species"
-             at 97%% threshold, the inferred sequences (iSeqs) from DADA2 are then clusterd at --pct_otu
-             to create OTUs. Both results are saved.  Requires R packages: dada2, ShortRead
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             -o, --out           Output base name. Default: dada2
-             -m, --min_reads     Minimum number of reads per sample. Default: 10
-             -l, --length        Length to trim reads.
-             -e, --maxee         Expected error quality trimming. Default: 1.0
-             -p, --pct_otu       OTU Clustering Radius (percent). Default: 97
-             --platform          Sequencing platform. [ion, illumina, 454]. Default: ion
-             --pool              Pool all samples together for DADA2. Default: off
-             --uchime_ref        Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path]
-             --debug             Keep intermediate files.
-        """ % (sys.argv[1], version)
-       
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-dada2.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)            
-
-    elif sys.argv[1] == 'unoise2':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script will run the UNOISE2 denoising algorithm followed by clustering with
-             UCLUST to generate OTUs. OTU table is then constructed by mapping reads to 
-             the OTUs.  Requires USEARCH v9.0.232 or greater.
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             -o, --out           Output base name. Default: out
-             -e, --maxee         Expected error quality trimming. Default: 1.0
-             -m, --minsize       Minimum size to keep for denoising. Default: 8
-             -p, --pct_otu       OTU Clustering Radius (percent). Default: 97
-             -u, --usearch       Path to USEARCH9. Default: usearch9
-             --uchime_ref        Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path]
-             --debug             Keep intermediate files.
-        """ % (sys.argv[1], version)
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-unoise2.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1) 
-
-    elif sys.argv[1] == 'unoise3':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script will run the UNOISE3 denoising algorithm followed by clustering with
-             UCLUST to generate OTUs. OTU table is then constructed by mapping reads to 
-             the OTUs.  Requires USEARCH v10.0.240 or greater.
-    
-Arguments:   -i, --fastq         Input FASTQ file (Required)
-             -o, --out           Output base name. Default: out
-             -e, --maxee         Expected error quality trimming. Default: 1.0
-             -m, --minsize       Minimum size to keep for denoising. Default: 8
-             -p, --pct_otu       OTU Clustering Radius (percent). Default: 97
-             -u, --usearch       Path to USEARCH9. Default: usearch9
-             --uchime_ref        Run Ref Chimera filtering. Default: off [ITS, LSU, COI, 16S, custom path]
-             --debug             Keep intermediate files.
-        """ % (sys.argv[1], version)
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-unoise3.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    
-    elif sys.argv[1] == 'lulu' or sys.argv[1] == 'LULU':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script is a wrapper for the LULU OTU table post-clustering curation of amplicon
-             data. The script calculates pairwise identity between the OTUs and then filters
-             the OTU table based on whether closely related OTUs that share the same/similar
-             distributions in the data are "daughters" of the "parent" OTU. Requires R and the
-             LULU R package. doi:10.1038/s41467-017-01312-x
-                 
-Arguments:   -i, --otu_table            Input OTU table (Required)
-             -f, --fasta                Input OTUs in FASTA format (Required)
-             -o, --out                  Output base name. Default: input basename
-             --min_ratio_type           Minimum ratio threshold. Default: min [min,avg]
-             --min_ratio                Minimum ratio. Default: 1
-             --min_match                Minimum match pident (%%). Default: 84
-             --min_relative_cooccurence Minimum relative co-occurance (%%): Default: 95
-             --debug                    Keep intermediate files.
-        """ % (sys.argv[1], version)
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-lulu.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1) 
-                 
-    elif sys.argv[1] == 'filter':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script filters OTU table generated from the `amptk cluster` command and should 
-             be run on all datasets to combat barcode-switching or index-bleed (as high as 
-             2%% in MiSeq datasets, ~ 0.3%% in Ion PGM datasets).  This script works best when
-             a spike-in control sequence is used, e.g. Synthetic Mock, although a mock is not required.
-    
-Required:    -i, --otu_table     OTU table
-             -f, --fasta         OTU fasta
-             
-Optional:    -o, --out           Base name for output files. Default: use input basename
-             -b, --mock_barcode  Name of barcode of mock community (Recommended)
-             -m, --mc            Mock community FASTA file. Required if -b passed. [synmock,mock1,mock2,mock3,other]
-             -c, --calculate     Calculate index-bleed options. Default: all [in,all]
-             -d, --drop          Sample(s) to drop from OTU table. (list, separate by space)
-             --negatives         Negative sample names. (list, separate by space)
-             --ignore            Ignore sample(s) during index-bleed calc (list, separate by space)
-             
-Filtering    -n, --normalize     Normalize reads to number of reads per sample [y,n]. Default: y
-             -p, --index_bleed   Filter index bleed between samples (percent). Default: 0.005
-             -t, --threshold     Number to use for establishing read count threshold. Default: max [max,sum,top5,top10,top25]
-             -s, --subtract      Threshold to subtract from all OTUs (any number or auto). Default: 0
-             --delimiter         Delimiter of OTU tables. Default: tsv  [csv, tsv]
-             --min_reads_otu     Minimum number of reads for valid OTU from whole experiment. Default: 2
-             --min_samples_otu   Minimum number of samples for valid OTU from whole experiment. Default: 1
-             --col_order         Column order (separate by space). Default: sort naturally
-             --keep_mock         Keep Spike-in mock community. Default: False
-             --show_stats        Show OTU stats on STDOUT  
-             --debug             Keep intermediate files.
-             -u, --usearch       USEARCH executable. Default: usearch9 
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-filter.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'select':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script filters de-multiplexed data (.demux.fq) to select only reads from samples 
-             provided in a text file, one name per line or pass a list to keep to --list.
-    
-Required:    -i, --input      Input FASTQ file (.demux.fq)
-             -t, --threshold  Keep samples with read count greater than -t
-             -l, --list       List of sample (barcode) names to keep, separate by space
-             -f, --file       List of sample (barcode) names to keep in a file, one per line
-             -o, --out        Output file name
-             --format         File format for output file. Default: fastq [fastq, fasta]  
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-keep_samples.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)    
-    elif sys.argv[1] == 'remove':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script filters de-multiplexed data (.demux.fq) to remove only reads from samples provided
-             in a text file, one name per line.
-    
-Required:    -i, --input      Input FASTQ file (.demux.fq)
-             -t, --threshold  Keep samples with read count greater than -t
-             -l, --list       List of sample (barcode) names to remove, separate by space
-             -f, --file       List of sample (barcode) names to remove in a file, one per line
-             -o, --out        Output file name
-             --format         File format for output file. Default: fastq [fastq, fasta]
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-remove_samples.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)    
-    elif sys.argv[1] == 'sample':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script sub-samples (rarifies) de-multiplexed data to equal number of reads per 
-             sample. For community analysis, this might not be appropriate as you are ignoring 
-             a portion of your data, however, there might be some applications where it is useful.
-    
-Required:    -i, --input       Input FASTQ file
-             -n, --num_reads   Number of reads to sub-sample to
-             -o, --out         Output FASTQ file name      
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-barcode_rarify.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'meta':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes meta data file in CSV format (e.g. from excel) and an OTU table as input.  
-             The first column of the meta data file must match the OTU table sample headers exactly.  
-             It then pivots the OTU table and appends it to the meta data file.  
-    
-Required:    -i, --input       Input OTU table
-             -m, --meta        Meta data table (csv format)
-             -o, --out         Output (meta data + pivotted OTU table)
-             --split_taxonomy  Make separate tables for groups of taxonomy [k,p,c,o,f,g]  
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-merge_metadata.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'show':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes de-multiplexed data (.demux.fq) as input and counts reads per barcode.
-    
-Required:    -i, --input     Input FASTQ file (.demux.fq)
-             --quality_trim  Quality trim reads
-             -e, --maxee     maxEE threshold for quality. Default: 1.0
-             -l, --length    truncation length for trimming: Default: 250
-             -o, --out       Output FASTQ file name (--quality_trim only)     
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-get_barcode_counts.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)    
-    
-    elif sys.argv[1] == 'funguild':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script takes OTU table as input and runs FUNGuild to assing functional annotation to an OTU
-             based on the Guilds database.  Guilds script written by Zewei Song (2015).  
-    
-Options:     -i, --input        Input OTU table
-             -d, --db           Database to use [fungi, nematode]. Default: fungi
-             -o, --out          Output file basename.
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'Guilds_v1.0.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)    
-
-    elif sys.argv[1] == 'heatmap':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script creates a heatmap from an OTU table.  Several settings are customizable.  
-             Requires Seaborn, matplotlib, numpy, and pandas.
-
-Arguments:   -i, --input         Input OTU table (Required)
-             -o, --output        Output file (Required)
-             -m, --method        Type of heatmap. Default: clustermap [clustermap,heatmap]
-             -d, --delimiter     Delimiter of OTU table. Default: tsv [tsv,csv]
-             -f, --format        Figure format. Default: pdf [pdf,jpg,svg,png]
-             --font              Font set. Default: arial
-             --color             Color Palette. Default: gist_gray_r
-             --figsize           Figure size. Default: 2x8
-             --annotate          Annotate heatmap with values.
-             --distance_metric   Distance metric to use for clustermap. Default: braycurtis
-             --cluster_columns   Cluster the columns (samples). Default: False [True,False]
-             --cluster_method    Clustering method for clustermap. Default: single [single,complete,average,weighted]
-             --scaling           Scale the data by row. Default: None [None, z_score, standard]
-             --yaxis_fontsize    Y-Axis Font Size. Default: 6
-             --xaxis_fontsize    X-Axis Font Size. Default: 6
-             --normalize         Normalize data based total, tsv file ID<tab>count
-             --normalize_counts  Value to normalize counts to, i.e. 100000
-             --vmax              Maximum value for heatmap coloration.
-             --debug             Print pandas table on import to terminal
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'csv2heatmap.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'drop':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script drops OTUs from dataset and outputs new OTU table
-    
-Required:    -i, --input     Input OTU file (.cluster.otus.fa) (FASTA)
-             -r, --reads     Demultiplexed reads (.demux.fq) (FASTQ)
-             -l, --list      List of OTU names to remove, separate by space
-             -f, --file      List of OTU names to remove in a file, one per line
-             -o, --out       Output file name. Default: amptk-drop
-        """ % (sys.argv[1], version)
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-drop.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'taxonomy':
-        db_list = ['DB_name', 'DB_type', 'FASTA originated from', 'Fwd Primer', 'Rev Primer', 'Records']
-        okay_list = []
-        search_path = os.path.join(script_path, 'DB')
-        for file in os.listdir(search_path):
-            if file.endswith(".udb"):
-                okay_list.append(file)
-                info_file = file + '.txt'
-                with open(os.path.join(search_path, info_file), 'rU') as info:
-                    line = info.readlines()
-                    line = [words for segments in line for words in segments.split()]
-                    line.insert(0, file)
-                    db_list.append(line)
-        if len(db_list) < 7:
-            db_print = "No DB configured, run 'amptk database' or 'amptk install' command."
-        else:
-            d = flatten(db_list)
-            db_print = fmtcols(d, 6)
-        
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script maps OTUs to taxonomy information and can append to an OTU table (optional).  
-             By default the script uses a hybrid approach, e.g. gets taxonomy information from 
-             SINTAX, UTAX, and global alignment hits from the larger UNITE-INSD database, and 
-             then parses results to extract the most taxonomy information that it can at 'trustable' 
-             levels. SINTAX/UTAX results are used if BLAST-like search pct identity is less than 97%%.  
-             If %% identity is greater than 97%%, the result with most taxonomy levels is retained.
-    
-Arguments:   -f, --fasta         Input FASTA file (i.e. OTUs from amptk cluster) (Required)
-             -i, --otu_table     Input OTU table file (i.e. otu_table from amptk cluster)
-             -o, --out           Base name for output file. Default: amptk-taxonomy.<method>.txt
-             -d, --db            Select Pre-installed database [ITS1, ITS2, ITS, 16S, LSU, COI]. Default: ITS2
-             -m, --mapping_file  QIIME-like mapping file
-             -t, --taxonomy      Taxonomy calculated elsewhere. 2 Column file.
-             --method            Taxonomy method. Default: hybrid [utax, sintax, usearch, hybrid, rdp, blast]
-             --add2db            Add FASTA files to DB on the fly.
-             --fasta_db          Alternative database of fasta sequenes to use for global alignment.
-             --utax_db           UTAX formatted database. Default: ITS2.udb [See configured DB's below]
-             --utax_cutoff       UTAX confidence value threshold. Default: 0.8 [0 to 0.9]
-             --usearch_db        USEARCH formatted database. Default: USEARCH.udb
-             --usearch_cutoff    USEARCH threshold percent identity. Default 0.7
-             --sintax_cutoff     SINTAX confidence value threshold. Default: 0.8 [0 to 0.9]
-             -r, --rdp           Path to RDP Classifier. Required if --method rdp
-             --rdp_db            RDP Classifer DB set. [fungalits_unite, fungalits_warcup. fungallsu, 16srrna]  
-             --rdp_cutoff        RDP Classifer confidence value threshold. Default: 0.8 [0 to 1.0]
-             --local_blast       Local Blast database (full path) Default: NCBI remote nt database   
-             --tax_filter        Remove OTUs from OTU table that do not match filter, i.e. Fungi to keep only fungi.
-             -u, --usearch       USEARCH executable. Default: usearch9
-             --debug             Keep intermediate files
-
-Databases Configured: 
-%s 
-        """ % (sys.argv[1], version, db_print)
-
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-assign_taxonomy.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-
-        else:
-            print(help)
-            sys.exit(1)                    
-    elif sys.argv[1] == 'database':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Setup/Format reference database for amptk taxonomy command.
-    
-Arguments:   -i, --fasta         Input FASTA file
-             -o, --out           Base Name for Output Files. Default: DB of amptk folder
-             -f, --fwd_primer    Forward primer. Default: fITS7
-             -r, --rev_primer    Reverse primer. Default: ITS4
-             --format            Reformat FASTA headers to UTAX format. Default: unite2utax [unite2utax, rdp2utax, off]
-             --drop_ns           Removal sequences that have > x N's. Default: 8
-             --create_db         Create a DB. Default: usearch [utax, usearch]
-             --skip_trimming     Keep full length sequences. Default: off
-             --derep_fulllength  Remove identical sequences.
-             --lca               Run LCA (last common ancestor) on taxonomy if dereplicating sequences.
-             --min_len           Minimum length to keep. Default: 100
-             --max_len           Maximum length to keep. Default: 1200
-             --trunclen          Truncate records to length.
-             --subsample         Random subsample reads.
-             --primer_mismatch   Max Primer Mismatch. Default: 2
-             --keep_all          Keep Sequence if forward primer not found.
-             --utax_trainlevels  UTAX custom parameters. Default: kpcofgs
-             --utax_splitlevels  UTAX custom parameters. Default: NVkpcofgs
-             --cpus              Number of CPUs to use. Default: all
-             -u, --usearch       USEARCH executable. Default: usearch9       
-        """ % (sys.argv[1], version)
-
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-extract_region.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            try:
-                outLocation = arguments.index('-o')
-            except ValueError:
-                outLocation = arguments.index('--out')
-            outLocation = outLocation + 1
-            arguments[outLocation] = os.path.join(script_path, 'DB', arguments[outLocation])
-            subprocess.call(arguments)
-        
-        else:
-            print(help)
-            sys.exit(1)   
-    elif sys.argv[1] == 'summarize':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script traverses the taxonomy information and creates an OTU table for each
-             level of taxonomy, i.e. Kingdom, Phylum, Class, etc.  Optionally, it will 
-             create a Stacked Bar Graph for each taxonomy levels for each sample. Requires 
-             Matplotlib, numpy, and pandas.
-    
-Arguments:   -i, --table     OTU Table containing Taxonomy information (Required)
-             -o, --out       Base name for output files. Default: amptk-summary
-             --graphs        Create stacked Bar Graphs.
-             --format        Image output format. Default: eps [eps, svg, png, pdf]
-             --percent       Convert numbers to Percent for Graphs. Default: off
-             --font_size     Adjust font size for X-axis sample lables. Default: 8
-        """ % (sys.argv[1], version)
-        
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-summarize_taxonomy.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)        
-        else:
-            print(help)
-            sys.exit(1)  
-    elif sys.argv[1] == 'install':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script downloads pre-formated databases for use with the `amptk taxonomy` 
-             command. You can download databases for fungal ITS, bacterial 16S, fungal
-             LSU, or arthropod/chordate COI amplicons. 
-    
-Arguments:   -i            Install Databases. Choices: ITS, 16S, LSU, COI
-             --force       Over-write existing databases
-        """ % (sys.argv[1], version) 
-
-        arguments = sys.argv[2:]
-        if len(arguments) < 1:
-            print(help)
-            sys.exit(1)
-        else:
-            if '-i' in arguments:
-                arguments.remove('-i')
-                if len(arguments) < 1:
-                    print(help)
-                    sys.exit(1)
-                for x in arguments:
-                    if os.path.isfile(os.path.join(script_path, 'DB', x+'.udb')):
-                        if not '--force' in arguments:
-                            print("A formated database was found, to overwrite use '--force'. You can add more custom databases by using the `amptk database` command.")
-                            sys.exit(1)
-                    #download
-                    if not x in URL:
-                        if x == '--force':
-                            continue
-                        print("%s not valid, choices are ITS, 16S, LSU, COI" % x)
-                        sys.exit(1)
-                    print("Downloading %s pre-formatted database" % x)
-                    address = URL.get(x)
-                    download(address, x+'.amptk.tar.gz')
-                    tfile = tarfile.open(x+'.amptk.tar.gz', 'r:gz')
-                    tfile.extractall(x)
-                    for file in os.listdir(x):
-                        shutil.move(os.path.join(x,file), os.path.join(script_path, 'DB', file))
-                    shutil.rmtree(x)
-                    os.remove(x+'.amptk.tar.gz')
-                    print("%s taxonomy database installed" % x)
-            else:
-                print(help)
-                sys.exit(1)
-    elif sys.argv[1] == 'SRA-submit':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: Script aids in submitted your data to NCBI Sequence Read Archive (SRA) by splitting 
-             FASTQ file from Ion, 454, or Illumina by barcode sequence into separate files for 
-             submission to SRA.  This ensures your data is minimally processed as only barcodes
-             are removed.  However, you can assert that primers must be found in order for 
-             sequences to be processed.  Additionally, you can pass the --biosample argument 
-             with an NCBI biosample tab-delimited file and the script will auto-populate an 
-             SRA submission file.
-    
-Arguments:   -i, --input         Input FASTQ file or folder (Required)
-             -o, --out           Output base name. Default: sra
-             -m, --mapping_file  QIIME-like mapping file.
-             -b, --barcode_fasta Mulit-fasta file containing barcodes used.
-             -s, --biosample     BioSample worksheet from NCBI (from confirmation email)
-             -p, --platform      Sequencing platform. Defalt: ion (ion, illumina, 454)
-             -n, --names         CSV name mapping file, e.g. BC_1,NewName
-             -d, --description   Paragraph description for SRA experimental design. Use quotes to wrap paragraph.
-             -f, --fwd_primer    Forward primer sequence. Default: fITS7
-             -r, --rev_primer    Reverse primer sequence. Default: ITS4
-             -a, --append        Append a name to the output of all files in run, i.e. run1 -> Sample_run1
-             --primer_mismatch   Number of mismatches allowed in primer search. Default: 2
-             --barcode_mismatch  Number of mismatches in barcode to allow. Default: 0
-             --require_primer    Require primer(s) to be present for output. Default: off [off,forward,both]
-             --min_len           Minimum length read to keep after trimming barcodes. Default 50
-             ---force            Overwrite directory with same name
-        """ % (sys.argv[1], version)
-   
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'bin', 'amptk-fastq2sra.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)
-    elif sys.argv[1] == 'stats':
-        help = """
-Usage:       amptk %s <arguments>
-version:     %s
-
-Description: A wrapper script for Phyloseq and Vegan R packages that draws NMDS of all 
-             treatments in a BIOM file (output from amptk taxonomy). The script also runs 
-             hypothesis tests (Adonis and Betadispersion) for each treatment.
-    
-Arguments:   -i, --biom          Input BIOM file with taxonomy and metadata (Required)
-             -t, --tree          Phylogeny of OTUs (from amptk taxonomy) (Required)
-             -d, --distance      Distance metric. Default: raupcrick [raupcrick,jaccard,bray,unifrac,wunifrac]
-             -o, --out           Output base name. Default: amptk_stats
-             --indicator_species Run indicator species analysis
-             --ignore_otus       Drop OTUs from table before running stats
-        """ % (sys.argv[1], version)
-   
-        arguments = sys.argv[2:]
-        if len(arguments) > 1:
-            cmd = os.path.join(script_path, 'util', 'amptk-stats.py')
-            arguments.insert(0, cmd)
-            exe = sys.executable
-            arguments.insert(0, exe)
-            subprocess.call(arguments)
-        else:
-            print(help)
-            sys.exit(1)            
-    elif sys.argv[1] == 'primers':
-        print("----------------------------------")
-        print("Primers hard-coded into AMPtk:")
-        print("----------------------------------")
-        for k,v in natsorted(amptklib.primer_db.items()):
-           print(k.ljust(13) + v)
-        print("----------------------------------")
-        sys.exit(1)
-    elif sys.argv[1] == 'version' or sys.argv[1] == '--version' or sys.argv[1] == '-version' or sys.argv[1] == '-v':
-        print("AMPtk v%s" % version)
-    elif sys.argv[1] == 'citation' or sys.argv[1] == '-citation' or sys.argv[1] == '--citation':
-        print("\nPalmer JM, Jusino MA, Banik MT, Lindner DL. 2017. Non-biological synthetic spike-in controls and the\n\tAMPtk software pipeline improve mycobiome data. bioRxiv 213470; doi: 10.1101/213470\n")
-    else:
-        print("%s option not recognized" % sys.argv[1])
-        print(default_help)
-        sys.exit(1)
-    
-else:
-    print(default_help)
-        
\ No newline at end of file
diff --git a/bin/amptk-assign_taxonomy.py b/bin/amptk-assign_taxonomy.py
index 0abe0f3..37fe744 100755
--- a/bin/amptk-assign_taxonomy.py
+++ b/bin/amptk-assign_taxonomy.py
@@ -291,7 +291,7 @@ def restricted_float(x):
             #load results into dictionary for appending to OTU table
             amptklib.log.debug("Loading SINTAX results into dictionary")
             with open(sintax_out, 'rU') as infile:
-                reader = csv.reader(infile, delimiter=("\t"))
+                reader = csv.reader(infile, delimiter=(str("\t")))
                 otuDict = {rows[0]:'SINTAX;'+rows[3] for rows in reader} 
 else:
     #you have supplied a two column taxonomy file, parse and build otuDict
diff --git a/bin/amptk-unoise3.py b/bin/amptk-unoise3.py
index ea7765d..20d96e2 100755
--- a/bin/amptk-unoise3.py
+++ b/bin/amptk-unoise3.py
@@ -219,7 +219,7 @@ def checkfastqsize(input):
 print("-------------------------------------------------------")
 if not not args.debug:
     print("Tmp Folder of files: %s" % tmp)
-print("amplicon sequnce variants: %s" % iSeqs))
+print("amplicon sequnce variants: %s" % iSeqs)
 print("ASV OTU Table: %s" % iSeq_otu_table)
 print("Clustered OTUs: %s" % os.path.basename(final_otu))
 print("OTU Table: %s" % os.path.basename(final_otu_table))
diff --git a/util/Guilds_v1.0.py b/util/Guilds_v1.0.py
index 029183f..7ed7661 100644
--- a/util/Guilds_v1.0.py
+++ b/util/Guilds_v1.0.py
@@ -15,7 +15,7 @@
 
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
-	
+    
 This script assigns functional information to the OTUs in the user's OTU table. The OTU table needs to have a column named 'taxonomy', which contains information from a reference database (such as UNITE - https://unite.ut.ee/). It is required that the first line of the OTU table to be the header, without any additional comments. Some programs, such as QIIME will add an additional row of comments before the header, and this has to be removed before using the FunGuild script. The script will try to recognized the delimiter in the user's OTU table, but comma (.csv) or tab (.txt) delimiter formats are recommended.
 
 The functional databases are fetched from http://www.stbates.org/funguild_db.php or http://www.stbates.org/nemaguild_db.php
@@ -29,9 +29,9 @@
                    preferred formats.
   -db              Database to use ('fungi' or 'nematode') [default:fungi]
   -m, --matched    Ask the script to output an otu table containing only OTUs
-  		   for which functional assignments have been made
+           for which functional assignments have been made
   -u, --unmatched  Ask the script to output an otu table containing only OTUs
-  		   for which functional assignments could not be made
+           for which functional assignments could not be made
   
 This is an example command to run this script:
 python Guilds_v1.0.py -otu user_otu_table.txt
@@ -75,14 +75,14 @@
 import os
 import timeit
 import sys
-import urllib.request, urllib.parse, urllib.error
+#import urllib.request, urllib.parse, urllib.error
 from operator import itemgetter
 import csv
 try:
     from urllib.request import urlopen
 except ImportError:
     from urllib2 import urlopen
-
+    
 start = timeit.default_timer()
 ################################
 
@@ -90,7 +90,7 @@
 parser = argparse.ArgumentParser()
 
 parser.add_argument("-i", "--input", help="Path and file name of the OTU table. The script will try to detect the delimiter"
-					"in the file, but tab or csv are preferred formats.") 
+                    "in the file, but tab or csv are preferred formats.") 
 parser.add_argument("-m", "--matched", action="store_true", help="Ask the script to output a otu table with function assigned OTUs") 
 parser.add_argument("-u", "--unmatched", action="store_true", help="Ask the script to output a otu table with function assigned OTUs")
 parser.add_argument("-d","--db", choices=['fungi','nematode'], default='fungi', help="Assign a specified database to the script")
@@ -103,9 +103,9 @@
 #Detect delimiter in the input file
 with open(otu_file, 'rU') as f1:
     dialect = csv.Sniffer().sniff(f1.read())
-    otu_delimiter = dialect.delimiter
-    if otu_delimiter == ';':
-        out_delimiter = '\t'
+    otu_delimiter = str(dialect.delimiter)
+    if otu_delimiter == str(';'):
+        out_delimiter = str('\t')
 
 #setup the output files and naming
 if args.out:
@@ -133,46 +133,39 @@
 temp = 'temp.txt'
 urlFile = urlopen(url)
 data = urlFile.read().decode('utf-8')
-
 new_data = data.split("} , {")
 
 #Fix the first and last record
 new_data[0] = new_data[0][3:]
 new_data[-1]=new_data[-1][:-3]
 
-#Parse the record
-parse_data = []
-for line in new_data:
-    record = line
-    rec = record.split(" , ")
-    del rec[0]
-
-    current_rec = []
-    for item in rec:
-        p = item.find(":")
-        current_rec.append(item[p+2:].replace('"',''))
-    parse_data.append(current_rec)
-
-header = "Taxon\tTaxon Level\tTrophic Mode\tGuild\tConfidence Ranking\tGrowth Morphology\tTrait\tNotes\tCitation/Source"
-
-f = open(function_file,'a')
-f.write("%s\n" %(header))
-for item in parse_data:
-    f.write("%s\n" %("\t".join(item)))
-f.close()
-
+#Parse the record and write temp data file
+with open(function_file, 'w') as f:
+    f.write("Taxon\tTaxon Level\tTrophic Mode\tGuild\tConfidence Ranking\tGrowth Morphology\tTrait\tNotes\tCitation/Source\n")
+    parse_data = []
+    for line in new_data:
+        record = line
+        rec = record.split(" , ")
+        del rec[0]
+
+        current_rec = []
+        for item in rec:
+            p = item.find(":")
+            cleaned_item = item[p+2:].replace('"','')
+            current_rec.append(cleaned_item.encode('ascii', 'ignore').decode('ascii'))
+        f.write('{:}\n'.format('\t'.join(current_rec)))
+        
 
 #Detect the position of header
-f_database = open(function_file, 'r') # Open the database file.
-for line in f_database:
-	if line.find('Taxon') != -1: #Search for the line that contains the header (if it is not the first line)
-		header_database = line.split('\t')
-		break
-f_database.close()
+with open(function_file, 'rU') as f_database:
+    for line in f_database:
+        if line.find('Taxon') != -1: #Search for the line that contains the header (if it is not the first line)
+            header_database = line.split('\t')
+            break
 
 #Check the database header.
 if len(header_database) == 1:
-	header_database = header_database[0].split(" ")
+    header_database = header_database[0].split(" ")
 
 # Set the parameters for progress report
 with open(function_file) as f1:
@@ -183,7 +176,7 @@
 
 p = list(range(1,11))
 way_points = [int(total_length*(old_div(x,10.0))) for x in p]
-############################################################################################		
+############################################################################################        
 
 # Open the OTU table and read in the header ################################################
 print("")
@@ -192,11 +185,11 @@
 
 #load the header
 with open(otu_file, 'rU') as otu:
-	header = otu.readline().rstrip().split(otu_delimiter) 
+    header = otu.readline().rstrip().split(otu_delimiter) 
 
 #Attach all columns of database file to the header of the new OTU table
 for item in header_database:
-	header.append(item)
+    header.append(item)
 
 #look for Taxonomy or taxonomy
 if 'taxonomy' in header:
@@ -210,24 +203,24 @@
 
 #Abort if the column 'taxonomy' is not found
 if index_tax == -1:
-	print("Column 'taxonomy' not found. Please check you OTU table %s." %(otu_file))
-	sys.exit(0)
+    print("Column 'taxonomy' not found. Please check you OTU table %s." %(otu_file))
+    sys.exit(0)
 ############################################################################################
 
 #Search in function database################################################################
 # Read the OTU table into memory, and separate taxonomic levels with '@'.
 with open(otu_file, 'rU') as otu:
-	otu_tab = []    
-	for record in otu:
-		otu_current = record.split(otu_delimiter)
-		otu_taxonomy = otu_current[index_tax].rstrip('\n')
-		replace_list = ['_', ' ', ';', ',', ':']
-		for symbol in replace_list:
-			otu_taxonomy = otu_taxonomy.replace(symbol, '@')
-		otu_taxonomy = otu_taxonomy + '@'
-		otu_current[index_tax] = otu_taxonomy
-		otu_tab.append(otu_current)
-	otu_tab = otu_tab[1:] # remove the header line
+    otu_tab = []    
+    for record in otu:
+        otu_current = record.split(otu_delimiter)
+        otu_taxonomy = otu_current[index_tax].rstrip('\n')
+        replace_list = ['_', ' ', ';', ',', ':']
+        for symbol in replace_list:
+            otu_taxonomy = otu_taxonomy.replace(symbol, '@')
+        otu_taxonomy = otu_taxonomy + '@'
+        otu_current[index_tax] = otu_taxonomy
+        otu_tab.append(otu_current)
+    otu_tab = otu_tab[1:] # remove the header line
 
 # Start searching the database
 ## Each record in the Fungal Guild Database is searched in the user's OTU table.
@@ -239,36 +232,37 @@
 
 print("Searching the FUNGuild database...")
 
-f_database = open(function_file, 'rU')
-for record in f_database:
-    # report the progress   
-    percent += 1
+with open(function_file, 'rb') as f_database:
+    for record in f_database:
+        record = record.decode('utf-8')
+        # report the progress   
+        percent += 1
  
-    if percent in way_points:
-        progress = (int(round(percent/total_length*100.0)))
-        print('{}%'.format(progress)) 
-    else: t = 0
+        if percent in way_points:
+            progress = (int(round(percent/total_length*100.0)))
+            print('{}%'.format(progress)) 
+        else: t = 0
     
-    # Compare database with the OTU table
-    function_tax = record.split('\t')
-    search_term = function_tax[0].replace(' ', '@') #first column of database, contains the name of the species
-    search_term = '@' + search_term + '@' #Add @ to the search term
-
-    for otu in otu_tab:
-        otu_tax = otu[index_tax] # Get the taxonomy string of current OTU record.
-        if otu_tax.find(search_term) >= 0: #found the keyword in this OTU's taxonomy
-            count += 1 # Count the matching record
-            otu_new = otu[:]
+        # Compare database with the OTU table
+        function_tax = record.split('\t')
+        search_term = function_tax[0].replace(' ', '@') #first column of database, contains the name of the species
+        search_term = '@' + search_term + '@' #Add @ to the search term
+        #print(search_term)
+        for otu in otu_tab:
+            otu_tax = otu[index_tax] # Get the taxonomy string of current OTU record.
+            #print(otu_tax)
+            if otu_tax.find(search_term) >= 0: #found the keyword in this OTU's taxonomy
+                count += 1 # Count the matching record
+                otu_new = otu[:]
             
-            # Assign the matching functional information to current OTU record.
-            for item in function_tax:
-                otu_new.append(item)
-            otu_redundant.append(otu_new)
-f_database.close()
-
+                # Assign the matching functional information to current OTU record.
+                for item in function_tax:
+                    otu_new.append(item)
+                otu_redundant.append(otu_new)
+#sys.exit(1)
 # Finish searching, delete the temp function database file
 if os.path.isfile('temp_db.txt') == True: 
-	os.remove('temp_db.txt')        
+    os.remove('temp_db.txt')        
 
 print("")
 print("Found %i matching taxonomy records in the database."%(count))
@@ -310,52 +304,52 @@
 #Write to output files##############################################################################
 #Output matched OTUs to a new file
 if args.matched:
-	if os.path.isfile(matched_file) == True:
-		os.remove(matched_file)
-	output = open(matched_file,'a')
-	#Write the matched list header
-	output.write('%s' % ('\t'.join(header))) #Header
-
-	#Write the matched OTU table
-	for item in unique_list:
-		rec = '\t'.join(item)    
-		output.write('%s' % rec)
-	output.close()
+    if os.path.isfile(matched_file) == True:
+        os.remove(matched_file)
+    output = open(matched_file,'a')
+    #Write the matched list header
+    output.write('%s' % ('\t'.join(header))) #Header
+
+    #Write the matched OTU table
+    for item in unique_list:
+        rec = '\t'.join(item)    
+        output.write('%s' % rec)
+    output.close()
 
 #Output unmatched OTUs to a new file
 unmatched_list = []    
 
 for rec in otu_tax:
-	count2 = 0        
-	for new_rec in unique_list:
-		if rec[0] == new_rec[0]: #Check if the current record is in the unique_list (has been assigned a function)
-			count2 += 1
-	if count2 == 0:
-		unmatched_list.append(rec)
+    count2 = 0        
+    for new_rec in unique_list:
+        if rec[0] == new_rec[0]: #Check if the current record is in the unique_list (has been assigned a function)
+            count2 += 1
+    if count2 == 0:
+        unmatched_list.append(rec)
 
 count_unmatched = 0
 
 #Add 'Unassigned' to the 'Notes' column
 for item in unmatched_list:
-	l = len(header) - len(item)
-	for i in range(l):
-		item.extend('-')
-	item[index_notes] = 'Unassigned'
+    l = len(header) - len(item)
+    for i in range(l):
+        item.extend('-')
+    item[index_notes] = 'Unassigned'
 
 if args.unmatched:
-	if os.path.isfile(unmatched_file) == True: 
-		os.remove(unmatched_file)
-	output_unmatched = open(unmatched_file, 'a')
-	output_unmatched.write('%s' % ('\t'.join(header)))		
-	for item in unmatched_list:
-		rec = '\t'.join(item)
-		output_unmatched.write('%s\n' % rec)
-		count_unmatched += 1
-	output_unmatched.close()
+    if os.path.isfile(unmatched_file) == True: 
+        os.remove(unmatched_file)
+    output_unmatched = open(unmatched_file, 'a')
+    output_unmatched.write('%s' % ('\t'.join(header)))      
+    for item in unmatched_list:
+        rec = '\t'.join(item)
+        output_unmatched.write('%s\n' % rec)
+        count_unmatched += 1
+    output_unmatched.close()
  
 #Output the combined matched and unmatched OTUs to a new file
 if os.path.isfile(total_file) == True: 
-	os.remove(total_file)
+    os.remove(total_file)
 
 total_list = unique_list + unmatched_list #Combine the two OTU tables
 total_list.sort(key=lambda x: float(sum(map(float,x[1:index_tax]))), reverse=True) #Sorted the combined OTU table
@@ -365,9 +359,9 @@
 
 count_total = 0
 for item in total_list:
-	rec = ('\t'.join(item)).strip('\n')
-	output_total.write('%s\n' % rec)
-	count_total += 1
+    rec = ('\t'.join(item)).strip('\n')
+    output_total.write('%s\n' % rec)
+    count_total += 1
 output_total.close()
 ####################################################################################################################
 
@@ -377,11 +371,11 @@
 print("Result saved to '%s'" %(total_file))
 
 if args.matched or args.unmatched:
-	print('\nAdditional output:')
-	if args.matched:
-		print("FUNGuild made assignments on %i OTUs, these have been saved to %s." %(count, matched_file))
-	if args.unmatched:
-		print("%i OTUs were unassigned, these are saved to %s." %(count_unmatched, unmatched_file))
+    print('\nAdditional output:')
+    if args.matched:
+        print("FUNGuild made assignments on %i OTUs, these have been saved to %s." %(count, matched_file))
+    if args.unmatched:
+        print("%i OTUs were unassigned, these are saved to %s." %(count_unmatched, unmatched_file))
 
 # Finish the program
 stop = timeit.default_timer()