updates to v0.3.4

nextgenusfs · Apr 11, 2016 · 1226264 · 1226264
1 parent d467c48
commit 1226264
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 32 deletions.
diff --git a/LICENSE.md b/LICENSE.md
@@ -0,0 +1,25 @@
+BSD 2-Clause License
+
+Copyright (c) 2016, Jonathan M. Palmer
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/bin/ufits-OTU_cluster.py b/bin/ufits-OTU_cluster.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 #This script runs USEARCH OTU clustering
-#written by Jon Palmer palmer.jona at gmail dot com
+#written by Jon Palmer nextgenusfs@gmail.com
 
 import sys, os, argparse, subprocess, inspect, csv, re, logging, shutil, multiprocessing
 from Bio import SeqIO
@@ -40,7 +40,6 @@ class colr:
 parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs')
 parser.add_argument('--unoise', action='store_true', help='Run De-noising (UNOISE)')
 parser.add_argument('--size_annotations', action='store_true', help='Append size annotations')
-parser.add_argument('--skip_quality', action='store_true', help='Skip Quality trimming (data is already Q-trimmed)')
 parser.add_argument('--cleanup', action='store_true', help='Remove Intermediate Files')
 args=parser.parse_args()
 
@@ -60,7 +59,7 @@ def checkfastqsize(input):
 print "-------------------------------------------------------"
 
 #initialize script, log system info and usearch version
-ufitslib.log.info("Operating system: %s" % sys.platform)
+ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
 usearch = args.usearch
 try:
     usearch_test = subprocess.Popen([usearch, '-version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
@@ -69,39 +68,48 @@ def checkfastqsize(input):
     os._exit(1)
 ufitslib.log.info("USEARCH version: %s" % usearch_test)
 
+#check if vsearch is installed
+vsearch = ufitslib.which('vsearch')
+if vsearch:
+    ufitslib.log.info("vsearch detected, will use for filtering")
+else:
+    ufitslib.log.info("vsearch not found, using Python for filtering")
+
 #make tmp folder
 tmp = args.out + '_tmp'
 if not os.path.exists(tmp):
     os.makedirs(tmp)
 
 #Count FASTQ records
 ufitslib.log.info("Loading FASTQ Records")
-total = ufitslib.countfastq(args.FASTQ)
+orig_total = ufitslib.countfastq(args.FASTQ)
 size = checkfastqsize(args.FASTQ)
 readablesize = ufitslib.convertSize(size)
-ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')
+ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')
 
-#Expected Errors filtering step
+#Expected Errors filtering step and convert to fasta
 filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
-if not args.skip_quality:
-    ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
-    with open(filter_out, 'w') as output:
-        SeqIO.write(ufitslib.MaxEEFilter(args.FASTQ, args.length, args.maxee), output, 'fastq')
-    total = ufitslib.countfastq(filter_out)
-    ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
-else:
-    filter_out = args.FASTQ
-
-#convert to FASTA to save space for large files
 filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
-SeqIO.convert(filter_out, 'fastq', filter_fasta, 'fasta')
 orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
-SeqIO.convert(args.FASTQ, 'fastq', orig_fasta, 'fasta')
+ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
+if vsearch:
+    subprocess.call(['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastq_trunclen', str(args.length), '--fastqout', filter_out, '--fastaout', filter_fasta], stdout = FNULL, stderr = FNULL)
+    subprocess.call(['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta], stdout = FNULL, stderr = FNULL)
+else:
+    with open(filter_out, 'w') as output:
+        SeqIO.write(ufitslib.MaxEEFilter(args.FASTQ, args.length, args.maxee), output, 'fastq')
+    SeqIO.convert(args.FASTQ, 'fastq', orig_fasta, 'fasta')
+    SeqIO.convert(filter_out, 'fastq', filter_fasta, 'fasta')
+total = ufitslib.countfastq(filter_out)
+ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
 
-#now run full length dereplication (biopython)
+#now run full length dereplication
 derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
 ufitslib.log.info("De-replication (remove duplicate reads)")
-ufitslib.dereplicate(filter_out, derep_out)
+if vsearch:
+    subprocess.call(['vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out], stdout = FNULL, stderr = FNULL)
+else:
+    ufitslib.dereplicate(filter_out, derep_out)
 total = ufitslib.countfasta(derep_out)
 ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
 
@@ -118,10 +126,11 @@ def checkfastqsize(input):
 
 #now run usearch 8 sort by size
 sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa')
-ufitslib.log.info("Sorting reads by size (USEARCH8)")
+ufitslib.log.info("Sorting reads by size")
 ufitslib.log.debug("%s -sortbysize %s -minsize %s -fastaout %s" % (usearch, unoise_out, args.minsize, sort_out))
 subprocess.call([usearch, '-sortbysize', unoise_out, '-minsize', args.minsize, '-fastaout', sort_out], stdout = FNULL, stderr = FNULL)
 
+
 #now run clustering algorithm
 radius = str(100 - int(args.pct_otu))
 otu_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otus.fa')
@@ -188,9 +197,19 @@ def checkfastqsize(input):
 else:
     reads = orig_fasta
 
-ufitslib.log.info("Mapping Reads to OTUs (USEARCH8)")
-ufitslib.log.debug("%s -usearch_global %s -strand plus -id 0.97 -db %s -uc %s" % (usearch, reads, uchime_out, uc_out))
-subprocess.call([usearch, '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)
+ufitslib.log.info("Mapping Reads to OTUs")
+if vsearch:
+    subprocess.call(['vsearch', '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)
+else:
+    ufitslib.log.debug("%s -usearch_global %s -strand plus -id 0.97 -db %s -uc %s" % (usearch, reads, uchime_out, uc_out))
+    subprocess.call([usearch, '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)
+
+#count reads mapped
+if vsearch:
+    total = ufitslib.line_count(uc_out)
+else:
+    total = ufitslib.line_count2(uc_out)
+ufitslib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs')
 
 #Build OTU table
 otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt')
@@ -218,7 +237,9 @@ def checkfastqsize(input):
 print "OTU Table: %s" % final_otu_table
 print "-------------------------------------------------------"
 
+otu_print = final_otu.split('/')[-1]
+tab_print = final_otu_table.split('/')[-1]
 if 'win32' in sys.platform:
-    print "\nExample of next cmd: ufits filter -i %s -f %s -b <mock barcode>\n" % (final_otu_table, final_otu)
+    print "\nExample of next cmd: ufits filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)
 else:
-    print colr.WARN + "\nExample of next cmd:" + colr.END + " ufits filter -i %s -f %s -b <mock barcode>\n" % (final_otu_table, final_otu)
+    print colr.WARN + "\nExample of next cmd:" + colr.END + " ufits filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)
diff --git a/bin/ufits-filter.py b/bin/ufits-filter.py
@@ -63,7 +63,7 @@ def __init__(self,prog):
 print "-------------------------------------------------------"
 
 #initialize script, log system info and usearch version
-ufitslib.log.info("Operating system: %s" % sys.platform)
+ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
 
 #get usearch location/name
 usearch = args.usearch
@@ -167,6 +167,7 @@ def __init__(self,prog):
 filtered = pd.DataFrame(df, columns=fs.index)
 filt2 = filtered.loc[(filtered != 0).any(1)]
 os = filt2.sum(axis=1)
+ufitslib.log.info("Removing singleton OTUs (OTUs with only 1 read from all samples)")
 fotus = os[os > 2] #valid allele must be found more than 2 times, i.e. no singletons
 filt3 = pd.DataFrame(filt2, index=fotus.index)
 
@@ -221,7 +222,11 @@ def __init__(self,prog):
     ufitslib.log.info("Overwriting auto detect index-bleed, setting to %f%%" % (args.index_bleed*100))
     bleedfilter = args.index_bleed
 else:
-    ufitslib.log.info("Will use value of %f%% for index-bleed OTU filtering." % (bleedfilter*100))
+    if bleedfilter:
+        ufitslib.log.info("Will use value of %f%% for index-bleed OTU filtering." % (bleedfilter*100))
+    else:
+        bleedfilter = 0 #no filtering if you don't pass -p or -b 
+        ufitslib.log.info("No spike-in mock (-b) or index-bleed (-p) specified, thus not running index-bleed filtering") 
 
 #to combat barcode switching, loop through each OTU filtering out if less than bleedfilter threshold
 cleaned = []

diff --git a/bin/ufits-process_illumina_folder.py b/bin/ufits-process_illumina_folder.py
@@ -173,7 +173,7 @@ def worker(file):
 print "-------------------------------------------------------"
 
 #initialize script, log system info and usearch version
-ufitslib.log.info("Operating system: %s" % sys.platform)
+ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
 usearch = args.usearch
 try:
     usearch_test = subprocess.Popen([usearch, '-version'], stdout=subprocess.PIPE).communicate()[0].rstrip()

diff --git a/bin/ufits-process_ion.py b/bin/ufits-process_ion.py
@@ -150,7 +150,7 @@ def worker(input):
 ufitslib.log.debug(cmd_args)
 print "-------------------------------------------------------"
 #initialize script, log system info and usearch version
-ufitslib.log.info("Operating system: %s" % sys.platform)
+ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
 
 #if SFF file passed, convert to FASTQ with biopython
 if args.fastq.endswith(".sff"):

diff --git a/lib/ufitslib.py b/lib/ufitslib.py
@@ -26,6 +26,25 @@ def countfastq(input):
     count = int(lines) / 4
     return count
 
+def line_count(fname):
+    with open(fname) as f:
+        i = -1
+        for i, l in enumerate(f):
+            pass
+    return i + 1
+
+def line_count2(fname):
+    count = 0
+    with open(fname, 'rU') as f:
+        for line in f:
+            if not '*' in line:
+                count += 1
+    return count
+
+def get_version():
+    version = subprocess.Popen(['ufits', 'version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
+    return version
+
 def batch_iterator(iterator, batch_size):
     entry = True #Make sure we loop once
     while entry :

diff --git a/ufits.py b/ufits.py
@@ -55,7 +55,7 @@ def download(url):
     f.close()
 
 
-version = '0.3.3'
+version = '0.3.4'
 
 default_help = """
 Usage:       ufits <command> <arguments>
@@ -241,7 +241,6 @@ def download(url):
              -l, --length        Length to trim reads. Default 250
              --uchime_ref        Run Chimera filtering. Default: off [ITS1, ITS2, Full]
              --map_filtered      Map quality filtered reads back to OTUs. Default: off
-             --skip_quality      Skip quality trimming (e.g. reads are already quality trimmed)
              --unoise            Run De-noising pre-clustering (UNOISE). Default: off
              --size_annotations  Append size annotations to OTU names. Default: off
              -u, --usearch       USEARCH executable. Default: usearch8