Skip to content

Commit

Permalink
updates to v0.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer authored and Jon Palmer committed Apr 11, 2016
1 parent d467c48 commit 1226264
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 32 deletions.
25 changes: 25 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
BSD 2-Clause License

Copyright (c) 2016, Jonathan M. Palmer
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
73 changes: 47 additions & 26 deletions bin/ufits-OTU_cluster.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python

#This script runs USEARCH OTU clustering
#written by Jon Palmer palmer.jona at gmail dot com
#written by Jon Palmer nextgenusfs@gmail.com

import sys, os, argparse, subprocess, inspect, csv, re, logging, shutil, multiprocessing
from Bio import SeqIO
Expand Down Expand Up @@ -40,7 +40,6 @@ class colr:
parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs')
parser.add_argument('--unoise', action='store_true', help='Run De-noising (UNOISE)')
parser.add_argument('--size_annotations', action='store_true', help='Append size annotations')
parser.add_argument('--skip_quality', action='store_true', help='Skip Quality trimming (data is already Q-trimmed)')
parser.add_argument('--cleanup', action='store_true', help='Remove Intermediate Files')
args=parser.parse_args()

Expand All @@ -60,7 +59,7 @@ def checkfastqsize(input):
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
ufitslib.log.info("Operating system: %s" % sys.platform)
ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
usearch = args.usearch
try:
usearch_test = subprocess.Popen([usearch, '-version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
Expand All @@ -69,39 +68,48 @@ def checkfastqsize(input):
os._exit(1)
ufitslib.log.info("USEARCH version: %s" % usearch_test)

#check if vsearch is installed
vsearch = ufitslib.which('vsearch')
if vsearch:
ufitslib.log.info("vsearch detected, will use for filtering")
else:
ufitslib.log.info("vsearch not found, using Python for filtering")

#make tmp folder
tmp = args.out + '_tmp'
if not os.path.exists(tmp):
os.makedirs(tmp)

#Count FASTQ records
ufitslib.log.info("Loading FASTQ Records")
total = ufitslib.countfastq(args.FASTQ)
orig_total = ufitslib.countfastq(args.FASTQ)
size = checkfastqsize(args.FASTQ)
readablesize = ufitslib.convertSize(size)
ufitslib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')
ufitslib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')')

#Expected Errors filtering step
#Expected Errors filtering step and convert to fasta
filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq')
if not args.skip_quality:
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
with open(filter_out, 'w') as output:
SeqIO.write(ufitslib.MaxEEFilter(args.FASTQ, args.length, args.maxee), output, 'fastq')
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')
else:
filter_out = args.FASTQ

#convert to FASTA to save space for large files
filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa')
SeqIO.convert(filter_out, 'fastq', filter_fasta, 'fasta')
orig_fasta = os.path.join(tmp, args.out+'.orig.fa')
SeqIO.convert(args.FASTQ, 'fastq', orig_fasta, 'fasta')
ufitslib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
if vsearch:
subprocess.call(['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastq_trunclen', str(args.length), '--fastqout', filter_out, '--fastaout', filter_fasta], stdout = FNULL, stderr = FNULL)
subprocess.call(['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta], stdout = FNULL, stderr = FNULL)
else:
with open(filter_out, 'w') as output:
SeqIO.write(ufitslib.MaxEEFilter(args.FASTQ, args.length, args.maxee), output, 'fastq')
SeqIO.convert(args.FASTQ, 'fastq', orig_fasta, 'fasta')
SeqIO.convert(filter_out, 'fastq', filter_fasta, 'fasta')
total = ufitslib.countfastq(filter_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

#now run full length dereplication (biopython)
#now run full length dereplication
derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa')
ufitslib.log.info("De-replication (remove duplicate reads)")
ufitslib.dereplicate(filter_out, derep_out)
if vsearch:
subprocess.call(['vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out], stdout = FNULL, stderr = FNULL)
else:
ufitslib.dereplicate(filter_out, derep_out)
total = ufitslib.countfasta(derep_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads passed')

Expand All @@ -118,10 +126,11 @@ def checkfastqsize(input):

#now run usearch 8 sort by size
sort_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.sort.fa')
ufitslib.log.info("Sorting reads by size (USEARCH8)")
ufitslib.log.info("Sorting reads by size")
ufitslib.log.debug("%s -sortbysize %s -minsize %s -fastaout %s" % (usearch, unoise_out, args.minsize, sort_out))
subprocess.call([usearch, '-sortbysize', unoise_out, '-minsize', args.minsize, '-fastaout', sort_out], stdout = FNULL, stderr = FNULL)


#now run clustering algorithm
radius = str(100 - int(args.pct_otu))
otu_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otus.fa')
Expand Down Expand Up @@ -188,9 +197,19 @@ def checkfastqsize(input):
else:
reads = orig_fasta

ufitslib.log.info("Mapping Reads to OTUs (USEARCH8)")
ufitslib.log.debug("%s -usearch_global %s -strand plus -id 0.97 -db %s -uc %s" % (usearch, reads, uchime_out, uc_out))
subprocess.call([usearch, '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)
ufitslib.log.info("Mapping Reads to OTUs")
if vsearch:
subprocess.call(['vsearch', '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)
else:
ufitslib.log.debug("%s -usearch_global %s -strand plus -id 0.97 -db %s -uc %s" % (usearch, reads, uchime_out, uc_out))
subprocess.call([usearch, '-usearch_global', reads, '-strand', 'plus', '-id', '0.97', '-db', uchime_out, '-uc', uc_out], stdout = FNULL, stderr = FNULL)

#count reads mapped
if vsearch:
total = ufitslib.line_count(uc_out)
else:
total = ufitslib.line_count2(uc_out)
ufitslib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs')

#Build OTU table
otu_table = os.path.join(tmp, args.out + '.EE' + args.maxee + '.otu_table.txt')
Expand Down Expand Up @@ -218,7 +237,9 @@ def checkfastqsize(input):
print "OTU Table: %s" % final_otu_table
print "-------------------------------------------------------"

otu_print = final_otu.split('/')[-1]
tab_print = final_otu_table.split('/')[-1]
if 'win32' in sys.platform:
print "\nExample of next cmd: ufits filter -i %s -f %s -b <mock barcode>\n" % (final_otu_table, final_otu)
print "\nExample of next cmd: ufits filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)
else:
print colr.WARN + "\nExample of next cmd:" + colr.END + " ufits filter -i %s -f %s -b <mock barcode>\n" % (final_otu_table, final_otu)
print colr.WARN + "\nExample of next cmd:" + colr.END + " ufits filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)
9 changes: 7 additions & 2 deletions bin/ufits-filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(self,prog):
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
ufitslib.log.info("Operating system: %s" % sys.platform)
ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))

#get usearch location/name
usearch = args.usearch
Expand Down Expand Up @@ -167,6 +167,7 @@ def __init__(self,prog):
filtered = pd.DataFrame(df, columns=fs.index)
filt2 = filtered.loc[(filtered != 0).any(1)]
os = filt2.sum(axis=1)
ufitslib.log.info("Removing singleton OTUs (OTUs with only 1 read from all samples)")
fotus = os[os > 2] #valid allele must be found more than 2 times, i.e. no singletons
filt3 = pd.DataFrame(filt2, index=fotus.index)

Expand Down Expand Up @@ -221,7 +222,11 @@ def __init__(self,prog):
ufitslib.log.info("Overwriting auto detect index-bleed, setting to %f%%" % (args.index_bleed*100))
bleedfilter = args.index_bleed
else:
ufitslib.log.info("Will use value of %f%% for index-bleed OTU filtering." % (bleedfilter*100))
if bleedfilter:
ufitslib.log.info("Will use value of %f%% for index-bleed OTU filtering." % (bleedfilter*100))
else:
bleedfilter = 0 #no filtering if you don't pass -p or -b
ufitslib.log.info("No spike-in mock (-b) or index-bleed (-p) specified, thus not running index-bleed filtering")

#to combat barcode switching, loop through each OTU filtering out if less than bleedfilter threshold
cleaned = []
Expand Down
2 changes: 1 addition & 1 deletion bin/ufits-process_illumina_folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def worker(file):
print "-------------------------------------------------------"

#initialize script, log system info and usearch version
ufitslib.log.info("Operating system: %s" % sys.platform)
ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))
usearch = args.usearch
try:
usearch_test = subprocess.Popen([usearch, '-version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
Expand Down
2 changes: 1 addition & 1 deletion bin/ufits-process_ion.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def worker(input):
ufitslib.log.debug(cmd_args)
print "-------------------------------------------------------"
#initialize script, log system info and usearch version
ufitslib.log.info("Operating system: %s" % sys.platform)
ufitslib.log.info("Operating system: %s, %s" % (sys.platform, ufitslib.get_version()))

#if SFF file passed, convert to FASTQ with biopython
if args.fastq.endswith(".sff"):
Expand Down
19 changes: 19 additions & 0 deletions lib/ufitslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,25 @@ def countfastq(input):
count = int(lines) / 4
return count

def line_count(fname):
with open(fname) as f:
i = -1
for i, l in enumerate(f):
pass
return i + 1

def line_count2(fname):
count = 0
with open(fname, 'rU') as f:
for line in f:
if not '*' in line:
count += 1
return count

def get_version():
version = subprocess.Popen(['ufits', 'version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
return version

def batch_iterator(iterator, batch_size):
entry = True #Make sure we loop once
while entry :
Expand Down
3 changes: 1 addition & 2 deletions ufits.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def download(url):
f.close()


version = '0.3.3'
version = '0.3.4'

default_help = """
Usage: ufits <command> <arguments>
Expand Down Expand Up @@ -241,7 +241,6 @@ def download(url):
-l, --length Length to trim reads. Default 250
--uchime_ref Run Chimera filtering. Default: off [ITS1, ITS2, Full]
--map_filtered Map quality filtered reads back to OTUs. Default: off
--skip_quality Skip quality trimming (e.g. reads are already quality trimmed)
--unoise Run De-noising pre-clustering (UNOISE). Default: off
--size_annotations Append size annotations to OTU names. Default: off
-u, --usearch USEARCH executable. Default: usearch8
Expand Down

0 comments on commit 1226264

Please sign in to comment.