Skip to content

Commit

Permalink
fix uchime_ref bug in amptk dada2
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer committed May 3, 2018
1 parent e98798b commit a86aa25
Showing 1 changed file with 19 additions and 29 deletions.
48 changes: 19 additions & 29 deletions bin/amptk-dada2.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ def getAvgLength(input):

#get basename if not args.out passed
if args.out:
base = args.out
base = args.out
else:
if 'demux' in args.fastq:
base = os.path.basename(args.fastq).split('.demux')[0]
else:
base = os.path.basename(args.fastq).split('.f')[0]
if 'demux' in args.fastq:
base = os.path.basename(args.fastq).split('.demux')[0]
else:
base = os.path.basename(args.fastq).split('.f')[0]

#remove logfile if exists
log_name = base + '.amptk-dada2.log'
Expand All @@ -111,9 +111,9 @@ def getAvgLength(input):

#get number of cores
if args.cpus:
CORES = str(args.cpus)
CORES = str(args.cpus)
else:
CORES = str(amptklib.getCPUS())
CORES = str(amptklib.getCPUS())

#check dependencies
programs = ['Rscript']
Expand All @@ -133,10 +133,10 @@ def getAvgLength(input):
amptklib.log.info("Loading FASTQ Records")
no_ns = base+'.cleaned_input.fq'
if args.fastq.endswith('.gz'):
fastqInput = args.fastq.replace('.gz', '')
amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES)
fastqInput = args.fastq.replace('.gz', '')
amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES)
else:
fastqInput = os.path.abspath(args.fastq)
fastqInput = os.path.abspath(args.fastq)
amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
demuxtmp = base+'.original.fa'
cmd = ['vsearch', '--fastq_filter', os.path.abspath(no_ns),'--fastq_qmax', '55', '--fastaout', demuxtmp, '--threads', CORES]
Expand Down Expand Up @@ -206,7 +206,7 @@ def getAvgLength(input):
counts = sum(countList)
ID = 'ASV'+str(counter)
if not ID in OTUCounts:
OTUCounts[ID] = counts
OTUCounts[ID] = counts
writefasta.write(">%s\n%s\n" % (ID, Seq))
counter += 1

Expand All @@ -230,8 +230,8 @@ def getAvgLength(input):
os.rename(fastaout, iSeqs)
else:
#check if file is present, remove from previous run if it is.
if os.path.isfile(uchime_out):
amptklib.removefile(uchime_out)
if os.path.isfile(iSeqs):
amptklib.removefile(iSeqs)
#R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
if args.uchime_ref in ['ITS', '16S', 'LSU', 'COI']: #test if it is one that is setup, otherwise default to full path
uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref+'.extracted.fa')
Expand All @@ -243,25 +243,15 @@ def getAvgLength(input):
uchime_db = os.path.abspath(args.uchime_ref)
else:
amptklib.log.error("%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref)
uchime_out = fastaout
iSeqs = fastaout
#now run chimera filtering if all checks out
if not os.path.isfile(uchime_out):
if not os.path.isfile(iSeqs):
amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref)
cmd = ['vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', CORES]
cmd = ['vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', iSeqs, '--threads', CORES]
amptklib.runSubprocess(cmd, amptklib.log)
total = amptklib.countfasta(uchime_out)
total = amptklib.countfasta(iSeqs)
uchime_chimeras = validSeqs - total
amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed')

#now reformat OTUs and OTU table, dropping chimeric OTUs from table, sorting the output as well
nonchimeras = amptklib.fasta2list(uchime_out)
inferredSeqs = SeqIO.index(uchime_out, 'fasta')
with open(iSeqs, 'w') as iSeqout:
for x in natsorted(nonchimeras):
SeqIO.write(inferredSeqs[x], iSeqout, 'fasta')
if not args.debug:
#clean up chimeras fasta
amptklib.removefile(uchime_out)
if os.path.isfile(fastaout):
amptklib.removefile(fastaout)

Expand Down Expand Up @@ -346,8 +336,8 @@ def getAvgLength(input):
otu_print = bioSeqs.split('/')[-1]
tab_print = bioTable.split('/')[-1]
if 'darwin' in sys.platform:
print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
else:
print("\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
print("\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))


0 comments on commit a86aa25

Please sign in to comment.