Skip to content

Commit

Permalink
update SRA-submit to use edlib, update menus, move BC function to sha…
Browse files Browse the repository at this point in the history
…red library
  • Loading branch information
Jon Palmer authored and Jon Palmer committed Jul 26, 2017
1 parent 7d478ac commit b8d2581
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 74 deletions.
11 changes: 10 additions & 1 deletion amptk.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def download(url, name):
--min_len Minimum length read to keep. Default: 100
--full_length Keep only full length sequences.
--barcode_fasta FASTA file containing barcodes. Default: pgm_barcodes.fa
--barcode_mismatch Number of mismatches in barcode to allow. Default: 0
--primer_mismatch Number of mismatches in primers to allow. Default: 2
--cpus Number of CPUs to use. Default: all
--mult_samples Combine multiple chip runs, name prefix for chip
Expand Down Expand Up @@ -163,6 +164,7 @@ def download(url, name):
-p, --pad Pad reads with Ns if shorter than --trim_len. Default: off [on,off]
--min_len Minimum length read to keep. Default: 100
--barcode_fasta FASTA file containing barcodes. Default: pgm_barcodes.fa
--barcode_mismatch Number of mismatches in barcode to allow. Default: 0
--reverse_barcode FASTA file containing 3' barcodes. Default: none
--full_length Keep only full length sequences.
--primer_mismatch Number of mismatches in primers to allow. Default: 2
Expand Down Expand Up @@ -206,6 +208,7 @@ def download(url, name):
--rescue_forward Rescue Forward Reads if PE do not merge, e.g. long amplicons. Default: on [on,off]
--require_primer Require the Forward primer to be present. Default: on [on,off]
--primer_mismatch Number of mismatches in primers to allow. Default: 2
--barcode_mismatch Number of mismatches in barcode to allow. Default: 1
--cpus Number of CPUs to use. Default: all
--cleanup Remove intermediate files.
--merge_method Software to use for PE merging. Default: usearch [usearch,vsearch]
Expand Down Expand Up @@ -282,6 +285,7 @@ def download(url, name):
--min_len Minimum length read to keep. Default: 50
--barcode_fasta FASTA file containing barcodes. (Required)
--reverse_barcode FASTA file containing 3' barcodes. Default: none
--barcode_mismatch Number of mismatches in barcode to allow. Default: 0
--primer_mismatch Number of mismatches in primers to allow. Default: 2
--cpus Number of CPUs to use. Default: all
""" % (sys.argv[1], version)
Expand Down Expand Up @@ -671,6 +675,7 @@ def download(url, name):
-o, --output Output file (Required)
-m, --method Type of heatmap. Default: clustermap [clustermap,heatmap]
-d, --delimiter Delimiter of OTU table. Default: tsv [tsv,csv]
-f, --format Figure format. Default: pdf [pdf,jpg,svg,png]
--font Font set. Default: arial
--color Color Palette. Default: gist_gray_r
--figsize Figure size. Default: 2x8
Expand All @@ -680,7 +685,10 @@ def download(url, name):
--cluster_method Clustering method for clustermap. Default: single [single,complete,average,weighted]
--scaling Scale the data by row. Default: None [None, z_score, standard]
--yaxis_fontsize Y-Axis Font Size. Default: 6
--xaxis_fontsize X-Axis Font Size. Default: 6
--xaxis_fontsize X-Axis Font Size. Default: 6
--normalize Normalize data based total, tsv file ID<tab>count
--normalize_counts Value to normalize counts to, i.e. 100000
--vmax Maximum value for heatmap coloration.
--debug Print pandas table on import to terminal
""" % (sys.argv[1], version)

Expand Down Expand Up @@ -926,6 +934,7 @@ def download(url, name):
-r, --rev_primer Reverse primer sequence. Default: ITS4
-a, --append Append a name to the output of all files in run, i.e. run1 -> Sample_run1
--primer_mismatch Number of mismatches allowed in primer search. Default: 2
--barcode_mismatch Number of mismatches in barcode to allow. Default: 0
--require_primer Require primer(s) to be present for output. Default: off [off,forward,both]
--min_len Minimum length read to keep after trimming barcodes. Default 50
---force Overwrite directory with same name
Expand Down
62 changes: 30 additions & 32 deletions bin/amptk-fastq2sra.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

import sys, os, re, gzip, argparse, inspect, csv, shutil, edlib
import sys, os, re, gzip, argparse, inspect, csv, shutil, edlib, multiprocessing
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
Expand Down Expand Up @@ -35,6 +35,7 @@ class col:
parser.add_argument('-t','--title', default='Fungal ITS', help='Start of title for SRA submission, name it according to amplicon')
parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
parser.add_argument('--require_primer', default='off', choices=['forward', 'both', 'off'], help='Require Primers to be present')
parser.add_argument('--force', action='store_true', help='Overwrite existing directory')
parser.add_argument('-a','--append', help='Append a name to all sample names for a run, i.e. --append run1 would yield Sample_run1')
Expand Down Expand Up @@ -168,7 +169,6 @@ def FindBarcode(Seq, BarcodeDict):
filelist.append(file)

else:
#count FASTQ records in input
#start here to process the reads, first reverse complement the reverse primer
ReverseCompRev = revcomp_lib.RevComp(RevPrimer)

Expand Down Expand Up @@ -200,10 +200,18 @@ def FindBarcode(Seq, BarcodeDict):
name = ID + ".fastq"
continue
Barcodes[name]=line.strip()


#check for compressed input file
if args.FASTQ.endswith('.gz'):
amptklib.log.info("Gzipped input files detected, uncompressing")
FASTQ_IN = args.FASTQ.replace('.gz', '')
amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count())
else:
FASTQ_IN = args.FASTQ

#count FASTQ records in input
amptklib.log.info("Loading FASTQ Records")
total = amptklib.countfastq(args.FASTQ)
total = amptklib.countfastq(FASTQ_IN)
size = amptklib.checkfastqsize(args.FASTQ)
readablesize = amptklib.convertSize(size)
amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')
Expand All @@ -216,41 +224,36 @@ def FindBarcode(Seq, BarcodeDict):
elif args.require_primer == 'both':
amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer))


#this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed
runningTotal = 0
trim = len(FwdPrimer)
with open(args.FASTQ, 'rU') as input:
with open(FASTQ_IN, 'rU') as input:
for title, seq, qual in FastqGeneralIterator(input):
Barcode, BarcodeLabel = FindBarcode(seq, Barcodes)
if Barcode == "": #if not found, move onto next record
Barcode, BarcodeLabel = amptklib.AlignBarcode(seq, Barcodes, args.barcode_mismatch)
if Barcode == "":
continue
#trim barcode from sequence
BarcodeLength = len(Barcode)
seq = seq[BarcodeLength:]
qual = qual[BarcodeLength:]
#look for forward primer
if args.require_primer != 'off': #means we only want ones with forward primer and or reverse
Diffs = primer.MatchPrefix(seq, FwdPrimer)
if Diffs > args.primer_mismatch:
if args.require_primer != 'off': #means we only want ones with forward primer and or reverse, but don't remove them
#now search for forward primer
foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch)
if foralign["editDistance"] < 0:
continue
#if found, trim away primer
seq = seq[trim:]
qual = qual[trim:]
if args.require_primer == 'both':
#look for reverse primer, strip if found
BestPosRev, BestDiffsRev = primer.BestMatch2(seq, ReverseCompRev, args.primer_mismatch)
if BestPosRev > 0:
seq = seq[:BestPosRev]
qual = qual[:BestPosRev]
else:
continue
if args.require_primer == 'both':
#now search for reverse primer
revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch)
if revalign["editDistance"] < 0: #reverse primer was not found
continue
#check size
if len(seq) < args.min_len: #filter out sequences less than minimum length.
continue
runningTotal += 1
fileout = os.path.join(args.out, BarcodeLabel)
with open(fileout, 'ab') as output:
output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))

if args.require_primer == 'off':
amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode')
elif args.require_primer == 'forward':
Expand All @@ -259,18 +262,10 @@ def FindBarcode(Seq, BarcodeDict):
amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and both primers')

amptklib.log.info("Now Gzipping files")
gzip_list = []
for file in os.listdir(args.out):
if file.endswith(".fastq"):
gzip_list.append(file)

for file in gzip_list:
file_path = os.path.join(args.out, file)
new_path = file_path + '.gz'
with open(file_path, 'rU') as orig_file:
with gzip.open(new_path, 'w') as zipped_file:
zipped_file.writelines(orig_file)
os.remove(file_path)
amptklib.Fzip_inplace(file_path)

#after all files demuxed into output folder, loop through and create SRA metadata file
filelist = []
Expand All @@ -279,6 +274,9 @@ def FindBarcode(Seq, BarcodeDict):
filelist.append(file)

amptklib.log.info("Finished: output in %s" % args.out)
#clean up if gzipped
if args.FASTQ.endswith('.gz'):
amptklib.removefile(FASTQ_IN)

#check for BioSample meta file
if args.biosample:
Expand Down
41 changes: 2 additions & 39 deletions bin/amptk-process_ion.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,43 +50,6 @@ class col:
parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE')
args=parser.parse_args()

def FindBarcode(Seq, BarcodeDict):
for BarcodeLabel in BarcodeDict.keys():
Barcode = BarcodeDict[BarcodeLabel]
if Seq.startswith(Barcode):
return Barcode, BarcodeLabel
return "", ""

def AlignBarcode(Seq, BarcodeDict):
besthit = ('', '', '')
for BL in BarcodeDict.keys():
B = BarcodeDict[BL]
align = edlib.align(B, Seq, mode="SHW", k=args.barcode_mismatch+1)
if align["editDistance"] == 0:
return B, BL
elif args.barcode_mismatch == 0:
continue
elif align["editDistance"] > 0:
if align["editDistance"] < besthit[2]:
besthit = (B,BL,align["editDistance"])
if besthit[2] <= args.barcode_mismatch:
return besthit[0], besthit[1]
return "", ""

def AlignRevBarcode(Seq, BarcodeDict):
besthit = ('', '', '')
for BL in BarcodeDict.keys():
B = BarcodeDict[BL]
align = edlib.align(B, Seq, mode="HW", k=args.barcode_mismatch+1)
if align["editDistance"] == 0:
return B, BL
elif align["editDistance"] > 0:
if align["editDistance"] < besthit[2]:
besthit = (B,BL,align["editDistance"])
if besthit[2] <= args.barcode_mismatch:
return besthit[0], besthit[1]
return "", ""

def processRead(input):
base = os.path.basename(input).split('.')[0]
PL = len(FwdPrimer)
Expand All @@ -105,7 +68,7 @@ def processRead(input):
for title, seq, qual in FastqGeneralIterator(open(input)):
Total += 1
#look for barcode, trim it off
Barcode, BarcodeLabel = AlignBarcode(seq, Barcodes)
Barcode, BarcodeLabel = amptklib.AlignBarcode(seq, Barcodes, args.barcode_mismatch)
if Barcode == "":
NoBarcode += 1
continue
Expand All @@ -129,7 +92,7 @@ def processRead(input):
RevBCdiffs = 0
BCcut = revalign["locations"][0][1]
CutSeq = Seq[BCcut:]
RevBarcode, RevBarcodeLabel = AlignRevBarcode(CutSeq, RevBarcodes)
RevBarcode, RevBarcodeLabel = amptklib.AlignRevBarcode(CutSeq, RevBarcodes, args.barcode_mismatch)
if RevBarcode == "":
NoRevBarcode += 1
continue
Expand Down
35 changes: 33 additions & 2 deletions lib/amptklib.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import sys, logging, csv, os, subprocess, multiprocessing, platform, time, shutil, inspect, gzip
import sys, logging, csv, os, subprocess, multiprocessing, platform, time, shutil, inspect, gzip, edlib
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
from Bio import SeqIO
Expand Down Expand Up @@ -299,7 +299,38 @@ def illuminaBCmismatch(R1, R2, index):
if BC != index:
remove.append(ID)
remove = set(remove)
return remove
return remove

def AlignBarcode(Seq, BarcodeDict, mismatch):
besthit = ('', '', '')
for BL in BarcodeDict.keys():
B = BarcodeDict[BL]
align = edlib.align(B, Seq, mode="SHW", k=int(mismatch))
if align["editDistance"] == 0:
return B, BL
elif int(mismatch) == 0:
continue
elif align["editDistance"] > 0:
if align["editDistance"] < besthit[2]:
besthit = (B,BL,align["editDistance"])
if besthit[2] <= int(mismatch):
return besthit[0], besthit[1]
return "", ""

def AlignRevBarcode(Seq, BarcodeDict, mismatch):
besthit = ('', '', '')
for BL in BarcodeDict.keys():
B = BarcodeDict[BL]
align = edlib.align(B, Seq, mode="HW", k=int(mismatch))
if align["editDistance"] == 0:
return B, BL
elif align["editDistance"] > 0:
if align["editDistance"] < besthit[2]:
besthit = (B,BL,align["editDistance"])
if besthit[2] <= int(mismatch):
return besthit[0], besthit[1]
return "", ""


def MergeReads(R1, R2, tmpdir, outname, read_length, minlen, usearch, rescue, method, index, mismatch):
removelist = []
Expand Down

0 comments on commit b8d2581

Please sign in to comment.