more minor bug fixes, moving towards v0.10.3

nextgenusfs · Sep 6, 2017 · 59b7222 · 59b7222
1 parent cc871a0
commit 59b7222
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 23 deletions.
diff --git a/bin/amptk-extract_region.py b/bin/amptk-extract_region.py
@@ -83,8 +83,12 @@ def dereplicate(input, output):
                     seqs[sequence] = rec.description
                 else:
                     #check length of taxonomy string, keep one with more tax info
-                    newTax = rec.description.split(',')
-                    oldTax = seqs.get(sequence).split(',')
+                    newHeader = rec.description.split(';tax=')
+                    oldHeader = seqs.get(sequence).split(';tax=')
+                    newTax = newHeader[-1].split(',')
+                    oldTax = oldHeader[-1].split(',')
+                    newID = newHeader[0]
+                    oldID = oldHeader[0]
                     newTaxLen = len(newTax)
                     oldTaxLen = len(oldTax)
                     if newTaxLen > oldTaxLen:
@@ -102,7 +106,7 @@ def dereplicate(input, output):
                                     if newTax[-num] == oldTax[-num]:
                                         lca = num-1
                                         break
-                                consensusTax = ','.join(oldTax[:-lca])
+                                consensusTax = oldID+';tax='+','.join(oldTax[:-lca])
                                 amptklib.log.debug("setting taxonomy to %s" % (consensusTax))
                                 seqs[sequence] = consensusTax
         #now write to file     
@@ -314,22 +318,6 @@ def stripPrimer(records):
                 yield rec
 
 def makeDB(input):
-    #need usearch for this, test to make sure version is ok with utax
-    usearch = args.usearch
-    try:
-        usearch_test = subprocess.Popen([usearch, '-version'], stdout=subprocess.PIPE).communicate()[0].rstrip()
-    except OSError:
-        amptklib.log.error("%s not found in your PATH, exiting." % usearch)
-        os._exit(1)
-    version = usearch_test.split(" v")[1]
-    majorV = version.split(".")[0]
-    minorV = version.split(".")[1]
-    if int(majorV) < 8 or (int(majorV) >= 8 and int(minorV) < 1):
-        amptklib.log.warning("USEARCH version: %s detected you need v8.1.1756 or above" % usearch_test)
-        os._exit(1)
-    else:
-        amptklib.log.info("USEARCH version: %s" % usearch_test)
-
     db_details = args.out + '.udb.txt'
     usearch_db = args.out + '.udb'
     if args.trimming:
@@ -340,7 +328,6 @@ def makeDB(input):
         details.write(db_string)
     report = args.out + '.report.txt'
 
-
     if args.create_db == 'utax':
         #create log file for this to troubleshoot
         utax_log = args.out + '.utax.log'
@@ -465,7 +452,7 @@ def worker(input):
 if args.derep_fulllength:
     Passed = amptklib.countfasta(OutName)
     amptklib.log.info('{0:,}'.format(Passed) + ' records passed (%.2f%%)' % (Passed*100.0/SeqCount))
-    amptklib.log.info("Now dereplicating sequences (remove if sequence and header identical)")
+    amptklib.log.info("Now dereplicating sequences (collapsing identical sequences)")
     derep_tmp = args.out + '.derep.extracted.fa'
     os.rename(OutName, derep_tmp)
     dereplicate(derep_tmp, OutName)

diff --git a/bin/amptk-filter.py b/bin/amptk-filter.py
@@ -318,6 +318,9 @@ def __init__(self,prog):
     #first calculate bleed out of mock community
     #slice normalized dataframe to get only mock OTUs from table
     mock_df = pd.DataFrame(norm_round, index=mock)
+    #if there are samples to drop, make sure they aren't being used in this calculation
+    if args.drop:
+        mock_df.drop(args.drop, axis=1, inplace=True)
     #get total number of reads from mock OTUs from entire table
     total = np.sum(np.sum(mock_df,axis=None))
     #now drop the mock barcode sample

diff --git a/bin/amptk-process_ion.py b/bin/amptk-process_ion.py
@@ -205,7 +205,15 @@ def processRead(input):
             outputSeqFile.close()
             inputSeqFile.close()
     else:
-        shutil.copyfile(args.barcode_fasta, barcode_file)
+        #check for multi_samples and add if necessary
+        if args.multi == 'False':
+            shutil.copyfile(args.barcode_fasta, barcode_file)
+        else:
+            with open(barcode_file, 'w') as barcodeout:
+                with open(args.barcode_fasta, 'rU') as input:
+                    for rec in SeqIO.parse(input, 'fasta'):
+                        outname = args.multi+'.'+rec.id
+                        barcodeout.write(">%s\n%s\n" % (outname, rec.seq))         
 
     #parse primers here so doesn't conflict with mapping primers
     #look up primer db otherwise default to entry

diff --git a/lib/amptklib.py b/lib/amptklib.py
@@ -8,7 +8,7 @@
 
 ASCII = {'!':'0','"':'1','#':'2','$':'3','%':'4','&':'5',"'":'6','(':'7',')':'8','*':'9','+':'10',',':'11','-':'12','.':'13','/':'14','0':'15','1':'16','2':'17','3':'18','4':'19','5':'20','6':'21','7':'22','8':'23','9':'24',':':'25',';':'26','<':'27','=':'28','>':'29','?':'30','@':'31','A':'32','B':'33','C':'34','D':'35','E':'36','F':'37','G':'38','H':'39','I':'40','J':'41','K':'42','L':'43','M':'44','N':'45','O':'46','P':'47','Q':'48','R':'49','S':'50'}
 
-primer_db = {'fITS7': 'GTGARTCATCGAATCTTTG', 'ITS4': 'TCCTCCGCTTATTGATATGC', 'ITS1-F': 'CTTGGTCATTTAGAGGAAGTAA', 'ITS2': 'GCTGCGTTCTTCATCGATGC', 'ITS3': 'GCATCGATGAAGAACGCAGC', 'ITS4-B': 'CAGGAGACTTGTACACGGTCCAG', 'ITS1': 'TCCGTAGGTGAACCTGCGG', 'LR0R': 'ACCCGCTGAACTTAAGC', 'LR2R': 'AAGAACTTTGAAAAGAG', 'JH-LS-369rc': 'CTTCCCTTTCAACAATTTCAC', '16S_V3': 'CCTACGGGNGGCWGCAG', '16S_V4': 'GACTACHVGGGTATCTAATCC', 'ITS3_KYO2': 'GATGAAGAACGYAGYRAA', 'COI-F': 'GGTCAACAAATCATAAAGATATTGG', 'COI-R': 'GGWACTAATCAATTTCCAAATCC', '515FB': 'GTGYCAGCMGCCGCGGTAA', '806RB': 'GGACTACNVGGGTWTCTAAT'}
+primer_db = {'fITS7': 'GTGARTCATCGAATCTTTG', 'ITS4': 'TCCTCCGCTTATTGATATGC', 'ITS1-F': 'CTTGGTCATTTAGAGGAAGTAA', 'ITS2': 'GCTGCGTTCTTCATCGATGC', 'ITS3': 'GCATCGATGAAGAACGCAGC', 'ITS4-B': 'CAGGAGACTTGTACACGGTCCAG', 'ITS1': 'TCCGTAGGTGAACCTGCGG', 'LR0R': 'ACCCGCTGAACTTAAGC', 'LR2R': 'AAGAACTTTGAAAAGAG', 'JH-LS-369rc': 'CTTCCCTTTCAACAATTTCAC', '16S_V3': 'CCTACGGGNGGCWGCAG', '16S_V4': 'GACTACHVGGGTATCTAATCC', 'ITS3_KYO2': 'GATGAAGAACGYAGYRAA', 'COI-F': 'GGTCAACAAATCATAAAGATATTGG', 'COI-R': 'GGWACTAATCAATTTCCAAATCC', '515FB': 'GTGYCAGCMGCCGCGGTAA', '806RB': 'GGACTACNVGGGTWTCTAAT', 'ITS4-B21': 'CAGGAGACTTGTACACGGTCC'}
 
 
 degenNuc = [("R", "A"), ("R", "G"),