Mabs 2.18

shelkmike · Apr 7, 2023 · cb3864d · cb3864d
1 parent 3771b51
commit cb3864d
Show file tree

Hide file tree

Showing 95 changed files with 79,203 additions and 8,140 deletions.
diff --git a/Additional_src/Flye/Makefile b/Additional_src/Flye/Makefile
@@ -11,6 +11,11 @@ export SAMTOOLS_DIR = ${ROOT_DIR}/lib/samtools-1.9
 export CXXFLAGS += ${LIBCUCKOO} ${INTERVAL_TREE} ${LEMON} -I${MINIMAP2_DIR}
 export LDFLAGS += -lz -L${MINIMAP2_DIR} -lminimap2
 
+ifeq ($(shell uname -m),arm64)
+	export arm_neon=1
+	export aarch64=1
+endif
+
 .PHONY: clean all profile debug minimap2 samtools
 
 .DEFAULT_GOAL := all

diff --git a/Additional_src/Flye/README.md b/Additional_src/Flye/README.md
@@ -3,7 +3,7 @@ Flye assembler
 
 [![BioConda Install](https://img.shields.io/conda/dn/bioconda/flye.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/flye)
 
-### Version: 2.9.1
+### Version: 2.9.2
 
 Flye is a de novo assembler for single-molecule sequencing reads,
 such as those produced by PacBio and Oxford Nanopore Technologies.
@@ -26,6 +26,11 @@ Manuals
 Latest updates
 --------------
 
+### Flye 2.9.2 release (18 March 2023)
+* Update to minimap 2.24 + using HiFi and Kit14 parameters for faster alignment
+* Fixed a few small bugs and corner cases
+* Polisher now outputs read depth for each base of consensus (can be used as confidence measure)
+
 ### Flye 2.9.1 release (07 Aug 2022)
 * New option --no-alt-contigs to remove all non-primary contigs from the assembly
 * Fixed crash on MacOS with Python 3.8+

diff --git a/Additional_src/Flye/docs/FAQ.md b/Additional_src/Flye/docs/FAQ.md
@@ -205,15 +205,16 @@ Do I need to preprocess my reads in any way?
 --------------------------------------------
 
 No, usually it is not necessary. Flye automatically filters out
-chimeric reads or reads with bad ends. If the read coverage is very high,
-you can use the built-in `--asm-coverage` option for subsampling the longest ones.
+chimeric reads or reads with bad ends. Adapter trimming and quality
+filtering is not needed either.
 
-Note that in PacBio mode, Flye assumes that the input files represent PacBio subreads,
+If the read coverage is very high, you can use the built-in `--asm-coverage` option for subsampling the longest ones.
+
+Note that in PacBio CLR mode, Flye assumes that the input files represent PacBio subreads,
 e.g. adaptors and scraps are removed and multiple passes of the same insertion
 sequence are separated. This is typically handled by PacBio instruments/toolchains,
-however we saw examples of problemmatic raw -> fastq conversions, 
-which resulted into incorrect subreads. In this case, 
-consider using [pbclip](https://github.com/fenderglass/pbclip) to fix your Fasta/q reads.
+however we saw examples of problemmatic raw -> fastq conversions with old CLR data. 
+In this case, consider using [pbclip](https://github.com/fenderglass/pbclip) to fix your Fasta/q reads.
 
 Are cluster environments (SGE / Slurm etc.) supported?
 ------------------------------------------------------

diff --git a/Additional_src/Flye/docs/NEWS.md b/Additional_src/Flye/docs/NEWS.md
@@ -1,3 +1,9 @@
+Flye 2.9.2 release (18 March 2023)
+=================================
+* Update to minimap 2.24 + using HiFi and Kit14 parameters for faster alignment
+* Fixed a few small bugs and corner cases
+* Polisher now outputs read depth for each base of consensus (can be used as confidence measure)
+
 Flye 2.9.1 release (07 Aug 2022)
 ===============================
 * New option --no-alt-contigs to remove all non-primary contigs from the assembly

diff --git a/Additional_src/Flye/flye/__build__.py b/Additional_src/Flye/flye/__build__.py
@@ -1 +1 @@
-__build__ = 1780
+__build__ = 1786
diff --git a/Additional_src/Flye/flye/__version__.py b/Additional_src/Flye/flye/__version__.py
@@ -1 +1 @@
-__version__ = "2.9.1"
+__version__ = "2.9.2"
diff --git a/Additional_src/Flye/flye/assembly/assemble.py b/Additional_src/Flye/flye/assembly/assemble.py
@@ -41,7 +41,7 @@ def assemble(args, run_params, out_file, log_file, config_path):
     logger.info("Assembling disjointigs")
     logger.debug("-----Begin assembly log------")
     cmdline = [ASSEMBLE_BIN, "assemble", "--reads", ",".join(args.reads), "--out-asm", out_file,
-               "--config", config_path, "--log", log_file, "--threads", str(args.threads)]
+               "--config", config_path, "--log", log_file, "--threads", str(1 if args.deterministic else args.threads)]
     if args.debug:
         cmdline.append("--debug")
     if args.meta:

diff --git a/Additional_src/Flye/flye/main.py b/Additional_src/Flye/flye/main.py
@@ -30,8 +30,8 @@
 from flye.utils.sam_parser import AlignmentException
 import flye.utils.fasta_parser as fp
 #import flye.short_plasmids.plasmids as plas
-import flye.trestle.trestle as tres
-import flye.trestle.graph_resolver as tres_graph
+#import flye.trestle.trestle as tres
+#import flye.trestle.graph_resolver as tres_graph
 from flye.repeat_graph.repeat_graph import RepeatGraph
 from flye.six.moves import range
 
@@ -124,43 +124,6 @@ def run(self):
         logger.debug("Disjointigs length: %d, N50: %d", asm_len, asm_n50)
 
 
-#class JobShortPlasmidsAssembly(Job):
-#    def __init__(self, args, work_dir, contigs_file, repeat_graph,
-#                 graph_edges):
-#        super(JobShortPlasmidsAssembly, self).__init__()
-#
-#        self.args = args
-#        self.work_dir = work_dir
-#        self.work_dir = os.path.join(work_dir, "22-plasmids")
-#        self.contigs_path = contigs_file
-#        self.repeat_graph = repeat_graph
-#        self.graph_edges = graph_edges
-#
-#        self.name = "plasmids"
-#        self.out_files["repeat_graph"] = os.path.join(self.work_dir,
-#                                                      "repeat_graph_dump")
-#        self.out_files["repeat_graph_edges"] = \
-#            os.path.join(self.work_dir, "repeat_graph_edges.fasta")
-#
-#
-#    def run(self):
-#        super(JobShortPlasmidsAssembly, self).run()
-#        logger.info("Recovering short unassembled sequences")
-#        if not os.path.isdir(self.work_dir):
-#            os.mkdir(self.work_dir)
-#        plasmids = plas.assemble_short_plasmids(self.args, self.work_dir,
-#                                                self.contigs_path)
-#
-#        #updating repeat graph
-#        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
-#        repeat_graph.load_from_file(self.repeat_graph)
-#        plas.update_graph(repeat_graph, plasmids)
-#        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
-#        fp.write_fasta_dict(repeat_graph.edges_fasta,
-#                            self.out_files["repeat_graph_edges"])
-
-
-
 class JobRepeat(Job):
     def __init__(self, args, work_dir, log_file, disjointigs):
         super(JobRepeat, self).__init__()
@@ -314,8 +277,7 @@ def run(self):
         logger.info("Running Minimap2")
         out_alignment = os.path.join(self.consensus_dir, "minimap.bam")
         aln.make_alignment(self.in_contigs, self.args.reads, self.args.threads,
-                           self.consensus_dir, self.args.platform, out_alignment,
-                           reference_mode=True, sam_output=True)
+                           self.args.platform, self.args.read_type, out_alignment)
 
         contigs_info = aln.get_contigs_info(self.in_contigs)
         logger.info("Computing consensus")
@@ -363,68 +325,14 @@ def run(self):
                                self.out_files["stats"], self.out_files["contigs"])
         pol.generate_polished_edges(self.in_graph_edges, self.in_graph_gfa,
                                     self.out_files["contigs"],
-                                    self.polishing_dir, self.args.platform,
+                                    self.polishing_dir, self.args.platform, self.args.read_type,
                                     stats, self.args.threads)
         os.remove(contigs)
         if os.path.getsize(self.out_files["contigs"]) == 0:
             raise asm.AssembleException("No contigs were assembled - "
                                         "pipeline stopped")
 
 
-class JobTrestle(Job):
-    def __init__(self, args, work_dir, log_file, repeat_graph,
-                 graph_edges, reads_alignment_file):
-        super(JobTrestle, self).__init__()
-
-        self.args = args
-        self.work_dir = os.path.join(work_dir, "21-trestle")
-        self.log_file = log_file
-        #self.repeats_dump = repeats_dump
-        self.graph_edges = graph_edges
-        self.repeat_graph = repeat_graph
-        self.reads_alignment_file = reads_alignment_file
-
-        self.name = "trestle"
-        self.out_files["repeat_graph"] = os.path.join(self.work_dir,
-                                                      "repeat_graph_dump")
-        self.out_files["repeat_graph_edges"] = \
-            os.path.join(self.work_dir, "repeat_graph_edges.fasta")
-
-    def run(self):
-        super(JobTrestle, self).run()
-
-        if not os.path.isdir(self.work_dir):
-            os.mkdir(self.work_dir)
-
-        summary_file = os.path.join(self.work_dir, "trestle_summary.txt")
-        resolved_repeats_seqs = os.path.join(self.work_dir,
-                                             "resolved_copies.fasta")
-        repeat_graph = RepeatGraph(fp.read_sequence_dict(self.graph_edges))
-        repeat_graph.load_from_file(self.repeat_graph)
-
-        try:
-            repeats_info = tres_graph \
-                .get_simple_repeats(repeat_graph, self.reads_alignment_file,
-                                    fp.read_sequence_dict(self.graph_edges))
-            tres_graph.dump_repeats(repeats_info,
-                                    os.path.join(self.work_dir, "repeats_dump"))
-
-            tres.resolve_repeats(self.args, self.work_dir, repeats_info,
-                                 summary_file, resolved_repeats_seqs)
-            tres_graph.apply_changes(repeat_graph, summary_file,
-                                     fp.read_sequence_dict(resolved_repeats_seqs))
-        except KeyboardInterrupt as e:
-            raise
-        #except Exception as e:
-        #    logger.warning("Caught unhandled exception: " + str(e))
-        #    logger.warning("Continuing to the next pipeline stage. "
-        #                   "Please submit a bug report along with the full log file")
-
-        repeat_graph.dump_to_file(self.out_files["repeat_graph"])
-        fp.write_fasta_dict(repeat_graph.edges_fasta,
-                            self.out_files["repeat_graph_edges"])
-
-
 def _create_job_list(args, work_dir, log_file):
     """
     Build pipeline as a list of consecutive jobs
@@ -452,13 +360,12 @@ def _create_job_list(args, work_dir, log_file):
 
     #Trestle: Resolve Unbridged Repeats
     #if not args.no_trestle and not args.meta and args.read_type == "raw":
-    if args.trestle:
-        jobs.append(JobTrestle(args, work_dir, log_file,
-                    repeat_graph, repeat_graph_edges,
-                    reads_alignment))
-        repeat_graph_edges = jobs[-1].out_files["repeat_graph_edges"]
-        repeat_graph = jobs[-1].out_files["repeat_graph"]
-
+    #if args.trestle:
+    #    jobs.append(JobTrestle(args, work_dir, log_file,
+    #                repeat_graph, repeat_graph_edges,
+    #                reads_alignment))
+    #    repeat_graph_edges = jobs[-1].out_files["repeat_graph_edges"]
+    #    repeat_graph = jobs[-1].out_files["repeat_graph"]
 
     #Contigger
     jobs.append(JobContigger(args, work_dir, log_file, repeat_graph_edges,
@@ -619,7 +526,8 @@ def _usage():
             "\t     [--meta] [--polish-target] [--min-overlap SIZE]\n"
             "\t     [--keep-haplotypes] [--debug] [--version] [--help] \n"
             "\t     [--scaffold] [--resume] [--resume-from] [--stop-after] \n"
-            "\t     [--read-error float] [--extra-params]")
+            "\t     [--read-error float] [--extra-params] \n"
+            "\t     [--deterministic]")
 
 
 def _epilog():
@@ -752,6 +660,9 @@ def check_int_range(value, min_val, max_val, require_odd=False):
                         dest="debug", default=False,
                         help="enable debug output")
     parser.add_argument("-v", "--version", action="version", version=_version())
+    parser.add_argument("--deterministic", action="store_true",
+                        dest="deterministic", default=False,
+                        help="perform disjointig assembly single-threaded")
     args = parser.parse_args()
 
     if args.asm_coverage and (args.genome_size is None):

diff --git a/Additional_src/Flye/flye/polishing/alignment.py b/Additional_src/Flye/flye/polishing/alignment.py
@@ -39,20 +39,9 @@ def check_binaries():
 
 
 def make_alignment(reference_file, reads_file, num_proc,
-                   work_dir, platform, out_alignment, reference_mode,
-                   sam_output):
-    """
-    Runs minimap2 and sorts its output
-    """
-    minimap_ref_mode = {False: "ava", True: "map"}
-    minimap_reads_mode = {"nano": "ont", "pacbio": "pb"}
-    mode = minimap_ref_mode[reference_mode] + "-" + minimap_reads_mode[platform]
-
-    _run_minimap(reference_file, reads_file, num_proc, mode,
-                 out_alignment, sam_output)
-
-    #if sam_output:
-    #    preprocess_sam(out_alignment, work_dir)
+                   platform, read_type, out_alignment):
+    mode = platform + "-" + read_type
+    _run_minimap(reference_file, reads_file, num_proc, mode, out_alignment)
 
 
 def get_contigs_info(contigs_file):
@@ -107,7 +96,8 @@ def get_uniform_alignments(alignments):
     MIN_QV = 20
 
     def is_reliable(aln):
-        return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV
+        return not aln.is_secondary and aln.map_qv >= MIN_QV
+        #return not aln.is_secondary and not aln.is_supplementary and aln.map_qv >= MIN_QV
 
     seq_len = alignments[0].trg_len
     ctg_id = alignments[0].trg_id
@@ -234,18 +224,30 @@ def name_split(h):
     return out_dict
 
 
-def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
-                 sam_output):
+def _run_minimap(reference_file, reads_files, num_proc, reads_type, out_file):
     #SAM_HEADER = "\'@PG|@HD|@SQ|@RG|@CO\'"
     work_dir = os.path.dirname(out_file)
     stderr_file = os.path.join(work_dir, "minimap.stderr")
     SORT_THREADS = "4"
     SORT_MEM = "4G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"
     BATCH = "5G" if os.path.getsize(reference_file) > 100 * 1024 * 1024 else "1G"
 
+    mode = None
+    extra_args = []
+    if reads_type in ["pacbio-raw", "pacbio-corrected", "pacbio-subasm"]:
+        mode = "map-pb"
+    if reads_type == "pacbio-hifi":
+        mode = "map-hifi"
+    elif reads_type in ["nano-raw", "nano-corrected"]:
+        mode = "map-ont"
+    elif reads_type == "nano-nano_hq":
+        mode = "map-ont"
+        extra_args = ["-k", "17"]
+
     cmdline = [MINIMAP_BIN, "'" + reference_file + "'"]
     cmdline.extend(["'" + read_file + "'" for read_file in reads_files])
     cmdline.extend(["-x", mode, "-t", str(num_proc)])
+    cmdline.extend(extra_args)
 
     #Produces gzipped SAM sorted by reference name. Since it's not sorted by
     #read name anymore, it's important that all reads have SEQ.
@@ -256,22 +258,14 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
     #--secondary-seq = custom option to output SEQ for seqcondary alignment with hard clipping
     #-L: move CIGAR strings for ultra-long reads to the separate tag
     #-Q don't output fastq quality
-    if sam_output:
-        tmp_prefix = os.path.join(os.path.dirname(out_file),
-                                  "sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
-        cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
-                        "-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
-        cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
-        cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
-                        "-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
-        cmdline.extend(["-o", "'" + out_file + "'"])
-    else:
-        pass    #paf output enabled by default
-
-        #cmdline.extend(["|", "grep", "-Ev", SAM_HEADER])    #removes headers
-        #cmdline.extend(["|", "sort", "-k", "3,3", "-T", work_dir,
-        #                "--parallel=8", "-S", "4G"])
-        #cmdline.extend(["|", "gzip", "-1"])
+    tmp_prefix = os.path.join(os.path.dirname(out_file),
+                              "sort_" + datetime.datetime.now().strftime("%y%m%d_%H%M%S"))
+    cmdline.extend(["-a", "-p", "0.5", "-N", "10", "--sam-hit-only", "-L", "-K", BATCH,
+                    "-z", "1000", "-Q", "--secondary-seq", "-I", "64G"])
+    cmdline.extend(["|", SAMTOOLS_BIN, "view", "-T", "'" + reference_file + "'", "-u", "-"])
+    cmdline.extend(["|", SAMTOOLS_BIN, "sort", "-T", "'" + tmp_prefix + "'", "-O", "bam",
+                    "-@", SORT_THREADS, "-l", "1", "-m", SORT_MEM])
+    cmdline.extend(["-o", "'" + out_file + "'"])
 
     #logger.debug("Running: " + " ".join(cmdline))
     try:
@@ -280,11 +274,11 @@ def _run_minimap(reference_file, reads_files, num_proc, mode, out_file,
                               "set -eo pipefail; " + " ".join(cmdline)],
                               stderr=open(stderr_file, "w"),
                               stdout=open(os.devnull, "w"))
-        if sam_output:
-            subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
+        subprocess.check_call(SAMTOOLS_BIN + " index -@ 4 " + "'" + out_file + "'", shell=True)
         #os.remove(stderr_file)
 
     except (subprocess.CalledProcessError, OSError) as e:
         logger.error("Error running minimap2, terminating. See the alignment error log "
                      " for details: " + stderr_file)
+        logger.error("Cmd: " + " ".join(cmdline))
         raise AlignmentException(str(e))
diff --git a/Additional_src/Flye/flye/polishing/bubbles.py b/Additional_src/Flye/flye/polishing/bubbles.py
@@ -328,7 +328,7 @@ def _compute_profile(alignment, ref_sequence):
     genome_len = alignment[0].trg_len
 
     #max_aln_err = cfg.vals["err_modes"][platform]["max_aln_error"]
-    min_aln_len = cfg.vals["min_polish_aln_len"]
+    min_aln_len = min(cfg.vals["min_polish_aln_len"], genome_len // 2)
     aln_errors = []
     #filtered = 0
     profile = [ProfileInfo() for _ in range(genome_len)]