diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java index e8ca43357..34887620a 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/VariantProcessingStep.java @@ -45,10 +45,22 @@ default void complete(PipelineJob job, List inputs, List> establishIntervals() { LinkedHashMap> ret; @@ -137,7 +147,7 @@ else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.chunk getLogger().info("Creating jobs with target bp size: " + basesPerJob + " mbp. allow splitting configs: " + allowSplitChromosomes + ", max contigs per job: " + maxContigsPerJob); basesPerJob = basesPerJob * 1000000; - ret = ScatterGatherUtils.divideGenome(dict, basesPerJob, allowSplitChromosomes, maxContigsPerJob); + ret = ScatterGatherUtils.divideGenome(dict, basesPerJob, allowSplitChromosomes, maxContigsPerJob, scatterMethodRequiresSort()); } else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.fixedJobs) @@ -146,7 +156,7 @@ else if (_scatterGatherMethod == VariantProcessingStep.ScatterGatherMethod.fixed int numJobs = getParameterJson().getInt("scatterGather.totalJobs"); int jobSize = (int)Math.ceil(totalSize / (double)numJobs); getLogger().info("Creating " + numJobs + " jobs with approximate size: " + jobSize + " bp."); - ret = ScatterGatherUtils.divideGenome(dict, jobSize, true, -1); + ret = ScatterGatherUtils.divideGenome(dict, jobSize, true, -1, false); } else { diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java index f962a73cc..5eeb8e50e 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/pipeline/VariantProcessingRemoteMergeTask.java @@ -222,7 +222,7 @@ else if (!vcf.exists()) throw new PipelineJobException("Missing one of more VCFs: " + missing.stream().map(File::getPath).collect(Collectors.joining(","))); } - boolean sortAfterMerge = handler instanceof VariantProcessingStep.SupportsScatterGather && ((VariantProcessingStep.SupportsScatterGather) handler).doSortAfterMerge(); + boolean sortAfterMerge = getPipelineJob().scatterMethodRequiresSort() || handler instanceof VariantProcessingStep.SupportsScatterGather && ((VariantProcessingStep.SupportsScatterGather) handler).doSortAfterMerge(); combined = SequenceAnalysisService.get().combineVcfs(toConcat, combined, genome, getJob().getLogger(), true, null, sortAfterMerge); } manager.addOutput(action, "Merged VCF", combined); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java index 4affbe7e2..8e69a3553 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/util/ScatterGatherUtils.java @@ -7,7 +7,6 @@ import org.junit.Test; import java.util.ArrayList; -import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.LinkedHashMap; @@ -113,13 +112,17 @@ private void addInterval(String refName, int start, int end) } } - public static LinkedHashMap> divideGenome(SAMSequenceDictionary dict, int optimalBasesPerJob, boolean allowSplitChromosomes, int maxContigsPerJob) + public static LinkedHashMap> divideGenome(SAMSequenceDictionary dict, int optimalBasesPerJob, boolean allowSplitChromosomes, int maxContigsPerJob, boolean sortOnContigSize) { ActiveIntervalSet ais = new ActiveIntervalSet(optimalBasesPerJob, allowSplitChromosomes, maxContigsPerJob); // Sort the sequences in descending length, rather than alphabetic on name: List sortedSeqs = new ArrayList<>(dict.getSequences()); - sortedSeqs.sort(Comparator.comparingInt(SAMSequenceRecord::getSequenceLength).reversed()); + if (sortOnContigSize) + { + sortedSeqs.sort(Comparator.comparingInt(SAMSequenceRecord::getSequenceLength).reversed()); + } + for (SAMSequenceRecord rec : sortedSeqs) { ais.add(rec); @@ -152,13 +155,13 @@ private SAMSequenceDictionary getDict() public void testScatter() { SAMSequenceDictionary dict = getDict(); - Map> ret = divideGenome(dict, 1000, true, -1); + Map> ret = divideGenome(dict, 1000, true, -1, true); assertEquals("Incorrect number of jobs", 8, ret.size()); assertEquals("Incorrect interval end", 1000, ret.get("Job3").get(0).getEnd()); assertEquals("Incorrect start", 1, ret.get("Job3").get(0).getStart()); assertEquals("Incorrect interval end", 4, ret.get("Job8").size()); - Map> ret2 = divideGenome(dict, 3000, false, -1); + Map> ret2 = divideGenome(dict, 3000, false, -1, true); assertEquals("Incorrect number of jobs", 3, ret2.size()); for (String jobName : ret2.keySet()) { @@ -168,7 +171,7 @@ public void testScatter() } } - Map> ret3 = divideGenome(dict, 3002, false, -1); + Map> ret3 = divideGenome(dict, 3002, false, -1, true); assertEquals("Incorrect number of jobs", 3, ret3.size()); for (String jobName : ret3.keySet()) { @@ -178,7 +181,7 @@ public void testScatter() } } - Map> ret4 = divideGenome(dict, 2999, false, -1); + Map> ret4 = divideGenome(dict, 2999, false, -1, true); assertEquals("Incorrect number of jobs", 3, ret4.size()); for (String jobName : ret4.keySet()) { @@ -188,7 +191,7 @@ public void testScatter() } } - Map> ret5 = divideGenome(dict, 750, true, -1); + Map> ret5 = divideGenome(dict, 750, true, -1, true); assertEquals("Incorrect number of jobs", 9, ret5.size()); assertEquals("Incorrect interval end", 750, ret5.get("Job1").get(0).getEnd()); assertEquals("Incorrect interval end", 4, ret5.get("Job9").size()); @@ -196,7 +199,7 @@ public void testScatter() assertEquals("Incorrect interval start", 1501, ret5.get("Job3").get(0).getStart()); assertEquals("Incorrect interval start", 1, ret5.get("Job8").get(0).getStart()); - Map> ret6 = divideGenome(dict, 5000, false, 2); + Map> ret6 = divideGenome(dict, 5000, false, 2, true); assertEquals("Incorrect number of jobs", 5, ret6.size()); } }