diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GLNexusHandler.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GLNexusHandler.java index 96a600b83..0864c7cfd 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GLNexusHandler.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/analysis/GLNexusHandler.java @@ -1,5 +1,9 @@ package org.labkey.sequenceanalysis.analysis; +import com.google.common.io.Files; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.Logger; @@ -12,6 +16,7 @@ import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.PipelineOutputTracker; +import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService; @@ -159,9 +164,46 @@ else if (genomeIds.isEmpty()) throw new PipelineJobException("Missing configType"); } - File outputVcf = new File(ctx.getOutputDir(), basename + ".vcf.gz"); + // NOTE: due to strange bad_alloc errors, iterate the genome contig-by-contig for now: + List vcfs = new ArrayList<>(); + ReferenceGenome rg = ctx.getSequenceSupport().getCachedGenome(genomeId); + SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(rg.getSequenceDictionary().toPath()); + for (SAMSequenceRecord r : dict.getSequences()) + { + File contigVcf = new File(ctx.getOutputDir(), basename + "." + r.getSequenceName() + ".vcf.gz"); + File contigVcfIdx = new File(contigVcf.getPath() + ".tbi"); + File doneFile = new File(contigVcf.getPath() + ".done"); + ctx.getFileManager().addIntermediateFile(contigVcf); + ctx.getFileManager().addIntermediateFile(contigVcfIdx); + ctx.getFileManager().addIntermediateFile(doneFile); + + if (doneFile.exists()) + { + if (!contigVcfIdx.exists()) + { + throw new PipelineJobException("Missing index: " + contigVcf.getPath()); + } + + vcfs.add(contigVcf); + } + else + { + ctx.getLogger().debug("Running GLNexus for contig: " + r.getSequenceName()); + new GLNexusWrapper(ctx.getLogger()).execute(inputVcfs, contigVcf, ctx.getFileManager(), binVersion, configType, r); + vcfs.add(contigVcf); + try + { + Files.touch(doneFile); + } + catch (IOException e) + { + throw new PipelineJobException(e); + } + } + } - new GLNexusWrapper(ctx.getLogger()).execute(inputVcfs, outputVcf, ctx.getFileManager(), binVersion, configType); + File outputVcf = new File(ctx.getOutputDir(), basename + ".vcf.gz"); + SequenceUtil.combineVcfs(vcfs, rg, outputVcf, ctx.getLogger(), true, null, true); ctx.getLogger().debug("adding sequence output: " + outputVcf.getPath()); SequenceOutputFile so1 = new SequenceOutputFile(); @@ -217,7 +259,7 @@ private File ensureLocalCopy(File input, File workingDirectory, PipelineOutputTr } } - public void execute(List inputGvcfs, File outputVcf, PipelineOutputTracker tracker, String binVersion, String configType) throws PipelineJobException + public void execute(List inputGvcfs, File outputVcf, PipelineOutputTracker tracker, String binVersion, String configType, SAMSequenceRecord rec) throws PipelineJobException { File workDir = outputVcf.getParentFile(); tracker.addIntermediateFile(outputVcf); @@ -233,7 +275,10 @@ public void execute(List inputGvcfs, File outputVcf, PipelineOutputTracker File localBashScript = new File(workDir, "docker.sh"); tracker.addIntermediateFile(localBashScript); - try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript)) + File bed = new File(workDir, "contig.bed"); + tracker.addIntermediateFile(bed); + + try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript);PrintWriter bedWriter = PrintWriters.getPrintWriter(bed)) { writer.println("#!/bin/bash"); writer.println("set -x"); @@ -260,7 +305,7 @@ public void execute(List inputGvcfs, File outputVcf, PipelineOutputTracker writer.println("\tghcr.io/dnanexus-rnd/glnexus:" + binVersion + " \\"); writer.println("\tglnexus_cli \\"); writer.println("\t--config " + configType + " \\"); - + writer.println("\t--bed /work/" + bed.getName() + " \\"); writer.println("\t--trim-uncalled-alleles \\"); if (maxRam != null) @@ -290,6 +335,9 @@ public void execute(List inputGvcfs, File outputVcf, PipelineOutputTracker getLogger().debug("Deleting pre-existing GLnexus.DB dir"); FileUtils.deleteDirectory(dbDir); } + + // Create a single-contig BED file: + bedWriter.println(rec.getSequenceName() + "\t0\t" + rec.getSequenceLength()); } catch (IOException e) {