Skip to content

Commit

Permalink
Switch GLNexus to execute per-contig
Browse files Browse the repository at this point in the history
  • Loading branch information
bbimber committed Feb 6, 2024
1 parent 6e2601d commit 80aae9e
Showing 1 changed file with 53 additions and 5 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package org.labkey.sequenceanalysis.analysis;

import com.google.common.io.Files;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Logger;
Expand All @@ -12,6 +16,7 @@
import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler;
import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner;
import org.labkey.api.sequenceanalysis.pipeline.PipelineOutputTracker;
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
Expand Down Expand Up @@ -159,9 +164,46 @@ else if (genomeIds.isEmpty())
throw new PipelineJobException("Missing configType");
}

File outputVcf = new File(ctx.getOutputDir(), basename + ".vcf.gz");
// NOTE: due to strange bad_alloc errors, iterate the genome contig-by-contig for now:
List<File> vcfs = new ArrayList<>();
ReferenceGenome rg = ctx.getSequenceSupport().getCachedGenome(genomeId);
SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(rg.getSequenceDictionary().toPath());
for (SAMSequenceRecord r : dict.getSequences())
{
File contigVcf = new File(ctx.getOutputDir(), basename + "." + r.getSequenceName() + ".vcf.gz");
File contigVcfIdx = new File(contigVcf.getPath() + ".tbi");
File doneFile = new File(contigVcf.getPath() + ".done");
ctx.getFileManager().addIntermediateFile(contigVcf);
ctx.getFileManager().addIntermediateFile(contigVcfIdx);
ctx.getFileManager().addIntermediateFile(doneFile);

if (doneFile.exists())
{
if (!contigVcfIdx.exists())
{
throw new PipelineJobException("Missing index: " + contigVcf.getPath());
}

vcfs.add(contigVcf);
}
else
{
ctx.getLogger().debug("Running GLNexus for contig: " + r.getSequenceName());
new GLNexusWrapper(ctx.getLogger()).execute(inputVcfs, contigVcf, ctx.getFileManager(), binVersion, configType, r);
vcfs.add(contigVcf);
try
{
Files.touch(doneFile);
}
catch (IOException e)
{
throw new PipelineJobException(e);
}
}
}

new GLNexusWrapper(ctx.getLogger()).execute(inputVcfs, outputVcf, ctx.getFileManager(), binVersion, configType);
File outputVcf = new File(ctx.getOutputDir(), basename + ".vcf.gz");
SequenceUtil.combineVcfs(vcfs, rg, outputVcf, ctx.getLogger(), true, null, true);

ctx.getLogger().debug("adding sequence output: " + outputVcf.getPath());
SequenceOutputFile so1 = new SequenceOutputFile();
Expand Down Expand Up @@ -217,7 +259,7 @@ private File ensureLocalCopy(File input, File workingDirectory, PipelineOutputTr
}
}

public void execute(List<File> inputGvcfs, File outputVcf, PipelineOutputTracker tracker, String binVersion, String configType) throws PipelineJobException
public void execute(List<File> inputGvcfs, File outputVcf, PipelineOutputTracker tracker, String binVersion, String configType, SAMSequenceRecord rec) throws PipelineJobException
{
File workDir = outputVcf.getParentFile();
tracker.addIntermediateFile(outputVcf);
Expand All @@ -233,7 +275,10 @@ public void execute(List<File> inputGvcfs, File outputVcf, PipelineOutputTracker
File localBashScript = new File(workDir, "docker.sh");
tracker.addIntermediateFile(localBashScript);

try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript))
File bed = new File(workDir, "contig.bed");
tracker.addIntermediateFile(bed);

try (PrintWriter writer = PrintWriters.getPrintWriter(localBashScript);PrintWriter bedWriter = PrintWriters.getPrintWriter(bed))
{
writer.println("#!/bin/bash");
writer.println("set -x");
Expand All @@ -260,7 +305,7 @@ public void execute(List<File> inputGvcfs, File outputVcf, PipelineOutputTracker
writer.println("\tghcr.io/dnanexus-rnd/glnexus:" + binVersion + " \\");
writer.println("\tglnexus_cli \\");
writer.println("\t--config " + configType + " \\");

writer.println("\t--bed /work/" + bed.getName() + " \\");
writer.println("\t--trim-uncalled-alleles \\");

if (maxRam != null)
Expand Down Expand Up @@ -290,6 +335,9 @@ public void execute(List<File> inputGvcfs, File outputVcf, PipelineOutputTracker
getLogger().debug("Deleting pre-existing GLnexus.DB dir");
FileUtils.deleteDirectory(dbDir);
}

// Create a single-contig BED file:
bedWriter.println(rec.getSequenceName() + "\t0\t" + rec.getSequenceLength());
}
catch (IOException e)
{
Expand Down

0 comments on commit 80aae9e

Please sign in to comment.