Merge pull request #256 from LabKey/fb_merge_23.11_to_develop

Merge discvr-23.11 to develop
BimberLab · Dec 21, 2023 · c95ba94 · c95ba94
2 parents da147a9 + 3db76df
commit c95ba94
Show file tree

Hide file tree

Showing 15 changed files with 466 additions and 220 deletions.
diff --git a/QueryExtensions/module.properties b/QueryExtensions/module.properties
@@ -1,4 +1,5 @@
 ModuleClass: org.labkey.queryextensions.QueryExtensionsModule
+ManageVersion: false
 Label: Query Extensions
 Description: This module contains low-level extensions to the LabKey Query layer
 License: Apache 2.0

diff --git a/SequenceAnalysis/resources/web/SequenceAnalysis/panel/VariantProcessingPanel.js b/SequenceAnalysis/resources/web/SequenceAnalysis/panel/VariantProcessingPanel.js
@@ -269,7 +269,7 @@ Ext4.define('SequenceAnalysis.panel.VariantProcessingPanel', {
 				description: 'A BED or similar file with intervals to skip',
 				defaultValue: null
 			},{
-				fieldXtype: 'ldk-expdatafield',
+				fieldXtype: 'checkbox',
 				name: 'skipExcessHetAndInbreeding',
 				label: 'Skip Excess Het And Inbreeding',
 				description: 'If checked, the ExcessHet and InbreedingCoeff annotations will be skipped, which can be important when using force-output-intervals',

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisMaintenanceTask.java
@@ -172,8 +172,7 @@ else if (!d.getFile().exists())
                 {
                     if (d != null && d.getFile() != null && d.getFile().exists())
                     {
-                        // NOTE: ultimately remove this:
-                        log.info("ReadData marked as archived, but file exists: " + rd.getRowid() + ", " + rd.getFileId1() + ", " + d.getFile().getPath() + " for container: " + (c == null ? rd.getContainer() : c.getPath()));
+                        log.error("ReadData marked as archived, but file exists: " + rd.getRowid() + ", " + rd.getFileId1() + ", " + d.getFile().getPath() + " for container: " + (c == null ? rd.getContainer() : c.getPath()));
                     }
                 }
             }

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java
@@ -78,6 +78,7 @@
 import org.labkey.sequenceanalysis.run.alignment.StarWrapper;
 import org.labkey.sequenceanalysis.run.alignment.VulcanWrapper;
 import org.labkey.sequenceanalysis.run.analysis.BamIterator;
+import org.labkey.sequenceanalysis.run.analysis.BcftoolsFillTagsStep;
 import org.labkey.sequenceanalysis.run.analysis.ExportOverlappingReadsAnalysis;
 import org.labkey.sequenceanalysis.run.analysis.GenrichStep;
 import org.labkey.sequenceanalysis.run.analysis.HaplotypeCallerAnalysis;
@@ -312,6 +313,7 @@ public static void registerPipelineSteps()
         SequencePipelineService.get().registerPipelineStep(new KingInferenceStep.Provider());
         SequencePipelineService.get().registerPipelineStep(new MendelianViolationReportStep.Provider());
         SequencePipelineService.get().registerPipelineStep(new SummarizeGenotypeQualityStep.Provider());
+        SequencePipelineService.get().registerPipelineStep(new BcftoolsFillTagsStep.Provider());
 
         //handlers
         SequenceAnalysisService.get().registerFileHandler(new LiftoverHandler());

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/BcftoolsFillTagsStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/BcftoolsFillTagsStep.java
@@ -0,0 +1,134 @@
+package org.labkey.sequenceanalysis.run.analysis;
+
+import htsjdk.samtools.util.Interval;
+import org.apache.commons.lang3.StringUtils;
+import org.json.JSONObject;
+import org.labkey.api.pipeline.PipelineJobException;
+import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
+import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
+import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner;
+import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
+import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
+import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
+import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
+import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
+import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
+import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl;
+import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep;
+import org.labkey.sequenceanalysis.pipeline.SequenceTaskHelper;
+
+import javax.annotation.Nullable;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class BcftoolsFillTagsStep extends AbstractCommandPipelineStep<BcftoolsRunner> implements VariantProcessingStep
+{
+    public BcftoolsFillTagsStep(PipelineStepProvider<?> provider, PipelineContext ctx)
+    {
+        super(provider, ctx, new BcftoolsRunner(ctx.getLogger()));
+    }
+
+    public static class Provider extends AbstractVariantProcessingStepProvider<BcftoolsFillTagsStep> implements VariantProcessingStep.RequiresPedigree,  VariantProcessingStep.SupportsScatterGather
+    {
+        public Provider()
+        {
+            super("BcftoolsFillTagsStep", "Bcftools Fill-tags", "bcftools", "Annotate variants using bcftools fill-tags", Arrays.asList(
+                    ToolParameterDescriptor.create("hwe", "HWE", "If selected, HWE will be annotated", "checkbox", new JSONObject(){{
+                        put("checked", true);
+                    }}, true),
+                    ToolParameterDescriptor.create("exchet", "Excess Het", "If selected, ExcHet will be annotated.", "checkbox", new JSONObject(){{
+                        put("checked", true);
+                    }}, true)
+            ), null, "");
+        }
+
+        @Override
+        public BcftoolsFillTagsStep create(PipelineContext ctx)
+        {
+            return new BcftoolsFillTagsStep(this, ctx);
+        }
+    }
+
+    @Override
+    public VariantProcessingStep.Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List<Interval> intervals) throws PipelineJobException
+    {
+        VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl();
+
+        List<String> options = new ArrayList<>();
+        options.add(getWrapper().getBcfToolsPath().getPath());
+        options.add("+fill-tags");
+
+        options.add(inputVCF.getPath());
+
+        if (intervals != null)
+        {
+            options.add("--regions");
+            options.add(intervals.stream().map(interval -> interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd()).collect(Collectors.joining(",")));
+        }
+
+        options.add("-O");
+        options.add("z9");
+
+        Integer threads = SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger());
+        if (threads != null)
+        {
+            options.add("--threads");
+            options.add(threads.toString());
+        }
+
+        File outputVcf = new File(outputDirectory, SequenceTaskHelper.getUnzippedBaseName(inputVCF) + ".ft.vcf.gz");
+        options.add("-o");
+        options.add(outputVcf.getPath());
+
+        List<String> annotations = new ArrayList<>();
+        if (getProvider().getParameterByName("hwe").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false))
+        {
+            annotations.add("hwe");
+        }
+
+        if (getProvider().getParameterByName("exchet").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false))
+        {
+            annotations.add("exchet");
+        }
+
+        if (annotations.isEmpty())
+        {
+            throw new PipelineJobException("No annotations were selected");
+        }
+
+        options.add("-t");
+        options.add(StringUtils.join(annotations, ","));
+
+        BcftoolsRunner wrapper = getWrapper();
+
+        String bcfPluginDir = StringUtils.trimToNull(System.getenv("BCFTOOLS_PLUGINS"));
+        if (bcfPluginDir != null)
+        {
+            getPipelineCtx().getLogger().debug("Setting BCFTOOLS_PLUGINS environment variable: " + bcfPluginDir);
+            wrapper.addToEnvironment("BCFTOOLS_PLUGINS", bcfPluginDir);
+        }
+
+        wrapper.execute(options);
+        if (!outputVcf.exists())
+        {
+            throw new PipelineJobException("output not found: " + outputVcf);
+        }
+
+        try
+        {
+            SequenceAnalysisService.get().ensureVcfIndex(outputVcf, getWrapper().getLogger());
+        }
+        catch (IOException e)
+        {
+            throw new PipelineJobException(e);
+        }
+
+        output.setVcf(outputVcf);
+
+        return output;
+    }
+}
diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java
@@ -110,9 +110,22 @@ public Provider()
                         put("width", 400);
                         put("allowBlank", true);
                     }}, null),
+                    ToolParameterDescriptor.create("generateConsensus", "Generate Consensus", "If selected, a FASTA with the simple majority consensus will be generated.", "checkbox", new JSONObject()
+                    {{
+
+                    }}, false),
                     ToolParameterDescriptor.create("minCoverage", "Min Coverage For Consensus", "If provided, a consensus will only be called over regions with at least this depth", "ldk-integerfield", new JSONObject(){{
                         put("minValue", 0);
                     }}, 25),
+                    ToolParameterDescriptor.create("generateTable", "Generate Variant Table", "If selected, a TSV listing variants above the given threshold will be generated.", "checkbox", new JSONObject()
+                    {{
+
+                    }}, false),
+                    ToolParameterDescriptor.create("minFractionForTable", "Min Fraction for Table", "If the option to generate a table output is used, only variants with frequency of this threshold will be included", "ldk-numberfield", new JSONObject(){{
+                        put("minValue", 0);
+                        put("maxValue", 1);
+                        put("decimalPrecision", 3);
+                    }}, 0.01),
                     ToolParameterDescriptor.createExpDataParam("primerBedFile", "Primer Sites (BED File)", "This is a BED file specifying the primer binding sites, which will be used to flag variants.  Strandedness is ignored.", "ldk-expdatafield", new JSONObject()
                     {{
                         put("allowBlank", true);
@@ -470,6 +483,10 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
         }
 
         double minFractionForConsensus = getProvider().getParameterByName("minFractionForConsensus").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class, 0.0);
+        boolean generateConsensus = getProvider().getParameterByName("generateConsensus").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
+
+        boolean generateTable = getProvider().getParameterByName("generateTable").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
+        double minFractionForTable = getProvider().getParameterByName("minFractionForTable").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class, 0.0);
 
         Integer primerDataId = getProvider().getParameterByName("primerBedFile").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class);
         List<Interval> primerIntervals = new ArrayList<>();
@@ -517,11 +534,12 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
 
         File loFreqConsensusVcf = getConsensusVcf(outputDir, inputBam);
         File loFreqAllVcf = getAllVcf(outputDir, inputBam);
+        File tableFile = getTableFile(outputDir, inputBam);
         Double strandBiasRecoveryAF = getProvider().getParameterByName("strandBiasRecoveryAF").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Double.class, 1.0);
         SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(referenceGenome.getSequenceDictionary().toPath());
         VariantContextWriterBuilder writerBuilderConsensus = new VariantContextWriterBuilder().setOutputFile(loFreqConsensusVcf).setReferenceDictionary(dict);
         VariantContextWriterBuilder writerBuilderAll = new VariantContextWriterBuilder().setOutputFile(loFreqAllVcf).setReferenceDictionary(dict);
-        try (VCFFileReader reader = new VCFFileReader(activeVCF);CloseableIterator<VariantContext> it = reader.iterator();VariantContextWriter writerConsensus = writerBuilderConsensus.build();VariantContextWriter writerAll = writerBuilderAll.build())
+        try (VCFFileReader reader = new VCFFileReader(activeVCF);CloseableIterator<VariantContext> it = reader.iterator();VariantContextWriter writerConsensus = writerBuilderConsensus.build();VariantContextWriter writerAll = writerBuilderAll.build();CSVWriter variantTableWriter = generateTable ? new CSVWriter(PrintWriters.getPrintWriter(tableFile), '\t', CSVWriter.NO_QUOTE_CHARACTER) : null)
         {
             VCFHeader header = reader.getFileHeader();
 
@@ -532,6 +550,11 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
             writerConsensus.writeHeader(header);
             writerAll.writeHeader(header);
 
+            if (generateTable)
+            {
+                variantTableWriter.writeNext(new String[]{"Contig", "Start", "End", "Reference", "Alt", "Frequency", "AlleleDepth", "TotalDepth"});
+            }
+
             SortingCollection<VariantContext> allVariants = getVariantSorter(header);
             SortingCollection<VariantContext> consensusVariants = getVariantSorter(header);
             while (it.hasNext())
@@ -571,6 +594,16 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
                     }
                 }
 
+                if (generateTable)
+                {
+                    if (vc.hasAttribute("AF") && vc.getAttributeAsDouble("AF", 0.0) > minFractionForTable)
+                    {
+                        List<Integer> depths = vc.getAttributeAsIntList("DP4", 0);
+                        int alleleDepth = depths.get(2) + depths.get(3);
+                        variantTableWriter.writeNext(new String[]{vc.getContig(), String.valueOf(vc.getStart()), String.valueOf(vc.getEnd()), vc.getReference().getBaseString(), vc.getAlternateAllele(0).getBaseString(), String.valueOf(vc.getAttributeAsDouble("AF", 0.0)), String.valueOf(alleleDepth), String.valueOf(vc.getAttributeAsInt("DP", 0))});
+                    }
+                }
+
                 totalVariants++;
                 if (vc.hasAttribute("AF") && vc.getAttributeAsDouble("AF", 0.0) > 0.01)
                 {
@@ -672,6 +705,10 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
             }
             consensusVariants.cleanup();
         }
+        catch (IOException e)
+        {
+            throw new PipelineJobException(e);
+        }
 
         NumberFormat fmt = NumberFormat.getPercentInstance();
         fmt.setMaximumFractionDigits(2);
@@ -720,12 +757,24 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
         }
 
         output.addSequenceOutput(coverageOut, "Depth of Coverage: " + rs.getName(), "Depth of Coverage", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
-        output.addSequenceOutput(consensusFastaLoFreq, "Consensus: " + rs.getName(), "Viral Consensus Sequence", rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
+        if (generateConsensus)
+        {
+            output.addSequenceOutput(consensusFastaLoFreq, "Consensus: " + rs.getName(), "Viral Consensus Sequence", rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
+        }
 
         boolean runPangolinAndNextClade = getProvider().getParameterByName("runPangolin").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Boolean.class, false);
 
         output.addSequenceOutput(loFreqAllVcf, "LoFreq: " + rs.getName(), CATEGORY, rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
 
+        if (generateTable)
+        {
+            if (!tableFile.exists())
+            {
+
+            }
+            output.addSequenceOutput(tableFile, "LoFreq: " + rs.getName(), "LoFreq Variant Table", rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
+        }
+
         Map<String, String> pangolinData = null;
         if (runPangolinAndNextClade)
         {
@@ -781,6 +830,11 @@ private File getAllVcf(File outputDir, File inputBam)
         return new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.all.vcf.gz");
     }
 
+    private File getTableFile(File outputDir, File inputBam)
+    {
+        return new File(outputDir, FileUtil.getBaseName(inputBam) + ".lofreq.txt");
+    }
+
     private Set<String> runBcftools(File inputBam, ReferenceGenome referenceGenome, File mask, int minCoverage) throws PipelineJobException
     {
         Set<String> variantsBcftools = new HashSet<>();

diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/VariantAnnotatorStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/VariantAnnotatorStep.java
@@ -28,7 +28,7 @@
  */
 public class VariantAnnotatorStep extends AbstractCommandPipelineStep<VariantAnnotatorWrapper> implements VariantProcessingStep
 {
-    public VariantAnnotatorStep(PipelineStepProvider provider, PipelineContext ctx)
+    public VariantAnnotatorStep(PipelineStepProvider<?> provider, PipelineContext ctx)
     {
         super(provider, ctx, new VariantAnnotatorWrapper(ctx.getLogger()));
     }

diff --git a/Studies/module.properties b/Studies/module.properties
@@ -3,3 +3,4 @@ Label: Studies
 Description: Extensions to the study framework, designed to more flexibly manage multiple study designs from one source of data
 License: Apache 2.0
 LicenseURL: http://www.apache.org/licenses/LICENSE-2.0
+ManageVersion: false