From cc2737526c8c5e45d24b9d68ac9880ac16b94880 Mon Sep 17 00:00:00 2001 From: bbimber Date: Mon, 24 Feb 2025 06:09:05 -0800 Subject: [PATCH] Improve KING/plink2 implementation --- .../pipeline/BcftoolsRunner.java | 33 ++++ .../SequenceAnalysisServiceImpl.java | 13 +- .../run/variant/KingInferenceStep.java | 172 ++++-------------- 3 files changed, 82 insertions(+), 136 deletions(-) diff --git a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/BcftoolsRunner.java b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/BcftoolsRunner.java index 0767a7471..f608e17de 100644 --- a/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/BcftoolsRunner.java +++ b/SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/BcftoolsRunner.java @@ -2,9 +2,13 @@ import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.Nullable; +import org.labkey.api.pipeline.PipelineJobException; +import org.labkey.api.pipeline.PipelineJobService; import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; import java.io.File; +import java.util.ArrayList; +import java.util.List; /** * User: bimber @@ -22,4 +26,33 @@ public static File getBcfToolsPath() { return SequencePipelineService.get().getExeForPackage("BCFTOOLSPATH", "bcftools"); } + + public static boolean isBcftoolsFound() + { + return BcftoolsRunner.resolveFileInPath("bcftools", null, false) != null; + } + + public void doIndex(File vcf) throws PipelineJobException + { + List args = new ArrayList<>(); + args.add(getBcfToolsPath().getAbsolutePath()); + args.add("index"); + args.add("-t"); + args.add("-f"); + args.add("-n"); + + if (!PipelineJobService.get().isWebServer()) + { + Integer threads = SequencePipelineService.get().getMaxThreads(getLogger()); + if (threads != null) + { + args.add("--threads"); + args.add(String.valueOf(threads)); + } + } + + args.add(vcf.getAbsolutePath()); + + execute(args); + } } diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisServiceImpl.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisServiceImpl.java index fb577f4ec..9a3305cbd 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisServiceImpl.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisServiceImpl.java @@ -36,6 +36,7 @@ import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.SequenceDataProvider; import org.labkey.api.sequenceanalysis.model.Readset; +import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner; import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; import org.labkey.api.sequenceanalysis.pipeline.SamtoolsCramConverter; import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; @@ -267,8 +268,16 @@ public File ensureVcfIndex(File vcf, Logger log, boolean forceRecreate) throws I //note: there is a bug in htsjdk's index creation with gz inputs if (gz.isType(vcf) && !SystemUtils.IS_OS_WINDOWS) { - TabixRunner r = new TabixRunner(log); - r.execute(vcf); + // preferentially use bcftools since it supports multithreading: + if (BcftoolsRunner.isBcftoolsFound()) + { + new BcftoolsRunner(log).doIndex(vcf); + } + else + { + new TabixRunner(log).execute(vcf); + } + if (!expectedIdx.exists()) { throw new PipelineJobException("Expected index was not created: " + expectedIdx.getPath()); diff --git a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/KingInferenceStep.java b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/KingInferenceStep.java index 4a6c124ae..a1b3b9af5 100644 --- a/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/KingInferenceStep.java +++ b/SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/KingInferenceStep.java @@ -1,5 +1,6 @@ package org.labkey.sequenceanalysis.run.variant; +import com.google.common.io.Files; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.Interval; @@ -10,10 +11,7 @@ import org.apache.logging.log4j.Logger; import org.jetbrains.annotations.Nullable; import org.json.JSONObject; -import org.labkey.api.collections.CaseInsensitiveHashMap; import org.labkey.api.pipeline.PipelineJobException; -import org.labkey.api.reader.Readers; -import org.labkey.api.sequenceanalysis.SequenceAnalysisService; import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider; import org.labkey.api.sequenceanalysis.pipeline.PedigreeToolParameterDescriptor; import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; @@ -26,18 +24,11 @@ import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep; import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; import org.labkey.api.util.Compress; -import org.labkey.api.writer.PrintWriters; -import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler; -import java.io.BufferedReader; import java.io.File; import java.io.IOException; -import java.io.PrintWriter; import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; import java.util.List; -import java.util.Map; public class KingInferenceStep extends AbstractCommandPipelineStep implements VariantProcessingStep { @@ -50,7 +41,7 @@ public static class Provider extends AbstractVariantProcessingStepProvider kingArgs = new ArrayList<>(); - kingArgs.add(wrapper.getExe().getPath()); - - kingArgs.add("-b"); - kingArgs.add(plinkOutBed.getPath()); - - kingArgs.add("--prefix"); - kingArgs.add(SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName())); - - // Update the pedigree / fam file: - String demographicsProviderName = getProvider().getParameterByName(PedigreeToolParameterDescriptor.NAME).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx()); - if (demographicsProviderName != null) - { - File pedFile = ProcessVariantsHandler.getPedigreeFile(getPipelineCtx().getSourceDirectory(true), demographicsProviderName); - if (!pedFile.exists()) - { - throw new PipelineJobException("Unable to find pedigree file: " + pedFile.getPath()); - } - - File kingFam = createFamFile(pedFile, new File(plinkOutBed.getParentFile(), "plink.fam")); - kingArgs.add("--fam"); - kingArgs.add(kingFam.getPath()); - - output.addIntermediateFile(kingFam); - } - - if (threads != null) - { - kingArgs.add("--cpus"); - kingArgs.add(threads.toString()); - } - - kingArgs.add("--kinship"); - kingArgs.add("--rplot"); - - File kinshipOutput = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".kin"); - wrapper.execute(kingArgs); - if (!kinshipOutput.exists()) - { - throw new PipelineJobException("Unable to find file: " + kinshipOutput.getPath()); - } - - File kinshipOutputTxt = new File(kinshipOutput.getPath() + ".txt.gz"); - if (kinshipOutputTxt.exists()) - { - kinshipOutputTxt.delete(); - } - - lineCount = SequencePipelineService.get().getLineCount(kinshipOutput)-1; - try - { - Compress.compressGzip(kinshipOutput, kinshipOutputTxt); - FileUtils.delete(kinshipOutput); - } - catch (IOException e) - { - throw new PipelineJobException(e); - } - - output.addSequenceOutput(kinshipOutputTxt, "King Relatedness: " + inputVCF.getName(), "KING Relatedness", null, null, genome.getGenomeId(), "Total lines: " + lineCount); + output.addSequenceOutput(plinkOutKingFileTxt, "PLINK2/KING Relatedness: " + inputVCF.getName(), "PLINK2/KING Kinship", null, null, genome.getGenomeId(), "Total lines: " + lineCount); return output; } - private File createFamFile(File pedFile, File famFile) throws PipelineJobException - { - File newFamFile = new File(famFile.getParentFile(), "king.fam"); - - Map pedMap = new CaseInsensitiveHashMap<>(); - try (BufferedReader reader = Readers.getReader(pedFile)) - { - String line; - while ((line = reader.readLine()) != null) - { - String[] tokens = line.split(" "); - if (tokens.length != 6) - { - throw new PipelineJobException("Improper ped line length: " + tokens.length); - } - - pedMap.put(tokens[1], StringUtils.join(Arrays.asList("0", tokens[1], tokens[2], tokens[3], tokens[4], "-9"), "\t")); - } - } - catch (IOException e) - { - throw new PipelineJobException(e); - } - - try (BufferedReader reader = Readers.getReader(famFile);PrintWriter writer = PrintWriters.getPrintWriter(newFamFile)) - { - String line; - while ((line = reader.readLine()) != null) - { - String[] tokens = line.split("\t"); - if (tokens.length != 6) - { - throw new PipelineJobException("Improper ped line length: " + tokens.length); - } - - String newRow = pedMap.get(tokens[1]); - if (newRow == null) - { - getPipelineCtx().getLogger().warn("Unable to find pedigree entry for: " + tokens[1] + ", reusing original"); - writer.println(line); - } - else - { - writer.println(newRow); - } - } - } - catch (IOException e) - { - throw new PipelineJobException(e); - } - - return newFamFile; - } - public static class KingWrapper extends AbstractCommandWrapper { public KingWrapper(@Nullable Logger logger)