From e5d6c4985e10f0b62b4bd813818a8da94ce09d6c Mon Sep 17 00:00:00 2001 From: Stephan Schiffels Date: Tue, 4 Feb 2025 21:22:46 +0100 Subject: [PATCH] added VCF output golden tests --- src/Poseidon/CLI/Forge.hs | 2 +- src/Poseidon/CLI/Genoconvert.hs | 2 +- src/Poseidon/GenotypeData.hs | 16 ++++++----- .../GoldenTestCheckSumFile.txt | 5 ++++ .../GoldenTestData/chronicle/chronicle2.yml | 12 ++++----- .../forge/ForgePac21/ForgePac21.bib | 16 +++++++++++ .../forge/ForgePac21/ForgePac21.janno | 7 +++++ .../forge/ForgePac21/ForgePac21.ssf | 3 +++ .../forge/ForgePac21/ForgePac21.vcf | 25 ++++++++++++++++++ .../forge/ForgePac21/POSEIDON.yml | 11 ++++++++ .../genoconvert/out_vcf/Schiffels_2016.vcf.gz | Bin 0 -> 512 bytes .../GoldenTestsRunCommands.hs | 25 ++++++++++++++++++ 12 files changed, 109 insertions(+), 15 deletions(-) create mode 100644 test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.bib create mode 100644 test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.janno create mode 100644 test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.ssf create mode 100644 test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.vcf create mode 100644 test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/POSEIDON.yml create mode 100644 test/PoseidonGoldenTests/GoldenTestData/genoconvert/out_vcf/Schiffels_2016.vcf.gz diff --git a/src/Poseidon/CLI/Forge.hs b/src/Poseidon/CLI/Forge.hs index 3588e3cc2..c23bc4fea 100644 --- a/src/Poseidon/CLI/Forge.hs +++ b/src/Poseidon/CLI/Forge.hs @@ -294,7 +294,7 @@ runForge ( GenotypeVCF outG _ -> let allJannoRows = getJannoRows $ getJointJanno relevantPackages selJannoRows = map (allJannoRows !!) relevantIndices - in writeVCF logA selJannoRows outG newEigenstratIndEntries + in writeVCF logA selJannoRows (outPath outG) let extractPipe = if packageWise then cat else P.map (selectIndices relevantIndices) -- define main forge pipe including file output. -- The final tee forwards the results to be used in the snpCounting-fold diff --git a/src/Poseidon/CLI/Genoconvert.hs b/src/Poseidon/CLI/Genoconvert.hs index acab8dc96..717081aaa 100644 --- a/src/Poseidon/CLI/Genoconvert.hs +++ b/src/Poseidon/CLI/Genoconvert.hs @@ -141,7 +141,7 @@ convertGenoTo outFormat onlyGeno outPath removeOld outPlinkPopMode outZip pac = (outFilesAbsTemp !! 1) (outFilesAbsTemp !! 2) (map (eigenstratInd2PlinkFam outPlinkPopMode) eigenstratIndEntries) - "VCF" -> writeVCF logA jannoRows (outFilesAbsTemp !! 0) eigenstratIndEntries + "VCF" -> writeVCF logA jannoRows (outFilesAbsTemp !! 0) _ -> liftIO . throwIO $ illegalFormatException outFormat runEffect $ eigenstratProd >-> printSNPCopyProgress logA currentTime >-> outConsumer ) (throwIO . PoseidonGenotypeExceptionForward errLength) diff --git a/src/Poseidon/GenotypeData.hs b/src/Poseidon/GenotypeData.hs index 6495c5e02..a2e8edf50 100644 --- a/src/Poseidon/GenotypeData.hs +++ b/src/Poseidon/GenotypeData.hs @@ -2,12 +2,14 @@ module Poseidon.GenotypeData where import Paths_poseidon_hs (version) -import Poseidon.Janno (JannoGenotypePloidy (..), JannoRow (jGenotypePloidy, jPoseidonID)) +import Poseidon.Janno (JannoGenotypePloidy (..), + JannoRow (jGenotypePloidy, jGroupName, jPoseidonID), + ListColumn (..), GroupName(..)) import Poseidon.Utils (LogA, PoseidonException (..), PoseidonIO, checkFile, envInputPlinkMode, logDebug, - logInfo, logWithEnv, padLeft, - logWarning) + logInfo, logWarning, logWithEnv, + padLeft) import Control.Exception (throwIO) import Control.Monad (forM, unless) @@ -367,10 +369,10 @@ printSNPCopyProgress logA startTime = do selectIndices :: [Int] -> (EigenstratSnpEntry, GenoLine) -> (EigenstratSnpEntry, GenoLine) selectIndices indices (snpEntry, genoLine) = (snpEntry, V.fromList [genoLine V.! i | i <- indices]) -writeVCF :: (MonadSafe m) => LogA -> [JannoRow] -> FilePath -> [EigenstratIndEntry] -> Consumer (EigenstratSnpEntry, GenoLine) m () -writeVCF logA jannoRows vcfFile eigenstratIndEntries = do - let sampleNames = [n | EigenstratIndEntry n _ _ <- eigenstratIndEntries] - groupNames = [B.unpack g | EigenstratIndEntry _ _ g <- eigenstratIndEntries] +writeVCF :: (MonadSafe m) => LogA -> [JannoRow] -> FilePath -> Consumer (EigenstratSnpEntry, GenoLine) m () +writeVCF logA jannoRows vcfFile = do + let sampleNames = map (B.pack . jPoseidonID) jannoRows + groupNames = map ((\(GroupName n) -> T.unpack n) . head . getListColumn . jGroupName) jannoRows prog_name <- liftIO getProgName prog_args <- liftIO getArgs let command_line = prog_name ++ " " ++ unwords prog_args diff --git a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt index c3d30b538..102aac1b1 100644 --- a/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt +++ b/test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt @@ -126,6 +126,11 @@ ad7e56177aad0a720f0bde13d47f2ac1 forge forge/ForgePac19/CHANGELOG.md b7b649620cd37bd4a6d6f0f31c1c56da forge forge/ForgePac19/ForgePac19.janno b36b3ca509c235d0f15571c96195e801 forge forge/ForgePac20/POSEIDON.yml e375863bca9e4a91c9855396abde31c7 forge forge/ForgePac20/ForgePac20.janno +1f24e4ad0943c830a58e9ae168f9ffa6 forge forge/ForgePac21/POSEIDON.yml +33b1ac865fb6ae948a01f2b397f4edde forge forge/ForgePac21/ForgePac21.vcf +8846333d9a1de6510f25a3816cc70fef forge forge/ForgePac21/ForgePac21.janno +9089f5d5602937bb7713e1dc8d7a8f2d forge forge/ForgePac21/ForgePac21.ssf +b4f71aff4fbc11594008c3811781cc43 forge forge/ForgePac21/ForgePac21.bib d4a05cfef045648238a94a9d621cf667 chronicle chronicle/chronicle1.yml b43da4d5734371c0648553120f812466 timetravel timetravel/Lamnidis_2018-1.0.0/POSEIDON.yml 8d57ce1a1ab28c0d8a5f391dd790a59c timetravel timetravel/Lamnidis_2018-1.0.1/POSEIDON.yml diff --git a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml index 45575e660..6295c941c 100644 --- a/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml +++ b/test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml @@ -5,25 +5,25 @@ lastModified: 2025-02-04 packages: - title: Lamnidis_2018 version: 1.0.0 - commit: d3e6237bbabacb39bf5289fdf0a0da764b5f499a + commit: e80d54ce4c0dbdd86ed03d700ae3cfe280966700 path: Lamnidis_2018 - title: Lamnidis_2018 version: 1.0.1 - commit: d3e6237bbabacb39bf5289fdf0a0da764b5f499a + commit: e80d54ce4c0dbdd86ed03d700ae3cfe280966700 path: Lamnidis_2018_newVersion - title: Schiffels version: 1.1.1 - commit: 6107cf921031db0c6bbce600be9f69889933f373 + commit: 4fb75ac7968c267ffe745a30511afead4ea263f1 path: Schiffels - title: Schiffels_2016 version: 1.0.1 - commit: d3e6237bbabacb39bf5289fdf0a0da764b5f499a + commit: e80d54ce4c0dbdd86ed03d700ae3cfe280966700 path: Schiffels_2016 - title: Schmid_2028 version: 1.0.0 - commit: d3e6237bbabacb39bf5289fdf0a0da764b5f499a + commit: e80d54ce4c0dbdd86ed03d700ae3cfe280966700 path: Schmid_2028 - title: Wang_2020 version: 0.1.0 - commit: d3e6237bbabacb39bf5289fdf0a0da764b5f499a + commit: e80d54ce4c0dbdd86ed03d700ae3cfe280966700 path: Wang_2020 diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.bib b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.bib new file mode 100644 index 000000000..c3cd3ae0b --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.bib @@ -0,0 +1,16 @@ +@article{Schiffels2016, + title = {Test}, +} + +@book{TestBook2, + title = {TestBook}, +} + +@article{TestPaper1, + title = {TestPaper}, +} + +@article{Wang2020, + title = {Test}, +} + diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.janno b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.janno new file mode 100644 index 000000000..c80e79c45 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.janno @@ -0,0 +1,7 @@ +Poseidon_ID Genetic_Sex Group_Name Alternative_IDs Relation_To Relation_Degree Relation_Type Relation_Note Collection_ID Country Country_ISO Location Site Latitude Longitude Date_Type Date_C14_Labnr Date_C14_Uncal_BP Date_C14_Uncal_BP_Err Date_BC_AD_Start Date_BC_AD_Median Date_BC_AD_Stop Date_Note MT_Haplogroup Y_Haplogroup Source_Tissue Nr_Libraries Library_Names Capture_Type UDG Library_Built Genotype_Ploidy Data_Preparation_Pipeline_URL Endogenous Nr_SNPs Coverage_on_Target_SNPs Damage Contamination Contamination_Err Contamination_Meas Contamination_Note Genetic_Source_Accession_IDs Primary_Contact Publication Note Keywords AddCol1 AddCol2 +XXX002 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016 n/a n/a v1 v2 +XXX004 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016 n/a n/a v1 v2 +XXX005 M POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016;TestPaper1 n/a n/a v1 v2 +XXX006 F POP2 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 9 n/a n/a n/a n/a n/a n/a n/a n/a Schiffels2016;TestPaper1 n/a n/a v1 v2 +SAMPLE2 F 3 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Wang2020;TestPaper1 n/a n/a n/a n/a +SAMPLE4 F 5 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a 7 n/a n/a n/a n/a n/a n/a n/a n/a Wang2020;TestPaper1;TestBook2 n/a n/a n/a n/a diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.ssf b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.ssf new file mode 100644 index 000000000..c51f729c2 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.ssf @@ -0,0 +1,3 @@ +poseidon_IDs udg library_built sample_accession study_accession run_accession sample_alias secondary_sample_accession first_public last_updated instrument_model library_layout library_source instrument_platform library_name library_strategy fastq_ftp fastq_aspera fastq_bytes fastq_md5 read_count submitted_ftp other_info_1 other_info_2 +XXX001;XXX002 n/a n/a n/a n/a ERR3518150 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a A B +XXX002;XXX004;XXX005 n/a n/a n/a n/a ERR3518151 n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a n/a C D diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.vcf b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.vcf new file mode 100644 index 000000000..f6f8b8223 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/ForgePac21.vcf @@ -0,0 +1,25 @@ +##fileformat=VCFv4.2 +##source=trident_v1.6.2.1 +##command_line=poseidon-devtools golden -d . +##group_names=POP2,POP2,POP2,POP2,3,5 +##INFO= +##FILTER= +##FILTER= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT XXX002 XXX004 XXX005 XXX006 SAMPLE2 SAMPLE4 +1 752566 1_752566 G A . PASS NS=4 GT 1/1 1/1 1/1 1/1 ./. ./. +1 842013 1_842013 T G . PASS NS=4 GT 1/1 0/0 0/0 0/0 ./. ./. +1 891021 1_891021 G A . PASS NS=3 GT 1/1 1/1 ./. 0/1 ./. ./. +1 949654 1_949654 A G . PASS NS=4 GT 0/1 1/1 0/1 1/1 ./. ./. +2 1018704 2_1018704 A G . PASS NS=3 GT 0/1 0/1 ./. 1/1 ./. ./. +2 1045331 2_1045331 G A . PASS NS=4 GT 0/0 0/0 0/0 0/0 ./. ./. +2 1048955 2_1048955 A G . PASS NS=4 GT 0/0 0/0 0/0 0/0 ./. ./. +2 1061166 2_1061166 T C . PASS NS=4 GT 0/0 0/1 0/0 0/0 ./. ./. +2 1108637 2_1108637 G A . PASS NS=4 GT 0/0 0/0 0/1 0/0 ./. ./. +11 0 rs0000 A C . s50 NS=2 GT ./. ./. ./. ./. 0/1 1/1 +11 100000 rs1111 A G . s50 NS=2 GT ./. ./. ./. ./. 0/0 0/0 +11 200000 rs2222 A T . s50 NS=2 GT ./. ./. ./. ./. 0/1 0/1 +11 300000 rs3333 C A . s50 NS=2 GT ./. ./. ./. ./. 0/1 0/0 +11 400000 rs4444 G A . s50 NS=2 GT ./. ./. ./. ./. 0/1 1/1 +11 500000 rs5555 T A . s50 NS=2 GT ./. ./. ./. ./. 0/1 0/1 +11 600000 rs6666 G T . s50 NS=2 GT ./. ./. ./. ./. 0/1 1/1 diff --git a/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/POSEIDON.yml b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/POSEIDON.yml new file mode 100644 index 000000000..8905488f5 --- /dev/null +++ b/test/PoseidonGoldenTests/GoldenTestData/forge/ForgePac21/POSEIDON.yml @@ -0,0 +1,11 @@ +poseidonVersion: 2.7.1 +title: ForgePac21 +description: Empty package template. Please add a description +packageVersion: 0.1.0 +lastModified: 1970-01-01 +genotypeData: + format: VCF + genoFile: ForgePac21.vcf +jannoFile: ForgePac21.janno +sequencingSourceFile: ForgePac21.ssf +bibFile: ForgePac21.bib diff --git a/test/PoseidonGoldenTests/GoldenTestData/genoconvert/out_vcf/Schiffels_2016.vcf.gz b/test/PoseidonGoldenTests/GoldenTestData/genoconvert/out_vcf/Schiffels_2016.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..2b8813a9423b8bc4f6d061d323fdb648f9d66520 GIT binary patch literal 512 zcmV+b0{{IViwFP!000006OEEzkD5Rf#h=GdLDuw*#ku?wlG23eYDrwNLR;Sql~GBA zF@Prf?K^PUtm8`4hv6Is?vHctFdXi)6WjALPipViWOg+*eJ~tW{8H|iSC?7Jih6qm z&DL}c#FHJ*^Q1_(r>tP!nO7`JdGVaGYt8woQV#qiDwO9`(ZtJ#l3&i-BFS0ht-=+k z;-agP)8#$m!-@J}ISak7!PHwu>hhBRVWkIZ{B~wuP}Jq`Q_)1n`M1m<LT~zUb!cd4 "zip_roundtrip" "Schiffels_2016.bim" , "genoconvert" "zip_roundtrip" "Schiffels_2016.fam" ] + + let genoconvertOpts7 = GenoconvertOptions { + _genoconvertGenoSources = [PacBaseDir $ testPacsDir "Schiffels_2016"] + , _genoConvertOutFormat = "VCF" + , _genoMaybeOutPackagePath = Just $ testDir "genoconvert" "out_vcf" + , _genoconvertRemoveOld = False + , _genoconvertOutPlinkPopMode = PlinkPopNameAsFamily + , _genoconvertOnlyLatest = False + , _genoconvertOutZip = True + } + testLog $ runGenoconvert genoconvertOpts7 testPipelineRectify :: FilePath -> FilePath -> IO () testPipelineRectify testDir checkFilePath = do @@ -1132,6 +1143,20 @@ testPipelineForge testDir checkFilePath = do "forge" "ForgePac20" "ForgePac20.janno" ] + let forgeOpts21 = forgeOpts1 { + _forgeOutFormat = "VCF" + , _forgeOutPacPath = testDir "forge" "ForgePac21" + , _forgeOutPacName = Just "ForgePac21" + } + let action21 = testLog (runForge forgeOpts21) >> patchLastModified testDir ("forge" "ForgePac21" "POSEIDON.yml") + runAndChecksumFiles checkFilePath testDir action21 "forge" [ + "forge" "ForgePac21" "POSEIDON.yml" + , "forge" "ForgePac21" "ForgePac21.vcf" + , "forge" "ForgePac21" "ForgePac21.janno" + , "forge" "ForgePac21" "ForgePac21.ssf" + , "forge" "ForgePac21" "ForgePac21.bib" + ] + testPipelineChronicleAndTimetravel :: FilePath -> FilePath -> IO () testPipelineChronicleAndTimetravel testDir checkFilePath = do