diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..fd675f6 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md index e5bf616..6a304cd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Ensemblex: an accuracy-weighted ensemble genetic demultiplexing framework for single-cell RNA sequencing +[![](https://img.shields.io/badge/Documentation-scrnabox-blue)](https://neurobioinfo.github.io/ensemblex/site/) + + ------------- ## Contents - [Introduction](#introduction) diff --git a/ensemblex.pip/.DS_Store b/ensemblex.pip/.DS_Store new file mode 100644 index 0000000..7d8fcee Binary files /dev/null and b/ensemblex.pip/.DS_Store differ diff --git a/ensemblex.pip/README b/ensemblex.pip/README new file mode 100644 index 0000000..ef1b449 --- /dev/null +++ b/ensemblex.pip/README @@ -0,0 +1,12 @@ + +# Ensembleux pipeline +Ensemblex is an accuracy-weighted ensemble framework for genetic demultiplexing of pooled single-cell RNA seqeuncing (scRNAseq) data. Ensemblex can be used to demultiplex pools **with** or **without** prior genotype information. It is compatible with Linux workstations. The detail of installing and running this pipeline can be found in tutorial in [ensembleux](https://neurobioinfo.github.io/ensembleux/site/). + +## Contributing +Any contributions or suggestions for improving the Ensemblex pipeline are welcomed and appreciated. If you encounter any issues, please open an issue in the [GitHub repository](https://github.com/neurobioinfo/ensemblex). Alternatively, you are welcomed to email the developers directly; for any questions please contact Michael Fiorini: michael.fiorini@mail.mcgill.ca + +## License +This project is licensed under the MIT License - see the [LICENSE.md](https://github.com/neurobioinfo/ensembleux/blob/main/LICENSE) file for details + +## Acknowledgement +The Ensemblex pipeline was produced for projects funded by the Canadian Institute of Health Research and Michael J. Fox Foundation Parkinson's Progression Markers Initiative (MJFF PPMI) in collaboration with The Neuro's Early Drug Discovery Unit (EDDU), McGill University. It is written by [Michael Fiorini](https://github.com/fiorini9) and [Saeid Amiri](https://github.com/saeidamiri1) with supervision from [Rhalena Thomas](https://github.com/RhalenaThomas) and Sali Farhan at the Montreal Neurological Institute-Hospital. Copyright belongs MNI BIOINFO CORE. \ No newline at end of file diff --git a/ensemblex.pip/gt/.DS_Store b/ensemblex.pip/gt/.DS_Store new file mode 100644 index 0000000..21536fb Binary files /dev/null and b/ensemblex.pip/gt/.DS_Store differ diff --git a/ensemblex.pip/gt/configs/ensemblex_config.ini b/ensemblex.pip/gt/configs/ensemblex_config.ini new file mode 100644 index 0000000..cc80396 --- /dev/null +++ b/ensemblex.pip/gt/configs/ensemblex_config.ini @@ -0,0 +1,121 @@ +############# Config Trace +############# [DEFAULT] +############# Depending on system and your requirments, +############# change the defaults + +######################## +## General parameters ## +######################## +METHOD=GT +CONTAINER_CMD=singularity + +############### +## Demuxalot ## +############### +## List of Sample ID's in the sample VCF file (e.g., 'Sample_1,Sample_2,Sample_3') +PAR_demuxalot_genotype_names='' +## Read prior strength +PAR_demuxalot_prior_strength=100 +## Minimum read coverage +PAR_demuxalot_minimum_coverage=200 +## Minimum alternative read coverage +PAR_demuxalot_minimum_alternative_coverage=10 +## Number of best snps for each donor to use for demultiplexing +PAR_demuxalot_n_best_snps_per_donor=100 +## Genotype prior strength +PAR_demuxalot_genotypes_prior_strength=1 +## Doublet prior strength +PAR_demuxalot_doublet_prior=0.25 + +############## +## Demuxlet ## +############## +## Field to extract the genotypes (GT), genotype likelihood (PL), or posterior probability (GP) from the sample .vcf file +PAR_demuxlet_field=GT + +########### +## Vireo ## +########### +## Number of pooled samples +PAR_vireo_N= +## Field to extract the genotypes (GT), genotype likelihood (PL), or posterior probability (GP) from the sample .vcf file +PAR_vireo_type=GT +## Number of subprocesses for computing +PAR_vireo_processes=20 +## Minimum minor allele frequency +PAR_vireo_minMAF=0.1 +## Minimum aggregated count +PAR_vireo_minCOUNT=20 +## Whether or not to treat donor GT as prior only +PAR_vireo_forcelearnGT=T + +################ +## Souporcell ## +################ +## Minimap2 parameters. For information regarding the minimap2 parameters, please see the documentation: +PAR_minimap2='-ax splice -t 8 -G50k -k 21 -w 11 --sr -A2 -B8 -O12,32 -E2,1 -r200 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g2000 -2K50m --secondary=no' +## Freebayes parameters. For information regarding the freebayes parameters, please see the documentation: +PAR_freebayes=' -iXu -C 2 -q 20 -n 3 -E 1 -m 30 --min-coverage 6' +## Whether or no to consider UMI information when populating coverage matrices +PAR_vartrix_umi=TRUE +## Minimum read mapping quality +PAR_vartrix_mapq=30 +## Number of threads for computing +PAR_vartrix_threads=8 +## Number of pooled samples +PAR_souporcell_k= +## Number of threads for computing +PAR_souporcell_t=8 + +######################### +## ensemblex algorithm ## +######################### +#== Pool parameters ==# +## Number of pooled samples +PAR_ensemblex_sample_size= +## Expected doublet rate for the pool. If using 10X Genomics, the expected doublet rate can be estimated based on the number of recovered cells. +PAR_ensemblex_expected_doublet_rate=0.12 + +#== Set up parameters ==# +## Whether or not to merge the output files of the constituent demultiplexing tools. If running ensemblex on a pool for the first time, this parameter should be set to "Yes". +PAR_ensemblex_merge_constituents=yes + +#== Step 1 parameters: Probabilistic-weighted ensemble ==# +## Whether or not to perform Step 1: Probabilistic-weighted ensemble. If running ensemblex on a pool for the first time, this parameter should be set to "Yes". +PAR_ensemblex_probabilistic_weighted_ensemble=yes + +#== Step 2 parameters: Graph-based doublet detection ==# +## Whether or not to perform a preliminary parameter sweep for Step 2: Graph-based doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define the number of confident doublets in the pool (nCD) and the percentile threshold of the nearest neighour frequency (pT), which can be defined in the following two parameters, respectively. +PAR_ensemblex_preliminary_parameter_sweep=no +## Manually defined number of confident doublets in the pool (nCD). To manually define nCD, uncomment the parament and enter the value (e.g., PAR_ensemblex_nCD=200) +#PAR_ensemblex_nCD= +## Manually defined percentile threshold of the nearest neighour frequency (pT. To manually define pT, uncomment the parament and enter the value (e.g., PAR_ensemblex_pT=0.9) +#PAR_ensemblex_pT= +## Whether or not to perform Step 2: Graph-based doublet detection. If PAR_ensemblex_nCD and PAR_ensemblex_pT are not defined by the user (NULL), ensemblex will automatically determine the optimal parameter values using an unsupervised parameter sweep. If PAR_ensemblex_nCD and PAR_ensemblex_pT are defined by the user, graph-based doublet detection will be performed with the user-defined values. +PAR_ensemblex_graph_based_doublet_detection=yes + +#== Step 3 parameters: Ensemble-independent doublet detection ==# +## Whether or not to perform a preliminary parameter sweep for Step 3: Ensemble-independent doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define which constituent tools to utilize for ensemble-independent doublet detection. Users can define which tools to utilize for ensemble-independent doublet detection in the following parameters. +PAR_ensemblex_preliminary_ensemble_independent_doublet=no +## Whether or not to perform Step 3: Ensemble-independent doublet detection. +PAR_ensemblex_ensemble_independent_doublet=yes +## Whether or not to label doublets identified by Demuxalot as doublets. Only doublets with assignment probabilities exceeding Demuxalot's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Demuxalot_threshold=yes +## Whether or not to label doublets identified by Demuxalot as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Demuxalot_no_threshold=no +## Whether or not to label doublets identified by Demuxlet as doublets. Only doublets with assignment probabilities exceeding Demuxlet's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Demuxlet_threshold=no +## Whether or not to label doublets identified by Demuxlet as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Demuxlet_no_threshold=no +## Whether or not to label doublets identified by Souporcell as doublets. Only doublets with assignment probabilities exceeding Souporcell's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Souporcell_threshold=no +## Whether or not to label doublets identified by Souporcell as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Souporcell_no_threshold=no +## Whether or not to label doublets identified by Vireo as doublets. Only doublets with assignment probabilities exceeding Vireo's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Vireo_threshold=yes +## Whether or not to label doublets identified by Vireo as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Vireo_no_threshold=no + +#== Confidence score parameters ==# +## Whether or not to compute ensemblex's singlet confidence score. This will define low confidence assignments which should be removed from downstream analyses. +PAR_ensemblex_compute_singlet_confidence=yes diff --git a/ensemblex.pip/gt/scripts/.DS_Store b/ensemblex.pip/gt/scripts/.DS_Store new file mode 100644 index 0000000..f5970d4 Binary files /dev/null and b/ensemblex.pip/gt/scripts/.DS_Store differ diff --git a/ensemblex.pip/gt/scripts/demuxalot/pipeline_demuxalot.sh b/ensemblex.pip/gt/scripts/demuxalot/pipeline_demuxalot.sh new file mode 100644 index 0000000..4e27d94 --- /dev/null +++ b/ensemblex.pip/gt/scripts/demuxalot/pipeline_demuxalot.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step Swmuxalot GT submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* PAR_demuxalot_genotype_names $PAR_demuxalot_genotype_names" +echo "* PAR_demuxalot_prior_strength $PAR_demuxalot_prior_strength" +echo "* PAR_demuxalot_minimum_coverage $PAR_demuxalot_minimum_coverage" +echo "* PAR_demuxalot_minimum_alternative_coverage $PAR_demuxalot_minimum_alternative_coverage" +echo "* PAR_demuxalot_n_best_snps_per_donor $PAR_demuxalot_n_best_snps_per_donor" +echo "* PAR_demuxalot_genotypes_prior_strength $PAR_demuxalot_genotypes_prior_strength" +echo "* PAR_demuxalot_doublet_prior $PAR_demuxalot_doublet_prior" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "Start of demuxalot" +$CONTAINER_CMD exec --bind $OUTPUT_DIR,$PIPELINE_HOME ${CONTAINER1} python3 $PIPELINE_HOME/gt/scripts/demuxalot/pipline_demuxalot.py -fl $OUTPUT_DIR -p1 $PAR_demuxalot_genotype_names -p2 $PAR_demuxalot_prior_strength -p3 $PAR_demuxalot_minimum_coverage -p4 $PAR_demuxalot_minimum_alternative_coverage -p5 $PAR_demuxalot_n_best_snps_per_donor -p6 $PAR_demuxalot_genotypes_prior_strength -p7 $PAR_demuxalot_doublet_prior +echo "End of demuxalot" +exit 0 diff --git a/ensemblex.pip/gt/scripts/demuxalot/pipline_demuxalot.py b/ensemblex.pip/gt/scripts/demuxalot/pipline_demuxalot.py new file mode 100644 index 0000000..4882c3a --- /dev/null +++ b/ensemblex.pip/gt/scripts/demuxalot/pipline_demuxalot.py @@ -0,0 +1,101 @@ +import argparse +import sys +import os + + +parser = argparse.ArgumentParser() +parser.add_argument('-fl', '--folder',action='store', type=str,required=False, help='outputfolder') +parser.add_argument('-p1', '--pdgn',action='store',required=False, help='PAR_demuxalot_genotype_names') +parser.add_argument('-p2', '--pdps',action='store', type=int,required=False, help='PAR_demuxalot_prior_strength') +parser.add_argument('-p3', '--pdmc',action='store', type=int,required=False, help='PAR_demuxalot_minimum_coverage') +parser.add_argument('-p4', '--pdmac',action='store', type=int,required=False, help='PAR_demuxalot_minimum_alternative_coverage') +parser.add_argument('-p5', '--pdnbspd',action='store', type=int, required=False, help='PAR_demuxalot_n_best_snps_per_donor') +parser.add_argument('-p6', '--pdgps',action='store',type=int,required=False, help='PAR_demuxalot_genotypes_prior_strength') +parser.add_argument('-p7', '--pddp',action='store', type=float,required=False, help='PAR_demuxalot_doublet_prior') +args = parser.parse_args() +folder=args.folder +PAR_demuxalot_genotype_names=args.pdgn.split(",") + +PAR_demuxalot_prior_strength=args.pdps +PAR_demuxalot_minimum_coverage=args.pdmc +PAR_demuxalot_minimum_alternative_coverage=args.pdmac +PAR_demuxalot_n_best_snps_per_donor=args.pdnbspd +PAR_demuxalot_genotypes_prior_strength=args.pdgps +PAR_demuxalot_doublet_prior=args.pddp + + +print(PAR_demuxalot_doublet_prior) + +from pathlib import Path +import pandas as pd +import pysam +# usr/local/lib/python3.10/dist-packages +from demuxalot.utils import download_file +from demuxalot import BarcodeHandler, ProbabilisticGenotypes, Demultiplexer, count_snps, detect_snps_positions +from pysam import VariantFile +import pandas as pd +import io +from demuxalot import utils + +print('Part I') +handler = BarcodeHandler.from_file(os.path.join(folder,'input_files/pooled_barcodes.tsv')) ### modify + + + +genotype_names = PAR_demuxalot_genotype_names + +genotype_names.sort() +genotypes = ProbabilisticGenotypes(genotype_names=genotype_names) + +genotypes.add_vcf(os.path.join(folder,'input_files/pooled_samples.vcf'), prior_strength=100) ### modify + +print('Part II') + +pysam.index(os.path.join(folder,'input_files/pooled_bam.bam')) ### modify +print('Part III') +counts = count_snps( + bamfile_location=os.path.join(folder,'input_files/pooled_bam.bam'), ### modify + chromosome2positions=genotypes.get_chromosome2positions(), + barcode_handler=handler, +) + +utils.summarize_counted_SNPs(counts) +print('Part IV') + +new_snps_filename = 'new_snps_single_file.betas' +_ = detect_snps_positions( + bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify + genotypes=genotypes, + barcode_handler=handler, + minimum_coverage=PAR_demuxalot_minimum_coverage, + minimum_alternative_coverage=PAR_demuxalot_minimum_alternative_coverage, + result_beta_prior_filename=str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')), + n_best_snps_per_donor=PAR_demuxalot_n_best_snps_per_donor, +) + +print('Part III') + +genotypes_with_new_snps = genotypes.clone() +genotypes_with_new_snps.add_prior_betas(str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')), prior_strength=PAR_demuxalot_genotypes_prior_strength) + +counts_enriched = count_snps( + bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify + chromosome2positions=genotypes_with_new_snps.get_chromosome2positions(), + barcode_handler=handler, +) + +print('Part VI') + +learnt_enriched_genotypes, probs_learning_new_snps = Demultiplexer.learn_genotypes( + counts_enriched, + genotypes_with_new_snps, + barcode_handler=handler, + doublet_prior=PAR_demuxalot_doublet_prior, +) + +print('Part V') + +probs_learning_new_snps.head() + + +probs_learning_new_snps.to_csv(os.path.join(folder,'demuxalot/Demuxalot_result.csv')) ### modify diff --git a/ensemblex.pip/gt/scripts/demuxlet/pipeline_demuxlet.sh b/ensemblex.pip/gt/scripts/demuxlet/pipeline_demuxlet.sh new file mode 100644 index 0000000..fd47a9e --- /dev/null +++ b/ensemblex.pip/gt/scripts/demuxlet/pipeline_demuxlet.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step Souporcell submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* PAR_demuxlet_field: $PAR_demuxlet_field" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif +# SOFT_SOUP=/opt/souporcell +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "Start of pileup step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} /opt/popscle/bin/popscle dsc-pileup \ +--sam $OUTPUT_DIR/input_files/pooled_bam.bam \ +--vcf $OUTPUT_DIR/input_files/pooled_samples.vcf \ +--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \ +--out $OUTPUT_DIR/demuxlet/pileup +echo "End of pileup step" + +echo "Start of demuxlet step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} /opt/popscle/bin/popscle demuxlet \ +--plp $OUTPUT_DIR/demuxlet/pileup \ +--vcf $OUTPUT_DIR/input_files/pooled_samples.vcf \ +--field $PAR_demuxlet_field \ +--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \ +--out $OUTPUT_DIR/demuxlet/outs +echo "End of demuxlet step" + +exit 0 diff --git a/ensemblex.pip/gt/scripts/ensemblexing/ensemblexing.R b/ensemblex.pip/gt/scripts/ensemblexing/ensemblexing.R new file mode 100644 index 0000000..ba64fc1 --- /dev/null +++ b/ensemblex.pip/gt/scripts/ensemblexing/ensemblexing.R @@ -0,0 +1,357 @@ +## load parameters +args = commandArgs(trailingOnly=TRUE) +home_dir=args[1] +output_dir=args[2] + +source(paste0(home_dir,'/gt/scripts/ensemblexing/functions.R')) +## load library +# .libPaths(r_lib_path) +packages<-c('dplyr','tidyr','pdfCluster','data.table','readr','lubridate','tidyverse','moments','mousetrap','usethis','devtools','desc','kneedle','ROCit','ggplot2','factoextra','ggpubr','splines','stats','pathviewr') +invisible(lapply(packages, library, character.only = TRUE)) + +par_sample_size= as.numeric(args[3]) +par_expected_doublet_rate= as.numeric(args[4]) +par_merge_constituents= args[5] +par_probabilistic_weighted_ensemble= args[6] +par_preliminary_parameter_sweep= args[7] +par_graph_based_doublet_detection= args[8] +par_preliminary_ensemble_independent_doublet= args[9] +par_ensemble_independent_doublet= args[10] +par_doublet_Demuxalot_threshold= args[11] +par_doublet_Demuxalot_no_threshold= args[12] +par_doublet_Demuxlet_threshold= args[13] +par_doublet_Demuxlet_no_threshold= args[14] +par_doublet_Souporcell_threshold= args[15] +par_doublet_Souporcell_no_threshold= args[16] +par_doublet_Vireo_threshold= args[17] +par_doublet_Vireo_no_threshold= args[18] +par_compute_singlet_confidence= args[19] +par_doublet_Vireo_no_threshold= args[18] +par_compute_singlet_confidence= args[19] +par_ensemblex_nCD= as.numeric(args[20]) +par_ensemblex_pT= as.numeric(args[21]) + +## Print the defined parameters by the user +print("Loaded parameter from config.ini") +print(paste0("par_sample_size=",par_sample_size)) +print(paste0("par_expected_doublet_rate=",par_expected_doublet_rate)) +print(paste0("par_merge_constituents=",par_merge_constituents)) +print(paste0("par_probabilistic_weighted_ensemble=",par_probabilistic_weighted_ensemble)) +print(paste0("par_preliminary_parameter_sweep=",par_preliminary_parameter_sweep)) +print(paste0("par_graph_based_doublet_detection=",par_graph_based_doublet_detection)) +print(paste0("par_ensemblex_nCD=",par_ensemblex_nCD)) +print(paste0("par_ensemblex_pT=",par_ensemblex_pT)) +print(paste0("par_preliminary_ensemble_independent_doublet=",par_preliminary_ensemble_independent_doublet)) +print(paste0("par_ensemble_independent_doublet=",par_ensemble_independent_doublet)) +print(paste0("par_doublet_Demuxalot_threshold=",par_doublet_Demuxalot_threshold)) +print(paste0("par_doublet_Demuxalot_no_threshold=",par_doublet_Demuxalot_no_threshold)) +print(paste0("par_doublet_Demuxlet_threshold=",par_doublet_Demuxlet_threshold)) +print(paste0("par_doublet_Demuxlet_no_threshold=",par_doublet_Demuxlet_no_threshold)) +print(paste0("par_doublet_Souporcell_threshold=",par_doublet_Souporcell_threshold)) +print(paste0("par_doublet_Souporcell_no_threshold=",par_doublet_Souporcell_no_threshold)) +print(paste0("par_doublet_Vireo_threshold=",par_doublet_Vireo_threshold)) +print(paste0("par_doublet_Vireo_no_threshold=",par_doublet_Vireo_no_threshold)) +print(paste0("par_compute_singlet_confidence=",par_compute_singlet_confidence)) +print(paste0("par_doublet_Vireo_no_threshold=",par_doublet_Vireo_no_threshold)) +print(paste0("par_compute_singlet_confidence=",par_compute_singlet_confidence)) + + +par_output_dir<- paste0(output_dir,"/ensemblex_gt") +## Demuxalot +par_demuxalot<- paste0(output_dir,"/demuxalot/Demuxalot_result.csv") +## Demuxlet +par_demuxlet<- paste0(output_dir,"/demuxlet/outs.best") +## Souporcell +par_souporcell<- paste0(output_dir,"/souporcell/clusters.tsv") +## Vireo +par_vireo<- paste0(output_dir,"/vireo_gt/donor_ids.tsv") + +########################################################################################################################### +# CONSTITUENT DATA PREPARATION AND MERGING +########################################################################################################################### +if (tolower(par_merge_constituents)=="yes"){ + message("Performing constituent data preparation and merge.") + ## Import constituent tool output files + vireo<- read.delim(par_vireo, header = T, sep = "\t") + dim(vireo) ## eventually remove this + souporcell <- read.delim(par_souporcell, header = T, sep = "\t") + dim(souporcell) ## eventually remove this + freemuxlet <-read.delim(par_demuxlet, header = T, sep = "\t") + dim(freemuxlet) ## eventually remove this + demuxalot <- read.delim(par_demuxalot, header = T, sep = ",", check.names=FALSE) + colnames(demuxalot)[1] <- "X" + dim(demuxalot) ## eventually remove this + + ## check output files for duplicated barcodes + message("Checking output files for duplicated barcodes.") + ## Vireo + if (length(unique(vireo$cell)) != nrow(vireo)){ + vireo <- vireo[!duplicated(vireo$cell),] + temp <- vireo[duplicated(vireo$cell),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Vireo resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(vireo), " unique barcodes in Vireo output.")) + } + } else { + print("No duplicated barcodes observed in Vireo output.") + print(paste0("Observed ", nrow(vireo), " unique barcodes in Vireo output.")) + } + + ## Souporcell + if (length(unique(souporcell$barcode)) != nrow(souporcell)){ + souporcell <- souporcell[!duplicated(souporcell$barcode),] + temp <- souporcell[duplicated(souporcell$barcode),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Souporcell resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(souporcell), " unique barcodes in Souporcell output.")) + } + } else { + print("No duplicated barcodes observed in Souporcell output.") + print(paste0("Observed ", nrow(souporcell), " unique barcodes in Souporcell output.")) + } + + ## Demuxlet + if (length(unique(freemuxlet$BARCODE)) != nrow(freemuxlet)){ + freemuxlet <- freemuxlet[!duplicated(freemuxlet$BARCODE),] + temp <- freemuxlet[duplicated(freemuxlet$BARCODE),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Demuxlet resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(freemuxlet), " unique barcodes in Demuxlet output.")) + } + } else { + print("No duplicated barcodes observed in Demuxlet output.") + print(paste0("Observed ", nrow(freemuxlet), " unique barcodes in Demuxlet output.")) + } + + ## Demuxalot + if (length(unique(demuxalot$X)) != nrow(demuxalot)){ + demuxalot <- demuxalot[!duplicated(demuxalot$X),] + temp <- demuxalot[duplicated(demuxalot$X),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Demuxalot resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(demuxalot), " unique barcodes in Demuxalot output.")) + } + } else { + print("No duplicated barcodes observed in Demuxalot output.") + print(paste0("Observed ", nrow(demuxalot), " unique barcodes in Demuxalot output.")) + } + + ## Make sure that all of the output files have the same number of droplets + if (nrow(vireo) == nrow(freemuxlet) & + nrow(vireo) == nrow(demuxalot) & + nrow(vireo) == nrow(souporcell)) { + message(paste0("All demultiplexing tools have reported the same number of cells: ", nrow(souporcell), "." )) + } else { + message(paste0("WARNING: Demultiplexing tools have reported different number of cells; Vireo: ", nrow(vireo), "; Demuxlet: ", nrow(freemuxlet), "; Demuxalot: ", nrow(demuxalot), "; Souporcell: ", nrow(souporcell), ".")) + } + + #### Make sure that the Sample IDs from each output file have the same structure. Here, we will replace "-" or "." with "_" + ### Vireo + ## fix potential "-" + vireo$donor_id <- gsub(pattern = "-", replacement = "_",x = vireo$donor_id, fixed = TRUE) + vireo$best_singlet <- gsub(pattern = "-", replacement = "_",x = vireo$best_singlet, fixed = TRUE) + vireo$best_doublet <- gsub(pattern = "-", replacement = "_",x = vireo$best_doublet, fixed = TRUE) + ## fix potential "." + vireo$donor_id <- gsub(pattern = ".", replacement = "_",x = vireo$donor_id, fixed = TRUE) + vireo$best_singlet <- gsub(pattern = ".", replacement = "_",x = vireo$best_singlet, fixed = TRUE) + vireo$best_doublet <- gsub(pattern = ".", replacement = "_",x = vireo$best_doublet, fixed = TRUE) + + ### Demuxalot + ## fix potential "-" + colnames(demuxalot) <- gsub(pattern = "-", replacement = "_",x = colnames(demuxalot), fixed = TRUE) + ## fix potential "." + colnames(demuxalot) <- gsub(pattern = ".", replacement = "_",x = colnames(demuxalot), fixed = TRUE) + + ## Demuxlet + ## fix potential "-" + freemuxlet$BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$BEST.GUESS, fixed = TRUE) + freemuxlet$NEXT.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$NEXT.GUESS, fixed = TRUE) + freemuxlet$SNG.BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$SNG.BEST.GUESS, fixed = TRUE) + freemuxlet$SNG.NEXT.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$SNG.NEXT.GUESS, fixed = TRUE) + freemuxlet$DBL.BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$DBL.BEST.GUESS, fixed = TRUE) + ## fix potential "." + freemuxlet$BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$BEST.GUESS, fixed = TRUE) + freemuxlet$NEXT.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$NEXT.GUESS, fixed = TRUE) + freemuxlet$SNG.BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$SNG.BEST.GUESS, fixed = TRUE) + freemuxlet$SNG.NEXT.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$SNG.NEXT.GUESS, fixed = TRUE) + freemuxlet$DBL.BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$DBL.BEST.GUESS, fixed = TRUE) + + ## Check the intersect of unique Sample IDs to make sure they are matching + vireo_samples <- vireo$best_singlet + demuxalot_samples <- colnames(demuxalot)[c(2:(par_sample_size + 1))] + freemuxlet_samples <- freemuxlet$SNG.BEST.GUESS + Sample_intersect <- Reduce(intersect, list(vireo_samples,demuxalot_samples,freemuxlet_samples)) + + ## print message + if (length(Sample_intersect) == 0){ + message("WARNING: Sample IDs may differ between constituent demultiplexing tools outputs.") + } else { + message("Identified matching Sample IDs betweeen constituent demultiplexing tool outputs.") + } + + ## Demuxalot prep + demuxalot <- demuxalot_prep(demuxalot) + + ## Vireo prep + vireo <- vireo_prep(vireo) + + ## Demuxlet prep + freemuxlet <- freemuxlet_prep(freemuxlet) + + ## Souporcell prep + souporcell <- souporcell_prep(souporcell, demuxalot, vireo, freemuxlet) + + ## Merge results and calculate consensus + merge_df <- merge_concensus(vireo,souporcell, freemuxlet, demuxalot) + + ## Ensure that if outputs do not having matching droplet counts, the missing droplets are unassigned. This occurs with Demuxlet/freemuxlet + merge_df$vireo_best_assignment[is.na(merge_df$vireo_best_assignment)] <- "unassigned" + merge_df$souporcell_best_assignment[is.na(merge_df$souporcell_best_assignment)] <- "unassigned" + merge_df$demuxlet_best_assignment[is.na(merge_df$demuxlet_best_assignment)] <- "unassigned" + merge_df$demuxalot_best_assignment[is.na(merge_df$demuxalot_best_assignment)] <- "unassigned" + + ## Check if the merged csv file has duplicated barcodes + if (length(unique(merge_df$barcode)) != nrow(merge_df)){ + merge_df <- merge_df[!duplicated(merge_df$barcode),] + temp <- merge_df[duplicated(merge_df$barcode),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Merged file resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(merge_df), " unique barcodes in Merged file.")) + } + } else { + print("No duplicated barcodes observed in Merged file.") + print(paste0("Observed ", nrow(merge_df), " unique barcodes in Merged file.")) + } + + ## Write merge CSV file + write.csv(merge_df, paste(par_output_dir,'/constituent_tool_merge.csv',sep =""), row.names=T) + } else { + message("Skipping constituent data preparation and merge, and loading exisiting csv file.") +} + + +########################################################################################################################### +## Probabilistic-weighted Ensemble +########################################################################################################################### +## Read in merge CSV file +if (tolower(par_probabilistic_weighted_ensemble)=="yes" & file.exists(paste0(par_output_dir,'/constituent_tool_merge.csv'))){ + message("Loading merged output file.") + merge_df <- read.delim(paste(par_output_dir,'/constituent_tool_merge.csv',sep =""), header = T, sep = ",") +} else if (tolower(par_probabilistic_weighted_ensemble)=="yes" & !file.exists(paste0(par_output_dir,'/constituent_tool_merge.csv'))) { + stop('Exiting ensemblex; constituent_tool_merge.csv file cannot be found.') +} else { + message("Skipping the probabilistic-weighted ensemble component of the ensemblex framework and loading existing csv file.") +} + +## Probabilistic-weighted Ensemble (PWE) function +if (tolower(par_probabilistic_weighted_ensemble)=="yes"){ + message("Performing the probabilistic-weighted ensemble component of the ensemblex framework.") +## Run PWE + result_test <- BA_weight_consensus(merge_df, par_sample_size, par_output_dir) +} else { + message("Skipping the probabilistic-weighted ensemble component of the ensemblex framework and loading existing csv file.") +} + +########################################################################################################################### +# GRAPH-BASED DOUBLET DETECTION +########################################################################################################################### +## Read in inputs +if (tolower(par_preliminary_parameter_sweep)=="yes" | tolower(par_graph_based_doublet_detection)=="yes" ){ + if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_test <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_test <- result_test[,-1] + } else { + stop('Exiting ensemblex; step1_cell_assignment.csv cannot be found.') + } +} else { + message("Skipping the graph-based doublet detection component of the ensemblex framework and loading existing csv file.") +} + + +if (tolower(par_preliminary_parameter_sweep)=="yes" & tolower(par_graph_based_doublet_detection)=="no"){ + message("Performing a preliminary parameter sweep for graph-based doublet detection.") + ## apply graph-based doublet detection parameter sweep + graph_based_doublet_detection_par_sweep(result_test, par_expected_doublet_rate, par_output_dir) + +} else if (tolower(par_preliminary_parameter_sweep)=="yes" & tolower(par_graph_based_doublet_detection)=="yes" & exists("par_ensemblex_nCD") & exists("par_ensemblex_pT") ){ + message("Performing graph-based doublet detection with manually defined parameters.") + ## apply graph-based doublet detection with manually defined parameters + graph_based_doublet_detection_manual_par(result_test, par_expected_doublet_rate, par_output_dir, par_ensemblex_pT, par_ensemblex_nCD) + +} else if (tolower(par_preliminary_parameter_sweep)=="no" & tolower(par_graph_based_doublet_detection)=="yes"){ + message("Performing graph-based doublet detection with ensemblex-estimated parameters.") + ## apply graph-based doublet detection with estimated optimal parameters + graph_based_doublet_detection_estimated_par(result_test, par_expected_doublet_rate, par_output_dir) + +} else { + message("Skipping graph-based doublet detection.") + +} + +########################################################################################################################### +# ENSEMBLE-INDEPENDENT DOUBLET DETECTION +########################################################################################################################### +## Read in inputs +if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" | tolower(par_ensemble_independent_doublet)=="yes"){ + if (file.exists(paste0(par_output_dir,"/step2",'/Step2_cell_assignment.csv'))){ + message("Loading graph-based doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_2 <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else { + stop('Exiting ensemblex; Step2_cell_assignment.csv and step1_cell_assignment.csv files cannot be found.') + } +} else { + message("Skipping the ensemble-independent doublet detection component of the ensemblex framework and loading existing csv file.") +} + + +if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" & tolower(par_ensemble_independent_doublet)=="yes"){ +## Run EID + result_2 <- ensemble_independent_doublet_detections(result_2, par_output_dir) +} else if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" & tolower(par_ensemble_independent_doublet)=="no"){ +## Run EID + ensemble_independent_doublet_detections_prelim(result_2, par_output_dir) +} else if (tolower(par_preliminary_ensemble_independent_doublet)=="no" & tolower(par_ensemble_independent_doublet)=="yes") { +## Run EID + result_2 <- ensemble_independent_doublet_detections(result_2, par_output_dir) +} else { + message("Skipping ensemble independent doublet detection.") +} + + +########################################################################################################################### +# CONFIDENCE-SCORE +########################################################################################################################### +## Read in the inputs +if (tolower(par_compute_singlet_confidence)=="yes"){ + if (file.exists(paste0(par_output_dir,"/step3",'/Step3_cell_assignment.csv'))){ + message("Loading ensemble-independent doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step3",'/Step3_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step2",'/Step2_cell_assignment.csv'))){ + message("Loading graph-based doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_2 <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else { + stop('Exiting ensemblex; Step3_cell_assignment.csv, Step2_cell_assignment.csv and step1_cell_assignment.csv files cannot be found.') + } +} else { + message("Skipping the ensemble-independent doublet detection component of the ensemblex framework and loading existing csv file.") +} + + +if (tolower(par_compute_singlet_confidence)=="yes"){ +## Compute ensemblex singlet confidence + eval_df <- confidence_score(result_2, par_output_dir,par_sample_size) +} + + diff --git a/ensemblex.pip/gt/scripts/ensemblexing/functions.R b/ensemblex.pip/gt/scripts/ensemblexing/functions.R new file mode 100644 index 0000000..0efc07a --- /dev/null +++ b/ensemblex.pip/gt/scripts/ensemblexing/functions.R @@ -0,0 +1,3077 @@ +########################################################################################################################### +# CONSTITUENT OUTPUT PREP +########################################################################################################################### +demuxalot_prep <- function(demuxalot){ + ## Compute number of columns in the dataframe + xx <- ncol(demuxalot) + + ## Change doublet column names for downstream operations + list_col <- c((par_sample_size + 2):xx) + for (i in list_col){ + colnames(demuxalot)[i] <- paste0("doublet_",i) + } + + ## Place Demuxalot output in long format + data_long <- gather(demuxalot, assignment, stat, 2:xx, factor_key=F) + + ## Identify the top sample prediction for each barcode + result <- data_long %>% + dplyr::group_by(X) %>% + dplyr::filter(stat == max(stat)) + nrow(result) + + ## Identify uniadentifiable cells; these are barcodes that have two or more samples assigned with the same probability + duplicates <- subset(result,duplicated(X)) + duplicated_id <- duplicates$X + result$assignment[result$X %in% duplicated_id] <- "unassigned" + + ## Remove duplicated barcodes to only keep one sample assignment; these barcodes are labelled as unassigned. + result <- subset(result,!duplicated(X)) + + ## Set up demuxalot assignment + result$demuxalot_sample <- result$assignment + result$demuxalot_sample[str_detect(result$demuxalot_sample, "doublet")] <- "doublet" + + ## Identify Demuxalot doublet probability + # Keep only doublet assignments + data_long_2 <- data_long[str_detect(data_long$assignment, "doublet"),] + # Keep only the top prediction for each barcode + result_doublet <- data_long_2 %>% + dplyr::group_by(X) %>% + dplyr::filter(stat == max(stat)) + # Keep only one of each barcode --> some barcodes will have multiple doublet combinations with the same doublet probability + result_doublet <- result_doublet[!(duplicated(result_doublet$X)),] + result_doublet <- result_doublet[,c(1,3)] + colnames(result_doublet) <- c("X", "demuxalot_doublet_probability") + + ## Merge doublet probability back to main dataframe + nrow(result) == nrow(result_doublet) + demuxalot_2 <- merge(result, result_doublet, by = "X") + nrow(demuxalot_2) == nrow(demuxalot) + demuxalot <- demuxalot_2 + + ## Print number of retained droplets + message(paste0("Retained ", nrow(demuxalot), " droplets during Demuxalot preparation.")) + demuxalot +} + +vireo_prep <- function(vireo){ + ## Vireo's sample assignment + vireo$vireo_sample <- vireo$donor_id + + ## Prepare Vireo's best guess assignment + vireo$vireo_best <- vireo$donor_id + vireo$vireo_best[vireo$vireo_best == "unassigned" & vireo$prob_max > vireo$prob_doublet] <- vireo$best_singlet[vireo$vireo_best == "unassigned" & vireo$prob_max > vireo$prob_doublet] + vireo$vireo_best[vireo$vireo_best == "unassigned" & vireo$prob_max < vireo$prob_doublet] <- "doublet" + + ## Print number of retained droplets + message(paste0("Retained ", nrow(vireo), " droplets during Vireo preparation.")) + vireo +} + +freemuxlet_prep <- function(freemuxlet){ + ## Prepare Demuxlet best guess + freemuxlet$guess_1 <- sub(".*,(.*),.*", "\\1", freemuxlet$BEST.GUESS) + freemuxlet$guess_2 <-sub("(.*),.*,.*", "\\1", freemuxlet$BEST.GUESS) + freemuxlet$freemuxlet_best[freemuxlet$guess_1 != freemuxlet$guess_2] <- "doublet" + freemuxlet$freemuxlet_best[freemuxlet$guess_1 == freemuxlet$guess_2] <- freemuxlet$guess_1[freemuxlet$guess_1 == freemuxlet$guess_2] + + ## Prepare Demuxlet sample assignment + freemuxlet$freemuxlet_sample <- freemuxlet$freemuxlet_best + freemuxlet$freemuxlet_sample[freemuxlet$DROPLET.TYPE == "AMB"] <- "unassigned" + freemuxlet <- freemuxlet %>% dplyr::select(-c(guess_1, guess_2)) + + ## Print number of retained droplets + message(paste0("Retained ", nrow(freemuxlet), " droplets during Demuxlet preparation.")) + freemuxlet +} + +souporcell_prep <- function(souporcell, demuxalot, vireo, freemuxlet){ + + ## Number of singlet cluster identified by Souporcell + soup_singlet_cluster <- unique(souporcell$assignment) + soup_singlet_cluster <- soup_singlet_cluster[!grepl("/", soup_singlet_cluster)] + + ## Demuxalot + demuxalot_for_soup <- demuxalot[,c(1,4)] + nrow(demuxalot_for_soup) == nrow(souporcell) + ## Merge Souporcell and Demuxalot + merge_soup_demuxalot <- merge(demuxalot_for_soup, souporcell, by.x = "X", by.y = "barcode") + nrow(merge_soup_demuxalot) == nrow(demuxalot_for_soup) + merge_soup_demuxalot <- merge_soup_demuxalot[,c(2:4)] + colnames(merge_soup_demuxalot) <- c("Sample_ID", "status", "assignment") + + ## Vireo + vireo_for_soup <- vireo[,c(1,10)] + nrow(vireo_for_soup) == nrow(souporcell) + ## Merge Souporcell and Vireo + merge_soup_vireo <- merge(vireo_for_soup, souporcell, by.x = "cell", by.y = "barcode") + nrow(merge_soup_vireo) == nrow(vireo_for_soup) + merge_soup_vireo <- merge_soup_vireo[,c(2:4)] + colnames(merge_soup_vireo) <- c("Sample_ID", "status", "assignment") + + ## Demuxlet + demuxlet_for_soup <- freemuxlet[,c(2,21)] + nrow(demuxlet_for_soup) == nrow(souporcell) + ## Merge Souporcell and Demuxlet + merge_soup_demuxlet <- merge(demuxlet_for_soup, souporcell, by.x = "BARCODE", by.y = "barcode") + nrow(merge_soup_demuxlet) == nrow(demuxlet_for_soup) + merge_soup_demuxlet <- merge_soup_demuxlet[,c(2:4)] + colnames(merge_soup_demuxlet) <- c("Sample_ID", "status", "assignment") + + ### Calculate cluster-Sample ID probabilities based on assignments of remaining constituent tools + bind_df <- rbind(merge_soup_demuxalot, merge_soup_vireo, merge_soup_demuxlet) + bind_df <- subset(bind_df, status != "doublet") + bind_df <- subset(bind_df, status != "unassigned") + bind_df <- subset(bind_df, Sample_ID != "doublet") + souporcell_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame_final <- data.frame(souporcell_assignment, Sample_ID, Probability) + + for(i in unique(bind_df$assignment)){ + df_lim <- subset(bind_df, assignment == i) + n_droplets <- nrow(df_lim) + + souporcell_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame <- data.frame(souporcell_assignment, Sample_ID, Probability) + + for (j in unique(bind_df$Sample_ID)){ + df_lim2 <- subset(df_lim, Sample_ID == j ) + prob <- nrow(df_lim2)/n_droplets + + souporcell_assignment = i + Sample_ID = j + Probability = prob + data_frame_temp <- data.frame(souporcell_assignment, Sample_ID, Probability) + data_frame <- rbind(data_frame, data_frame_temp) + } + data_frame_final <- rbind(data_frame_final, data_frame) + } + + data_frame_final <- subset(data_frame_final, Probability != "none") + data_frame_final$Probability <- as.numeric(data_frame_final$Probability) + + ## take maximum cluster-sample probability + result <- data_frame_final %>% + dplyr::group_by(souporcell_assignment) %>% + dplyr::filter(Probability == max(Probability)) + + ## set duplicated clusters to unassigned + n_occur <- data.frame(table(result$souporcell_assignment)) + n_occur <- subset(n_occur, Freq > 1) + duplicated <- n_occur$Var1 + result$Sample_ID[result$souporcell_assignment %in% duplicated] <- "unassigned" + + #temp_df <- data.frame(result) + result <- result[!duplicated(result$souporcell_assignment),] + + + if(length(result$souporcell_assignment) == length(unique(bind_df$assignment)) ){ + result <- data.frame(result) + result <- result[,c(1:2)] + colnames(result) <- c("assignment","souporcell_sample") + souporcell <- merge(souporcell, result, by = "assignment", all = T) + } + + + souporcell$souporcell_sample[souporcell$status == 'doublet'] <- 'doublet' + souporcell$souporcell_sample[str_detect(souporcell$assignment, "/")] <- "doublet" + + souporcell$souporcell_sample[is.na(souporcell$souporcell_sample)] <- "unassigned" + unique(souporcell$souporcell_sample) + + souporcell$souporcell_best <- souporcell$souporcell_sample + souporcell$souporcell_sample[souporcell$status == 'unassigned'] <- 'unassigned' + + if(length(unique(result$souporcell_sample)) == par_sample_size){ + message("Successfully matched all Souporcell clusters to Sample ID.") + } else { + message(paste0("WARNING: ensemblex failed to match all Souporcell clusters to all Sample ID. ensemblex will proceed; however, it is recommended to look into the data manually.")) + } + + ## Print number of retained droplets + message(paste0("Retained ", nrow(souporcell), " droplets during Demuxlet preparation.")) + souporcell +} + +merge_concensus <- function(vireo,souporcell, freemuxlet, demuxalot){ + ## Select important columns from Vireo + vireo <- dplyr::select(vireo, c("cell", "prob_max", "prob_doublet", "n_vars", "best_doublet", "doublet_logLikRatio", "vireo_sample", "vireo_best" )) + colnames(vireo) <- c("barcode", "vireo_singlet_probability", "vireo_doublet_probability", "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "vireo_assignment", "vireo_best_assignment") + + ## Select important columns from Souporcell + souporcell <- dplyr::select(souporcell, c("barcode", "log_prob_singleton", "log_prob_doublet", "souporcell_sample", "souporcell_best")) + colnames(souporcell) <- c("barcode", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "souporcell_assignment", "souporcell_best_assignment") + + ## Select important columns from Demuxlet + freemuxlet <- dplyr::select(freemuxlet, c("BARCODE", "NUM.SNPS", "NUM.READS", "SNG.POSTERIOR", "freemuxlet_sample", "freemuxlet_best", "DIFF.LLK.SNG.DBL")) + colnames(freemuxlet) <- c("barcode", "demuxlet_n_snps", "demuxlet_n_reads", "demuxlet_max_probability", "demuxlet_assignment", "demuxlet_best_assignment", "demuxlet_DIFF_LLK_SNG_DBL") + + ## Select important columns from Demuxalot + demuxalot$demuxalot_second <- demuxalot$demuxalot_sample + demuxalot$demuxalot_sample[demuxalot$stat < 0.9] <- "unassigned" + demuxalot <- dplyr::select(demuxalot, c("X", "stat", "demuxalot_sample", "demuxalot_second", "demuxalot_doublet_probability")) + colnames(demuxalot) <- c("barcode", "demuxalot_max_probability", "demuxalot_assignment", "demuxalot_best_assignment", "demuxalot_doublet_probability" ) + + ## Merge dataframes + merge_df <- merge(vireo, souporcell, by = c("barcode"), all = T) + merge_df <- merge(merge_df, freemuxlet, by = c("barcode"), all = T) + merge_df <- merge(merge_df, demuxalot, by = c("barcode"), all = T) + + ## Generate a general consensus column + merge_lim <- dplyr::select(merge_df, c("barcode", "vireo_assignment", "souporcell_assignment", "demuxlet_assignment", "demuxalot_assignment")) + merge_lim$general_consensus <- apply(merge_lim,1,function(x) names(which.max(table(x)))) + merge_lim$general_consensus <- sub(".*(-).*", "\\1", merge_lim$general_consensus) + merge_lim$general_consensus[merge_lim$general_consensus == "-" ] <- "unassigned" + merge_lim[is.na(merge_lim)] <- "unassigned" + merge_bind <- merge_lim + + ## Merge back to initial dataframe + merge_df <- merge(merge_bind, merge_df, by = c("barcode"), all = T) + + ## Print number of droplets in merged dataframe + message(paste0("Retained ", nrow(merge_df), " after merging output files from each constituent demultiplexing tool.")) + + ## Clean up output dataframe + merge_df <- dplyr::select(merge_df, -c("vireo_assignment.y", "souporcell_assignment.y","demuxlet_assignment.y", "demuxalot_assignment.y")) + colnames(merge_df) <- c("barcode", "vireo_assignment", "souporcell_assignment", "demuxlet_assignment", "demuxalot_assignment", "general_consensus","vireo_singlet_probability", "vireo_doublet_probability", + "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "vireo_best_assignment", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "souporcell_best_assignment", + "demuxlet_n_snps", "demuxlet_n_reads", "demuxlet_max_probability", "demuxlet_best_assignment", "demuxlet_DIFF_LLK_SNG_DBL", "demuxalot_max_probability", "demuxalot_best_assignment", + "demuxalot_doublet_probability") + merge_df <- dplyr::select(merge_df, c("barcode", "vireo_assignment", "souporcell_assignment", "demuxlet_assignment", "demuxalot_assignment", "general_consensus", "vireo_best_assignment", "souporcell_best_assignment", + "demuxlet_best_assignment", "demuxalot_best_assignment", "vireo_singlet_probability", "vireo_doublet_probability", "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", + "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "demuxlet_n_snps", "demuxlet_n_reads", "demuxlet_max_probability", "demuxlet_DIFF_LLK_SNG_DBL", + "demuxalot_max_probability", "demuxalot_doublet_probability")) + merge_df +} + + +########################################################################################################################### +# PROBABILISTIC-WEIGHTED ENSEMBLE +########################################################################################################################### +## FUNCTIONS +BA_weight_consensus <- function(merge_df,par_sample_size,par_output_dir){ + + ## Set seed + set.seed(1234) + + ## Create an output directory of probabilistic-weighted ensmeble outputs + dir.create(paste(par_output_dir,"/step1",sep='')) + + ## Rename the dataset + eval_df <- merge_df + + #### Adjusted Rand Index between sample assignments -- here we are using the best guess from each tool + ## Vireo + ARI_vireo_vireo <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$vireo_best_assignment) + ARI_vireo_demuxlet <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$demuxlet_best_assignment) + ARI_vireo_demuxalot <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$demuxalot_best_assignment) + ARI_vireo_souporcell <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$souporcell_best_assignment) + ## Demuxlet + ARI_demuxlet_demuxlet <- adj.rand.index(eval_df$demuxlet_best_assignment, eval_df$demuxlet_best_assignment) + ARI_demuxlet_vireo <- adj.rand.index(eval_df$demuxlet_best_assignment, eval_df$vireo_best_assignment) + ARI_demuxlet_demuxalot <- adj.rand.index(eval_df$demuxlet_best_assignment, eval_df$demuxalot_best_assignment) + ARI_demuxlet_souporcell <- adj.rand.index(eval_df$demuxlet_best_assignment, eval_df$souporcell_best_assignment) + ## Demuxalot + ARI_demuxalot_demuxalot <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$demuxalot_best_assignment) + ARI_demuxalot_souporcell <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$souporcell_best_assignment) + ARI_demuxalot_demuxlet <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$demuxlet_best_assignment) + ARI_demuxalot_vireo <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$vireo_best_assignment) + ## Souporcell + ARI_souporcell_souporcell <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$souporcell_best_assignment) + ARI_souporcell_vireo <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$vireo_best_assignment) + ARI_souporcell_demuxlet <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$demuxlet_best_assignment) + ARI_souporcell_demuxalot <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$demuxalot_best_assignment) + + ## Produce data frame + tool_1 <- c("Vireo","Vireo","Vireo","Vireo", + "Demuxlet", "Demuxlet", "Demuxlet", "Demuxlet", + "Demuxalot","Demuxalot","Demuxalot","Demuxalot", + "Souporcell","Souporcell","Souporcell", "Souporcell") + tool_2 <- c("Vireo", "Demuxlet", "Demuxalot", "Souporcell", + "Demuxlet", "Vireo","Demuxalot","Souporcell", + "Demuxalot", "Souporcell","Demuxlet","Vireo", + "Souporcell","Vireo","Demuxlet","Demuxalot") + ARI <- c(ARI_vireo_vireo, ARI_vireo_demuxlet, ARI_vireo_demuxalot, ARI_vireo_souporcell, + ARI_demuxlet_demuxlet, ARI_demuxlet_vireo, ARI_demuxlet_demuxalot, ARI_demuxlet_souporcell, + ARI_demuxalot_demuxalot, ARI_demuxalot_souporcell, ARI_demuxalot_demuxlet, ARI_demuxalot_vireo, + ARI_souporcell_souporcell, ARI_souporcell_vireo, ARI_souporcell_demuxlet, ARI_souporcell_demuxalot) + ARI_df <- data.frame(tool_1,tool_2,ARI ) + + ## Plot ARI heatmap + ggplot(ARI_df, aes(x = tool_1, y = tool_2, fill = ARI, label = round(ARI, digits = 3) )) +geom_tile() + theme_bw() + + scale_fill_gradient(low="white", high="darkblue") + + xlab("Demultiplexing tool") + + ylab("Demultiplexing tool") + + scale_x_discrete(expand = c(0,0)) + + scale_y_discrete(expand = c(0,0)) + geom_text() + ggsave(paste(par_output_dir,"/step1","/ARI_demultiplexing_tools.pdf", sep="")) + + #### Compute proxy balanced accuracies for each tool using consensus cells for the remaining tools + ### Vireo + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$demuxlet_best_assignment & + eval_df$souporcell_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + vireo_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$vireo_best_assignment] <- "TN" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$vireo_best_assignment] <- "TP" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$vireo_best_assignment] <- "FP" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$vireo_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$vireo_eval)) + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + vireo_BA <- BA + vireo_df <- data.frame(BA) + vireo_df$tool <- "Vireo" + + ### Demuxlet + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$vireo_best_assignment & + eval_df$souporcell_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + demuxlet_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxlet_best_assignment] <- "TN" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxlet_best_assignment] <- "TP" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxlet_best_assignment] <- "FP" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxlet_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$freemuxlet_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + demuxlet_BA <- BA + freemuxlet_df <- data.frame(BA) + freemuxlet_df$tool <- "Demuxlet" + + ### Demuxalot + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$vireo_best_assignment & + eval_df$souporcell_best_assignment == eval_df$demuxlet_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + demuxalot_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxalot_best_assignment] <- "TN" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxalot_best_assignment] <- "TP" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxalot_best_assignment] <- "FP" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxalot_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$demuxalot_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + demuxalot_BA <- BA + demuxalot_df <- data.frame(BA) + demuxalot_df$tool <- "Demuxalot" + + ### Souporcell + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$demuxalot_best_assignment == eval_df$vireo_best_assignment & + eval_df$demuxalot_best_assignment == eval_df$demuxlet_best_assignment & + eval_df$demuxalot_best_assignment != "unassigned" ,] + souporcell_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment == "doublet" & eval_df_ba$demuxalot_best_assignment == eval_df_ba$souporcell_best_assignment] <- "TN" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment != "doublet" & eval_df_ba$demuxalot_best_assignment == eval_df_ba$souporcell_best_assignment] <- "TP" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment != "doublet" & eval_df_ba$demuxalot_best_assignment != eval_df_ba$souporcell_best_assignment] <- "FP" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment == "doublet" & eval_df_ba$demuxalot_best_assignment != eval_df_ba$souporcell_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$souporcell_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + souporcell_BA <- BA + souporcell_df <- data.frame(BA) + souporcell_df$tool <- "Souporcell" + + #### Output summary information + ## Produce table with PWE information + Tool <- c("Vireo", "Demuxlet", "Demuxalot", "Souporcell") + Balanced_accuracy <- c(vireo_BA, demuxlet_BA, demuxalot_BA, souporcell_BA) + n_consensus_droplets <- c(vireo_n, demuxlet_n, demuxalot_n, souporcell_n) + PWE_summary_df <- data.frame(Tool,Balanced_accuracy,n_consensus_droplets ) + write.csv(PWE_summary_df, paste(par_output_dir,"/step1",'/Balanced_accuracy_summary.csv', sep="")) + + ## Plot estimated balanced accuracy + ggplot(PWE_summary_df, aes(x = Tool, y = Balanced_accuracy, label = round(Balanced_accuracy, digits = 4), fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_text() + + xlab("Demultiplexing tool") + + ylab("Estimated Balanced Accuracy") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step1","/BA_demultiplexing_tools.pdf", sep="")) + + ### combine balanced accuracies into one data frame #### + BA_df <- rbind(vireo_df, + freemuxlet_df, + demuxalot_df, + souporcell_df) + + ### Place Vireo and Souporcell probabilties into proper format for downstream analsyes + ## Vireo + # Merge Vireo singlet and doublet probabilities -- take max + # Columns 8 and 9 are vireo_max and vireo_doublet, respectively + eval_df[, "vireo_max_probability"] <- apply(eval_df[, 12:13], 1, max) + + ## Souporcell -- convert log(prob) to prob + # Singlets + eval_df$souporcell_singlet_probability <- 1-(10^(eval_df$souporcell_log_probability_singlet)) + # Doublets + eval_df$souporcell_doublet_probability <- 1-(10^(eval_df$souporcell_log_probability_doublet)) + # Take souporcell max probability between singlets and doublets + # Columns 25 and 26 are souporcell_singlet and souporcell_doublet, respectively + eval_df[, "souporcell_max_probability"] <- apply(eval_df[, 26:27], 1, max) + + ### Multiply the assignment probabilities from each of the constituent demultiplexing tools by their respective estimated balanced accuracy for the dataset + ## Vireo + eval_df$vireo_weighted_probability <- eval_df$vireo_max_probability*BA_df$BA[BA_df$tool == "Vireo"] + ## Demuxlet + eval_df$demuxlet_weighted_probability <- eval_df$demuxlet_max_probability*BA_df$BA[BA_df$tool == "Demuxlet"] + ## Demuxalot + eval_df$demuxalot_weighted_probability <- eval_df$demuxalot_max_probability*BA_df$BA[BA_df$tool == "Demuxalot"] + ## Souporcell + eval_df$souporcell_weighted_probability <- eval_df$souporcell_max_probability*BA_df$BA[BA_df$tool == "Souporcell"] + + ############################################################################################################################################################ + ## Get weighted consensus assignment ## + ############################################################################################################################################################ + ## Rename dataframe and remove first column + practice_df <- eval_df + practice_df <- practice_df[,-1] + + ### Create a sample list, not including unassigned and doublet + ## Vireo + Vireo_sample_list <- unique(practice_df$vireo_best_assignment) + ## Demuxalot + Demuxalot_sample_list <- unique(practice_df$demuxalot_best_assignment) + ## Demuxlet + Demuxlet_sample_list <- unique(practice_df$demuxlet_best_assignment) + ## Souporcell + Souporcell_sample_list <- unique(practice_df$souporcell_best_assignment) + + ## Identify all unique samples identified by each demultiplexing tool + sample_list <- unlist(append(Vireo_sample_list,Demuxalot_sample_list)) + sample_list <- unlist(append(sample_list,Demuxlet_sample_list)) + sample_list <- unlist(append(sample_list,Souporcell_sample_list)) + remove_sample <- c("doublet", "unassigned") + sample_list_2 <- sample_list[!sample_list %in% remove_sample] + sample_list_2 <- unique(sample_list_2) + if (length(sample_list_2) == par_sample_size){ + message(paste0("Generating weighted-probabilistic ensemble assignments from ", length(sample_list_2), " Sample IDs.")) + } else { + message(paste0("WARNING: Generating weighted-probabilistic ensemble assignments from ", length(sample_list_2), " Sample IDs. This is not the number of pooled samples defined by the user.")) + } + + ## Compute weighted probability for each sample + for (i in sample_list_2) { + # Doublets + practice_df$doublet <- ifelse(practice_df$vireo_best_assignment == "doublet", practice_df$vireo_weighted_probability, 0) + practice_df$doublet <- ifelse(practice_df$souporcell_best_assignment == "doublet", practice_df$doublet + practice_df$souporcell_weighted_probability, practice_df$doublet+ 0) + practice_df$doublet <- ifelse(practice_df$demuxalot_best_assignment == "doublet", practice_df$doublet + practice_df$demuxalot_weighted_probability, practice_df$doublet+ 0) + practice_df$doublet <- ifelse(practice_df$demuxlet_best_assignment == "doublet", practice_df$doublet + practice_df$demuxlet_weighted_probability, practice_df$doublet+ 0) + + # Singlets + practice_df$sample <- ifelse(practice_df$vireo_best_assignment == i, practice_df$vireo_weighted_probability, 0) + practice_df$sample <- ifelse(practice_df$souporcell_best_assignment == i, practice_df$sample + practice_df$souporcell_weighted_probability, practice_df$sample+ 0) + practice_df$sample <- ifelse(practice_df$demuxalot_best_assignment == i, practice_df$sample + practice_df$demuxalot_weighted_probability, practice_df$sample+ 0) + practice_df$sample <- ifelse(practice_df$demuxlet_best_assignment == i, practice_df$sample + practice_df$demuxlet_weighted_probability, practice_df$sample+ 0) + colnames(practice_df)[ncol(practice_df)] <- i + } + + ## Select sample assignment with maximum weighted probability for each droplet + data_long <- gather(practice_df, key="ensemblex_assignment", value="stat", 32:ncol(practice_df)) + result <- data_long %>% + dplyr::group_by(barcode) %>% + dplyr::filter(stat == max(stat)) + + ## Get remaining probabilities for non-assigned samples + data_long <- gather(practice_df, key="ensemblex_assignment", value="stat", 32:ncol(practice_df)) + result_sum <- data_long %>% + dplyr::group_by(barcode) %>% + dplyr::summarise(total = sum(stat)) + result_test <- merge(result, result_sum, by = "barcode") + + ## Calculate ensemblex probability + result_test$ensemblex_probability <- result_test$stat/result_test$total + + ## Clean up dataframe + result_test <- dplyr::select(result_test, c("barcode", "ensemblex_assignment", "ensemblex_probability", "vireo_assignment", "souporcell_assignment", "demuxlet_assignment", "demuxalot_assignment", "general_consensus", + "vireo_best_assignment", "souporcell_best_assignment", "demuxlet_best_assignment", "demuxalot_best_assignment", "vireo_singlet_probability", "vireo_doublet_probability", + "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "demuxlet_n_snps", + "demuxlet_n_reads", "demuxlet_max_probability", "demuxlet_DIFF_LLK_SNG_DBL", "demuxalot_max_probability", "demuxalot_doublet_probability", "vireo_max_probability", + "vireo_weighted_probability", "demuxlet_weighted_probability", + "demuxalot_weighted_probability", "souporcell_weighted_probability")) + + ## save PWE assignment dataframe + write.csv(result_test, paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep="")) + result_test +} + + +########################################################################################################################### +# GRAPH-BASED DOUBLET DETECTION +########################################################################################################################### +graph_based_doublet_detection_par_sweep <- function(result_test, par_expected_doublet_rate, par_output_dir){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, demuxlet_n_snps, demuxlet_n_reads, vireo_doublet_logLikRatio, demuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Demuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "demuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(demuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Demuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "demuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(demuxlet_n_snps)) + + ## Organize Demuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "demuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(demuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Demuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(demuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Demuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(demuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Demuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(demuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + ################################################################################################################################ + #### parameter sweep #### + ################################################################################################################################ + ### Compute varying number of confident doublets (nCD) + ## 50 nCD + suspected_doublets_50 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_50){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_50 <- c(suspected_doublets_50, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_50 + } + if(length(suspected_doublets_50) >= 50 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_50 + } + + ## 100 nCD + suspected_doublets_100 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_100){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_100 <- c(suspected_doublets_100, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_100 + } + if(length(suspected_doublets_100) >= 100 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_100 + } + + ## 150 nCD + suspected_doublets_150 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_150){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_150 <- c(suspected_doublets_150, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_150 + } + if(length(suspected_doublets_150) >= 150 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_150 + } + + ## 200 nCD + suspected_doublets_200 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_200){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_200 <- c(suspected_doublets_200, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_200 + } + if(length(suspected_doublets_200) >= 200 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_200 + } + + ## 250 nCD + suspected_doublets_250 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_250){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_250 <- c(suspected_doublets_250, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_250 + } + if(length(suspected_doublets_250) >= 250 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_250 + } + + ## 300 nCD + suspected_doublets_300 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_300){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_300 <- c(suspected_doublets_300, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_300 + } + if(length(suspected_doublets_300) >= 300 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_300 + } + + + ## Make a list of percentile threshold (pT) values based on expected doublet rate for the pool + message(paste0("Expected doublet rate is ", par_expected_doublet_rate, "; ", round(par_expected_doublet_rate*nrow(result_2), digits = 0), " droplets" )) + interval <- par_expected_doublet_rate/6 + pT_list <- rev(seq(1-par_expected_doublet_rate,1-interval, by = interval)) + + ## 50 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_50 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_50){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_50 <- rbind(filler_frame_50,distances_test_15) + filler_frame_50 + } + filler_frame_50 + } + + ## 100 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_100 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_100){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_100 <- rbind(filler_frame_100,distances_test_15) + filler_frame_100 + } + filler_frame_100 + } + + ## 150 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_150 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_150){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_150 <- rbind(filler_frame_150,distances_test_15) + filler_frame_150 + } + filler_frame_150 + } + + ## 200 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_200 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_200){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_200 <- rbind(filler_frame_200,distances_test_15) + filler_frame_200 + } + filler_frame_200 + } + + ## 250 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_250 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_250){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_250 <- rbind(filler_frame_250,distances_test_15) + filler_frame_250 + } + filler_frame_250 + } + + ## 300 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_300 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_300){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_300 <- rbind(filler_frame_300,distances_test_15) + filler_frame_300 + } + filler_frame_300 + } + + ### Compute nearest neighbour frequency and Kutosis of frequency distributions + ## 50 nCD + counts_50 <- filler_frame_50 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_50 <- counts_50 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 50 + pT <- 0 + kurtosis <- 0 + fill_frame_50 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_50$n[counts_50$pT == t]) + nCD <- 50 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_50 <- rbind(fill_frame_50,temp_frame) + fill_frame_50 <- subset(fill_frame_50, pT != 0) + fill_frame_50 + } + + ## 100 nCD + counts_100 <- filler_frame_100 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_100 <- counts_100 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 100 + pT <- 0 + kurtosis <- 0 + fill_frame_100 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_100$n[counts_100$pT == t]) + nCD <- 100 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_100 <- rbind(fill_frame_100,temp_frame) + fill_frame_100 <- subset(fill_frame_100, pT != 0) + fill_frame_100 + } + + ## 150 nCD + counts_150 <- filler_frame_150 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_150 <- counts_150 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 150 + pT <- 0 + kurtosis <- 0 + fill_frame_150 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_150$n[counts_150$pT == t]) + nCD <- 150 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_150 <- rbind(fill_frame_150,temp_frame) + fill_frame_150 <- subset(fill_frame_150, pT != 0) + fill_frame_150 + } + + ## 200 nCD + counts_200 <- filler_frame_200 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_200 <- counts_200 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 200 + pT <- 0 + kurtosis <- 0 + fill_frame_200 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_200$n[counts_200$pT == t]) + nCD <- 200 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_200 <- rbind(fill_frame_200,temp_frame) + fill_frame_200 <- subset(fill_frame_200, pT != 0) + fill_frame_200 + } + + ## 250 nCD + counts_250 <- filler_frame_250 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_250 <- counts_250 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 250 + pT <- 0 + kurtosis <- 0 + fill_frame_250 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_250$n[counts_250$pT == t]) + nCD <- 250 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_250 <- rbind(fill_frame_250,temp_frame) + fill_frame_250 <- subset(fill_frame_250, pT != 0) + fill_frame_250 + } + + ## 300 nCD + counts_300 <- filler_frame_300 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_300 <- counts_300 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 300 + pT <- 0 + kurtosis <- 0 + fill_frame_300 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_300$n[counts_300$pT == t]) + nCD <- 300 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_300 <- rbind(fill_frame_300,temp_frame) + fill_frame_300 <- subset(fill_frame_300, pT != 0) + fill_frame_300 + } + + ### Compute optimal parameters + ## Bind all of the Kurtosis frames + kurtosis_bind <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + + ## Compute average Kurtosis for each pT + kurtosis_bind <- kurtosis_bind %>% + dplyr::group_by(pT) %>% + dplyr::summarize(Mean_k = mean(kurtosis, na.rm=TRUE)) + + optimal_pT <- kurtosis_bind$pT[kurtosis_bind$Mean_k == max(kurtosis_bind$Mean_k)] + + ## Plot kurtosis values + ggplot(kurtosis_bind, aes(x = pT, y = Mean_k)) + + geom_vline(xintercept = optimal_pT, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal pT: ", optimal_pT)) + ggsave(paste(par_output_dir,"/step2","/optimal_pT.pdf", sep="")) + + ## find optimal nCD + bind_nCD <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + bind_nCD <- subset(bind_nCD, pT == optimal_pT) + ncD <- data.frame(nCD = c(50, 100, 150, 200, 250, 300)) + ncD_list <- c(50, 100, 150, 200, 250, 300) + + ## Smooth line + fm1 <- smooth.spline(bind_nCD[,"ncD"], bind_nCD[,"kurtosis"], df = 3) + y2 <- predict(fm1, x = ncD) + y <- data.frame(y2$y) + y <- y$nCD + + ## Find elbow of the smoothed curve + df <- data.frame(ncd = ncD_list, kurtosis = y) + optimal_nCD <- find_curve_elbow(df, export_type = "row_num", plot_curve = FALSE) + + ## parse value + optimal_nCD <- df[optimal_nCD, 1] + + ## If cannot find slope, take point preceedinging the largest slope + if (is.na(optimal_nCD)) { + message("Could not determine optimal nCD based on the elbow, taking maximum kurtosis value.") + max <- max(bind_nCD$kurtosis) + optimal_nCD <- bind_nCD$ncD[bind_nCD$kurtosis == max] + optimal_nCD <- optimal_nCD[1] + #slope= diff(bind_nCD$ncD)/diff(bind_nCD$kurtosis) + #xx <- bind_nCD[which.max(slope),] + #optimal_nCD <- max(xx$ncD) + }else{ + optimal_nCD <- optimal_nCD + message("Determine optimal nCD based on the elbow.") + } + + ## plot + ggplot(bind_nCD, aes(x = ncD, y = kurtosis)) + + geom_vline(xintercept = optimal_nCD, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal nCD: ", optimal_nCD)) + ggsave(paste(par_output_dir,"/step2","/optimal_nCD.pdf", sep="")) + +} +graph_based_doublet_detection_manual_par <- function(result_test, par_expected_doublet_rate, par_output_dir,par_ensemblex_pT,par_ensemblex_nCD){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load manually defined optimal parameters + optimal_pT <- par_ensemblex_pT + optimal_nCD <- par_ensemblex_nCD + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, demuxlet_n_snps, demuxlet_n_reads, vireo_doublet_logLikRatio, demuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Demuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "demuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(demuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Demuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "demuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(demuxlet_n_snps)) + + ## Organize Demuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "demuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(demuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Demuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(demuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Demuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(demuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Demuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(demuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + ################################################################################################################################ + #### Compute graph-based doublet detection with optimal pT and nCD values + ################################################################################################################################ + ## Report optimal parameters + message(paste0("Using ", round(optimal_pT, digits = 4), " as optimal pT value")) + message(paste0("Using ", optimal_nCD, " as optimal nCD value")) + + ### Identify high confidence doublets + suspected_doublets <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets <- c(suspected_doublets, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets + } + if(length(suspected_doublets) >= optimal_nCD ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets + } + + ### Identify graph-based suspected doublets + doublet <- "none" + percentile <- 0 + barcode <- "none" + filler_frame <- data.frame(doublet,percentile, barcode ) + + for (j in suspected_doublets){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= optimal_pT,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + + filler_frame <- rbind(filler_frame,distances_test_15) + filler_frame + } + + ## Nearest neighbour frequency of GBD-identified doublets + total_testerquester_count <- filler_frame %>% + dplyr::group_by(barcode) %>% + dplyr::mutate(n = n()) %>% + ungroup() + total_testerquester_count <- total_testerquester_count[!(duplicated(total_testerquester_count$barcode)),] + total_testerquester_count <- total_testerquester_count[order(total_testerquester_count$n, decreasing = TRUE),] + total_testerquester_count_lim <- total_testerquester_count[c(1:nrow(total_testerquester_count)),] + remove_dublets <- subset(total_testerquester_count_lim, barcode != "none") + + ## Plot kurtosis and save plot + den <- density(total_testerquester_count_lim$n) + k <- kurtosis(total_testerquester_count_lim$n) + k <- round(k, digits = 3) + + ### Plot PCA summary + ## PCA prior to doublet detection + result_2_lim <- result_2 + result_2_lim$is_doublet <- "singlet" + result_2_lim$is_doublet[result_2_lim$ensemblex_assignment == "doublet"] <- "doublet" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment prior to graph-based doublet detection") + + scale_colour_manual(values = c( "indianred2", "grey")) + ggsave(paste(par_output_dir,"/step2","/PCA1_graph_based_doublet_detection.pdf", sep="")) + + ## PCA highlighting confident doublets + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% suspected_doublets ] <- "yes" + colnames(result_2_lim) + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle(paste0("High confidence doublets (nCD = ", optimal_nCD,")")) + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA2_graph_based_doublet_detection.pdf", sep="")) + + ## PCA after graph baed doublet detection + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% total_testerquester_count$barcode | result_2_lim$ensemblex_assignment == "doublet"] <- "yes" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment after graph-based doublet detection") + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA3_graph_based_doublet_detection.pdf", sep="")) + + ## Label graph-based expected doublets as doublets + result_2 <- result_test + result_2$ensemblex_assignment[result_2$barcode %in% remove_dublets$barcode] <- "doublet" + write.csv(result_2, paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep="")) + + result_2 +} +graph_based_doublet_detection_estimated_par <- function(result_test, par_expected_doublet_rate, par_output_dir){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, demuxlet_n_snps, demuxlet_n_reads, vireo_doublet_logLikRatio, demuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Demuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "demuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(demuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Demuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "demuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(demuxlet_n_snps)) + + ## Organize Demuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "demuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(demuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Demuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(demuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Demuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(demuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Demuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(demuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + ################################################################################################################################ + #### parameter sweep #### + ################################################################################################################################ + ### Compute varying number of confident doublets (nCD) + ## 50 nCD + suspected_doublets_50 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_50){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_50 <- c(suspected_doublets_50, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_50 + } + if(length(suspected_doublets_50) >= 50 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_50 + } + + ## 100 nCD + suspected_doublets_100 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_100){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_100 <- c(suspected_doublets_100, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_100 + } + if(length(suspected_doublets_100) >= 100 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_100 + } + + ## 150 nCD + suspected_doublets_150 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_150){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_150 <- c(suspected_doublets_150, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_150 + } + if(length(suspected_doublets_150) >= 150 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_150 + } + + ## 200 nCD + suspected_doublets_200 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_200){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_200 <- c(suspected_doublets_200, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_200 + } + if(length(suspected_doublets_200) >= 200 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_200 + } + + ## 250 nCD + suspected_doublets_250 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_250){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_250 <- c(suspected_doublets_250, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_250 + } + if(length(suspected_doublets_250) >= 250 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_250 + } + + ## 300 nCD + suspected_doublets_300 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_300){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_300 <- c(suspected_doublets_300, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_300 + } + if(length(suspected_doublets_300) >= 300 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_300 + } + + + ## Make a list of percentile threshold (pT) values based on expected doublet rate for the pool + message(paste0("Expected doublet rate is ", par_expected_doublet_rate, "; ", round(par_expected_doublet_rate*nrow(result_2), digits = 0), " droplets" )) + interval <- par_expected_doublet_rate/6 + pT_list <- rev(seq(1-par_expected_doublet_rate,1-interval, by = interval)) + + ## 50 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_50 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_50){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_50 <- rbind(filler_frame_50,distances_test_15) + filler_frame_50 + } + filler_frame_50 + } + + ## 100 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_100 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_100){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_100 <- rbind(filler_frame_100,distances_test_15) + filler_frame_100 + } + filler_frame_100 + } + + ## 150 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_150 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_150){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_150 <- rbind(filler_frame_150,distances_test_15) + filler_frame_150 + } + filler_frame_150 + } + + ## 200 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_200 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_200){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_200 <- rbind(filler_frame_200,distances_test_15) + filler_frame_200 + } + filler_frame_200 + } + + ## 250 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_250 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_250){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_250 <- rbind(filler_frame_250,distances_test_15) + filler_frame_250 + } + filler_frame_250 + } + + ## 300 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_300 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_300){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_300 <- rbind(filler_frame_300,distances_test_15) + filler_frame_300 + } + filler_frame_300 + } + + ### Compute nearest neighbour frequency and Kutosis of frequency distributions + ## 50 nCD + counts_50 <- filler_frame_50 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_50 <- counts_50 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 50 + pT <- 0 + kurtosis <- 0 + fill_frame_50 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_50$n[counts_50$pT == t]) + nCD <- 50 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_50 <- rbind(fill_frame_50,temp_frame) + fill_frame_50 <- subset(fill_frame_50, pT != 0) + fill_frame_50 + } + + ## 100 nCD + counts_100 <- filler_frame_100 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_100 <- counts_100 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 100 + pT <- 0 + kurtosis <- 0 + fill_frame_100 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_100$n[counts_100$pT == t]) + nCD <- 100 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_100 <- rbind(fill_frame_100,temp_frame) + fill_frame_100 <- subset(fill_frame_100, pT != 0) + fill_frame_100 + } + + ## 150 nCD + counts_150 <- filler_frame_150 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_150 <- counts_150 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 150 + pT <- 0 + kurtosis <- 0 + fill_frame_150 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_150$n[counts_150$pT == t]) + nCD <- 150 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_150 <- rbind(fill_frame_150,temp_frame) + fill_frame_150 <- subset(fill_frame_150, pT != 0) + fill_frame_150 + } + + ## 200 nCD + counts_200 <- filler_frame_200 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_200 <- counts_200 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 200 + pT <- 0 + kurtosis <- 0 + fill_frame_200 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_200$n[counts_200$pT == t]) + nCD <- 200 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_200 <- rbind(fill_frame_200,temp_frame) + fill_frame_200 <- subset(fill_frame_200, pT != 0) + fill_frame_200 + } + + ## 250 nCD + counts_250 <- filler_frame_250 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_250 <- counts_250 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 250 + pT <- 0 + kurtosis <- 0 + fill_frame_250 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_250$n[counts_250$pT == t]) + nCD <- 250 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_250 <- rbind(fill_frame_250,temp_frame) + fill_frame_250 <- subset(fill_frame_250, pT != 0) + fill_frame_250 + } + + ## 300 nCD + counts_300 <- filler_frame_300 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_300 <- counts_300 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 300 + pT <- 0 + kurtosis <- 0 + fill_frame_300 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_300$n[counts_300$pT == t]) + nCD <- 300 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_300 <- rbind(fill_frame_300,temp_frame) + fill_frame_300 <- subset(fill_frame_300, pT != 0) + fill_frame_300 + } + + ################################################################################################################################ + ### Compute optimal parameters + ################################################################################################################################ + ## Bind all of the Kurtosis frames + kurtosis_bind <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + + ## Compute average Kurtosis for each pT + kurtosis_bind <- kurtosis_bind %>% + dplyr::group_by(pT) %>% + dplyr::summarize(Mean_k = mean(kurtosis, na.rm=TRUE)) + + optimal_pT <- kurtosis_bind$pT[kurtosis_bind$Mean_k == max(kurtosis_bind$Mean_k)] + + ## Plot kurtosis values + ggplot(kurtosis_bind, aes(x = pT, y = Mean_k)) + + geom_vline(xintercept = optimal_pT, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal pT: ", optimal_pT)) + ggsave(paste(par_output_dir,"/step2","/optimal_pT.pdf", sep="")) + + ## find optimal nCD + bind_nCD <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + bind_nCD <- subset(bind_nCD, pT == optimal_pT) + ncD <- data.frame(nCD = c(50, 100, 150, 200, 250, 300)) + ncD_list <- c(50, 100, 150, 200, 250, 300) + + ## Smooth line + fm1 <- smooth.spline(bind_nCD[,"ncD"], bind_nCD[,"kurtosis"], df = 3) + y2 <- predict(fm1, x = ncD) + y <- data.frame(y2$y) + y <- y$nCD + + ## Find elbow of the smoothed curve + df <- data.frame(ncd = ncD_list, kurtosis = y) + optimal_nCD <- find_curve_elbow(df, export_type = "row_num", plot_curve = FALSE) + + ## parse value + optimal_nCD <- df[optimal_nCD, 1] + + ## If cannot find slope, take point preceedinging the largest slope + if (is.na(optimal_nCD)) { + message("Could not determine optimal nCD based on the elbow, taking maximum kurtosis value.") + max <- max(bind_nCD$kurtosis) + optimal_nCD <- bind_nCD$ncD[bind_nCD$kurtosis == max] + optimal_nCD <- optimal_nCD[1] + #slope= diff(bind_nCD$ncD)/diff(bind_nCD$kurtosis) + #xx <- bind_nCD[which.max(slope),] + #optimal_nCD <- max(xx$ncD) + }else{ + optimal_nCD <- optimal_nCD + message("Determine optimal nCD based on the elbow.") + } + + ## plot + ggplot(bind_nCD, aes(x = ncD, y = kurtosis)) + + geom_vline(xintercept = optimal_nCD, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal nCD: ", optimal_nCD)) + ggsave(paste(par_output_dir,"/step2","/optimal_nCD.pdf", sep="")) + + ################################################################################################################################ + #### Compute graph-based doublet detection with optimal pT and nCD values + ################################################################################################################################ + ## Report optimal parameters + message(paste0("Using ", round(optimal_pT, digits = 4), " as optimal pT value")) + message(paste0("Using ", optimal_nCD, " as optimal nCD value")) + + ### Identify high confidence doublets + suspected_doublets <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets <- c(suspected_doublets, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets + } + if(length(suspected_doublets) >= optimal_nCD ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets + } + + ### Identify graph-based suspected doublets + doublet <- "none" + percentile <- 0 + barcode <- "none" + filler_frame <- data.frame(doublet,percentile, barcode ) + + for (j in suspected_doublets){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= optimal_pT,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + + filler_frame <- rbind(filler_frame,distances_test_15) + filler_frame + } + + ## Nearest neighbour frequency of GBD-identified doublets + total_testerquester_count <- filler_frame %>% + dplyr::group_by(barcode) %>% + dplyr::mutate(n = n()) %>% + ungroup() + total_testerquester_count <- total_testerquester_count[!(duplicated(total_testerquester_count$barcode)),] + total_testerquester_count <- total_testerquester_count[order(total_testerquester_count$n, decreasing = TRUE),] + total_testerquester_count_lim <- total_testerquester_count[c(1:nrow(total_testerquester_count)),] + remove_dublets <- subset(total_testerquester_count_lim, barcode != "none") + + ## Plot kurtosis and save plot + den <- density(total_testerquester_count_lim$n) + k <- kurtosis(total_testerquester_count_lim$n) + k <- round(k, digits = 3) + + ### Plot PCA summary + ## PCA prior to doublet detection + result_2_lim <- result_2 + result_2_lim$is_doublet <- "singlet" + result_2_lim$is_doublet[result_2_lim$ensemblex_assignment == "doublet"] <- "doublet" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment prior to graph-based doublet detection") + + scale_colour_manual(values = c( "indianred2", "grey")) + ggsave(paste(par_output_dir,"/step2","/PCA1_graph_based_doublet_detection.pdf", sep="")) + + ## PCA highlighting confident doublets + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% suspected_doublets ] <- "yes" + colnames(result_2_lim) + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle(paste0("High confidence doublets (nCD = ", optimal_nCD,")")) + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA2_graph_based_doublet_detection.pdf", sep="")) + + ## PCA after graph baed doublet detection + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% total_testerquester_count$barcode | result_2_lim$ensemblex_assignment == "doublet"] <- "yes" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment after graph-based doublet detection") + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA3_graph_based_doublet_detection.pdf", sep="")) + + ## Label graph-based expected doublets as doublets + result_2 <- result_test + result_2$ensemblex_assignment[result_2$barcode %in% remove_dublets$barcode] <- "doublet" + write.csv(result_2, paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep="")) + + result_2 +} + +########################################################################################################################### +# Ensemble-independent doublet detection +########################################################################################################################### +ensemble_independent_doublet_detections <- function(result_2, par_output_dir){ + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/step3",sep='')) + + expected_doublets <- nrow(result_2)*par_expected_doublet_rate + + ### Proportion agreement bar plot with threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$demuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$demuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$demuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Demuxlet + result_2_freemuxlet <- result_2[result_2$demuxlet_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Demuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_threshold.pdf", sep="")) + + ### Proportion agreement bar plot without threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_best_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$demuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_best_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$demuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_best_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$demuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Demuxlet + result_2_freemuxlet <- result_2[result_2$demuxlet_best_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Demuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets (no threshold)") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_no_threshold.pdf", sep="")) + + ### Number of ensemblex droplets with EID of each tool with threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxlet + result_2_temp_demuxlet <- result_2 + result_2_temp_demuxlet$ensemblex_assignment[result_2_temp_demuxlet$demuxlet_assignment == "doublet"] <- "doublet" + n_demuxlet <- result_2_temp_demuxlet[result_2_temp_demuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Demuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_demuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_threshold.pdf", sep="")) + + ### number of ensemblex droplets with EID of each tool with out threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_best_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_best_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxlet + result_2_temp_demuxlet <- result_2 + result_2_temp_demuxlet$ensemblex_assignment[result_2_temp_demuxlet$demuxlet_best_assignment == "doublet"] <- "doublet" + n_demuxlet <- result_2_temp_demuxlet[result_2_temp_demuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_best_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Demuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_demuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_no_threshold.pdf", sep="")) + + #################################################################################################################################################################################### + ## Ensemble inpendent doublet detection ## + #################################################################################################################################################################################### + ## Threshold + ## If Souporcell says doublet do doublet + if ((tolower(par_doublet_Souporcell_threshold))=="yes"){ + message("Labelling all doublets identified by Souporcell (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$souporcell_assignment == "doublet"] <- "doublet" + } + ## If Demuxlet says doublet do doublet + if ((tolower(par_doublet_Demuxlet_threshold))=="yes"){ + message("Labelling all doublets identified by Demuxlet (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$freemuxlet_assignment == "doublet"] <- "doublet" + } + ## If Vireo says doublet do doublet + if ((tolower(par_doublet_Vireo_threshold))=="yes"){ + message("Labelling all doublets identified by Vireo (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$vireo_assignment == "doublet"] <- "doublet" + } + ## If Demuxalot says doublet do doublet + if ((tolower(par_doublet_Demuxalot_threshold))=="yes") { + message("Labelling all doublets identified by Demuxalot (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$demuxalot_assignment == "doublet"] <- "doublet" + } + + ### No threshold + ## If Souporcell says doublet do doublet + if ((tolower(par_doublet_Souporcell_no_threshold))=="yes") { + message("Labelling all doublets identified by Souporcell (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$souporcell_best_assignment == "doublet"] <- "doublet" + } + ## If Demuxlet says doublet do doublet + if ((tolower(par_doublet_Demuxlet_no_threshold))=="yes") { + message("Labelling all doublets identified by Demuxlet (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$demuxlet_best_assignment == "doublet"] <- "doublet" + } + ## If Vireo says doublet do doublet + if ((tolower(par_doublet_Vireo_no_threshold))=="yes") { + message("Labelling all doublets identified by Vireo (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$vireo_best_assignment == "doublet"] <- "doublet" + } + ## If Demuxalot says doublet do doublet + if ((tolower(par_doublet_Demuxalot_no_threshold))=="yes") { + message("Labelling all doublets identified by Demuxalot (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$demuxalot_best_assignment == "doublet"] <- "doublet" + } + + ## Write csv file + write.csv(result_2, paste(par_output_dir,"/step3",'/Step3_cell_assignment.csv', sep="")) + + result_2 +} +ensemble_independent_doublet_detections_prelim <- function(result_2, par_output_dir){ + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/step3",sep='')) + + expected_doublets <- nrow(result_2)*par_expected_doublet_rate + + ### Proportion agreement bar plot with threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$demuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$demuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$demuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Demuxlet + result_2_freemuxlet <- result_2[result_2$demuxlet_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Demuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_threshold.pdf", sep="")) + + ### Proportion agreement bar plot without threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_best_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$demuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_best_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$demuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_best_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, demuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$demuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Demuxlet + result_2_freemuxlet <- result_2[result_2$demuxlet_best_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Demuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets (no threshold)") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_no_threshold.pdf", sep="")) + + ### Number of ensemblex droplets with EID of each tool with threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxlet + result_2_temp_demuxlet <- result_2 + result_2_temp_demuxlet$ensemblex_assignment[result_2_temp_demuxlet$demuxlet_assignment == "doublet"] <- "doublet" + n_demuxlet <- result_2_temp_demuxlet[result_2_temp_demuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Demuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_demuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_threshold.pdf", sep="")) + + ### number of ensemblex droplets with EID of each tool with out threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_best_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_best_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxlet + result_2_temp_demuxlet <- result_2 + result_2_temp_demuxlet$ensemblex_assignment[result_2_temp_demuxlet$demuxlet_best_assignment == "doublet"] <- "doublet" + n_demuxlet <- result_2_temp_demuxlet[result_2_temp_demuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_best_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Demuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_demuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_no_threshold.pdf", sep="")) +} + +########################################################################################################################### +# CONFIDENCE-SCORE +########################################################################################################################### +## FUNCTION +confidence_score <- function(result_2, par_output_dir, par_sample_size){ + + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/confidence",sep='')) + + #### Calculate AUC singlet detection using consensus cells as proxy for ground truth + ### Vireo + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == demuxlet_best_assignment & + souporcell_best_assignment == demuxalot_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$vireo_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Vireo AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Vireo AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$vireo_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + vireo_AUC <- roc_empirical$AUC + vireo_AUC_singlet <- vireo_AUC + + } else { + print(paste0("Insufficient droplets to compute Vireo AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Vireo AUC as 0.5 for confidence score computation.")) + vireo_AUC_singlet <- 0.5 + } + + ### Demuxlet + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == vireo_best_assignment & + souporcell_best_assignment == demuxalot_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$demuxlet_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Demuxlet AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Demuxlet AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$demuxlet_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + freemuxlet_AUC <- roc_empirical$AUC + freemuxlet_AUC_singlet <- freemuxlet_AUC + + } else { + print(paste0("Insufficient droplets to compute Demuxlet AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Demuxlet AUC as 0.5 for confidence score computation.")) + freemuxlet_AUC_singlet <- 0.5 + } + + ### Demuxalot + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == vireo_best_assignment & + souporcell_best_assignment == demuxlet_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$demuxalot_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Demuxalot AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Demuxalot AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$demuxalot_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + demuxalot_AUC <- roc_empirical$AUC + demuxalot_AUC_singlet <- demuxalot_AUC + + } else { + print(paste0("Insufficient droplets to compute Demuxalot AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Demuxalot AUC as 0.5 for confidence score computation.")) + demuxalot_AUC_singlet <- 0.5 + } + + ### Souporcell + eval_df <- result_2 + eval_df_lim <- subset(eval_df, demuxalot_best_assignment == vireo_best_assignment & + demuxalot_best_assignment == demuxlet_best_assignment & + demuxalot_best_assignment != "unassigned" & + demuxalot_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$demuxalot_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Souporcell AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Souporcell AUC.") + } + + roc_empirical <- rocit(score = log(1-(10^(eval_df_lim$souporcell_log_probability_singlet))), class = eval_df_lim$consensus_eval_ROC, ##change_here + negref = "bad") + print(summary(roc_empirical)) + souporcell_AUC <- roc_empirical$AUC + souporcell_AUC_singlet <- souporcell_AUC + + } else { + print(paste0("Insufficient droplets to compute Demuxalot AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Demuxalot AUC as 0.5 for confidence score computation.")) + souporcell_AUC_singlet <- 0.5 + } + + ### Compute ensemblex singlet confidence + eval_df$ensemblex_singlet_confidence <- eval_df$ensemblex_probability + + ## Vireo + eval_df$ensemblex_singlet_confidence[eval_df$vireo_singlet_probability >= 0.9 & eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_singlet_probability >= 0.9 & eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment == eval_df$ensemblex_assignment] + vireo_AUC_singlet + + ## Demuxalot + eval_df$ensemblex_singlet_confidence[eval_df$demuxalot_max_probability >= 0.9 & eval_df$demuxalot_best_assignment != "doublet" & + eval_df$demuxalot_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$demuxalot_max_probability >= 0.9 & eval_df$demuxalot_best_assignment != "doublet" & + eval_df$demuxalot_best_assignment == eval_df$ensemblex_assignment] + demuxalot_AUC_singlet + + ## Freemuxlet + eval_df$ensemblex_singlet_confidence[eval_df$demuxlet_assignment != "unassigned" & eval_df$demuxlet_best_assignment != "doublet" & + eval_df$demuxlet_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$demuxlet_assignment != "unassigned" & eval_df$demuxlet_best_assignment != "doublet" & + eval_df$demuxlet_best_assignment == eval_df$ensemblex_assignment] + freemuxlet_AUC_singlet + + ## Souporcell + eval_df$ensemblex_singlet_confidence[eval_df$souporcell_assignment != "unassigned" & eval_df$souporcell_best_assignment != "doublet" & + eval_df$souporcell_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$souporcell_assignment != "unassigned" & eval_df$souporcell_best_assignment != "doublet" & + eval_df$souporcell_best_assignment == eval_df$ensemblex_assignment] + souporcell_AUC_singlet + + ## Unassignable cells + eval_df$ensemblex_singlet_confidence[eval_df$vireo_n_vars == 0] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_n_vars == 0]/par_sample_size + + ## Consensus + eval_df$ensemblex_singlet_confidence[eval_df$vireo_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$vireo_best_assignment == eval_df$demuxlet_best_assignment & + eval_df$vireo_best_assignment == eval_df$souporcell_best_assignment & + eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment != "unassigned"] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$vireo_best_assignment == eval_df$demuxlet_best_assignment & + eval_df$vireo_best_assignment == eval_df$souporcell_best_assignment & + eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment != "unassigned"] +1 + + ## Set ensemblex best assignment + eval_df$ensemblex_best_assignment <- eval_df$ensemblex_assignment + ## Set ensemblex assignment + eval_df$ensemblex_assignment[eval_df$ensemblex_singlet_confidence < 1 & eval_df$ensemblex_assignment != "doublet" ] <- "unassigned" + + + eval_df <- dplyr::select(eval_df, c("barcode", "ensemblex_assignment","ensemblex_best_assignment", "ensemblex_probability", "ensemblex_singlet_confidence" , "vireo_assignment","souporcell_assignment","demuxlet_assignment","demuxalot_assignment","general_consensus" , + "vireo_best_assignment","souporcell_best_assignment","demuxlet_best_assignment","demuxalot_best_assignment","vireo_singlet_probability","vireo_doublet_probability","vireo_n_vars","vireo_best_doublet", + "vireo_doublet_logLikRatio","souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "demuxlet_n_snps", "demuxlet_n_reads","demuxlet_max_probability", "demuxlet_DIFF_LLK_SNG_DBL","demuxalot_max_probability" , + "demuxalot_doublet_probability", "vireo_max_probability","vireo_weighted_probability","demuxlet_weighted_probability","demuxalot_weighted_probability" , + "souporcell_weighted_probability" )) + + write.csv(eval_df, paste(par_output_dir,"/confidence",'/ensemblex_final_cell_assignment.csv', sep="")) + eval_df +} diff --git a/ensemblex.pip/gt/scripts/ensemblexing/pipeline_ensemblexing.sh b/ensemblex.pip/gt/scripts/ensemblexing/pipeline_ensemblexing.sh new file mode 100644 index 0000000..8655815 --- /dev/null +++ b/ensemblex.pip/gt/scripts/ensemblexing/pipeline_ensemblexing.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step Souporcell submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "------Parameters used in this step---------" +echo "* PAR_ensemblex_sample_size $PAR_ensemblex_sample_size" +echo "* PAR_ensemblex_expected_doublet_rate $PAR_ensemblex_expected_doublet_rate" +echo "* PAR_ensemblex_merge_constituents $PAR_ensemblex_merge_constituents" +echo "* PAR_ensemblex_probabilistic_weighted_ensemble $PAR_ensemblex_probabilistic_weighted_ensemble" +echo "* PAR_ensemblex_preliminary_parameter_sweep $PAR_ensemblex_preliminary_parameter_sweep" +echo "* PAR_ensemblex_graph_based_doublet_detection $PAR_ensemblex_graph_based_doublet_detection" +echo "* PAR_ensemblex_preliminary_ensemble_independent_doublet $PAR_ensemblex_preliminary_ensemble_independent_doublet" +echo "* PAR_ensemblex_ensemble_independent_doublet $PAR_ensemblex_ensemble_independent_doublet" +echo "* PAR_ensemblex_doublet_Demuxalot_threshold $PAR_ensemblex_doublet_Demuxalot_threshold" +echo "* PAR_ensemblex_doublet_Demuxalot_no_threshold $PAR_ensemblex_doublet_Demuxalot_no_threshold" +echo "* PAR_ensemblex_doublet_Demuxlet_threshold $PAR_ensemblex_doublet_Demuxlet_threshold" +echo "* PAR_ensemblex_doublet_Demuxlet_no_threshold $PAR_ensemblex_doublet_Demuxlet_no_threshold" +echo "* PAR_ensemblex_doublet_Souporcell_threshold $PAR_ensemblex_doublet_Souporcell_threshold" +echo "* PAR_ensemblex_doublet_Souporcell_no_threshold $PAR_ensemblex_doublet_Souporcell_no_threshold" +echo "* PAR_ensemblex_doublet_Vireo_threshold $PAR_ensemblex_doublet_Vireo_threshold" +echo "* PAR_ensemblex_doublet_Vireo_no_threshold $PAR_ensemblex_doublet_Vireo_no_threshold" +echo "* PAR_ensemblex_compute_singlet_confidence $PAR_ensemblex_compute_singlet_confidence" +echo "* PAR_ensemblex_nCD $PAR_ensemblex_nCD" +echo "* PAR_ensemblex_pT $PAR_ensemblex_pT" + + +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "Start of ensemblexing" +$CONTAINER_CMD exec --bind $OUTPUT_DIR,$PIPELINE_HOME ${CONTAINER1} Rscript $PIPELINE_HOME/gt/scripts/ensemblexing/ensemblexing.R $PIPELINE_HOME $OUTPUT_DIR $PAR_ensemblex_sample_size $PAR_ensemblex_expected_doublet_rate $PAR_ensemblex_merge_constituents $PAR_ensemblex_probabilistic_weighted_ensemble $PAR_ensemblex_preliminary_parameter_sweep $PAR_ensemblex_graph_based_doublet_detection $PAR_ensemblex_preliminary_ensemble_independent_doublet $PAR_ensemblex_ensemble_independent_doublet $PAR_ensemblex_doublet_Demuxalot_threshold $PAR_ensemblex_doublet_Demuxalot_no_threshold $PAR_ensemblex_doublet_Demuxlet_threshold $PAR_ensemblex_doublet_Demuxlet_no_threshold $PAR_ensemblex_doublet_Souporcell_threshold $PAR_ensemblex_doublet_Souporcell_no_threshold $PAR_ensemblex_doublet_Vireo_threshold $PAR_ensemblex_doublet_Vireo_no_threshold $PAR_ensemblex_compute_singlet_confidence $PAR_ensemblex_nCD $PAR_ensemblex_pT +echo "End of ensemblexing" +exit 0 diff --git a/ensemblex.pip/gt/scripts/souporcell/pipeline_souporcell_generate.sh b/ensemblex.pip/gt/scripts/souporcell/pipeline_souporcell_generate.sh new file mode 100644 index 0000000..0dad7de --- /dev/null +++ b/ensemblex.pip/gt/scripts/souporcell/pipeline_souporcell_generate.sh @@ -0,0 +1,129 @@ +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +TEMPFILERUN=$JOB_OUTPUT_DIR/.tmp/pipeline_souporcell.sh +cat < $TEMPFILERUN +#!/bin/bash + +umask 002 +source \$OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source \$OUTPUT_DIR/job_info/.tmp/temp_config.ini + +echo "-------------------------------------------" +echo "* step souporcell submitted at \`date +%FT%H.%M.%S\`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: \$PIPELINE_HOME" +echo "* OUTPUT_DIR: \$OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* souporcell_N: \$PAR_souporcell_N" +echo "* souporcell_h: \$PAR_souporcell_h" +echo "* souporcell_threads: \$PAR_souporcell_threads" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" + +cd $OUTPUT_DIR/souporcell + +CONTAINER1=\$PIPELINE_HOME/soft/ensemblex.sif +SOFT_SOUP=/opt/souporcell +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "#----------------------------------------------------------------#" +echo "Start of Renamer step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/renamer.py \\ + --bam $OUTPUT_DIR/input_files/pooled_bam.bam \\ + --barcodes $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + --out $OUTPUT_DIR/souporcell/fq.fq + +echo "End of Renamer step" + +echo "#----------------------------------------------------------------#" +echo "Start of Re-align step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} minimap2 $PAR_minimap2 \\ +$OUTPUT_DIR/input_files/reference.fa $OUTPUT_DIR/souporcell/fq.fq > $OUTPUT_DIR/souporcell/minimap.sam +echo "End of Re-align step" + +echo "#----------------------------------------------------------------#" +echo "Start of Retag step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/retag.py --sam $OUTPUT_DIR/souporcell/minimap.sam --out $OUTPUT_DIR/souporcell/minitagged.bam +echo "End of Retag step" + + +echo "#----------------------------------------------------------------#" +echo "Start of Sorting step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} samtools sort $OUTPUT_DIR/souporcell/minitagged.bam -o $OUTPUT_DIR/souporcell/minitagged_sorted.bam +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} samtools index $OUTPUT_DIR/souporcell/minitagged_sorted.bam +echo "End of Sorting step" + +echo "#----------------------------------------------------------------#" +echo "Start of Call variants step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} freebayes -f $OUTPUT_DIR/input_files/reference.fa \\ +$PAR_freebayes $OUTPUT_DIR/souporcell/minitagged_sorted.bam > $OUTPUT_DIR/souporcell/Pool.vcf +echo "End of Call variants step" + +echo "#----------------------------------------------------------------#" +echo "Start of Vartrix step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} vartrix \\ + -v $OUTPUT_DIR/souporcell/Pool.vcf \\ + -b $OUTPUT_DIR/input_files/pooled_bam.bam \\ + -f $OUTPUT_DIR/input_files/reference.fa \\ + -c $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + --ref-matrix $OUTPUT_DIR/souporcell/ref.mtx \\ + --out-matrix $OUTPUT_DIR/souporcell/alt.mtx \\ + --scoring-method coverage \\ +EOF + +if [[ $PAR_vartrix_umi ]]; then + echo " --umi \\" >> $TEMPFILERUN +fi + +if [[ $PAR_vartrix_mapq ]] ; then + echo " --mapq $PAR_vartrix_mapq \\" >> $TEMPFILERUN +fi +if [[ $PAR_vartrix_threads ]] ; then + echo " --threads $PAR_vartrix_threads \\" >> $TEMPFILERUN +fi +echo " " >> $TEMPFILERUN +echo " echo \" End of Vartrix step \" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \"Start of Clustering cells by genotype \" " >> $TEMPFILERUN +echo " $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/souporcell/target/release/souporcell \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " -b $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\" >> $TEMPFILERUN +echo " -k $PAR_souporcell_k \\" >> $TEMPFILERUN +echo " -t $PAR_souporcell_t > $OUTPUT_DIR/souporcell/clusters_tmp.tsv " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo " echo \"End of Clustering cells by genotype\" " >> $TEMPFILERUN + +echo " " >> $TEMPFILERUN +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \"Start of Calling doublets step\" " >> $TEMPFILERUN +echo " $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/troublet/target/release/troublet \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " --clusters $OUTPUT_DIR/souporcell/clusters_tmp.tsv > $OUTPUT_DIR/souporcell/clusters.tsv " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo " echo \"End of Calling doublets step\" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \" Start Genotype and ambient RNA coinference step\" " >> $TEMPFILERUN +echo "$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/consensus.py \\" >> $TEMPFILERUN +echo " -c $OUTPUT_DIR/souporcell/clusters.tsv \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " --soup_out $OUTPUT_DIR/souporcell/soup.txt \\" >> $TEMPFILERUN +echo " -v $OUTPUT_DIR/souporcell/Pool.vcf \\" >> $TEMPFILERUN +echo " --vcf_out $OUTPUT_DIR/souporcell/cluster_genotypes.vcf \\" >> $TEMPFILERUN +echo " --output_dir $OUTPUT_DIR/souporcell " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "echo \"End of Genotype and ambient RNA coinference step\" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo "exit 0 " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN diff --git a/ensemblex.pip/gt/scripts/vireo/pipeline_vireo.sh b/ensemblex.pip/gt/scripts/vireo/pipeline_vireo.sh new file mode 100644 index 0000000..dc12f96 --- /dev/null +++ b/ensemblex.pip/gt/scripts/vireo/pipeline_vireo.sh @@ -0,0 +1,56 @@ +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + + +TEMPFILERUN=$OUTPUT_DIR/job_info/.tmp/pipeline_vireo.sh + +cat < $TEMPFILERUN + #!/bin/bash + + umask 002 + source \$OUTPUT_DIR/job_info/configs/ensemblex_config.ini + source \$OUTPUT_DIR/job_info/.tmp/temp_config.ini + + echo "-------------------------------------------" + echo "* step vireo submitted at \`date +%FT%H.%M.%S\`" + echo "-------------------------------------------" + echo "* PIPELINE_HOME: $PIPELINE_HOME" + echo "* OUTPUT_DIR: $OUTPUT_DIR" + echo "-------------------------------------------" + echo "------Parameters used in this step---------" + echo "* N: $PAR_vireo_N" + echo "* type: $PAR_vireo_type" + echo "* h: $PAR_vireo_h" + echo "* processes: $PAR_vireo_processes" + echo "* minMAF: $PAR_vireo_minMAF" + echo "* minCOUNT: $PAR_vireo_minCOUNT" + echo "* forcelearnGT: $PAR_vireo_forcelearnGT" + echo "-------------------------------------------" + echo -e "------Output of Run------------------------\n\n" + CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} cellSNP -s $OUTPUT_DIR/input_files/pooled_bam.bam \\ + -b $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + -O $OUTPUT_DIR/vireo_gt \\ + -R $OUTPUT_DIR/input_files/reference.vcf \\ + -p $PAR_vireo_processes \\ + --minMAF $PAR_vireo_minMAF \\ + --minCOUNT $PAR_vireo_minCOUNT +EOF + + echo "" >> $TEMPFILERUN + echo "" >> $TEMPFILERUN + echo "$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} vireo -c $OUTPUT_DIR/vireo_gt \\" >> $TEMPFILERUN + echo " -d $OUTPUT_DIR/input_files/pooled_samples.vcf \\" >> $TEMPFILERUN + echo " --forceLearnGT \\" >> $TEMPFILERUN + echo " -t GT \\" >> $TEMPFILERUN + echo " -o $OUTPUT_DIR/vireo_gt \\" >> $TEMPFILERUN + if [[ $PAR_vireo_N ]] ;then + echo " -N $PAR_vireo_N \\" >> $TEMPFILERUN + fi + if [[ $PAR_vireo_h ]] ;then + echo " -h $PAR_vireo_h \\" >> $TEMPFILERUN + fi + echo "" >> $TEMPFILERUN + echo "" >> $TEMPFILERUN + \ No newline at end of file diff --git a/ensemblex.pip/launch/launch_gt.sh b/ensemblex.pip/launch/launch_gt.sh new file mode 100644 index 0000000..1e10835 --- /dev/null +++ b/ensemblex.pip/launch/launch_gt.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini +export QUEUE=bash + +MODE0=$MODE + +# =============================================== +# STEP Vireo: +# =============================================== +# +STEP=vireo +if [[ ${MODE0[@]} =~ vireo ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP submitted $STEP GT " >> $EXPECTED_DONE_FILES + $QUEUE $PIPELINE_HOME/gt/scripts/vireo/pipeline_vireo.sh + sleep 10 + $QUEUE $JOB_OUTPUT_DIR/.tmp/pipeline_vireo.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/vireo" >> $EXPECTED_DONE_FILES +fi + +# =============================================== +# STEP souporcell: +# =============================================== +# +STEP=souporcell +if [[ ${MODE0[@]} =~ souporcell ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP GT submitted" >> $EXPECTED_DONE_FILES + $QUEUE $PIPELINE_HOME/gt/scripts/souporcell/pipeline_souporcell_generate.sh + sleep 10 + $QUEUE $JOB_OUTPUT_DIR/.tmp/pipeline_souporcell.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/souporcell" >> $EXPECTED_DONE_FILES +fi + + +# =============================================== +# STEP Demuxlet: +# =============================================== +# +STEP=demuxlet +if [[ ${MODE0[@]} =~ demuxlet ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP GT submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/gt/scripts/demuxlet/pipeline_demuxlet.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/demuxlet" >> $EXPECTED_DONE_FILES +fi + +# =============================================== +# STEP Demuxalot: +# =============================================== +# +STEP=demuxalot +if [[ ${MODE0[@]} =~ demuxalot ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP GT submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/gt/scripts/demuxalot/pipeline_demuxalot.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/demuxalot" >> $EXPECTED_DONE_FILES +fi + + +# =============================================== +# STEP ensemblexing: +# =============================================== +# + +STEP=ensemblexing +if [[ ${MODE0[@]} =~ ensemblexing ]] || [[ ${MODE0[@]} =~ all ]] ; then + + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP GT submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/gt/scripts/ensemblexing/pipeline_ensemblexing.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/ensemblexing" >> $EXPECTED_DONE_FILES +fi + +exit 0 + + + diff --git a/ensemblex.pip/launch/launch_nogt.sh b/ensemblex.pip/launch/launch_nogt.sh new file mode 100644 index 0000000..1c7323c --- /dev/null +++ b/ensemblex.pip/launch/launch_nogt.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +export QUEUE=bash + +MODE0=$MODE + + + +# =============================================== +# STEP Demuxlet: +# =============================================== +# +STEP=freemuxlet +if [[ ${MODE0[@]} =~ freemuxlet ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP freemuxlet submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/nogt/scripts/freemuxlet/pipeline_freemuxlet.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/freemuxlet" >> $EXPECTED_DONE_FILES +fi + +# =============================================== +# STEP Vireo: +# =============================================== +# +STEP=vireo +if [[ ${MODE0[@]} =~ vireo ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP submitted $STEP No-GT " >> $EXPECTED_DONE_FILES + $QUEUE $PIPELINE_HOME/nogt/scripts/vireo/pipeline_vireo.sh + sleep 10 + $QUEUE $JOB_OUTPUT_DIR/.tmp/pipeline_vireo.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/vireo" >> $EXPECTED_DONE_FILES +fi + +# =============================================== +# STEP souporcell: +# =============================================== +# +STEP=souporcell +if [[ ${MODE0[@]} =~ souporcell ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP No-GT submitted" >> $EXPECTED_DONE_FILES + $QUEUE $PIPELINE_HOME/nogt/scripts/souporcell/pipeline_souporcell_generate.sh + sleep 10 + $QUEUE $JOB_OUTPUT_DIR/.tmp/pipeline_souporcell.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/souporcell" >> $EXPECTED_DONE_FILES +fi + + +# =============================================== +# STEP Demuxalot: +# =============================================== +# +STEP=demuxalot +if [[ ${MODE0[@]} =~ demuxalot ]] || [[ ${MODE0[@]} =~ all ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/nogt/scripts/demuxalot/pipeline_demuxalot.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/demuxalot" >> $EXPECTED_DONE_FILES +fi + +# =============================================== +# STEP ensemblexing: +# =============================================== +# + +STEP=ensemblexing +if [[ ${MODE0[@]} =~ ensemblexing ]] || [[ ${MODE0[@]} =~ all ]] ; then + + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "* It submitted at `date +%FT%H.%M.%S`" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP submitted" >> $EXPECTED_DONE_FILES + + $QUEUE $PIPELINE_HOME/nogt/scripts/ensemblexing/pipeline_ensemblexing.sh &> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/ensemblexing" >> $EXPECTED_DONE_FILES +fi + +exit 0 diff --git a/ensemblex.pip/launch_ensemblex.sh b/ensemblex.pip/launch_ensemblex.sh new file mode 100644 index 0000000..7f07207 --- /dev/null +++ b/ensemblex.pip/launch_ensemblex.sh @@ -0,0 +1,277 @@ +#!/bin/bash + +# The Ensemblex pipeline was produced for projects funded by the Canadian Institute of Health Research and Michael J. Fox Foundation Parkinson's Progression Markers Initiative (MJFF PPMI) in collaboration with The Neuro's Early Drug Discovery Unit (EDDU), McGill University. +# Copyright belong MNI BIOINFO CORE (https://github.com/neurobioinfo) +# The pipeline is scripted by Saeid Amiri (saeid.amiri@mcgill.ca) and Michael Fiorini( michael.fiorini@mail.mcgill.ca ) + +VERSION=0.0.02 +DATE0=2024-06-13 +echo -e "ensemblex pipeline version $VERSION" + +# =============================================== +# default variables values +# =============================================== +unset OUTPUT_DIR PIPELINE_HOME + +PIPELINE_HOME0=`realpath ${BASH_SOURCE[0]}` +export PIPELINE_HOME=$(cd $(dirname $PIPELINE_HOME0) && pwd -P) + +TIMESTAMP=`date +%FT%H.%M.%S` + + +# create function to handle error messages +# =============================================== +Usage() { + echo + echo "------------------- " + echo -e "Usage:\t$0 [arguments]" + echo -e "\tmandatory arguments:\n" \ + "\t\t-d (--dir) = Working directory (where all the outputs will be printed) (give full path)\n" \ + "\t\t--steps = Specify the steps to execute. Begin by selecting either init-GT or init-noGT to establish the working directory. \n" \ + "\t\t For GT: vireo, demuxalot, demuxlet, souporcell, ensemblexing \n" \ + "\t\t For noGT: vireo, demuxalot, freemuxlet, souporcell, ensemblexing \n" + echo -e "\toptional arguments:\n " \ + "\t\t-h (--help) = See helps regarding the pipeline arguments \n" \ + "\t\t--vcf = The path of vcf file \n" \ + "\t\t--bam = The path of bam file \n" \ + "\t\t--sortout = The path snd nsme of vcf generated using sort \n" \ + "------------------- \n" \ + "For a comprehensive help, visit https://neurobioinfo.github.io/ensemblex/site/ for documentation. " + +echo +} + + +# =============================================== +# PARSING ARGUMENTS +# =============================================== +if ! options=$(getopt --name pipeline --alternative --unquoted --options hs:d:t:m:vw:f:S:c:a:x: --longoptions dir:,steps:,method:,container:,vcf:,bam:,sortout: -- "$@") +then + # something went wrong, getopt will put out an error message for us + echo "Error processing options." + exit 42 +fi + +# =============================================== +# LOAD & OVERRIDE EXTRA CONFIG FILE FOR PROJECT +# =============================================== +set -- $options + +while [ $# -gt 0 ] + +do + case $1 in + -x| --extra) + EXTRA_CONF="$2" ; + if [ -f $EXTRA_CONF ]; then + echo "* LOADING EXTRA CONFIG FILE $EXTRA_CONF"; + . $EXTRA_CONF + else + echo "ERROR: invalid EXTRA CONFIG file: $EXTRA_CONF"; + echo "Please check options and try again"; exit 42; + fi + esac + shift +done + +# =============================================== +# LOAD ALL OTHER OPTIONS +# =============================================== +set -- $options + +while [ $# -gt 0 ] +do + case $1 in + -h| --help) Usage; exit 0;; + -d| --dir) OUTPUT_DIR="$2" ; shift ;; + -v| --verbose) VERBOSE=1 ;; + --steps) MODE="$2"; shift ;; + --vcf) VCF="$2"; shift ;; + --bam) BAM="$2"; shift ;; + --sortout) SORTOUT="$2"; shift ;; + --method) METHOD0="$2"; shift ;; + --container) CONTAINER0="$2"; shift ;; + (--) shift; break;; + (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 42;; + (*) break;; + esac + shift +done + + +MODE0=$MODE +FOUND_ERROR=0 + +# =============================================== +# CHECKING VARIABLES +# =============================================== +#check to ensure all mandatory arguments have been entered + +if [ -z $OUTPUT_DIR ]; then echo "ERROR: missing mandatory option: -d (--dir) must be specified"; FOUND_ERROR=1; fi +if (( $FOUND_ERROR )); then echo "Please check options and try again"; exit 42; fi + + +# STEP 0: RUN setting +# =============================================== + JOB_OUTPUT_DIR=$OUTPUT_DIR/job_info + + +if [[ ${MODE0[@]} =~ init-GT ]]; then + if [[ -s $OUTPUT_DIR/job_info ]]; then + read -p "folder\files already exist in $OUTPUT_DIR. Overwrite? y|n: "$'\n' answer + if [[ $answer =~ y ]]; then + echo "NOTE: the folder\files are Overwritten." + rm -rf $OUTPUT_DIR/job_info; mkdir -p $OUTPUT_DIR/job_info + rm -rf $OUTPUT_DIR/job_info/configs; mkdir -p $OUTPUT_DIR/job_info/configs + cp -r $PIPELINE_HOME/gt/configs $OUTPUT_DIR/job_info/ + rm -rf $OUTPUT_DIR/job_info/logs; mkdir -p $OUTPUT_DIR/job_info/logs + rm -rf $OUTPUT_DIR/input_files; mkdir -p $OUTPUT_DIR/input_files + rm -rf $OUTPUT_DIR/demuxalot; mkdir -p $OUTPUT_DIR/demuxalot + rm -rf $OUTPUT_DIR/demuxlet; mkdir -p $OUTPUT_DIR/demuxlet + rm -rf $OUTPUT_DIR/ensemblex_gt; mkdir -p $OUTPUT_DIR/ensemblex_gt + rm -rf $OUTPUT_DIR/souporcell; mkdir -p $OUTPUT_DIR/souporcell + rm -rf $OUTPUT_DIR/vireo_gt ; mkdir -p $OUTPUT_DIR/vireo_gt + rm -rf $OUTPUT_DIR/job_info/.tmp ; mkdir -p $OUTPUT_DIR/job_info/.tmp + else + echo "NOTE: the pipeline is using the existing config file and parameters." + fi + else + echo "The configuration files do not exist, the pipeline will create them during execution." + mkdir -p $OUTPUT_DIR/job_info + mkdir -p $OUTPUT_DIR/input_files + mkdir -p $OUTPUT_DIR/job_info/configs + cp -r $PIPELINE_HOME/gt/configs $OUTPUT_DIR/job_info/ + mkdir -p $OUTPUT_DIR/job_info/logs + mkdir -p $OUTPUT_DIR/demuxalot + mkdir -p $OUTPUT_DIR/demuxlet + mkdir -p $OUTPUT_DIR/ensemblex_gt + mkdir -p $OUTPUT_DIR/souporcell + mkdir -p $OUTPUT_DIR/vireo_gt + rm -rf $OUTPUT_DIR/job_info/.tmp ; mkdir -p $OUTPUT_DIR/job_info/.tmp + fi + touch $JOB_OUTPUT_DIR/summary_report.txt +fi + +if [[ ${MODE0[@]} =~ init-noGT ]]; then + if [[ -s $OUTPUT_DIR/job_info ]]; then + read -p "folder\files already exist in $OUTPUT_DIR. Overwrite? y|n: "$'\n' answer + if [[ $answer =~ y ]]; then + echo "NOTE: the folder\files are Overwritten." + rm -rf $OUTPUT_DIR/job_info; mkdir -p $OUTPUT_DIR/job_info + rm -rf $OUTPUT_DIR/job_info/configs; mkdir -p $OUTPUT_DIR/job_info/configs + cp -r $PIPELINE_HOME/nogt/configs $OUTPUT_DIR/job_info/ + rm -rf $OUTPUT_DIR/job_info/logs; mkdir -p $OUTPUT_DIR/job_info/logs + rm -rf $OUTPUT_DIR/input_files; mkdir -p $OUTPUT_DIR/input_files + rm -rf $OUTPUT_DIR/demuxalot; mkdir -p $OUTPUT_DIR/demuxalot + rm -rf $OUTPUT_DIR/freemuxlet; mkdir -p $OUTPUT_DIR/freemuxlet + rm -rf $OUTPUT_DIR/souporcell; mkdir -p $OUTPUT_DIR/souporcell + rm -rf $OUTPUT_DIR/vireo; mkdir -p $OUTPUT_DIR/vireo + rm -rf $OUTPUT_DIR/job_info/.tmp ; mkdir -p $OUTPUT_DIR/job_info/.tmp + else + echo "NOTE: the pipeline is using the existing config file and parameters." + fi + else + echo "The configuration files do not exist, the pipeline will create them during execution." + mkdir -p $OUTPUT_DIR/job_info + mkdir -p $OUTPUT_DIR/input_files + mkdir -p $OUTPUT_DIR/job_info/configs + cp -r $PIPELINE_HOME/nogt/configs $OUTPUT_DIR/job_info/ + mkdir -p $OUTPUT_DIR/job_info/logs + mkdir -p $OUTPUT_DIR/demuxalot + mkdir -p $OUTPUT_DIR/freemuxlet + mkdir -p $OUTPUT_DIR/souporcell + mkdir -p $OUTPUT_DIR/vireo + rm -rf $OUTPUT_DIR/job_info/.tmp ; mkdir -p $OUTPUT_DIR/job_info/.tmp + fi + touch $JOB_OUTPUT_DIR/summary_report.txt +fi + +export JOB_OUTPUT_DIR=$OUTPUT_DIR/job_info +export EXPECTED_DONE_FILES=$JOB_OUTPUT_DIR/summary_report.txt +chmod 775 $EXPECTED_DONE_FILES +export OUTPUT_DIR=$OUTPUT_DIR + +echo -e "NOTE: the pipeline is running << $MODE >>" + +TEMPCONFIG=$OUTPUT_DIR/job_info/.tmp/temp_config.ini +touch $TEMPCONFIG +if [[ -f "TEMPCONFIG" ]]; then + rm $TEMPCONFIG +fi + +echo " # IT IS A temp FILE. DO NOT EDIT THIS FILE DIRECTLY." > $TEMPCONFIG + +if [[ ${MODE0[@]} =~ init-GT ]]; then + echo METHODGT=GT >> $TEMPCONFIG + echo -e "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Pipeline is set up for GT-----------------" $VERSION >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The Output is under ${OUTPUT_DIR}/job_info/" >> $EXPECTED_DONE_FILES +fi +if [[ ${MODE0[@]} =~ init-noGT ]]; then + echo METHODGT=noGT >> $TEMPCONFIG + echo -e "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Pipeline is set up for noGT-----------------" $VERSION >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The Output is under ${OUTPUT_DIR}/job_info/" >> $EXPECTED_DONE_FILES +fi + +echo OUTPUT_DIR=$OUTPUT_DIR >> $TEMPCONFIG +echo JOB_OUTPUT_DIR=$JOB_OUTPUT_DIR >> $TEMPCONFIG +echo EXPECTED_DONE_FILES=$EXPECTED_DONE_FILES >> $TEMPCONFIG +echo MODE=$MODE >> $TEMPCONFIG +echo VERSION=$VERSION >> $TEMPCONFIG + +if [[ $CONTAINER_CMD ]]; then + echo CONTAINER_CMD=$CONTAINER_CMD >> $TEMPCONFIG +fi +if [[ $CONTAINER_MODULE ]]; then + echo CONTAINER_MODULE=`echo \'$CONTAINER_MODULE\'` >> $TEMPCONFIG +fi +if [[ $CONTAINER ]]; then + echo CONTAINER=$CONTAINER >> $TEMPCONFIG +fi + +if [[ $R_MODULE ]]; then + echo R_MODULE=$R_MODULE >> $TEMPCONFIG +fi + +if [[ $R_CMD ]]; then + echo R_CMD=$R_CMD >> $TEMPCONFIG +fi + +R_LIB_PATH_CONT=/opt/R/4.3.1/lib/R/library +echo R_LIB_PATH_CONT=$R_LIB_PATH_CONT >> $TEMPCONFIG +PIPELINE_HOME_CONT=/opt/ensemblex.pip +echo PIPELINE_HOME_CONT=$PIPELINE_HOME_CONT >> $TEMPCONFIG + +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + + + +if [[ ${METHOD} == GT ]] ; then + bash ${PIPELINE_HOME}/launch/launch_gt.sh +elif [[ ${METHOD} == noGT ]] ; then + bash ${PIPELINE_HOME}/launch/launch_nogt.sh +fi + +STEP=sort +if [[ ${MODE0[@]} =~ sort ]] ; then + echo -e "\n\n-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo -e "--------Job submitted using pipeline version $VERSION--------" >> $EXPECTED_DONE_FILES + echo "-----------------------------------------------------------" >> $EXPECTED_DONE_FILES + echo "STEP $STEP: sort vcf same as bam" >> $EXPECTED_DONE_FILES + CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + $CONTAINER_CMD exec --bind ${PIPELINE_HOME},$OUTPUT_DIR,$VCF,$BAM ${CONTAINER1} ${PIPELINE_HOME}/tools/sort_vcf_same_as_bam.sh $BAM $VCF > $SORTOUT 2> $JOB_OUTPUT_DIR/logs/${STEP}.$(date +%FT%H.%M.%S) + echo "This function prepared by Gert Hulselmans" >> $EXPECTED_DONE_FILES + echo "-------------------------------------------" >> $EXPECTED_DONE_FILES + echo "The output is under ${OUTPUT_DIR}/souporcell" >> $EXPECTED_DONE_FILES + echo -e " \n" + exit 0 +fi + + +echo -e " \n" +exit 0 + diff --git a/ensemblex.pip/nogt/.DS_Store b/ensemblex.pip/nogt/.DS_Store new file mode 100644 index 0000000..21536fb Binary files /dev/null and b/ensemblex.pip/nogt/.DS_Store differ diff --git a/ensemblex.pip/nogt/configs/ensemblex_config.ini b/ensemblex.pip/nogt/configs/ensemblex_config.ini new file mode 100644 index 0000000..abfc031 --- /dev/null +++ b/ensemblex.pip/nogt/configs/ensemblex_config.ini @@ -0,0 +1,119 @@ +############# Config Trace +############# [DEFAULT] +############# Depending on system and your requirments, +############# change the defaults + +######################## +## General parameters ## +######################## +METHOD=noGT +CONTAINER_CMD=singularity + +############### +## Demuxalot ## +############### +## List of Sample ID's in the sample VCF file generated by Freemuxlet: outs.clust1.vcf (e.g., 'CLUST0,CLUST1,CLUST2'). +PAR_demuxalot_genotype_names='' +## Read prior strength +PAR_demuxalot_prior_strength=100 +## Minimum read coverage +PAR_demuxalot_minimum_coverage=200 +## Minimum alternative read coverage +PAR_demuxalot_minimum_alternative_coverage=10 +## Number of best snps for each donor to use for demultiplexing +PAR_demuxalot_n_best_snps_per_donor=100 +## Genotype prior strength +PAR_demuxalot_genotypes_prior_strength=1 +## Doublet prior strength +PAR_demuxalot_doublet_prior=0.25 + +################ +## Freemuxlet ## +################ +## Number of pooled samples +PAR_freemuxlet_nsample= + +################ +## Souporcell ## +################ +## Minimap2 parameters. For information regarding the minimap2 parameters, please see the documentation: +PAR_minimap2='-ax splice -t 8 -G50k -k 21 -w 11 --sr -A2 -B8 -O12,32 -E2,1 -r200 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g2000 -2K50m --secondary=no' +## Freebayes parameters. For information regarding the freebayes parameters, please see the documentation: +PAR_freebayes=' -iXu -C 2 -q 20 -n 3 -E 1 -m 30 --min-coverage 6' +## Whether or no to consider UMI information when populating coverage matrices +PAR_vartrix_umi=TRUE +## Minimum read mapping quality +PAR_vartrix_mapq=30 +## Number of threads for computing +PAR_vartrix_threads=8 +## Number of pooled samples +PAR_souporcell_k= +## Number of threads for computing +PAR_souporcell_t=8 + +########### +## Vireo ## +########### +## Number of pooled samples +PAR_vireo_N=4 +## Number of subprocesses for computing +PAR_vireo_processes=20 +## Minimum minor allele frequency +PAR_vireo_minMAF=0.1 +## Minimum aggregated count +PAR_vireo_minCOUNT=20 +## Whether or not to treat donor GT as prior only +PAR_vireo_forcelearnGT=T + +######################### +## ensemblex algorithm ## +######################### +#== Pool parameters ==# +## Number of pooled samples +PAR_ensemblex_sample_size= +## Expected doublet rate for the pool. If using 10X Genomics, the expected doublet rate can be estimated based on the number of recovered cells. +PAR_ensemblex_expected_doublet_rate=0.12 + +#== Set up parameters ==# +## Whether or not to merge the output files of the constituent demultiplexing tools. If running ensemblex on a pool for the first time, this parameter should be set to "Yes". +PAR_ensemblex_merge_constituents=yes + +#== Step 1 parameters: Probabilistic-weighted ensemble ==# +## Whether or not to perform Step 1: Probabilistic-weighted ensemble. If running ensemblex on a pool for the first time, this parameter should be set to "Yes". +PAR_ensemblex_probabilistic_weighted_ensemble=yes + +#== Step 2 parameters: Graph-based doublet detection ==# +## Whether or not to perform a preliminary parameter sweep for Step 2: Graph-based doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define the number of confident doublets in the pool (nCD) and the percentile threshold of the nearest neighour frequency (pT), which can be defined in the following two parameters, respectively. +PAR_ensemblex_preliminary_parameter_sweep=no +## Manually defined number of confident doublets in the pool (nCD). To manually define nCD, uncomment the parament and enter the value (e.g., PAR_ensemblex_nCD=200) +#PAR_ensemblex_nCD= +## Manually defined percentile threshold of the nearest neighour frequency (pT. To manually define pT, uncomment the parament and enter the value (e.g., PAR_ensemblex_pT=0.9) +#PAR_ensemblex_pT= +## Whether or not to perform Step 2: Graph-based doublet detection. If PAR_ensemblex_nCD and PAR_ensemblex_pT are not defined by the user (NULL), ensemblex will automatically determine the optimal parameter values using an unsupervised parameter sweep. If PAR_ensemblex_nCD and PAR_ensemblex_pT are defined by the user, graph-based doublet detection will be performed with the user-defined values. +PAR_ensemblex_graph_based_doublet_detection=yes + +#== Step 3 parameters: Ensemble-independent doublet detection ==# +## Whether or not to perform a preliminary parameter sweep for Step 3: Ensemble-independent doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define which constituent tools to utilize for ensemble-independent doublet detection. Users can define which tools to utilize for ensemble-independent doublet detection in the following parameters. +PAR_ensemblex_preliminary_ensemble_independent_doublet=no +## Whether or not to perform Step 3: Ensemble-independent doublet detection. +PAR_ensemblex_independent_doublet=yes +## Whether or not to label doublets identified by Demuxalot as doublets. Only doublets with assignment probabilities exceeding Demuxalot's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Demuxalot_threshold=yes +## Whether or not to label doublets identified by Demuxalot as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Demuxalot_no_threshold=no +## Whether or not to label doublets identified by Demuxlet as doublets. Only doublets with assignment probabilities exceeding Demuxlet's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Freemuxlet_threshold=no +## Whether or not to label doublets identified by Demuxlet as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Freemuxlet_no_threshold=no +## Whether or not to label doublets identified by Souporcell as doublets. Only doublets with assignment probabilities exceeding Souporcell's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Souporcell_threshold=no +## Whether or not to label doublets identified by Souporcell as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Souporcell_no_threshold=no +## Whether or not to label doublets identified by Vireo as doublets. Only doublets with assignment probabilities exceeding Vireo's recommended probability threshold will be labeled as doublets by ensemblex. +PAR_ensemblex_doublet_Vireo_threshold=yes +## Whether or not to label doublets identified by Vireo as doublets, regardless of the corresponding assignment probability. +PAR_ensemblex_doublet_Vireo_no_threshold=no + +#== Confidence score parameters ==# +## Whether or not to compute ensemblex's singlet confidence score. This will define low confidence assignments which should be removed from downstream analyses. +PAR_ensemblex_compute_singlet_confidence=yes diff --git a/ensemblex.pip/nogt/scripts/.DS_Store b/ensemblex.pip/nogt/scripts/.DS_Store new file mode 100644 index 0000000..f5970d4 Binary files /dev/null and b/ensemblex.pip/nogt/scripts/.DS_Store differ diff --git a/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.py b/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.py new file mode 100644 index 0000000..11c57a9 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.py @@ -0,0 +1,101 @@ +import argparse +import sys +import os + + +parser = argparse.ArgumentParser() + +parser.add_argument('-fl', '--folder',action='store', type=str,required=False, help='outputfolder') +parser.add_argument('-p1', '--pdgn',action='store',required=False, help='PAR_demuxalot_genotype_names') +parser.add_argument('-p2', '--pdps',action='store', type=int,required=False, help='PAR_demuxalot_prior_strength') +parser.add_argument('-p3', '--pdmc',action='store', type=int,required=False, help='PAR_demuxalot_minimum_coverage') +parser.add_argument('-p4', '--pdmac',action='store', type=int,required=False, help='PAR_demuxalot_minimum_alternative_coverage') +parser.add_argument('-p5', '--pdnbspd',action='store', type=int, required=False, help='PAR_demuxalot_n_best_snps_per_donor') +parser.add_argument('-p6', '--pdgps',action='store',type=int,required=False, help='PAR_demuxalot_genotypes_prior_strength') +parser.add_argument('-p7', '--pddp',action='store', type=float,required=False, help='PAR_demuxalot_doublet_prior') +args = parser.parse_args() +folder=args.folder +PAR_demuxalot_genotype_names=args.pdgn.split(",") + +PAR_demuxalot_prior_strength=args.pdps +PAR_demuxalot_minimum_coverage=args.pdmc +PAR_demuxalot_minimum_alternative_coverage=args.pdmac +PAR_demuxalot_n_best_snps_per_donor=args.pdnbspd +PAR_demuxalot_genotypes_prior_strength=args.pdgps +PAR_demuxalot_doublet_prior=args.pddp + + +from pathlib import Path +import pandas as pd +import pysam + +from demuxalot.utils import download_file +from demuxalot import BarcodeHandler, ProbabilisticGenotypes, Demultiplexer, count_snps, detect_snps_positions +from pysam import VariantFile +import pandas as pd +import io +from demuxalot import utils + +print('Part I') +handler = BarcodeHandler.from_file(os.path.join(folder,'input_files/pooled_barcodes.tsv')) ### modify +print(handler) +genotype_names = PAR_demuxalot_genotype_names +genotype_names.sort() +genotypes = ProbabilisticGenotypes(genotype_names=genotype_names) + +print('Part II') + +outfolder=os.path.join(folder,'freemuxlet') +cmd_unzip=f'gunzip {outfolder}/outs.clust1.vcf.gz' +os.system(cmd_unzip) +genotypes.add_vcf(os.path.join(folder,'freemuxlet/outs.clust1.vcf'), prior_strength=PAR_demuxalot_prior_strength) ### modify +pysam.index(os.path.join(folder,'input_files/pooled_bam.bam')) ### modify +print('Part III') +counts = count_snps( + bamfile_location=os.path.join(folder,'input_files/pooled_bam.bam'), ### modify + chromosome2positions=genotypes.get_chromosome2positions(), + barcode_handler=handler, +) + +utils.summarize_counted_SNPs(counts) +print('Part IV') + +new_snps_filename = 'new_snps_single_file.betas' +_ = detect_snps_positions( + bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify + genotypes=genotypes, + barcode_handler=handler, + minimum_coverage=PAR_demuxalot_minimum_coverage, + minimum_alternative_coverage=PAR_demuxalot_minimum_alternative_coverage, + result_beta_prior_filename=str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')), + n_best_snps_per_donor=PAR_demuxalot_n_best_snps_per_donor, +) + +print('Part V') + +genotypes_with_new_snps = genotypes.clone() +genotypes_with_new_snps.add_prior_betas(str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')), prior_strength=PAR_demuxalot_genotypes_prior_strength) +counts_enriched = count_snps( + bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify + chromosome2positions=genotypes_with_new_snps.get_chromosome2positions(), + barcode_handler=handler, +) +print('Part VI') +learnt_enriched_genotypes, probs_learning_new_snps = Demultiplexer.learn_genotypes( + counts_enriched, + genotypes_with_new_snps, + barcode_handler=handler, + doublet_prior=PAR_demuxalot_doublet_prior, +) + +print('Part V') + +probs_learning_new_snps.head() +# print('save to the following') +# print(os.path.join(folder,'demuxalot/Demuxalot_result.csv')) +probs_learning_new_snps.to_csv(os.path.join(folder,'demuxalot/Demuxalot_result.csv')) ### modify + + +cmd_zip=f'gunzip {outfolder}/outs.clust1.vcf' +os.system(cmd_zip) + diff --git a/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.sh b/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.sh new file mode 100644 index 0000000..4f19f22 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/demuxalot/pipeline_demuxalot.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +if [ -d $OUTPUT_DIR/ensemblex ]; then + rm -rf $OUTPUT_DIR/demuxalot + mkdir -p $OUTPUT_DIR/demuxalot +else + mkdir -p $OUTPUT_DIR/demuxalot +fi + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step Demuxalot No-GT submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* PAR_demuxalot_genotype_names $PAR_demuxalot_genotype_names" +echo "* PAR_demuxalot_prior_strength $PAR_demuxalot_prior_strength" +echo "* PAR_demuxalot_minimum_coverage $PAR_demuxalot_minimum_coverage" +echo "* PAR_demuxalot_minimum_alternative_coverage $PAR_demuxalot_minimum_alternative_coverage" +echo "* PAR_demuxalot_n_best_snps_per_donor $PAR_demuxalot_n_best_snps_per_donor" +echo "* PAR_demuxalot_genotypes_prior_strength $PAR_demuxalot_genotypes_prior_strength" +echo "* PAR_demuxalot_doublet_prior $PAR_demuxalot_doublet_prior" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# + +echo "Start of demuxalot" +$CONTAINER_CMD exec --bind $OUTPUT_DIR,$PIPELINE_HOME ${CONTAINER1} python3 $PIPELINE_HOME/nogt/scripts/demuxalot/pipeline_demuxalot.py -fl $OUTPUT_DIR -p1 $PAR_demuxalot_genotype_names -p2 $PAR_demuxalot_prior_strength -p3 $PAR_demuxalot_minimum_coverage -p4 $PAR_demuxalot_minimum_alternative_coverage -p5 $PAR_demuxalot_n_best_snps_per_donor -p6 $PAR_demuxalot_genotypes_prior_strength -p7 $PAR_demuxalot_doublet_prior + +echo "End of demuxalot" + +exit 0 diff --git a/ensemblex.pip/nogt/scripts/ensemblexing/ensemblexing_nogt.R b/ensemblex.pip/nogt/scripts/ensemblexing/ensemblexing_nogt.R new file mode 100644 index 0000000..b0402c9 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/ensemblexing/ensemblexing_nogt.R @@ -0,0 +1,328 @@ +## load parameters +args = commandArgs(trailingOnly=TRUE) +home_dir=args[1] +output_dir=args[2] + +source(paste0(home_dir,'/nogt/scripts/ensemblexing/functions_nogt.R')) + +packages<-c('dplyr','tidyr','pdfCluster','data.table','readr','lubridate','tidyverse','moments','mousetrap','usethis','devtools','desc','kneedle','ROCit','ggplot2','factoextra','ggpubr','splines','stats','pathviewr') +invisible(lapply(packages, library, character.only = TRUE)) + +par_sample_size= as.numeric(args[3]) +par_expected_doublet_rate= as.numeric(args[4]) +par_merge_constituents= args[5] +par_probabilistic_weighted_ensemble= args[6] +par_preliminary_parameter_sweep= args[7] +par_graph_based_doublet_detection= args[8] +par_preliminary_ensemble_independent_doublet= args[9] +par_ensemble_independent_doublet= args[10] +par_doublet_Demuxalot_threshold= args[11] +par_doublet_Demuxalot_no_threshold= args[12] +par_doublet_Freemuxlet_threshold= args[13] +par_doublet_Freemuxlet_no_threshold= args[14] +par_doublet_Souporcell_threshold= args[15] +par_doublet_Souporcell_no_threshold= args[16] +par_doublet_Vireo_threshold= args[17] +par_doublet_Vireo_no_threshold= args[18] +par_compute_singlet_confidence= args[19] +par_ensemblex_nCD= as.numeric(args[20]) +par_ensemblex_pT= as.numeric(args[21]) + +## Print the defined parameters by the user +print("Loaded parameter from config.ini") +print(paste0("par_sample_size=",par_sample_size)) +print(paste0("par_expected_doublet_rate=",par_expected_doublet_rate)) +print(paste0("par_merge_constituents=",par_merge_constituents)) +print(paste0("par_probabilistic_weighted_ensemble=",par_probabilistic_weighted_ensemble)) +print(paste0("par_preliminary_parameter_sweep=",par_preliminary_parameter_sweep)) +print(paste0("par_graph_based_doublet_detection=",par_graph_based_doublet_detection)) +print(paste0("par_ensemblex_nCD=",par_ensemblex_nCD)) +print(paste0("par_ensemblex_pT=",par_ensemblex_pT)) +print(paste0("par_preliminary_ensemble_independent_doublet=",par_preliminary_ensemble_independent_doublet)) +print(paste0("par_ensemble_independent_doublet=",par_ensemble_independent_doublet)) +print(paste0("par_doublet_Demuxalot_threshold=",par_doublet_Demuxalot_threshold)) +print(paste0("par_doublet_Demuxalot_no_threshold=",par_doublet_Demuxalot_no_threshold)) +print(paste0("par_doublet_Freemuxlet_threshold=",par_doublet_Freemuxlet_threshold)) +print(paste0("par_doublet_Freemuxlet_no_threshold=",par_doublet_Freemuxlet_no_threshold)) +print(paste0("par_doublet_Souporcell_threshold=",par_doublet_Souporcell_threshold)) +print(paste0("par_doublet_Souporcell_no_threshold=",par_doublet_Souporcell_no_threshold)) +print(paste0("par_doublet_Vireo_threshold=",par_doublet_Vireo_threshold)) +print(paste0("par_doublet_Vireo_no_threshold=",par_doublet_Vireo_no_threshold)) +print(paste0("par_compute_singlet_confidence=",par_compute_singlet_confidence)) +print(paste0("par_doublet_Vireo_no_threshold=",par_doublet_Vireo_no_threshold)) +print(paste0("par_compute_singlet_confidence=",par_compute_singlet_confidence)) + + +par_output_dir<- paste0(output_dir,"/ensemblex") +## Demuxalot +par_demuxalot<- paste0(output_dir,"/demuxalot/Demuxalot_result.csv") +## Freemuxlet +par_freemuxlet<- paste0(output_dir,"/freemuxlet/outs.clust1.samples") +## Souporcell +par_souporcell<- paste0(output_dir,"/souporcell/clusters.tsv") +## Vireo +par_vireo<- paste0(output_dir,"/vireo/donor_ids.tsv") + + +########################################################################################################################### +# Print Warning message +########################################################################################################################### +message("You are running Ensemblex without prior genotype information. Please note that it is not recommended to run Ensemblex without prior genotype information for pools exceeding 16 multiplexed samples. Furthermore, we discourage using Ensemblex without prior genotype information if there is a strong imbalance of sample representation in the pool.") + +########################################################################################################################### +# CONSTITUENT DATA PREPARATION AND MERGING +########################################################################################################################### +if (tolower(par_merge_constituents)=="yes"){ + message("Performing constituent data preparation and merge.") + ## Import constituent tool output files + vireo<- read.delim(par_vireo, header = T, sep = "\t") + dim(vireo) + souporcell <- read.delim(par_souporcell, header = T, sep = "\t") + dim(souporcell) + freemuxlet <-read.delim(par_freemuxlet, header = T, sep = "\t") + dim(freemuxlet) + demuxalot <- read.delim(par_demuxalot, header = T, sep = ",", check.names=FALSE) + colnames(demuxalot)[1] <- "X" + dim(demuxalot) + ## check output files for duplicated barcodes + message("Checking output files for duplicated barcodes.") + ## Vireo + if (length(unique(vireo$cell)) != nrow(vireo)){ + vireo <- vireo[!duplicated(vireo$cell),] + temp <- vireo[duplicated(vireo$cell),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Vireo resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(vireo), " unique barcodes in Vireo output.")) + } + } else { + print("No duplicated barcodes observed in Vireo output.") + print(paste0("Observed ", nrow(vireo), " unique barcodes in Vireo output.")) + } + ## Souporcell + if (length(unique(souporcell$barcode)) != nrow(souporcell)){ + souporcell <- souporcell[!duplicated(souporcell$barcode),] + temp <- souporcell[duplicated(souporcell$barcode),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Souporcell resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(souporcell), " unique barcodes in Souporcell output.")) + } + } else { + print("No duplicated barcodes observed in Souporcell output.") + print(paste0("Observed ", nrow(souporcell), " unique barcodes in Souporcell output.")) + } + ## Freemuxlet + if (length(unique(freemuxlet$BARCODE)) != nrow(freemuxlet)){ + freemuxlet <- freemuxlet[!duplicated(freemuxlet$BARCODE),] + temp <- freemuxlet[duplicated(freemuxlet$BARCODE),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Freemuxlet resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(freemuxlet), " unique barcodes in Freemuxlet output.")) + } + } else { + print("No duplicated barcodes observed in Freemuxlet output.") + print(paste0("Observed ", nrow(freemuxlet), " unique barcodes in Freemuxlet output.")) + } + ## Demuxalot + if (length(unique(demuxalot$X)) != nrow(demuxalot)){ + demuxalot <- demuxalot[!duplicated(demuxalot$X),] + temp <- demuxalot[duplicated(demuxalot$X),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Demuxalot resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(demuxalot), " unique barcodes in Demuxalot output.")) + } + } else { + print("No duplicated barcodes observed in Demuxalot output.") + print(paste0("Observed ", nrow(demuxalot), " unique barcodes in Demuxalot output.")) + } + ## Make sure that all of the output files have the same number of droplets + if (nrow(vireo) == nrow(freemuxlet) & + nrow(vireo) == nrow(demuxalot) & + nrow(vireo) == nrow(souporcell)) { + message(paste0("All demultiplexing tools have reported the same number of cells: ", nrow(souporcell), "." )) + } else { + message(paste0("WARNING: Demultiplexing tools have reported different number of cells; Vireo: ", nrow(vireo), "; Freemuxlet: ", nrow(freemuxlet), "; Demuxalot: ", nrow(demuxalot), "; Souporcell: ", nrow(souporcell), ".")) + } + #### Make sure that the Sample IDs from each output file have the same structure. We will replace "-" or "." with "_" + ### Vireo + ## fix potential "-" + vireo$donor_id <- gsub(pattern = "-", replacement = "_",x = vireo$donor_id, fixed = TRUE) + vireo$best_singlet <- gsub(pattern = "-", replacement = "_",x = vireo$best_singlet, fixed = TRUE) + vireo$best_doublet <- gsub(pattern = "-", replacement = "_",x = vireo$best_doublet, fixed = TRUE) + ## fix potential "." + vireo$donor_id <- gsub(pattern = ".", replacement = "_",x = vireo$donor_id, fixed = TRUE) + vireo$best_singlet <- gsub(pattern = ".", replacement = "_",x = vireo$best_singlet, fixed = TRUE) + vireo$best_doublet <- gsub(pattern = ".", replacement = "_",x = vireo$best_doublet, fixed = TRUE) + ### Demuxalot + ## fix potential "-" + colnames(demuxalot) <- gsub(pattern = "-", replacement = "_",x = colnames(demuxalot), fixed = TRUE) + ## fix potential "." + colnames(demuxalot) <- gsub(pattern = ".", replacement = "_",x = colnames(demuxalot), fixed = TRUE) + ## Freemuxlet + ## fix potential "-" + freemuxlet$BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$BEST.GUESS, fixed = TRUE) + freemuxlet$NEXT.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$NEXT.GUESS, fixed = TRUE) + freemuxlet$SNG.BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$SNG.BEST.GUESS, fixed = TRUE) + freemuxlet$SNG.NEXT.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$SNG.NEXT.GUESS, fixed = TRUE) + freemuxlet$DBL.BEST.GUESS <- gsub(pattern = "-", replacement = "_",x = freemuxlet$DBL.BEST.GUESS, fixed = TRUE) + ## fix potential "." + freemuxlet$BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$BEST.GUESS, fixed = TRUE) + freemuxlet$NEXT.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$NEXT.GUESS, fixed = TRUE) + freemuxlet$SNG.BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$SNG.BEST.GUESS, fixed = TRUE) + freemuxlet$SNG.NEXT.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$SNG.NEXT.GUESS, fixed = TRUE) + freemuxlet$DBL.BEST.GUESS <- gsub(pattern = ".", replacement = "_",x = freemuxlet$DBL.BEST.GUESS, fixed = TRUE) + ### Prepare outputs from each of the constituent demultiplexing tools for downstream analyses + ## Demuxalot prep + demuxalot <- demuxalot_prep(demuxalot) + ## Freemuxlet prep + freemuxlet <- freemuxlet_prep(freemuxlet) + ## Souporcell prep + souporcell <- souporcell_prep(souporcell, demuxalot, freemuxlet) + ## Vireo prep + vireo <- vireo_prep(vireo,souporcell, demuxalot, freemuxlet) + ## Merge results and calculate consensus + merge_df <- merge_concensus(vireo,souporcell, freemuxlet, demuxalot) + ## Ensure that if outputs do not have matching droplet counts, the missing droplets are unassigned. + merge_df$vireo_best_assignment[is.na(merge_df$vireo_best_assignment)] <- "unassigned" + merge_df$souporcell_best_assignment[is.na(merge_df$souporcell_best_assignment)] <- "unassigned" + merge_df$freemuxlet_best_assignment[is.na(merge_df$freemuxlet_best_assignment)] <- "unassigned" + merge_df$demuxalot_best_assignment[is.na(merge_df$demuxalot_best_assignment)] <- "unassigned" + ## Check if the merged csv file has duplicated barcodes + if (length(unique(merge_df$barcode)) != nrow(merge_df)){ + merge_df <- merge_df[!duplicated(merge_df$barcode),] + temp <- merge_df[duplicated(merge_df$barcode),] + if (nrow(temp) == 0){ + print("Duplicated barcodes in Merged file resolved; recommended to investigate initial reason for barcode duplication.") + print(paste0("Observed ", nrow(merge_df), " unique barcodes in Merged file.")) + } + } else { + print("No duplicated barcodes observed in Merged file.") + print(paste0("Observed ", nrow(merge_df), " unique barcodes in Merged file.")) + } + ## Write merge CSV file + write.csv(merge_df, paste(par_output_dir,'/constituent_tool_merge.csv',sep =""), row.names=T) +} else { + message("Skipping constituent data preparation and merge, and loading exisiting csv file.") +} + + +########################################################################################################################### +# PROBABILISTIC-WEIGHTED ENSEMBLE +########################################################################################################################### +## Read in merge CSV file +if (tolower(par_probabilistic_weighted_ensemble)=="yes" & file.exists(paste0(par_output_dir,'/constituent_tool_merge.csv'))){ + message("Loading merged output file.") + merge_df <- read.delim(paste(par_output_dir,'/constituent_tool_merge.csv',sep =""), header = T, sep = ",") +} else if (tolower(par_probabilistic_weighted_ensemble)=="yes" & !file.exists(paste0(par_output_dir,'/constituent_tool_merge.csv'))) { + stop('Exiting ensemblex; constituent_tool_merge.csv file cannot be found.') +} else { + message("Skipping the probabilistic-weighted ensemble component of the ensemblex framework and loading existing csv file.") +} + +## Probabilistic-weighted Ensemble (PWE) function +if (tolower(par_probabilistic_weighted_ensemble)=="yes"){ + message("Performing the probabilistic-weighted ensemble component of the ensemblex framework.") +## Run PWE +result_test <- BA_weight_consensus(merge_df, par_sample_size, par_output_dir) +} else { + message("Skipping the probabilistic-weighted ensemble component of the ensemblex framework and loading existing csv file.") +} + + +########################################################################################################################### +# GRAPH-BASED DOUBLET DETECTION +########################################################################################################################### +## Read in inputs +if (tolower(par_preliminary_parameter_sweep)=="yes" | tolower(par_graph_based_doublet_detection)=="yes" ){ + if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_test <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_test <- result_test[,-1] + } else { + stop('Exiting ensemblex; step1_cell_assignment.csv cannot be found.') + } +} else { + message("Skipping the graph-based doublet detection component of the ensemblex framework and loading existing csv file.") +} + +## perform graph-based doublet detection +if (tolower(par_preliminary_parameter_sweep)=="yes" & tolower(par_graph_based_doublet_detection)=="no"){ + message("Performing a preliminary parameter sweep for graph-based doublet detection.") + ## apply graph-based doublet detection parameter sweep + graph_based_doublet_detection_par_sweep(result_test, par_expected_doublet_rate, par_output_dir) + +} else if (tolower(par_preliminary_parameter_sweep)=="yes" & tolower(par_graph_based_doublet_detection)=="yes" & exists("par_ensemblex_nCD") & exists("par_ensemblex_pT") ){ + message("Performing graph-based doublet detection with manually defined parameters.") + ## apply graph-based doublet detection with manually defined parameters + graph_based_doublet_detection_manual_par(result_test, par_expected_doublet_rate, par_output_dir, par_ensemblex_pT, par_ensemblex_nCD) + +} else if (tolower(par_preliminary_parameter_sweep)=="no" & tolower(par_graph_based_doublet_detection)=="yes"){ + message("Performing graph-based doublet detection with ensemblex-estimated parameters.") + ## apply graph-based doublet detection with estimated optimal parameters + graph_based_doublet_detection_estimated_par(result_test, par_expected_doublet_rate, par_output_dir) +} else { + message("Skipping graph-based doublet detection.") +} + +########################################################################################################################### +# ENSEMBLE-INDEPENDENT DOUBLET DETECTION +########################################################################################################################### +## Read in inputs +if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" | tolower(par_ensemble_independent_doublet)=="yes"){ + if (file.exists(paste0(par_output_dir,"/step2",'/Step2_cell_assignment.csv'))){ + message("Loading graph-based doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_2 <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else { + stop('Exiting ensemblex; Step2_cell_assignment.csv and step1_cell_assignment.csv files cannot be found.') + } +} else { + message("Skipping the ensemble-independent doublet detection component of the ensemblex framework and loading existing csv file.") +} + +## Run ensemble-independet doublet detection +if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" & tolower(par_ensemble_independent_doublet)=="yes"){ + ## Run EID + result_2 <- ensemble_independent_doublet_detections(result_2, par_output_dir) +} else if (tolower(par_preliminary_ensemble_independent_doublet)=="yes" & tolower(par_ensemble_independent_doublet)=="no"){ + ## Run EID + ensemble_independent_doublet_detections_prelim(result_2, par_output_dir) +} else if (tolower(par_preliminary_ensemble_independent_doublet)=="no" & tolower(par_ensemble_independent_doublet)=="yes") { + ## Run EID + result_2 <- ensemble_independent_doublet_detections(result_2, par_output_dir) +} else { + message("Skipping ensemble independent doublet detection.") +} + + +########################################################################################################################### +# CONFIDENCE SCORE +########################################################################################################################### +if (tolower(par_compute_singlet_confidence)=="yes"){ + if (file.exists(paste0(par_output_dir,"/step3",'/Step3_cell_assignment.csv'))){ + message("Loading ensemble-independent doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step3",'/Step3_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step2",'/Step2_cell_assignment.csv'))){ + message("Loading graph-based doublet detection output.") + result_2 <- read.delim(paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else if (file.exists(paste0(par_output_dir,"/step1",'/step1_cell_assignment.csv'))){ + message("Loading probabilistic-weighted ensemble output.") + result_2 <- read.delim(paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep=""), header = T, sep = ",") + result_2 <- result_2[,-1] + } else { + stop('Exiting ensemblex; Step3_cell_assignment.csv, Step2_cell_assignment.csv and step1_cell_assignment.csv files cannot be found.') + } +} else { + message("Skipping the ensemble-independent doublet detection component of the ensemblex framework and loading existing csv file.") +} + +## Compute confidence score +if (tolower(par_compute_singlet_confidence)=="yes"){ +## Compute ensemblex singlet confidence +eval_df <- confidence_score(result_2, par_output_dir,par_sample_size) +} diff --git a/ensemblex.pip/nogt/scripts/ensemblexing/functions_nogt.R b/ensemblex.pip/nogt/scripts/ensemblexing/functions_nogt.R new file mode 100644 index 0000000..411f215 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/ensemblexing/functions_nogt.R @@ -0,0 +1,3205 @@ +########################################################################################################################### +# CONSTITUENT DATA PREPARATION AND MERGING +########################################################################################################################### +## FUNCTIONS +demuxalot_prep <- function(demuxalot){ + ## Compute number of columns in the dataframe + xx <- ncol(demuxalot) + + ## Change doublet column names for downstream operations + list_col <- c((par_sample_size + 2):xx) + for (i in list_col){ + colnames(demuxalot)[i] <- paste0("doublet_",i) + } + + ## Place Demuxalot output in long format + data_long <- gather(demuxalot, assignment, stat, 2:xx, factor_key=F) + + ## Identify the top sample prediction for each barcode + result <- data_long %>% + dplyr::group_by(X) %>% + dplyr::filter(stat == max(stat)) + nrow(result) + + ## Identify uniadentifiable cells; these are barcodes that have two or more samples assigned with the same probability + duplicates <- subset(result,duplicated(X)) + duplicated_id <- duplicates$X + result$assignment[result$X %in% duplicated_id] <- "unassigned" + + ## Remove duplicated barcodes to only keep one sample assignment; these barcodes are labelled as unassigned. + result <- subset(result,!duplicated(X)) + + ## Set up demuxalot assignment + result$demuxalot_sample <- result$assignment + result$demuxalot_sample[str_detect(result$demuxalot_sample, "doublet")] <- "doublet" + + ## Identify Demuxalot doublet probability + # Keep only doublet assignments + data_long_2 <- data_long[str_detect(data_long$assignment, "doublet"),] + # Keep only the top prediction for each barcode + result_doublet <- data_long_2 %>% + dplyr::group_by(X) %>% + dplyr::filter(stat == max(stat)) + # Keep only one of each barcode --> some barcodes will have multiple doublet combinations with the same doublet probability + result_doublet <- result_doublet[!(duplicated(result_doublet$X)),] + result_doublet <- result_doublet[,c(1,3)] + colnames(result_doublet) <- c("X", "demuxalot_doublet_probability") + + ## Merge doublet probability back to main dataframe + nrow(result) == nrow(result_doublet) + demuxalot_2 <- merge(result, result_doublet, by = "X") + nrow(demuxalot_2) == nrow(demuxalot) + demuxalot <- demuxalot_2 + + ## Print number of retained droplets + message(paste0("Retained ", nrow(demuxalot), " droplets during Demuxalot preparation.")) + demuxalot +} + +freemuxlet_prep <- function(freemuxlet){ + ## Prepare Freemuxlet best guess + freemuxlet$guess_1 <- sub(".*,(.*)", "\\1", freemuxlet$BEST.GUESS) + freemuxlet$guess_2 <-sub("(.*),.*", "\\1", freemuxlet$BEST.GUESS) + freemuxlet$freemuxlet_best[freemuxlet$guess_1 != freemuxlet$guess_2] <- "doublet" + freemuxlet$freemuxlet_best[freemuxlet$guess_1 == freemuxlet$guess_2] <- freemuxlet$guess_1[freemuxlet$guess_1 == freemuxlet$guess_2] + + ## Prepare Freemuxlet sample assignment + freemuxlet$freemuxlet_sample <- freemuxlet$freemuxlet_best + freemuxlet$freemuxlet_sample[freemuxlet$DROPLET.TYPE == "AMB"] <- "unassigned" + freemuxlet <- freemuxlet %>% dplyr::select(-c(guess_1, guess_2)) + + ## Make the sample labels match with Demuxalot + # Fix best guess + freemuxlet$freemuxlet_best <- paste0("CLUST", freemuxlet$freemuxlet_best) + freemuxlet$freemuxlet_best[freemuxlet$freemuxlet_best == "CLUSTdoublet"] <- "doublet" + + # Fix sample + freemuxlet$freemuxlet_sample <- paste0("CLUST", freemuxlet$freemuxlet_sample) + freemuxlet$freemuxlet_sample[freemuxlet$freemuxlet_sample == "CLUSTdoublet"] <- "doublet" + freemuxlet$freemuxlet_sample[freemuxlet$freemuxlet_sample == "CLUSTunassigned"] <- "unassigned" + + ## Print number of retained droplets + message(paste0("Retained ", nrow(freemuxlet), " droplets during Freemuxlet preparation.")) + freemuxlet +} + +souporcell_prep <- function(souporcell, demuxalot, freemuxlet){ + + message("Matching Souporcell Sample IDs to Demuxalot and Freemuxlet.") + + ## Number of singlet cluster identified by Souporcell + soup_singlet_cluster <- unique(souporcell$assignment) + soup_singlet_cluster <- soup_singlet_cluster[!grepl("/", soup_singlet_cluster)] + + ## Demuxalot + demuxalot_for_soup <- demuxalot[,c(1,4)] + nrow(demuxalot_for_soup) == nrow(souporcell) + ## Merge Souporcell and Demuxalot + merge_soup_demuxalot <- merge(demuxalot_for_soup, souporcell, by.x = "X", by.y = "barcode") + nrow(merge_soup_demuxalot) == nrow(demuxalot_for_soup) + merge_soup_demuxalot <- merge_soup_demuxalot[,c(2:4)] + colnames(merge_soup_demuxalot) <- c("Sample_ID", "status", "assignment") + + ## Freemuxlet + freemuxlet_for_soup <- freemuxlet[,c(2,21)] + nrow(freemuxlet_for_soup) == nrow(souporcell) + ## Merge Souporcell and freemuxlet + merge_soup_freemuxlet <- merge(freemuxlet_for_soup, souporcell, by.x = "BARCODE", by.y = "barcode") + nrow(merge_soup_freemuxlet) == nrow(freemuxlet_for_soup) + merge_soup_freemuxlet <- merge_soup_freemuxlet[,c(2:4)] + colnames(merge_soup_freemuxlet) <- c("Sample_ID", "status", "assignment") + + ### Calculate cluster-Sample ID probabilities based on assignments of remaining constituent tools + bind_df <- rbind(merge_soup_demuxalot, merge_soup_freemuxlet) + bind_df <- subset(bind_df, status != "doublet") + bind_df <- subset(bind_df, status != "unassigned") + bind_df <- subset(bind_df, Sample_ID != "doublet") + bind_df <- subset(bind_df, Sample_ID != "unassigned") + + souporcell_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame_final <- data.frame(souporcell_assignment, Sample_ID, Probability) + + for(i in unique(bind_df$assignment)){ + df_lim <- subset(bind_df, assignment == i) + n_droplets <- nrow(df_lim) + + souporcell_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame <- data.frame(souporcell_assignment, Sample_ID, Probability) + + for (j in unique(bind_df$Sample_ID)){ + df_lim2 <- subset(df_lim, Sample_ID == j ) + prob <- nrow(df_lim2)/n_droplets + + souporcell_assignment = i + Sample_ID = j + Probability = prob + data_frame_temp <- data.frame(souporcell_assignment, Sample_ID, Probability) + data_frame <- rbind(data_frame, data_frame_temp) + } + data_frame_final <- rbind(data_frame_final, data_frame) + } + + data_frame_final <- subset(data_frame_final, Probability != "none") + data_frame_final$Probability <- as.numeric(data_frame_final$Probability) + + ## take maximum cluster-sample probability + result <- data_frame_final %>% + dplyr::group_by(souporcell_assignment) %>% + dplyr::filter(Probability == max(Probability)) + + ## set duplicated clusters to unassigned + n_occur <- data.frame(table(result$souporcell_assignment)) + n_occur <- subset(n_occur, Freq > 1) + duplicated <- n_occur$Var1 + result$Sample_ID[result$souporcell_assignment %in% duplicated] <- "unassigned" + + #temp_df <- data.frame(result) + result <- result[!duplicated(result$souporcell_assignment),] + + + if(length(result$souporcell_assignment) == length(unique(bind_df$assignment)) ){ + result <- data.frame(result) + result <- result[,c(1:2)] + colnames(result) <- c("assignment","souporcell_sample") + souporcell <- merge(souporcell, result, by = "assignment", all = T) + } + + + souporcell$souporcell_sample[souporcell$status == 'doublet'] <- 'doublet' + souporcell$souporcell_sample[str_detect(souporcell$assignment, "/")] <- "doublet" + + souporcell$souporcell_sample[is.na(souporcell$souporcell_sample)] <- "unassigned" + unique(souporcell$souporcell_sample) + + souporcell$souporcell_best <- souporcell$souporcell_sample + souporcell$souporcell_sample[souporcell$status == 'unassigned'] <- 'unassigned' + unique(souporcell$souporcell_best) + unique(souporcell$souporcell_sample) + + if(length(unique(result$souporcell_sample)) == par_sample_size){ + message("Successfully matched all Souporcell clusters to Sample ID.") + } else { + message(paste0("WARNING: ensemblex failed to match all Souporcell clusters to all Sample ID. ensemblex will proceed; however, it is recommended to look into the data manually.")) + } + + ## Print number of retained droplets + message(paste0("Retained ", nrow(souporcell), " droplets during Souporcell preparation.")) + souporcell +} + +vireo_prep <- function(vireo,souporcell, demuxalot, freemuxlet){ + ## Vireo's sample assignment + #vireo$vireo_sample <- vireo$donor_id + + message("Matching Vireo Sample IDs to Demuxalot, Freemuxlet, and Souporcell.") + + ## Number of singlet clusters identified by Vireo + vireo_singlet_cluster <- unique(vireo$donor_id) + vireo_singlet_cluster <- vireo_singlet_cluster[vireo_singlet_cluster != "doublet"] + vireo_singlet_cluster <- vireo_singlet_cluster[vireo_singlet_cluster != "unassigned"] + + # Temp column + vireo$donor_id_temp <- vireo$donor_id + vireo$donor_id[vireo$donor_id == "unassigned" & vireo$prob_max > vireo$prob_doublet] <- vireo$best_singlet[vireo$donor_id == "unassigned" & vireo$prob_max > vireo$prob_doublet] + vireo$donor_id[vireo$donor_id == "unassigned" & vireo$prob_max < vireo$prob_doublet] <- "doublet" + + ## Demuxalot + demuxalot_for_vireo <- demuxalot[,c(1,4)] + nrow(demuxalot_for_vireo) == nrow(vireo) + merge_vireo_demuxalot <- merge(demuxalot_for_vireo, vireo, by.x = "X", by.y = "cell") + nrow(merge_vireo_demuxalot) == nrow(demuxalot_for_vireo) + merge_vireo_demuxalot <- merge_vireo_demuxalot[,c(2,3)] + merge_vireo_demuxalot$assignment <- merge_vireo_demuxalot$donor_id + colnames(merge_vireo_demuxalot) <- c("Sample_ID", "status", "assignment") + merge_vireo_demuxalot$status <- "singlet" + merge_vireo_demuxalot$status[merge_vireo_demuxalot$assignment == "unassigned"] <- "unassigned" + merge_vireo_demuxalot$status[merge_vireo_demuxalot$assignment == "doublet"] <- "doublet" + + ## Freemuxlet + freemuxlet_for_vireo<- freemuxlet[,c(2,21)] + nrow(freemuxlet_for_vireo) == nrow(vireo) + merge_vireo_freemuxlet <- merge(freemuxlet_for_vireo, vireo, by.x = "BARCODE", by.y = "cell") + nrow(merge_vireo_freemuxlet) == nrow(freemuxlet_for_vireo) + merge_vireo_freemuxlet <- merge_vireo_freemuxlet[,c(2,3)] + merge_vireo_freemuxlet$assignment <- merge_vireo_freemuxlet$donor_id + colnames(merge_vireo_freemuxlet) <- c("Sample_ID", "status", "assignment") + merge_vireo_freemuxlet$status <- "singlet" + merge_vireo_freemuxlet$status[merge_vireo_freemuxlet$assignment == "unassigned"] <- "unassigned" + merge_vireo_freemuxlet$status[merge_vireo_freemuxlet$assignment == "doublet"] <- "doublet" + + ## Souporcell + souporcell_for_vireo<- souporcell[,c(2,11)] + nrow(souporcell_for_vireo) == nrow(vireo) + merge_vireo_souporcell <- merge(souporcell_for_vireo, vireo, by.x = "barcode", by.y = "cell") + nrow(merge_vireo_souporcell) == nrow(souporcell_for_vireo) + merge_vireo_souporcell <- merge_vireo_souporcell[,c(2,3)] + merge_vireo_souporcell$assignment <- merge_vireo_souporcell$donor_id + colnames(merge_vireo_souporcell) <- c("Sample_ID", "status", "assignment") + merge_vireo_souporcell$status <- "singlet" + merge_vireo_souporcell$status[merge_vireo_souporcell$assignment == "unassigned"] <- "unassigned" + merge_vireo_souporcell$status[merge_vireo_souporcell$assignment == "doublet"] <- "doublet" + + ### Calculate cluster-Sample ID probabilities based on assignments of remaining constituent tools + bind_df <- rbind(merge_vireo_demuxalot, merge_vireo_freemuxlet, merge_vireo_souporcell) + bind_df <- subset(bind_df, status != "doublet") + bind_df <- subset(bind_df, status != "unassigned") + bind_df <- subset(bind_df, Sample_ID != "doublet") + bind_df <- subset(bind_df, Sample_ID != "unassigned") + + vireo_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame_final <- data.frame(vireo_assignment, Sample_ID, Probability) + + for(i in unique(bind_df$assignment)){ + df_lim <- subset(bind_df, assignment == i) + n_droplets <- nrow(df_lim) + + vireo_assignment = "none" + Sample_ID = "none" + Probability = "none" + data_frame <- data.frame(vireo_assignment, Sample_ID, Probability) + + for (j in unique(bind_df$Sample_ID)){ + df_lim2 <- subset(df_lim, Sample_ID == j ) + prob <- nrow(df_lim2)/n_droplets + + vireo_assignment = i + Sample_ID = j + Probability = prob + data_frame_temp <- data.frame(vireo_assignment, Sample_ID, Probability) + data_frame <- rbind(data_frame, data_frame_temp) + } + data_frame_final <- rbind(data_frame_final, data_frame) + } + + data_frame_final <- subset(data_frame_final, Probability != "none") + data_frame_final$Probability <- as.numeric(data_frame_final$Probability) + + ## take maximum cluster-sample probability + result <- data_frame_final %>% + dplyr::group_by(vireo_assignment) %>% + dplyr::filter(Probability == max(Probability)) + + ## set duplicated clusters to unassigned + n_occur <- data.frame(table(result$vireo_assignment)) + n_occur <- subset(n_occur, Freq > 1) + duplicated <- n_occur$Var1 + result$Sample_ID[result$vireo_assignment %in% duplicated] <- "unassigned" + + #temp_df <- data.frame(result) + result <- result[!duplicated(result$vireo_assignment),] + + if(length(result$vireo_assignment) == length(unique(bind_df$assignment)) ){ + result <- data.frame(result) + result <- result[,c(1:2)] + colnames(result) <- c("donor_id","vireo_sample") + vireo <- merge(vireo, result, by = "donor_id", all = T) + } + + + vireo$vireo_sample[vireo$donor_id == 'doublet'] <- 'doublet' + vireo$vireo_sample[is.na(vireo$vireo_sample)] <- "unassigned" + unique(vireo$vireo_sample) + + vireo$vireo_best <- vireo$vireo_sample + vireo$vireo_sample[vireo$donor_id_temp == "unassigned"] <- "unassigned" + + vireo <- vireo %>% dplyr::select(-donor_id_temp) + + if(length(unique(result$vireo_sample)) == par_sample_size){ + message("Successfully matched all vireo clusters to Sample ID.") + } else { + message(paste0("WARNING: ensemblex failed to match all Vireo clusters to Sample ID. ensemblex will proceed; however, it is recommended to look into the data manually.")) + } + + ## Print number of retained droplets + message(paste0("Retained ", nrow(vireo), " droplets during Vireo preparation.")) + vireo +} + +merge_concensus <- function(vireo,souporcell, freemuxlet, demuxalot){ + ## Select important columns from Vireo + vireo <- dplyr::select(vireo, c("cell", "prob_max", "prob_doublet", "n_vars", "best_doublet", "doublet_logLikRatio", "vireo_sample", "vireo_best" )) + colnames(vireo) <- c("barcode", "vireo_singlet_probability", "vireo_doublet_probability", "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "vireo_assignment", "vireo_best_assignment") + + ## Select important columns from Souporcell + souporcell <- dplyr::select(souporcell, c("barcode", "log_prob_singleton", "log_prob_doublet", "souporcell_sample", "souporcell_best")) + colnames(souporcell) <- c("barcode", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "souporcell_assignment", "souporcell_best_assignment") + + ## Select important columns from Freemuxlet + freemuxlet <- dplyr::select(freemuxlet, c("BARCODE", "NUM.SNPS", "NUM.READS", "SNG.POSTERIOR", "freemuxlet_sample", "freemuxlet_best", "DIFF.LLK.SNG.DBL")) + colnames(freemuxlet) <- c("barcode", "freemuxlet_n_snps", "freemuxlet_n_reads", "freemuxlet_max_probability", "freemuxlet_assignment", "freemuxlet_best_assignment", "freemuxlet_DIFF_LLK_SNG_DBL") + + ## Select important columns from Demuxalot + demuxalot$demuxalot_second <- demuxalot$demuxalot_sample + demuxalot$demuxalot_sample[demuxalot$stat < 0.9] <- "unassigned" + demuxalot <- dplyr::select(demuxalot, c("X", "stat", "demuxalot_sample", "demuxalot_second", "demuxalot_doublet_probability")) + colnames(demuxalot) <- c("barcode", "demuxalot_max_probability", "demuxalot_assignment", "demuxalot_best_assignment", "demuxalot_doublet_probability" ) + + ## Merge dataframes + merge_df <- merge(vireo, souporcell, by = c("barcode"), all = T) + merge_df <- merge(merge_df, freemuxlet, by = c("barcode"), all = T) + merge_df <- merge(merge_df, demuxalot, by = c("barcode"), all = T) + + ## Generate a general consensus column + merge_lim <- dplyr::select(merge_df, c("barcode", "vireo_assignment", "souporcell_assignment", "freemuxlet_assignment", "demuxalot_assignment")) + merge_lim$general_consensus <- apply(merge_lim,1,function(x) names(which.max(table(x)))) + merge_lim$general_consensus <- sub(".*(-).*", "\\1", merge_lim$general_consensus) + merge_lim$general_consensus[merge_lim$general_consensus == "-" ] <- "unassigned" + merge_lim[is.na(merge_lim)] <- "unassigned" + merge_bind <- merge_lim + + ## Merge back to initial dataframe + merge_df <- merge(merge_bind, merge_df, by = c("barcode"), all = T) + + ## Print number of droplets in merged dataframe + message(paste0("Retained ", nrow(merge_df), " after merging output files from each constituent demultiplexing tool.")) + + ## Clean up output dataframe + merge_df <- dplyr::select(merge_df, -c("vireo_assignment.y", "souporcell_assignment.y","freemuxlet_assignment.y", "demuxalot_assignment.y")) + colnames(merge_df) <- c("barcode", "vireo_assignment", "souporcell_assignment", "freemuxlet_assignment", "demuxalot_assignment", "general_consensus","vireo_singlet_probability", "vireo_doublet_probability", + "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "vireo_best_assignment", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "souporcell_best_assignment", + "freemuxlet_n_snps", "freemuxlet_n_reads", "freemuxlet_max_probability", "freemuxlet_best_assignment", "freemuxlet_DIFF_LLK_SNG_DBL", "demuxalot_max_probability", "demuxalot_best_assignment", + "demuxalot_doublet_probability") + merge_df <- dplyr::select(merge_df, c("barcode", "vireo_assignment", "souporcell_assignment", "freemuxlet_assignment", "demuxalot_assignment", "general_consensus", "vireo_best_assignment", "souporcell_best_assignment", + "freemuxlet_best_assignment", "demuxalot_best_assignment", "vireo_singlet_probability", "vireo_doublet_probability", "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", + "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "freemuxlet_n_snps", "freemuxlet_n_reads", "freemuxlet_max_probability", "freemuxlet_DIFF_LLK_SNG_DBL", + "demuxalot_max_probability", "demuxalot_doublet_probability")) + merge_df +} + +########################################################################################################################### +# PROBABILISTIC-WEIGHTED ENSEMBLE +########################################################################################################################### +## FUNCTIONS +BA_weight_consensus <- function(merge_df,par_sample_size,par_output_dir){ + + ## Set seed + set.seed(1234) + + ## Create an output directory of probabilistic-weighted ensmeble outputs + dir.create(paste(par_output_dir,"/step1",sep='')) + + ## Rename the dataset + eval_df <- merge_df + + #### Adjusted Rand Index between sample assignments -- here we are using the best guess from each tool + ## Vireo + ARI_vireo_vireo <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$vireo_best_assignment) + ARI_vireo_freemuxlet <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$freemuxlet_best_assignment) + ARI_vireo_demuxalot <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$demuxalot_best_assignment) + ARI_vireo_souporcell <- adj.rand.index(eval_df$vireo_best_assignment, eval_df$souporcell_best_assignment) + ## Freemuxlet + ARI_freemuxlet_freemuxlet <- adj.rand.index(eval_df$freemuxlet_best_assignment, eval_df$freemuxlet_best_assignment) + ARI_freemuxlet_vireo <- adj.rand.index(eval_df$freemuxlet_best_assignment, eval_df$vireo_best_assignment) + ARI_freemuxlet_demuxalot <- adj.rand.index(eval_df$freemuxlet_best_assignment, eval_df$demuxalot_best_assignment) + ARI_freemuxlet_souporcell <- adj.rand.index(eval_df$freemuxlet_best_assignment, eval_df$souporcell_best_assignment) + ## Demuxalot + ARI_demuxalot_demuxalot <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$demuxalot_best_assignment) + ARI_demuxalot_souporcell <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$souporcell_best_assignment) + ARI_demuxalot_freemuxlet <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$freemuxlet_best_assignment) + ARI_demuxalot_vireo <- adj.rand.index(eval_df$demuxalot_best_assignment, eval_df$vireo_best_assignment) + ## Souporcell + ARI_souporcell_souporcell <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$souporcell_best_assignment) + ARI_souporcell_vireo <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$vireo_best_assignment) + ARI_souporcell_freemuxlet <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$freemuxlet_best_assignment) + ARI_souporcell_demuxalot <- adj.rand.index(eval_df$souporcell_best_assignment, eval_df$demuxalot_best_assignment) + + ## Produce data frame + tool_1 <- c("Vireo","Vireo","Vireo","Vireo", + "Freemuxlet", "Freemuxlet", "Freemuxlet", "Freemuxlet", + "Demuxalot","Demuxalot","Demuxalot","Demuxalot", + "Souporcell","Souporcell","Souporcell", "Souporcell") + tool_2 <- c("Vireo", "Freemuxlet", "Demuxalot", "Souporcell", + "Freemuxlet", "Vireo","Demuxalot","Souporcell", + "Demuxalot", "Souporcell","Freemuxlet","Vireo", + "Souporcell","Vireo","Freemuxlet","Demuxalot") + ARI <- c(ARI_vireo_vireo, ARI_vireo_freemuxlet, ARI_vireo_demuxalot, ARI_vireo_souporcell, + ARI_freemuxlet_freemuxlet, ARI_freemuxlet_vireo, ARI_freemuxlet_demuxalot, ARI_freemuxlet_souporcell, + ARI_demuxalot_demuxalot, ARI_demuxalot_souporcell, ARI_demuxalot_freemuxlet, ARI_demuxalot_vireo, + ARI_souporcell_souporcell, ARI_souporcell_vireo, ARI_souporcell_freemuxlet, ARI_souporcell_demuxalot) + ARI_df <- data.frame(tool_1,tool_2,ARI ) + + ## Plot ARI heatmap + ggplot(ARI_df, aes(x = tool_1, y = tool_2, fill = ARI, label = round(ARI, digits = 3) )) +geom_tile() + theme_bw() + + scale_fill_gradient(low="white", high="darkblue") + + xlab("Demultiplexing tool") + + ylab("Demultiplexing tool") + + scale_x_discrete(expand = c(0,0)) + + scale_y_discrete(expand = c(0,0)) + geom_text() + ggsave(paste(par_output_dir,"/step1","/ARI_demultiplexing_tools.pdf", sep="")) + + #### Compute proxy balanced accuracies for each tool using consensus cells from the remaining tools + ### Vireo + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$freemuxlet_best_assignment & + eval_df$souporcell_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + vireo_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$vireo_best_assignment] <- "TN" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$vireo_best_assignment] <- "TP" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$vireo_best_assignment] <- "FP" + eval_df_ba$vireo_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$vireo_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$vireo_eval)) + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + vireo_BA <- BA + vireo_df <- data.frame(BA) + vireo_df$tool <- "Vireo" + + ### Freemuxlet + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$vireo_best_assignment & + eval_df$souporcell_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + freemuxlet_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$freemuxlet_best_assignment] <- "TN" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$freemuxlet_best_assignment] <- "TP" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$freemuxlet_best_assignment] <- "FP" + eval_df_ba$freemuxlet_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$freemuxlet_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$freemuxlet_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + freemuxlet_BA <- BA + freemuxlet_df <- data.frame(BA) + freemuxlet_df$tool <- "Freemuxlet" + + ### Demuxalot + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$souporcell_best_assignment == eval_df$vireo_best_assignment & + eval_df$souporcell_best_assignment == eval_df$freemuxlet_best_assignment & + eval_df$souporcell_best_assignment != "unassigned" ,] + demuxalot_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxalot_best_assignment] <- "TN" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment == eval_df_ba$demuxalot_best_assignment] <- "TP" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment != "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxalot_best_assignment] <- "FP" + eval_df_ba$demuxalot_eval[eval_df_ba$souporcell_best_assignment == "doublet" & eval_df_ba$souporcell_best_assignment != eval_df_ba$demuxalot_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$demuxalot_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + demuxalot_BA <- BA + demuxalot_df <- data.frame(BA) + demuxalot_df$tool <- "Demuxalot" + + ### Souporcell + ## Compute consensus cells + eval_df_ba <- eval_df[eval_df$demuxalot_best_assignment == eval_df$vireo_best_assignment & + eval_df$demuxalot_best_assignment == eval_df$freemuxlet_best_assignment & + eval_df$demuxalot_best_assignment != "unassigned" ,] + souporcell_n <- nrow(eval_df_ba) + + ## Compute balanced accuracy + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment == "doublet" & eval_df_ba$demuxalot_best_assignment == eval_df_ba$souporcell_best_assignment] <- "TN" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment != "doublet" & eval_df_ba$demuxalot_best_assignment == eval_df_ba$souporcell_best_assignment] <- "TP" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment != "doublet" & eval_df_ba$demuxalot_best_assignment != eval_df_ba$souporcell_best_assignment] <- "FP" + eval_df_ba$souporcell_eval[eval_df_ba$demuxalot_best_assignment == "doublet" & eval_df_ba$demuxalot_best_assignment != eval_df_ba$souporcell_best_assignment] <- "FN" + df <- eval_df_ba + df_summary <- data.frame(table(df$souporcell_eval)) + df_summary + Var1 <- c("FN", "TN", "TP", "FP") + Freq<- c(0,0,0,0) + filler_frame <- data.frame(Var1, Freq) + df_summary <- rbind(df_summary, filler_frame) + df_summary <- df_summary %>% dplyr::group_by(Var1) %>% + dplyr::summarise(Freq = sum(Freq)) %>% as.data.frame() + + BA <- ((df_summary$Freq[df_summary$Var1 == "TP"]/(df_summary$Freq[df_summary$Var1 == "TP"] + df_summary$Freq[df_summary$Var1 == "FN"])) + + (df_summary$Freq[df_summary$Var1 == "TN"]/(df_summary$Freq[df_summary$Var1 == "TN"] + df_summary$Freq[df_summary$Var1 == "FP"])))/2 + + souporcell_BA <- BA + souporcell_df <- data.frame(BA) + souporcell_df$tool <- "Souporcell" + + #### Output summary information + ## Produce table with PWE information + Tool <- c("Vireo", "Freemuxlet", "Demuxalot", "Souporcell") + Balanced_accuracy <- c(vireo_BA, freemuxlet_BA, demuxalot_BA, souporcell_BA) + n_consensus_droplets <- c(vireo_n, freemuxlet_n, demuxalot_n, souporcell_n) + PWE_summary_df <- data.frame(Tool,Balanced_accuracy,n_consensus_droplets ) + write.csv(PWE_summary_df, paste(par_output_dir,"/step1",'/Balanced_accuracy_summary.csv', sep="")) + + ## Plot estimated balanced accuracy + ggplot(PWE_summary_df, aes(x = Tool, y = Balanced_accuracy, label = round(Balanced_accuracy, digits = 4), fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_text() + + xlab("Demultiplexing tool") + + ylab("Estimated Balanced Accuracy") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step1","/BA_demultiplexing_tools.pdf", sep="")) + + ## combine balanced accuracies into one data frame + BA_df <- rbind(vireo_df, + freemuxlet_df, + demuxalot_df, + souporcell_df) + + ### Place Vireo and Souporcell probabilties into proper format for downstream analsyes + ## Vireo + # Merge Vireo singlet and doublet probabilities -- take max + # Columns 8 and 9 are vireo_max and vireo_doublet, respectively + eval_df[, "vireo_max_probability"] <- apply(eval_df[, 12:13], 1, max) + + ## Souporcell -- convert log(prob) to prob + # Singlets + eval_df$souporcell_singlet_probability <- 1-(10^(eval_df$souporcell_log_probability_singlet)) + # Doublets + eval_df$souporcell_doublet_probability <- 1-(10^(eval_df$souporcell_log_probability_doublet)) + # Take souporcell max probability between singlets and doublets + # Columns 25 and 26 are souporcell_singlet and souporcell_doublet, respectively + eval_df[, "souporcell_max_probability"] <- apply(eval_df[, 26:27], 1, max) + + ### Multiply the assignment probabilities from each of the constituent demultiplexing tools by their respective estimated balanced accuracy for the dataset + ## Vireo + eval_df$vireo_weighted_probability <- eval_df$vireo_max_probability*BA_df$BA[BA_df$tool == "Vireo"] + ## Freemuxlet + eval_df$freemuxlet_weighted_probability <- eval_df$freemuxlet_max_probability*BA_df$BA[BA_df$tool == "Freemuxlet"] + ## Demuxalot + eval_df$demuxalot_weighted_probability <- eval_df$demuxalot_max_probability*BA_df$BA[BA_df$tool == "Demuxalot"] + ## Souporcell + eval_df$souporcell_weighted_probability <- eval_df$souporcell_max_probability*BA_df$BA[BA_df$tool == "Souporcell"] + + ############################################################################################################################################################ + ## Get weighted consensus assignment ## + ############################################################################################################################################################ + ## Rename dataframe and remove first column + practice_df <- eval_df + practice_df <- practice_df[,-1] + + ### Create a sample list, not including unassigned and doublet + ## Vireo + Vireo_sample_list <- unique(practice_df$vireo_best_assignment) + ## Demuxalot + Demuxalot_sample_list <- unique(practice_df$demuxalot_best_assignment) + ## Freemuxlet + Freemuxlet_sample_list <- unique(practice_df$freemuxlet_best_assignment) + ## Souporcell + Souporcell_sample_list <- unique(practice_df$souporcell_best_assignment) + + ## Identify all unique samples identified by each demultiplexing tool + sample_list <- unlist(append(Vireo_sample_list,Demuxalot_sample_list)) + sample_list <- unlist(append(sample_list,Freemuxlet_sample_list)) + sample_list <- unlist(append(sample_list,Souporcell_sample_list)) + remove_sample <- c("doublet", "unassigned") + sample_list_2 <- sample_list[!sample_list %in% remove_sample] + sample_list_2 <- unique(sample_list_2) + if (length(sample_list_2) == par_sample_size){ + message(paste0("Generating weighted-probabilistic ensemble assignments from ", length(sample_list_2), " Sample IDs.")) + } else { + message(paste0("WARNING: Generating weighted-probabilistic ensemble assignments from ", length(sample_list_2), " Sample IDs. This is not the number of pooled samples defined by the user.")) + } + + ## Compute weighted probability for each sample + for (i in sample_list_2) { + # Doublets + practice_df$doublet <- ifelse(practice_df$vireo_best_assignment == "doublet", practice_df$vireo_weighted_probability, 0) + practice_df$doublet <- ifelse(practice_df$souporcell_best_assignment == "doublet", practice_df$doublet + practice_df$souporcell_weighted_probability, practice_df$doublet+ 0) + practice_df$doublet <- ifelse(practice_df$demuxalot_best_assignment == "doublet", practice_df$doublet + practice_df$demuxalot_weighted_probability, practice_df$doublet+ 0) + practice_df$doublet <- ifelse(practice_df$freemuxlet_best_assignment == "doublet", practice_df$doublet + practice_df$freemuxlet_weighted_probability, practice_df$doublet+ 0) + + # Singlets + practice_df$sample <- ifelse(practice_df$vireo_best_assignment == i, practice_df$vireo_weighted_probability, 0) + practice_df$sample <- ifelse(practice_df$souporcell_best_assignment == i, practice_df$sample + practice_df$souporcell_weighted_probability, practice_df$sample+ 0) + practice_df$sample <- ifelse(practice_df$demuxalot_best_assignment == i, practice_df$sample + practice_df$demuxalot_weighted_probability, practice_df$sample+ 0) + practice_df$sample <- ifelse(practice_df$freemuxlet_best_assignment == i, practice_df$sample + practice_df$freemuxlet_weighted_probability, practice_df$sample+ 0) + colnames(practice_df)[ncol(practice_df)] <- i + } + + ## Select sample assignment with maximum weighted probability for each droplet + data_long <- gather(practice_df, key="ensemblex_assignment", value="stat", 32:ncol(practice_df)) + result <- data_long %>% + dplyr::group_by(barcode) %>% + dplyr::filter(stat == max(stat)) + + ## Get remaining probabilities for non-assigned samples + data_long <- gather(practice_df, key="ensemblex_assignment", value="stat", 32:ncol(practice_df)) + result_sum <- data_long %>% + dplyr::group_by(barcode) %>% + dplyr::summarise(total = sum(stat)) + result_test <- merge(result, result_sum, by = "barcode") + + ## Calculate ensemblex probability + result_test$ensemblex_probability <- result_test$stat/result_test$total + + ## Clean up dataframe + result_test <- dplyr::select(result_test, c("barcode", "ensemblex_assignment", "ensemblex_probability", "vireo_assignment", "souporcell_assignment", "freemuxlet_assignment", "demuxalot_assignment", "general_consensus", + "vireo_best_assignment", "souporcell_best_assignment", "freemuxlet_best_assignment", "demuxalot_best_assignment", "vireo_singlet_probability", "vireo_doublet_probability", + "vireo_n_vars", "vireo_best_doublet", "vireo_doublet_logLikRatio", "souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "freemuxlet_n_snps", + "freemuxlet_n_reads", "freemuxlet_max_probability", "freemuxlet_DIFF_LLK_SNG_DBL", "demuxalot_max_probability", "demuxalot_doublet_probability", "vireo_max_probability", + "vireo_weighted_probability", "freemuxlet_weighted_probability", + "demuxalot_weighted_probability", "souporcell_weighted_probability")) + + ## save PWE assignment dataframe + write.csv(result_test, paste(par_output_dir,"/step1",'/step1_cell_assignment.csv', sep="")) + result_test +} + +########################################################################################################################### +# GRAPH-BASED DOUBLET DETECTION +########################################################################################################################### +## FUNCTIONS +graph_based_doublet_detection_estimated_par <- function(result_test, par_expected_doublet_rate, par_output_dir){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, freemuxlet_n_snps, reemuxlet_n_reads, vireo_doublet_logLikRatio, reemuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Freemuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "freemuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(freemuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Freemuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "freemuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(freemuxlet_n_snps)) + + ## Organize Freemuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "freemuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(freemuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Freemuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(freemuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Freemuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(freemuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Freemuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(freemuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + ################################################################################################################################ + #### parameter sweep #### + ################################################################################################################################ + ### Compute varying number of confident doublets (nCD) + ## 50 nCD + suspected_doublets_50 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_50){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_50 <- c(suspected_doublets_50, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_50 + } + if(length(suspected_doublets_50) >= 50 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_50 + } + + ## 100 nCD + suspected_doublets_100 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_100){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_100 <- c(suspected_doublets_100, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_100 + } + if(length(suspected_doublets_100) >= 100 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_100 + } + + ## 150 nCD + suspected_doublets_150 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_150){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_150 <- c(suspected_doublets_150, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_150 + } + if(length(suspected_doublets_150) >= 150 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_150 + } + + ## 200 nCD + suspected_doublets_200 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_200){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_200 <- c(suspected_doublets_200, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_200 + } + if(length(suspected_doublets_200) >= 200 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_200 + } + + ## 250 nCD + suspected_doublets_250 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_250){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_250 <- c(suspected_doublets_250, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_250 + } + if(length(suspected_doublets_250) >= 250 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_250 + } + + ## 300 nCD + suspected_doublets_300 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_300){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_300 <- c(suspected_doublets_300, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_300 + } + if(length(suspected_doublets_300) >= 300 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_300 + } + + + ## Make a list of percentile threshold (pT) values based on expected doublet rate for the pool + message(paste0("Expected doublet rate is ", par_expected_doublet_rate, "; ", round(par_expected_doublet_rate*nrow(result_2), digits = 0), " droplets" )) + interval <- par_expected_doublet_rate/6 + pT_list <- rev(seq(1-par_expected_doublet_rate,1-interval, by = interval)) + + ## 50 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_50 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_50){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_50 <- rbind(filler_frame_50,distances_test_15) + filler_frame_50 + } + filler_frame_50 + } + + ## 100 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_100 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_100){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_100 <- rbind(filler_frame_100,distances_test_15) + filler_frame_100 + } + filler_frame_100 + } + + ## 150 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_150 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_150){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_150 <- rbind(filler_frame_150,distances_test_15) + filler_frame_150 + } + filler_frame_150 + } + + ## 200 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_200 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_200){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_200 <- rbind(filler_frame_200,distances_test_15) + filler_frame_200 + } + filler_frame_200 + } + + ## 250 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_250 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_250){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_250 <- rbind(filler_frame_250,distances_test_15) + filler_frame_250 + } + filler_frame_250 + } + + ## 300 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_300 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_300){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_300 <- rbind(filler_frame_300,distances_test_15) + filler_frame_300 + } + filler_frame_300 + } + + ### Compute nearest neighbour frequency and Kutosis of frequency distributions + ## 50 nCD + counts_50 <- filler_frame_50 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_50 <- counts_50 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 50 + pT <- 0 + kurtosis <- 0 + fill_frame_50 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_50$n[counts_50$pT == t]) + nCD <- 50 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_50 <- rbind(fill_frame_50,temp_frame) + fill_frame_50 <- subset(fill_frame_50, pT != 0) + fill_frame_50 + } + + ## 100 nCD + counts_100 <- filler_frame_100 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_100 <- counts_100 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 100 + pT <- 0 + kurtosis <- 0 + fill_frame_100 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_100$n[counts_100$pT == t]) + nCD <- 100 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_100 <- rbind(fill_frame_100,temp_frame) + fill_frame_100 <- subset(fill_frame_100, pT != 0) + fill_frame_100 + } + + ## 150 nCD + counts_150 <- filler_frame_150 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_150 <- counts_150 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 150 + pT <- 0 + kurtosis <- 0 + fill_frame_150 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_150$n[counts_150$pT == t]) + nCD <- 150 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_150 <- rbind(fill_frame_150,temp_frame) + fill_frame_150 <- subset(fill_frame_150, pT != 0) + fill_frame_150 + } + + ## 200 nCD + counts_200 <- filler_frame_200 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_200 <- counts_200 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 200 + pT <- 0 + kurtosis <- 0 + fill_frame_200 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_200$n[counts_200$pT == t]) + nCD <- 200 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_200 <- rbind(fill_frame_200,temp_frame) + fill_frame_200 <- subset(fill_frame_200, pT != 0) + fill_frame_200 + } + + ## 250 nCD + counts_250 <- filler_frame_250 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_250 <- counts_250 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 250 + pT <- 0 + kurtosis <- 0 + fill_frame_250 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_250$n[counts_250$pT == t]) + nCD <- 250 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_250 <- rbind(fill_frame_250,temp_frame) + fill_frame_250 <- subset(fill_frame_250, pT != 0) + fill_frame_250 + } + + ## 300 nCD + counts_300 <- filler_frame_300 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_300 <- counts_300 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 300 + pT <- 0 + kurtosis <- 0 + fill_frame_300 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_300$n[counts_300$pT == t]) + nCD <- 300 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_300 <- rbind(fill_frame_300,temp_frame) + fill_frame_300 <- subset(fill_frame_300, pT != 0) + fill_frame_300 + } + + ################################################################################################################################ + ### Compute optimal parameters + ################################################################################################################################ + ## Bind all of the Kurtosis frames + kurtosis_bind <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + + ## Compute average Kurtosis for each pT + kurtosis_bind <- kurtosis_bind %>% + dplyr::group_by(pT) %>% + dplyr::summarize(Mean_k = mean(kurtosis, na.rm=TRUE)) + + optimal_pT <- kurtosis_bind$pT[kurtosis_bind$Mean_k == max(kurtosis_bind$Mean_k)] + + ## Plot kurtosis values + ggplot(kurtosis_bind, aes(x = pT, y = Mean_k)) + + geom_vline(xintercept = optimal_pT, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal pT: ", optimal_pT)) + ggsave(paste(par_output_dir,"/step2","/optimal_pT.pdf", sep="")) + + ## find optimal nCD + bind_nCD <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + bind_nCD <- subset(bind_nCD, pT == optimal_pT) + ncD <- data.frame(nCD = c(50, 100, 150, 200, 250, 300)) + ncD_list <- c(50, 100, 150, 200, 250, 300) + + ## Smooth line + fm1 <- smooth.spline(bind_nCD[,"ncD"], bind_nCD[,"kurtosis"], df = 3) + y2 <- predict(fm1, x = ncD) + y <- data.frame(y2$y) + y <- y$nCD + + ## Find elbow of the smoothed curve + df <- data.frame(ncd = ncD_list, kurtosis = y) + optimal_nCD <- find_curve_elbow(df, export_type = "row_num", plot_curve = FALSE) + + ## parse value + optimal_nCD <- df[optimal_nCD, 1] + + ## If cannot find slope, take point preceedinging the largest slope + if (is.na(optimal_nCD)) { + message("Could not determine optimal nCD based on the elbow, taking maximum kurtosis value.") + max <- max(bind_nCD$kurtosis) + optimal_nCD <- bind_nCD$ncD[bind_nCD$kurtosis == max] + #slope= diff(bind_nCD$ncD)/diff(bind_nCD$kurtosis) + #xx <- bind_nCD[which.max(slope),] + #optimal_nCD <- max(xx$ncD) + }else{ + optimal_nCD <- optimal_nCD + message("Determine optimal nCD based on the elbow.") + } + + ## plot + ggplot(bind_nCD, aes(x = ncD, y = kurtosis)) + + geom_vline(xintercept = optimal_nCD, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal nCD: ", optimal_nCD)) + ggsave(paste(par_output_dir,"/step2","/optimal_nCD.pdf", sep="")) + + ################################################################################################################################ + #### Compute graph-based doublet detection with optimal pT and nCD values + ################################################################################################################################ + ## Report optimal parameters + message(paste0("Using ", round(optimal_pT, digits = 4), " as optimal pT value")) + message(paste0("Using ", optimal_nCD, " as optimal nCD value")) + + ### Identify high confidence doublets + suspected_doublets <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets <- c(suspected_doublets, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets + } + if(length(suspected_doublets) >= optimal_nCD ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets + } + + ### Identify graph-based suspected doublets + doublet <- "none" + percentile <- 0 + barcode <- "none" + filler_frame <- data.frame(doublet,percentile, barcode ) + + for (j in suspected_doublets){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= optimal_pT,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + + filler_frame <- rbind(filler_frame,distances_test_15) + filler_frame + } + + ## Nearest neighbour frequency of GBD-identified doublets + total_testerquester_count <- filler_frame %>% + dplyr::group_by(barcode) %>% + dplyr::mutate(n = n()) %>% + ungroup() + total_testerquester_count <- total_testerquester_count[!(duplicated(total_testerquester_count$barcode)),] + total_testerquester_count <- total_testerquester_count[order(total_testerquester_count$n, decreasing = TRUE),] + total_testerquester_count_lim <- total_testerquester_count[c(1:nrow(total_testerquester_count)),] + remove_dublets <- subset(total_testerquester_count_lim, barcode != "none") + + ## Plot kurtosis and save plot + den <- density(total_testerquester_count_lim$n) + k <- kurtosis(total_testerquester_count_lim$n) + k <- round(k, digits = 3) + + ### Plot PCA summary + ## PCA prior to doublet detection + result_2_lim <- result_2 + result_2_lim$is_doublet <- "singlet" + result_2_lim$is_doublet[result_2_lim$ensemblex_assignment == "doublet"] <- "doublet" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment prior to graph-based doublet detection") + + scale_colour_manual(values = c( "indianred2", "grey")) + ggsave(paste(par_output_dir,"/step2","/PCA1_graph_based_doublet_detection.pdf", sep="")) + + ## PCA highlighting confident doublets + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% suspected_doublets ] <- "yes" + colnames(result_2_lim) + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle(paste0("High confidence doublets (nCD = ", optimal_nCD,")")) + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA2_graph_based_doublet_detection.pdf", sep="")) + + ## PCA after graph baed doublet detection + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% total_testerquester_count$barcode | result_2_lim$ensemblex_assignment == "doublet"] <- "yes" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment after graph-based doublet detection") + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA3_graph_based_doublet_detection.pdf", sep="")) + + ## Label graph-based expected doublets as doublets + result_2 <- result_test + result_2$ensemblex_assignment[result_2$barcode %in% remove_dublets$barcode] <- "doublet" + write.csv(result_2, paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep="")) + + result_2 +} +graph_based_doublet_detection_par_sweep <- function(result_test, par_expected_doublet_rate, par_output_dir){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, freemuxlet_n_snps, freemuxlet_n_reads, vireo_doublet_logLikRatio, freemuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Freemuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "freemuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(freemuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Freemuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "freemuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(freemuxlet_n_snps)) + + ## Organize Freemuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "freemuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(freemuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Freemuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(freemuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Freemuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(freemuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Freemuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(freemuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + ################################################################################################################################ + #### parameter sweep #### + ################################################################################################################################ + ### Compute varying number of confident doublets (nCD) + ## 50 nCD + suspected_doublets_50 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_50){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_50 <- c(suspected_doublets_50, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_50 + } + if(length(suspected_doublets_50) >= 50 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_50 + } + + ## 100 nCD + suspected_doublets_100 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_100){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_100 <- c(suspected_doublets_100, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_100 + } + if(length(suspected_doublets_100) >= 100 ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_100 + } + + ## 150 nCD + suspected_doublets_150 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_150){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_150 <- c(suspected_doublets_150, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_150 + } + if(length(suspected_doublets_150) >= 150 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_150 + } + + ## 200 nCD + suspected_doublets_200 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_200){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_200 <- c(suspected_doublets_200, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_200 + } + if(length(suspected_doublets_200) >= 200 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_200 + } + + ## 250 nCD + suspected_doublets_250 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_250){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_250 <- c(suspected_doublets_250, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_250 + } + if(length(suspected_doublets_250) >= 250 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_250 + } + + ## 300 nCD + suspected_doublets_300 <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + vireo_doublet_2_percentile <- subset(vireo_doublet_2_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets_300){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% vireo_doublet_2_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets_300 <- c(suspected_doublets_300, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets_300 + } + if(length(suspected_doublets_300) >= 300 ) { #OG = 100 + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets_300 + } + + + ## Make a list of percentile threshold (pT) values based on expected doublet rate for the pool + message(paste0("Expected doublet rate is ", par_expected_doublet_rate, "; ", round(par_expected_doublet_rate*nrow(result_2), digits = 0), " droplets" )) + interval <- par_expected_doublet_rate/6 + pT_list <- rev(seq(1-par_expected_doublet_rate,1-interval, by = interval)) + + ## 50 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_50 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_50){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_50 <- rbind(filler_frame_50,distances_test_15) + filler_frame_50 + } + filler_frame_50 + } + + ## 100 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_100 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_100){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_100 <- rbind(filler_frame_100,distances_test_15) + filler_frame_100 + } + filler_frame_100 + } + + ## 150 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_150 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_150){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_150 <- rbind(filler_frame_150,distances_test_15) + filler_frame_150 + } + filler_frame_150 + } + + ## 200 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_200 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_200){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_200 <- rbind(filler_frame_200,distances_test_15) + filler_frame_200 + } + filler_frame_200 + } + + ## 250 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_250 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_250){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_250 <- rbind(filler_frame_250,distances_test_15) + filler_frame_250 + } + filler_frame_250 + } + + ## 300 nCD pT sweep + doublet <- "none" + percentile <- 0 + barcode <- "none" + pT <- 0 + filler_frame_300 <- data.frame(doublet,percentile, barcode,pT ) + + for (t in pT_list){ + for (j in suspected_doublets_300){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= t,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + distances_test_15$pT <- t + + filler_frame_300 <- rbind(filler_frame_300,distances_test_15) + filler_frame_300 + } + filler_frame_300 + } + + ### Compute nearest neighbour frequency and Kutosis of frequency distributions + ## 50 nCD + counts_50 <- filler_frame_50 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_50 <- counts_50 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 50 + pT <- 0 + kurtosis <- 0 + fill_frame_50 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_50$n[counts_50$pT == t]) + nCD <- 50 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_50 <- rbind(fill_frame_50,temp_frame) + fill_frame_50 <- subset(fill_frame_50, pT != 0) + fill_frame_50 + } + + ## 100 nCD + counts_100 <- filler_frame_100 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_100 <- counts_100 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 100 + pT <- 0 + kurtosis <- 0 + fill_frame_100 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_100$n[counts_100$pT == t]) + nCD <- 100 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_100 <- rbind(fill_frame_100,temp_frame) + fill_frame_100 <- subset(fill_frame_100, pT != 0) + fill_frame_100 + } + + ## 150 nCD + counts_150 <- filler_frame_150 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_150 <- counts_150 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 150 + pT <- 0 + kurtosis <- 0 + fill_frame_150 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_150$n[counts_150$pT == t]) + nCD <- 150 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_150 <- rbind(fill_frame_150,temp_frame) + fill_frame_150 <- subset(fill_frame_150, pT != 0) + fill_frame_150 + } + + ## 200 nCD + counts_200 <- filler_frame_200 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_200 <- counts_200 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 200 + pT <- 0 + kurtosis <- 0 + fill_frame_200 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_200$n[counts_200$pT == t]) + nCD <- 200 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_200 <- rbind(fill_frame_200,temp_frame) + fill_frame_200 <- subset(fill_frame_200, pT != 0) + fill_frame_200 + } + + ## 250 nCD + counts_250 <- filler_frame_250 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_250 <- counts_250 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 250 + pT <- 0 + kurtosis <- 0 + fill_frame_250 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_250$n[counts_250$pT == t]) + nCD <- 250 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_250 <- rbind(fill_frame_250,temp_frame) + fill_frame_250 <- subset(fill_frame_250, pT != 0) + fill_frame_250 + } + + ## 300 nCD + counts_300 <- filler_frame_300 %>% + group_by(barcode,pT) %>% + dplyr::mutate(n = n()) %>% + ungroup() + + counts_300 <- counts_300 %>% + distinct(pT, barcode, .keep_all = TRUE) + + ncD <- 300 + pT <- 0 + kurtosis <- 0 + fill_frame_300 <- data.frame(ncD,pT, kurtosis) + + for (t in pT_list){ + k <- kurtosis(counts_300$n[counts_300$pT == t]) + nCD <- 300 + pT <- t + kurtosis <- k + temp_frame <- data.frame(ncD,pT, kurtosis) + fill_frame_300 <- rbind(fill_frame_300,temp_frame) + fill_frame_300 <- subset(fill_frame_300, pT != 0) + fill_frame_300 + } + + ### Compute optimal parameters + ## Bind all of the Kurtosis frames + kurtosis_bind <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + + ## Compute average Kurtosis for each pT + kurtosis_bind <- kurtosis_bind %>% + dplyr::group_by(pT) %>% + dplyr::summarize(Mean_k = mean(kurtosis, na.rm=TRUE)) + + optimal_pT <- kurtosis_bind$pT[kurtosis_bind$Mean_k == max(kurtosis_bind$Mean_k)] + + ## Plot kurtosis values + ggplot(kurtosis_bind, aes(x = pT, y = Mean_k)) + + geom_vline(xintercept = optimal_pT, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal pT: ", optimal_pT)) + ggsave(paste(par_output_dir,"/step2","/optimal_pT.pdf", sep="")) + + ## find optimal nCD + bind_nCD <- rbind(fill_frame_50,fill_frame_100,fill_frame_150,fill_frame_200,fill_frame_250, fill_frame_300) + bind_nCD <- subset(bind_nCD, pT == optimal_pT) + ncD <- data.frame(nCD = c(50, 100, 150, 200, 250, 300)) + ncD_list <- c(50, 100, 150, 200, 250, 300) + + ## Smooth line + fm1 <- smooth.spline(bind_nCD[,"ncD"], bind_nCD[,"kurtosis"], df = 3) + y2 <- predict(fm1, x = ncD) + y <- data.frame(y2$y) + y <- y$nCD + + ## Find elbow of the smoothed curve + df <- data.frame(ncd = ncD_list, kurtosis = y) + optimal_nCD <- find_curve_elbow(df, export_type = "row_num", plot_curve = FALSE) + + ## parse value + optimal_nCD <- df[optimal_nCD, 1] + + ## If cannot find slope, take point preceedinging the largest slope + if (is.na(optimal_nCD)) { + message("Could not determine optimal nCD based on the elbow, taking maximum kurtosis value.") + max <- max(bind_nCD$kurtosis) + optimal_nCD <- bind_nCD$ncD[bind_nCD$kurtosis == max] + #slope= diff(bind_nCD$ncD)/diff(bind_nCD$kurtosis) + #xx <- bind_nCD[which.max(slope),] + #optimal_nCD <- max(xx$ncD) + }else{ + optimal_nCD <- optimal_nCD + message("Determine optimal nCD based on the elbow.") + } + + ## plot + ggplot(bind_nCD, aes(x = ncD, y = kurtosis)) + + geom_vline(xintercept = optimal_nCD, col = "red") + + geom_point() + + geom_line() + + theme_classic() + + ggtitle(paste0("Estimated optimal nCD: ", optimal_nCD)) + ggsave(paste(par_output_dir,"/step2","/optimal_nCD.pdf", sep="")) + +} +graph_based_doublet_detection_manual_par <- function(result_test, par_expected_doublet_rate, par_output_dir){ + + ## Set seed + set.seed(1234) + + ## create an output directory + dir.create(paste(par_output_dir,"/step2",sep='')) + + ## load manually defined optimal parameters + optimal_pT <- par_ensemblex_pT + optimal_nCD <- par_ensemblex_nCD + + ## load Balanced-accuracy dataset + result_2 <- result_test + + ### Perform principal component analysis with select variables + result_2_lim <- result_2 + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13 )] #barcode, vireo_doublet_probability, souporcell_log_probability_doublet, freemuxlet_n_snps, reemuxlet_n_reads, vireo_doublet_logLikRatio, reemuxlet_DIFF_LLK_SNG_DBL, demuxalot_doublet_probability, vireo_singlet_probability (we dont use this for PCA) + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + ## scree plot + fviz_eig(res.pca) + ggsave(paste(par_output_dir,"/step2","/PCA_scree_plot.pdf", sep="")) + + ## PCA + fviz_pca_ind(res.pca, + col.ind = "black", + geom="point", + pointsize = 0.5 + ) + ggsave(paste(par_output_dir,"/step2","/PCA_plot.pdf", sep="")) + + ### variable contribution to variation + ## Plot contributions of variables to PC1 + fviz_contrib(res.pca, choice = "var", axes = 1, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC1") + ggsave(paste(par_output_dir,"/step2","/PC1_var_contrib.pdf", sep="")) + + ## Plot contributions of variables to PC2 + fviz_contrib(res.pca, choice = "var", axes = 2, top = 10) + theme_classic() + coord_flip() + xlab("Variable") + ggtitle("PC2") + ggsave(paste(par_output_dir,"/step2","/PC2_var_contrib.pdf", sep="")) + + ## Compute euclidean distance between points + rownames(result_2_lim) <- result_2_lim$barcode + res.pca <- prcomp(result_2_lim[,c(2,3,4,5,6,7,8)],scale = T) + df_1 <- data.frame(res.pca$x[,1]) + df_1$barcode <- rownames(df_1) + df_2 <- data.frame(res.pca$x[,2]) + df_2$barcode <- rownames(df_2) + df_merge_test <- merge(df_1, df_2, by = "barcode") + colnames(df_merge_test) <- c("barcode", "PC1", "PC2") + distances <- dist(df_merge_test[c("PC1", "PC2")], diag = TRUE, upper = TRUE) + distances <- as.matrix(distances) + colnames(distances) <- df_merge_test$barcode + rownames(distances) <- df_merge_test$barcode + + ### Organize parameters to identify most likely doublets + ## Organize Vireo doublet log_lik into ordered frame + vireo_doublet_df <- result_2 %>% select("barcode", "vireo_doublet_logLikRatio") + vireo_doublet_df <- vireo_doublet_df %>% arrange(desc(vireo_doublet_logLikRatio)) + + ## Organize Vireo doublet probability + vireo_doublet_2_df <- result_2 %>% select("barcode", "vireo_doublet_probability") + vireo_doublet_2_df <- vireo_doublet_2_df %>% arrange(desc(vireo_doublet_probability)) + + ## Organize Freemuxlet "freemuxlet_DIFF_LLK_SNG_DBL" in ordered frame + freemuxlet_doublet_df <- result_2 %>% select("barcode", "freemuxlet_DIFF_LLK_SNG_DBL") + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% arrange(freemuxlet_DIFF_LLK_SNG_DBL) + + ## Organize Souporcell log prob doublet + souporcell_doublet_df <- result_2 %>% select("barcode", "souporcell_log_probability_doublet") + souporcell_doublet_df <- souporcell_doublet_df %>% arrange(souporcell_log_probability_doublet) + + ## Organize Freemuxlet num snp + freemuxlet_snp_df <- result_2 %>% select("barcode", "freemuxlet_n_snps") + freemuxlet_snp_df <- freemuxlet_snp_df %>% arrange(desc(freemuxlet_n_snps)) + + ## Organize Freemuxlet num reads + freemuxlet_reads_df <- result_2 %>% select("barcode", "freemuxlet_n_reads") + freemuxlet_reads_df <- freemuxlet_reads_df %>% arrange(desc(freemuxlet_n_reads)) + + ## Organize Demuxalot probability + demuxalot_doublet_df <- result_2 %>% select("barcode", "demuxalot_doublet_probability") + demuxalot_doublet_df <- demuxalot_doublet_df %>% arrange(desc(demuxalot_doublet_probability)) + + ### Organize metrics by percentile + ## Vireo diff + vireo_doublet_df <- vireo_doublet_df %>% + mutate(percentile = percent_rank(vireo_doublet_logLikRatio)) + + ## Vireo doublet probability + vireo_doublet_2_df <- vireo_doublet_2_df %>% + mutate(percentile = percent_rank(vireo_doublet_probability)) + + ## Freemuxlet + freemuxlet_doublet_df <- freemuxlet_doublet_df %>% + mutate(percentile = percent_rank(freemuxlet_DIFF_LLK_SNG_DBL)) + freemuxlet_doublet_df$percentile <- 1 - freemuxlet_doublet_df$percentile + + ## Freemuxlet num reads + freemuxlet_reads_df <- freemuxlet_reads_df %>% + mutate(percentile = percent_rank(freemuxlet_n_reads)) + freemuxlet_reads_df$percentile <- freemuxlet_reads_df$percentile + + ## Freemuxlet num snps + freemuxlet_snp_df <- freemuxlet_snp_df %>% + mutate(percentile = percent_rank(freemuxlet_n_snps)) + freemuxlet_snp_df$percentile <- freemuxlet_snp_df$percentile + + ## Souporcell doublet prob + souporcell_doublet_df <- souporcell_doublet_df %>% + mutate(percentile = percent_rank(souporcell_log_probability_doublet)) + souporcell_doublet_df$percentile <- 1 - souporcell_doublet_df$percentile + + ## Demuxalot doublet prob + demuxalot_doublet_df <- demuxalot_doublet_df %>% + mutate(percentile = percent_rank(demuxalot_doublet_probability)) + demuxalot_doublet_df$percentile <- demuxalot_doublet_df$percentile + + + ################################################################################################################################ + #### Compute graph-based doublet detection with optimal pT and nCD values + ################################################################################################################################ + ## Report optimal parameters + message(paste0("Using ", round(optimal_pT, digits = 4), " as optimal pT value")) + message(paste0("Using ", optimal_nCD, " as optimal nCD value")) + + ### Identify high confidence doublets + suspected_doublets <- list() + s1 <- rev(seq(0, 1, by = 0.01)) + for (j in s1){ + vireo_doublet_percentile <- subset(vireo_doublet_df, percentile >= j) + freemuxlet_doublet_percentile <- subset(freemuxlet_doublet_df, percentile >= j) + souporcell_doublet_percentile <- subset(souporcell_doublet_df, percentile >= j) + freemuxlet_snp_percentile <- subset(freemuxlet_snp_df, percentile >= j) + freemuxlet_reads_percentile <- subset(freemuxlet_reads_df, percentile >= j) + demuxalot_doublet_percentile <- subset(demuxalot_doublet_df, percentile >= j) + + for (i in result_2$barcode){ + if (i %in% suspected_doublets){ + print("Barcode already doublet") + } else if (i %in% vireo_doublet_percentile$barcode & + i %in% freemuxlet_doublet_percentile$barcode & + i %in% souporcell_doublet_percentile$barcode & + i %in% freemuxlet_snp_percentile$barcode & + i %in% freemuxlet_reads_percentile$barcode & + i %in% demuxalot_doublet_percentile$barcode) { + print(paste0("adding barcode to doublet list at ", j, "percentile")) + suspected_doublets <- c(suspected_doublets, i) + } else { + print(paste0("barcode not a suspected doublet at", j, "percentile")) + } + suspected_doublets + } + if(length(suspected_doublets) >= optimal_nCD ) { + print(paste0("reached expected doublet rate at", j, "percentile")) + break + } + suspected_doublets + } + + ### Identify graph-based suspected doublets + doublet <- "none" + percentile <- 0 + barcode <- "none" + filler_frame <- data.frame(doublet,percentile, barcode ) + + for (j in suspected_doublets){ + distances_test_1 <-distances + distances_test_1 <- data.frame(distances_test_1[,j]) + colnames(distances_test_1) <- "doublet" + + distances_test_1 <- distances_test_1 %>% + mutate(percentile = percent_rank(doublet)) + distances_test_1$percentile <- 1 - distances_test_1$percentile + + distances_test_15 <- distances_test_1[distances_test_1$percentile >= optimal_pT,] + distances_test_15$barcode <- rownames(distances_test_15) + distances_test_15$doublet <- "temp" + + filler_frame <- rbind(filler_frame,distances_test_15) + filler_frame + } + + ## Nearest neighbour frequency of GBD-identified doublets + total_testerquester_count <- filler_frame %>% + dplyr::group_by(barcode) %>% + dplyr::mutate(n = n()) %>% + ungroup() + total_testerquester_count <- total_testerquester_count[!(duplicated(total_testerquester_count$barcode)),] + total_testerquester_count <- total_testerquester_count[order(total_testerquester_count$n, decreasing = TRUE),] + total_testerquester_count_lim <- total_testerquester_count[c(1:nrow(total_testerquester_count)),] + remove_dublets <- subset(total_testerquester_count_lim, barcode != "none") + + ## Plot kurtosis and save plot + den <- density(total_testerquester_count_lim$n) + k <- kurtosis(total_testerquester_count_lim$n) + k <- round(k, digits = 3) + + ### Plot PCA summary + ## PCA prior to doublet detection + result_2_lim <- result_2 + result_2_lim$is_doublet <- "singlet" + result_2_lim$is_doublet[result_2_lim$ensemblex_assignment == "doublet"] <- "doublet" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment prior to graph-based doublet detection") + + scale_colour_manual(values = c( "indianred2", "grey")) + ggsave(paste(par_output_dir,"/step2","/PCA1_graph_based_doublet_detection.pdf", sep="")) + + ## PCA highlighting confident doublets + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% suspected_doublets ] <- "yes" + colnames(result_2_lim) + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle(paste0("High confidence doublets (nCD = ", optimal_nCD,")")) + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA2_graph_based_doublet_detection.pdf", sep="")) + + ## PCA after graph baed doublet detection + result_2_lim <- result_2 + result_2_lim$is_confident_doublet <- "no" + result_2_lim$is_confident_doublet[result_2_lim$barcode %in% total_testerquester_count$barcode | result_2_lim$ensemblex_assignment == "doublet"] <- "yes" + result_2_lim <- result_2_lim[,c(1, 14, 19, 20, 21, 17, 23, 25, 13, 31)] + result_2_lim <- result_2_lim[complete.cases(result_2_lim), ] + res.pca <- prcomp(result_2_lim[,c(2:8)],scale = T) + + fviz_pca_ind(res.pca, + col.ind = result_2_lim$is_confident_doublet, + geom="point", + pointsize = 0.5 + ) + ggtitle("Assignment after graph-based doublet detection") + + scale_colour_manual(values = c("grey", "indianred2")) + ggsave(paste(par_output_dir,"/step2","/PCA3_graph_based_doublet_detection.pdf", sep="")) + + ## Label graph-based expected doublets as doublets + result_2 <- result_test + result_2$ensemblex_assignment[result_2$barcode %in% remove_dublets$barcode] <- "doublet" + write.csv(result_2, paste(par_output_dir,"/step2",'/Step2_cell_assignment.csv', sep="")) + + result_2 +} + + + +########################################################################################################################### +# ENSEMBLE-INDEPENDENT DOUBLET DETECTION +########################################################################################################################### +## FUNCTIONS +ensemble_independent_doublet_detections <- function(result_2, par_output_dir){ + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/step3",sep='')) + + expected_doublets <- nrow(result_2)*par_expected_doublet_rate + + ### Proportion agreement bar plot with threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Freemuxlet + result_2_freemuxlet <- result_2[result_2$freemuxlet_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Freemuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_threshold.pdf", sep="")) + + ### Proportion agreement bar plot without threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_best_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_best_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_best_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Freemuxlet + result_2_freemuxlet <- result_2[result_2$freemuxlet_best_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Freemuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets (no threshold)") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_no_threshold.pdf", sep="")) + + ### Number of ensemblex droplets with EID of each tool with threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Freemuxlet + result_2_temp_freemuxlet <- result_2 + result_2_temp_freemuxlet$ensemblex_assignment[result_2_temp_freemuxlet$freemuxlet_assignment == "doublet"] <- "doublet" + n_freemuxlet <- result_2_temp_freemuxlet[result_2_temp_freemuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Freemuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_freemuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_threshold.pdf", sep="")) + + ### number of ensemblex droplets with EID of each tool with out threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_best_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_best_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Freemuxlet + result_2_temp_freemuxlet <- result_2 + result_2_temp_freemuxlet$ensemblex_assignment[result_2_temp_freemuxlet$freemuxlet_best_assignment == "doublet"] <- "doublet" + n_freemuxlet <- result_2_temp_freemuxlet[result_2_temp_freemuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_best_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Freemuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_freemuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_no_threshold.pdf", sep="")) + + #################################################################################################################################################################################### + ## Ensemble inpendent doublet detection ## + #################################################################################################################################################################################### + ## Threshold + ## If Souporcell says doublet do doublet + if ((tolower(par_doublet_Souporcell_threshold))=="yes"){ + message("Labelling all doublets identified by Souporcell (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$souporcell_assignment == "doublet"] <- "doublet" + } + ## If Freemuxlet says doublet do doublet + if ((tolower(par_doublet_Freemuxlet_threshold))=="yes"){ + message("Labelling all doublets identified by Freemuxlet (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$freemuxlet_assignment == "doublet"] <- "doublet" + } + ## If Vireo says doublet do doublet + if ((tolower(par_doublet_Vireo_threshold))=="yes"){ + message("Labelling all doublets identified by Vireo (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$vireo_assignment == "doublet"] <- "doublet" + } + ## If Demuxalot says doublet do doublet + if ((tolower(par_doublet_Demuxalot_threshold))=="yes") { + message("Labelling all doublets identified by Demuxalot (threshold) as doublets.") + result_2$ensemblex_assignment[result_2$demuxalot_assignment == "doublet"] <- "doublet" + } + + ### No threshold + ## If Souporcell says doublet do doublet + if ((tolower(par_doublet_Souporcell_no_threshold))=="yes") { + message("Labelling all doublets identified by Souporcell (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$souporcell_best_assignment == "doublet"] <- "doublet" + } + ## If Freemuxlet says doublet do doublet + if ((tolower(par_doublet_Freemuxlet_no_threshold))=="yes") { + message("Labelling all doublets identified by Freemuxlet (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$freemuxlet_best_assignment == "doublet"] <- "doublet" + } + ## If Vireo says doublet do doublet + if ((tolower(par_doublet_Vireo_no_threshold))=="yes") { + message("Labelling all doublets identified by Vireo (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$vireo_best_assignment == "doublet"] <- "doublet" + } + ## If Demuxalot says doublet do doublet + if ((tolower(par_doublet_Demuxalot_no_threshold))=="yes") { + message("Labelling all doublets identified by Demuxalot (no threshold) as doublets.") + result_2$ensemblex_assignment[result_2$demuxalot_best_assignment == "doublet"] <- "doublet" + } + + ## Write csv file + write.csv(result_2, paste(par_output_dir,"/step3",'/Step3_cell_assignment.csv', sep="")) + + result_2 +} +ensemble_independent_doublet_detections_prelim <- function(result_2, par_output_dir){ + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/step3",sep='')) + + expected_doublets <- nrow(result_2)*par_expected_doublet_rate + + ### Proportion agreement bar plot with threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Freemuxlet + result_2_freemuxlet <- result_2[result_2$freemuxlet_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Freemuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_threshold.pdf", sep="")) + + ### Proportion agreement bar plot without threshold + ## Demuxalot + result_2_demuxalot <- result_2[result_2$demuxalot_best_assignment == "doublet",] + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(vireo_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_demuxalot$one <- 0 + result_2_demuxalot$one[result_2_demuxalot$vireo_best_assignment == "doublet"] <- 1 + result_2_demuxalot$two <- 0 + result_2_demuxalot$two[result_2_demuxalot$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_demuxalot$three <- 0 + result_2_demuxalot$three[result_2_demuxalot$souporcell_best_assignment == "doublet"] <- 1 + result_2_demuxalot <- result_2_demuxalot %>% dplyr::select(one, two, three) + result_2_demuxalot$sum <- rowSums(result_2_demuxalot) + result_2_demuxalot$tool <- "Demuxalot" + ## Vireo + result_2_vireo <- result_2[result_2$vireo_best_assignment == "doublet",] + result_2_vireo <- result_2_vireo %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, souporcell_best_assignment) + result_2_vireo$one <- 0 + result_2_vireo$one[result_2_vireo$demuxalot_best_assignment == "doublet"] <- 1 + result_2_vireo$two <- 0 + result_2_vireo$two[result_2_vireo$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_vireo$three <- 0 + result_2_vireo$three[result_2_vireo$souporcell_best_assignment == "doublet"] <- 1 + result_2_vireo <- result_2_vireo %>% dplyr::select(one, two, three) + result_2_vireo$sum <- rowSums(result_2_vireo) + result_2_vireo$tool <- "Vireo" + ## Souporcell + result_2_souporcell <- result_2[result_2$souporcell_best_assignment == "doublet",] + result_2_souporcell <- result_2_souporcell %>% dplyr::select(demuxalot_best_assignment, freemuxlet_best_assignment, vireo_best_assignment) + result_2_souporcell$one <- 0 + result_2_souporcell$one[result_2_souporcell$demuxalot_best_assignment == "doublet"] <- 1 + result_2_souporcell$two <- 0 + result_2_souporcell$two[result_2_souporcell$freemuxlet_best_assignment == "doublet"] <- 1 + result_2_souporcell$three <- 0 + result_2_souporcell$three[result_2_souporcell$vireo_best_assignment == "doublet"] <- 1 + result_2_souporcell <- result_2_souporcell %>% dplyr::select(one, two, three) + result_2_souporcell$sum <- rowSums(result_2_souporcell) + result_2_souporcell$tool <- "Souporcell" + ## Freemuxlet + result_2_freemuxlet <- result_2[result_2$freemuxlet_best_assignment == "doublet",] + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(demuxalot_best_assignment, souporcell_best_assignment, vireo_best_assignment) + result_2_freemuxlet$one <- 0 + result_2_freemuxlet$one[result_2_freemuxlet$demuxalot_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$two <- 0 + result_2_freemuxlet$two[result_2_freemuxlet$souporcell_best_assignment == "doublet"] <- 1 + result_2_freemuxlet$three <- 0 + result_2_freemuxlet$three[result_2_freemuxlet$vireo_best_assignment == "doublet"] <- 1 + result_2_freemuxlet <- result_2_freemuxlet %>% dplyr::select(one, two, three) + result_2_freemuxlet$sum <- rowSums(result_2_freemuxlet) + result_2_freemuxlet$tool <- "Freemuxlet" + + ## merge and plot + merge_df <- rbind(result_2_demuxalot, result_2_vireo, result_2_souporcell, result_2_freemuxlet) + df2 <- merge_df %>% group_by(tool, sum) %>% count() + df2 <- df2 %>% + group_by(tool) %>% + mutate(label_y = cumsum(n) - 0.5 * n) + ggplot(df2, aes(x = tool, y = n, fill = as.character(sum), group = tool)) + + geom_bar(stat = "identity", position = "stack") +theme_classic() + + geom_text(aes( y = label_y, label = n), vjust = 1.5, colour = "black") + + geom_hline(yintercept = expected_doublets)+ + xlab("Demultiplexing tool") + + ylab("Number of doublets (no threshold)") + + scale_fill_manual(values = c("lightgrey", "#ffb9b9", "#ee7272", "#a31818")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Doublet_overlap_no_threshold.pdf", sep="")) + + ### Number of ensemblex droplets with EID of each tool with threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Freemuxlet + result_2_temp_freemuxlet <- result_2 + result_2_temp_freemuxlet$ensemblex_assignment[result_2_temp_freemuxlet$freemuxlet_assignment == "doublet"] <- "doublet" + n_freemuxlet <- result_2_temp_freemuxlet[result_2_temp_freemuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Freemuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_freemuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_threshold.pdf", sep="")) + + ### number of ensemblex droplets with EID of each tool with out threshold + ## Souporcell + result_2_temp_souporcell <- result_2 + result_2_temp_souporcell$ensemblex_assignment[result_2_temp_souporcell$souporcell_best_assignment == "doublet"] <- "doublet" + n_souporcell <- result_2_temp_souporcell[result_2_temp_souporcell$ensemblex_assignment == "doublet",] %>% nrow() + + ## Vireo + result_2_temp_vireo <- result_2 + result_2_temp_vireo$ensemblex_assignment[result_2_temp_vireo$vireo_best_assignment == "doublet"] <- "doublet" + n_vireo <- result_2_temp_vireo[result_2_temp_vireo$ensemblex_assignment == "doublet",] %>% nrow() + + ## Freemuxlet + result_2_temp_freemuxlet <- result_2 + result_2_temp_freemuxlet$ensemblex_assignment[result_2_temp_freemuxlet$freemuxlet_best_assignment == "doublet"] <- "doublet" + n_freemuxlet <- result_2_temp_freemuxlet[result_2_temp_freemuxlet$ensemblex_assignment == "doublet",] %>% nrow() + + ## Demuxalot + result_2_temp_demuxalot <- result_2 + result_2_temp_demuxalot$ensemblex_assignment[result_2_temp_demuxalot$demuxalot_best_assignment == "doublet"] <- "doublet" + n_demuxalot <- result_2_temp_demuxalot[result_2_temp_demuxalot$ensemblex_assignment == "doublet",] %>% nrow() + + ## Plot + df <- data.frame(Tool = c("Demuxalot", "Vireo", "Souporcell", "Freemuxlet"), + n_doublets = c(n_demuxalot, n_vireo, n_souporcell, n_freemuxlet)) + + ggplot(df, aes(x = Tool, y = n_doublets, label = n_doublets, fill = Tool)) + + geom_bar(stat = "identity") +theme_classic() + + geom_hline(yintercept = expected_doublets)+ + geom_text() + + xlab("Demultiplexing tool") + + ylab("Number of doublets (threshold)") + + scale_fill_manual(values = c("#d95f02", "#e6ab02", "#7570b3", "#66a61e")) + + theme(legend.position = "right") + ggsave(paste(par_output_dir,"/step3","/Number_ensemblex_doublets_EID_no_threshold.pdf", sep="")) +} + + +########################################################################################################################### +# CONFIDENCE SCORE +########################################################################################################################### + +## FUNCTION +confidence_score <- function(result_2, par_output_dir, par_sample_size){ + + ## Set seed + set.seed(1234) + + ## Create an output directory + dir.create(paste(par_output_dir,"/confidence",sep='')) + + #### Calculate AUC singlet detection using consensus cells as proxy for ground truth + ### Vireo + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == freemuxlet_best_assignment & + souporcell_best_assignment == demuxalot_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$vireo_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Vireo AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Vireo AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$vireo_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + vireo_AUC <- roc_empirical$AUC + vireo_AUC_singlet <- vireo_AUC + + } else { + print(paste0("Insufficient droplets to compute Vireo AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Vireo AUC as 0.5 for confidence score computation.")) + vireo_AUC_singlet <- 0.5 + } + + ### Freemuxlet + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == vireo_best_assignment & + souporcell_best_assignment == demuxalot_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$freemuxlet_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Freemuxlet AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Freemuxlet AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$freemuxlet_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + freemuxlet_AUC <- roc_empirical$AUC + freemuxlet_AUC_singlet <- freemuxlet_AUC + + } else { + print(paste0("Insufficient droplets to compute Freemuxlet AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Freemuxlet AUC as 0.5 for confidence score computation.")) + freemuxlet_AUC_singlet <- 0.5 + } + + ### Demuxalot + eval_df <- result_2 + eval_df_lim <- subset(eval_df, souporcell_best_assignment == vireo_best_assignment & + souporcell_best_assignment == freemuxlet_best_assignment & + souporcell_best_assignment != "unassigned" & + souporcell_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$demuxalot_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Demuxalot AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Demuxalot AUC.") + } + + roc_empirical <- rocit(score = log(eval_df_lim$demuxalot_max_probability), class = eval_df_lim$consensus_eval_ROC, + negref = "bad") + print(summary(roc_empirical)) + + demuxalot_AUC <- roc_empirical$AUC + demuxalot_AUC_singlet <- demuxalot_AUC + + } else { + print(paste0("Insufficient droplets to compute Demuxalot AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Demuxalot AUC as 0.5 for confidence score computation.")) + demuxalot_AUC_singlet <- 0.5 + } + + ### Souporcell + eval_df <- result_2 + eval_df_lim <- subset(eval_df, demuxalot_best_assignment == vireo_best_assignment & + demuxalot_best_assignment == freemuxlet_best_assignment & + demuxalot_best_assignment != "unassigned" & + demuxalot_best_assignment != "doublet") + + eval_df_lim$consensus_eval_ROC <- "bad" + eval_df_lim$consensus_eval_ROC[eval_df_lim$demuxalot_best_assignment == eval_df_lim$souporcell_best_assignment] <- "good" + + ## Check if we have sufficient values to compute AUC. + temp_neg = subset(eval_df_lim, consensus_eval_ROC == "bad") + neg <- nrow(temp_neg) + temp_pos = subset(eval_df_lim, consensus_eval_ROC == "good") + pos <- nrow(temp_pos) + + if (pos != 0 & neg != 0){ + if (neg <=(0.01*pos) | pos <=(0.01*neg) ){ + print("Limited droplets obtained to compute Souporcell AUC; results may not be reflective of true AUC.") + } else { + print("Sufficient droplets obtained to compute Souporcell AUC.") + } + + roc_empirical <- rocit(score = log(1-(10^(eval_df_lim$souporcell_log_probability_singlet))), class = eval_df_lim$consensus_eval_ROC, ##change_here + negref = "bad") + print(summary(roc_empirical)) + souporcell_AUC <- roc_empirical$AUC + souporcell_AUC_singlet <- souporcell_AUC + + } else { + print(paste0("Insufficient droplets to compute Demuxalot AUC. Observed ", neg, " incorrectly classified droplets and ", pos, " correctly classified droplets. Setting Demuxalot AUC as 0.5 for confidence score computation.")) + souporcell_AUC_singlet <- 0.5 + } + + ### Compute ensemblex singlet confidence + eval_df$ensemblex_singlet_confidence <- eval_df$ensemblex_probability + + ## Vireo + eval_df$ensemblex_singlet_confidence[eval_df$vireo_singlet_probability >= 0.9 & eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_singlet_probability >= 0.9 & eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment == eval_df$ensemblex_assignment] + vireo_AUC_singlet + + ## Demuxalot + eval_df$ensemblex_singlet_confidence[eval_df$demuxalot_max_probability >= 0.9 & eval_df$demuxalot_best_assignment != "doublet" & + eval_df$demuxalot_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$demuxalot_max_probability >= 0.9 & eval_df$demuxalot_best_assignment != "doublet" & + eval_df$demuxalot_best_assignment == eval_df$ensemblex_assignment] + demuxalot_AUC_singlet + + ## Freemuxlet + eval_df$ensemblex_singlet_confidence[eval_df$freemuxlet_assignment != "unassigned" & eval_df$freemuxlet_best_assignment != "doublet" & + eval_df$freemuxlet_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$freemuxlet_assignment != "unassigned" & eval_df$freemuxlet_best_assignment != "doublet" & + eval_df$freemuxlet_best_assignment == eval_df$ensemblex_assignment] + freemuxlet_AUC_singlet + + ## Souporcell + eval_df$ensemblex_singlet_confidence[eval_df$souporcell_assignment != "unassigned" & eval_df$souporcell_best_assignment != "doublet" & + eval_df$souporcell_best_assignment == eval_df$ensemblex_assignment] <- eval_df$ensemblex_singlet_confidence[eval_df$souporcell_assignment != "unassigned" & eval_df$souporcell_best_assignment != "doublet" & + eval_df$souporcell_best_assignment == eval_df$ensemblex_assignment] + souporcell_AUC_singlet + + ## Unassignable cells + eval_df$ensemblex_singlet_confidence[eval_df$vireo_n_vars == 0] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_n_vars == 0]/par_sample_size + + ## Consensus + eval_df$ensemblex_singlet_confidence[eval_df$vireo_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$vireo_best_assignment == eval_df$freemuxlet_best_assignment & + eval_df$vireo_best_assignment == eval_df$souporcell_best_assignment & + eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment != "unassigned"] <- eval_df$ensemblex_singlet_confidence[eval_df$vireo_best_assignment == eval_df$demuxalot_best_assignment & + eval_df$vireo_best_assignment == eval_df$freemuxlet_best_assignment & + eval_df$vireo_best_assignment == eval_df$souporcell_best_assignment & + eval_df$vireo_best_assignment != "doublet" & + eval_df$vireo_best_assignment != "unassigned"] +1 + + ## Set ensemblex best assignment + eval_df$ensemblex_best_assignment <- eval_df$ensemblex_assignment + ## Set ensemblex assignment + eval_df$ensemblex_assignment[eval_df$ensemblex_singlet_confidence < 1 & eval_df$ensemblex_assignment != "doublet" ] <- "unassigned" + + + eval_df <- dplyr::select(eval_df, c("barcode", "ensemblex_assignment","ensemblex_best_assignment", "ensemblex_probability", "ensemblex_singlet_confidence" , "vireo_assignment","souporcell_assignment","freemuxlet_assignment","demuxalot_assignment","general_consensus" , + "vireo_best_assignment","souporcell_best_assignment","freemuxlet_best_assignment","demuxalot_best_assignment","vireo_singlet_probability","vireo_doublet_probability","vireo_n_vars","vireo_best_doublet", + "vireo_doublet_logLikRatio","souporcell_log_probability_singlet", "souporcell_log_probability_doublet", "freemuxlet_n_snps", "freemuxlet_n_reads","freemuxlet_max_probability", "freemuxlet_DIFF_LLK_SNG_DBL","demuxalot_max_probability" , + "demuxalot_doublet_probability", "vireo_max_probability","vireo_weighted_probability","freemuxlet_weighted_probability","demuxalot_weighted_probability" , + "souporcell_weighted_probability" )) + + write.csv(eval_df, paste(par_output_dir,"/confidence",'/ensemblex_final_cell_assignment.csv', sep="")) + eval_df +} diff --git a/ensemblex.pip/nogt/scripts/ensemblexing/pipeline_ensemblexing.sh b/ensemblex.pip/nogt/scripts/ensemblexing/pipeline_ensemblexing.sh new file mode 100644 index 0000000..343cff1 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/ensemblexing/pipeline_ensemblexing.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini +if [ -d $OUTPUT_DIR/ensemblex ]; then + rm -rf $OUTPUT_DIR/ensemblex + mkdir -p $OUTPUT_DIR/ensemblex +else + mkdir -p $OUTPUT_DIR/ensemblex +fi + + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step ensemblex No-GT submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* PAR_ensemblex_sample_size $PAR_ensemblex_sample_size" +echo "* PAR_ensemblex_expected_doublet_rate $PAR_ensemblex_expected_doublet_rate" +echo "* PAR_ensemblex_merge_constituents $PAR_ensemblex_merge_constituents" +echo "* PAR_ensemblex_probabilistic_weighted_ensemble $PAR_ensemblex_probabilistic_weighted_ensemble" +echo "* PAR_ensemblex_preliminary_parameter_sweep $PAR_ensemblex_preliminary_parameter_sweep" +echo "* PAR_ensemblex_graph_based_doublet_detection $PAR_ensemblex_graph_based_doublet_detection" +echo "* PAR_ensemblex_preliminary_ensemble_independent_doublet $PAR_ensemblex_preliminary_ensemble_independent_doublet" +echo "* PAR_ensemblex_independent_doublet $PAR_ensemblex_independent_doublet" +echo "* PAR_ensemblex_doublet_Demuxalot_threshold $PAR_ensemblex_doublet_Demuxalot_threshold" +echo "* PAR_ensemblex_doublet_Demuxalot_no_threshold $PAR_ensemblex_doublet_Demuxalot_no_threshold" +echo "* PAR_ensemblex_doublet_Freemuxlet_threshold $PAR_ensemblex_doublet_Freemuxlet_threshold" +echo "* PAR_ensemblex_doublet_Freemuxlet_no_threshold $PAR_ensemblex_doublet_Freemuxlet_no_threshold" +echo "* PAR_ensemblex_doublet_Souporcell_threshold $PAR_ensemblex_doublet_Souporcell_threshold" +echo "* PAR_ensemblex_doublet_Souporcell_no_threshold $PAR_ensemblex_doublet_Souporcell_no_threshold" +echo "* PAR_ensemblex_doublet_Vireo_threshold $PAR_ensemblex_doublet_Vireo_threshold" +echo "* PAR_ensemblex_doublet_Vireo_no_threshold $PAR_ensemblex_doublet_Vireo_no_threshold" +echo "* PAR_ensemblex_compute_singlet_confidence $PAR_ensemblex_compute_singlet_confidence" +echo "* PAR_ensemblex_nCD $PAR_ensemblex_nCD" +echo "* PAR_ensemblex_pT $PAR_ensemblex_pT" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# + +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} gunzip $OUTPUT_DIR/freemuxlet/outs.clust1.samples.gz + + +echo "Start of ensemblexing" +$CONTAINER_CMD exec --bind $OUTPUT_DIR,$PIPELINE_HOME ${CONTAINER1} Rscript $PIPELINE_HOME/nogt/scripts/ensemblexing/ensemblexing_nogt.R $PIPELINE_HOME $OUTPUT_DIR $PAR_ensemblex_sample_size $PAR_ensemblex_expected_doublet_rate $PAR_ensemblex_merge_constituents $PAR_ensemblex_probabilistic_weighted_ensemble $PAR_ensemblex_preliminary_parameter_sweep $PAR_ensemblex_graph_based_doublet_detection $PAR_ensemblex_preliminary_ensemble_independent_doublet $PAR_ensemblex_independent_doublet $PAR_ensemblex_doublet_Demuxalot_threshold $PAR_ensemblex_doublet_Demuxalot_no_threshold $PAR_ensemblex_doublet_Freemuxlet_threshold $PAR_ensemblex_doublet_Freemuxlet_no_threshold $PAR_ensemblex_doublet_Souporcell_threshold $PAR_ensemblex_doublet_Souporcell_no_threshold $PAR_ensemblex_doublet_Vireo_threshold $PAR_ensemblex_doublet_Vireo_no_threshold $PAR_ensemblex_compute_singlet_confidence $PAR_ensemblex_nCD $PAR_ensemblex_pT +echo "End of ensemblexing" + +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} gzip $OUTPUT_DIR/freemuxlet/outs.clust1.samples + +exit 0 diff --git a/ensemblex.pip/nogt/scripts/freemuxlet/pipeline_freemuxlet.sh b/ensemblex.pip/nogt/scripts/freemuxlet/pipeline_freemuxlet.sh new file mode 100644 index 0000000..424fc57 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/freemuxlet/pipeline_freemuxlet.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +if [ -d $OUTPUT_DIR/ensemblex ]; then + rm -rf $OUTPUT_DIR/freemuxlet + mkdir -p $OUTPUT_DIR/freemuxlet +else + mkdir -p $OUTPUT_DIR/freemuxlet +fi + +#----------------------------------------------------------------# +# # +# INITIALIZE VARIABLES # +# # +#----------------------------------------------------------------# +echo "-------------------------------------------" +echo "* step Freemuxlet No-GT submitted at `date +%FT%H.%M.%S`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: $PIPELINE_HOME" +echo "* OUTPUT_DIR: $OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* PAR_freemuxlet_field: $PAR_freemuxlet_field" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "Start of pileup step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} /opt/popscle/bin/popscle dsc-pileup \ +--sam $OUTPUT_DIR/input_files/pooled_bam.bam \ +--vcf $OUTPUT_DIR/input_files/reference.vcf \ +--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \ +--out $OUTPUT_DIR/freemuxlet/pileup + +echo "End of Step pileup" + +echo "Start of Freemuxlet step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR ${CONTAINER1} /opt/popscle/bin/popscle freemuxlet \ +--plp $OUTPUT_DIR/freemuxlet/pileup \ +--nsample $PAR_freemuxlet_nsample \ +--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \ +--out $OUTPUT_DIR/freemuxlet/outs + +echo "End of Freemuxlet step" +exit 0 diff --git a/ensemblex.pip/nogt/scripts/souporcell/pipeline_souporcell_generate.sh b/ensemblex.pip/nogt/scripts/souporcell/pipeline_souporcell_generate.sh new file mode 100644 index 0000000..942f433 --- /dev/null +++ b/ensemblex.pip/nogt/scripts/souporcell/pipeline_souporcell_generate.sh @@ -0,0 +1,140 @@ +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + + +TEMPFILERUN=$JOB_OUTPUT_DIR/.tmp/pipeline_souporcell.sh +cat < $TEMPFILERUN +#!/bin/bash + +umask 002 +source \$OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source \$OUTPUT_DIR/job_info/.tmp/temp_config.ini + +if [ -d $OUTPUT_DIR/souporcell ]; then + rm -rf $OUTPUT_DIR/souporcell + mkdir -p $OUTPUT_DIR/souporcell +else + mkdir -p $OUTPUT_DIR/souporcell +fi + +cd $OUTPUT_DIR/souporcell + +echo "-------------------------------------------" +echo "* step souporcell No-GT submitted at \`date +%FT%H.%M.%S\`" +echo "-------------------------------------------" +echo "* PIPELINE_HOME: \$PIPELINE_HOME" +echo "* OUTPUT_DIR: \$OUTPUT_DIR" +echo "-------------------------------------------" +echo "------Parameters used in this step---------" +echo "* souporcell_N: \$PAR_souporcell_N" +echo "* souporcell_h: \$PAR_souporcell_h" +echo "* souporcell_threads: \$PAR_souporcell_threads" +echo "-------------------------------------------" +echo -e "------Output of Run------------------------\n\n" +CONTAINER1=\$PIPELINE_HOME/soft/ensemblex.sif +SOFT_SOUP=/opt/souporcell +#----------------------------------------------------------------# +# START PIPELINE # +#----------------------------------------------------------------# +echo "#----------------------------------------------------------------#" + +echo "Start of Renamer step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/renamer.py \\ + --bam $OUTPUT_DIR/input_files/pooled_bam.bam \\ + --barcodes $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + --out $OUTPUT_DIR/souporcell/fq.fq + +echo "End of Renamer Step" + + +echo "#----------------------------------------------------------------#" +echo "Start of Re-align step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} minimap2 $PAR_minimap2 \\ +$OUTPUT_DIR/input_files/reference.fa $OUTPUT_DIR/souporcell/fq.fq > $OUTPUT_DIR/souporcell/minimap.sam + +echo "End of Step Re-align " + +echo "#----------------------------------------------------------------#" +echo "Start of Retag step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/retag.py --sam $OUTPUT_DIR/souporcell/minimap.sam --out $OUTPUT_DIR/souporcell/minitagged.bam +echo "End of Retag step" + + +echo "#----------------------------------------------------------------#" +echo "Start of Sorting step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} samtools sort $OUTPUT_DIR/souporcell/minitagged.bam -o $OUTPUT_DIR/souporcell/minitagged_sorted.bam +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} samtools index $OUTPUT_DIR/souporcell/minitagged_sorted.bam + +echo "End of Sorting step" + +echo "#----------------------------------------------------------------#" +echo "Start of Call variants step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} freebayes -f $OUTPUT_DIR/input_files/reference.fa \\ +$PAR_freebayes $OUTPUT_DIR/souporcell/minitagged_sorted.bam > $OUTPUT_DIR/souporcell/Pool.vcf +echo "End of Call variants step" + +echo "#----------------------------------------------------------------#" +echo "Start of Vartrix step" +$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} vartrix \\ + -v $OUTPUT_DIR/souporcell/Pool.vcf \\ + -b $OUTPUT_DIR/input_files/pooled_bam.bam \\ + -f $OUTPUT_DIR/input_files/reference.fa \\ + -c $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + --ref-matrix $OUTPUT_DIR/souporcell/ref.mtx \\ + --out-matrix $OUTPUT_DIR/souporcell/alt.mtx \\ + --scoring-method coverage \\ +EOF + +if [[ $PAR_vartrix_umi ]]; then + echo " --umi \\" >> $TEMPFILERUN +fi + +if [[ $PAR_vartrix_mapq ]] ; then + echo " --mapq $PAR_vartrix_mapq \\" >> $TEMPFILERUN +fi +if [[ $PAR_vartrix_threads ]] ; then + echo " --threads $PAR_vartrix_threads \\" >> $TEMPFILERUN +fi +echo " " >> $TEMPFILERUN +echo " echo \" End of Vartrix \" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \"Start of Clustering cells by genotype step\" " >> $TEMPFILERUN +echo " $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/souporcell/target/release/souporcell \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " -b $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\" >> $TEMPFILERUN +echo " -k $PAR_souporcell_k \\" >> $TEMPFILERUN +echo " -t $PAR_souporcell_t > $OUTPUT_DIR/souporcell/clusters_tmp.tsv " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo " echo \"End Clustering cells by genotype step\" " >> $TEMPFILERUN + +echo " " >> $TEMPFILERUN +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \"Start of Step Calling doublets step\" " >> $TEMPFILERUN +echo " $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/troublet/target/release/troublet \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " --clusters $OUTPUT_DIR/souporcell/clusters_tmp.tsv > $OUTPUT_DIR/souporcell/clusters.tsv " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo " echo \"End of Calling doublets step\" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "#----------------------------------------------------------------#" >> $TEMPFILERUN +echo " echo \" Start of Genotype and ambient RNA coinference \" " >> $TEMPFILERUN +echo "$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} \$SOFT_SOUP/consensus.py \\" >> $TEMPFILERUN +echo " -c $OUTPUT_DIR/souporcell/clusters.tsv \\" >> $TEMPFILERUN +echo " -a $OUTPUT_DIR/souporcell/alt.mtx \\" >> $TEMPFILERUN +echo " -r $OUTPUT_DIR/souporcell/ref.mtx \\" >> $TEMPFILERUN +echo " --soup_out $OUTPUT_DIR/souporcell/soup.txt \\" >> $TEMPFILERUN +echo " -v $OUTPUT_DIR/souporcell/Pool.vcf \\" >> $TEMPFILERUN +echo " --vcf_out $OUTPUT_DIR/souporcell/cluster_genotypes.vcf \\" >> $TEMPFILERUN +echo " --output_dir $OUTPUT_DIR/souporcell " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN + +echo "echo \"End of Genotype and ambient RNA coinference step\" " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN +echo "exit 0 " >> $TEMPFILERUN +echo " " >> $TEMPFILERUN diff --git a/ensemblex.pip/nogt/scripts/vireo/pipeline_vireo.sh b/ensemblex.pip/nogt/scripts/vireo/pipeline_vireo.sh new file mode 100644 index 0000000..e15e2fa --- /dev/null +++ b/ensemblex.pip/nogt/scripts/vireo/pipeline_vireo.sh @@ -0,0 +1,61 @@ +umask 002 +source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini +source $OUTPUT_DIR/job_info/.tmp/temp_config.ini + +TEMPFILERUN=$OUTPUT_DIR/job_info/.tmp/pipeline_vireo.sh + +cat < $TEMPFILERUN + #!/bin/bash + + umask 002 + source \$OUTPUT_DIR/job_info/configs/ensemblex_config.ini + source \$OUTPUT_DIR/job_info/.tmp/temp_config.ini + if [ -d $OUTPUT_DIR/ensemblex ]; then + rm -rf $OUTPUT_DIR/vireo + mkdir -p $OUTPUT_DIR/vireo + else + mkdir -p $OUTPUT_DIR/vireo + fi + echo "-------------------------------------------" + echo "* step vireo No-GT submitted at \`date +%FT%H.%M.%S\`" + echo "-------------------------------------------" + echo "* PIPELINE_HOME: $PIPELINE_HOME" + echo "* OUTPUT_DIR: $OUTPUT_DIR" + echo "-------------------------------------------" + echo "------Parameters used in this step---------" + echo "* PAR_vireo_N: $PAR_vireo_N" + echo "* PAR_vireo_h: $PAR_vireo_h" + echo "* PAR_vireo_processes: $PAR_vireo_processes" + echo "* PAR_vireo_minMAF: $PAR_vireo_minMAF" + echo "* PAR_vireo_minCOUNT: $PAR_vireo_minCOUNT" + echo "* PAR_vireo_forcelearnGT: $PAR_vireo_forcelearnGT" + echo "-------------------------------------------" + echo -e "------Output of Run------------------------\n\n" + CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif + echo "Start of cellSNP step" + $CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} cellSNP -s $OUTPUT_DIR/input_files/pooled_bam.bam \\ + -b $OUTPUT_DIR/input_files/pooled_barcodes.tsv \\ + -O $OUTPUT_DIR/vireo \\ + -R $OUTPUT_DIR/input_files/reference.vcf \\ + -p $PAR_vireo_processes \\ + --minMAF $PAR_vireo_minMAF \\ + --minCOUNT $PAR_vireo_minCOUNT +EOF + echo "" >> $TEMPFILERUN + echo "End of cellSNP step" >> $TEMPFILERUN + echo "" >> $TEMPFILERUN + echo "Start of vireo step" >> $TEMPFILERUN + echo "$CONTAINER_CMD exec --bind $OUTPUT_DIR \${CONTAINER1} vireo -c $OUTPUT_DIR/vireo \\" >> $TEMPFILERUN + echo " --forceLearnGT \\" >> $TEMPFILERUN + echo " -t GT \\" >> $TEMPFILERUN + echo " -o $OUTPUT_DIR/vireo \\" >> $TEMPFILERUN + if [[ $PAR_vireo_N ]] ;then + echo " -N $PAR_vireo_N \\" >> $TEMPFILERUN + fi + if [[ $PAR_vireo_h ]] ;then + echo " -h $PAR_vireo_h \\" >> $TEMPFILERUN + fi + echo "" >> $TEMPFILERUN + echo "End of vireo step" >> $TEMPFILERUN + echo "" >> $TEMPFILERUN + \ No newline at end of file diff --git a/ensemblex.pip/tools/sort_vcf_same_as_bam.sh b/ensemblex.pip/tools/sort_vcf_same_as_bam.sh new file mode 100755 index 0000000..968f93f --- /dev/null +++ b/ensemblex.pip/tools/sort_vcf_same_as_bam.sh @@ -0,0 +1,201 @@ +#!/bin/bash +# +# Copyright (C): 2020-2021 - Gert Hulselmans +# +# Purpose: Sort VCF file in the same order as the BAM file, so it can be used with popscle. + + + +# Function to check if any of the programs in a pipe failed. +check_exit_codes () { + local GET_PIPESTATUS="${PIPESTATUS[@]}"; + local exit_code; + + for exit_code in ${GET_PIPESTATUS} ; do + if [ ${exit_code} -ne 0 ] ; then + return ${exit_code}; + fi + done + + return 0; +} + + + +# Check if necessary programs are installed. +check_if_programs_exists () { + local exit_code=0; + + # Check if awk is installed. + if ! type awk > /dev/null 2>&1 ; then + printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + # Check if bcftools is installed. + if ! type bcftools > /dev/null 2>&1 ; then + printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + # Check if samtools is installed. + if ! type samtools > /dev/null 2>&1 ; then + printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + return ${exit_code}; +} + + + +# Get order of the contigs (chromosomes) and their length from the BAM header. +get_contig_order_from_bam () { + local bam_input_file="${1}"; + local output_type="${2}"; + + if [ ${#@} -ne 2 ] ; then + printf 'Usage: get_contig_order_from_bam BAM_file output_type\n\n'; + printf 'Arguments:\n'; + printf ' - BAM_file: BAM file from which to get the contig order and contig lengths.\n'; + printf ' - output_type:\n'; + printf ' - "names": Return contig names.\n'; + printf ' - "chrom_sizes": Return contig names and contig lengths.\n'; + printf ' - "vcf": Return VCF header section for contigs.\n\n'; + return 1; + fi + + case "${output_type}" in + 'names') + ;; + 'chrom_sizes') + ;; + 'vcf') + ;; + *) + printf 'Error: output_type "%s" is not supported.\n' "${output_type}" > /dev/stderr; + return 1; + ;; + esac + + check_if_programs_exists || return $?; + + # Get the order of the contigs from the BAM header. + samtools view -H "${bam_input_file}" \ + | awk \ + -F '\t' \ + -v output_type="${output_type}" \ + ' + { + # Only look at sequence header fields. + if ($1 == "@SQ") { + contig_idx += 1; + contig_name = ""; + contig_length = ""; + # Extract contig (chromosome) name and contig (chromosome) length. + for (i = 2; i <= NF; i++) { + if ($i ~ /^SN:/) { + contig_name = substr($i, 4); + } + if ($i ~ /^LN:/) { + contig_length = substr($i, 4); + } + # Create contig order to name and contig order to length and vcf contig appings. + contig_idx_to_name[contig_idx] = contig_name; + contig_idx_to_length[contig_idx] = contig_length; + contig_idx_to_vcf_contig[contig_idx] = sprintf("##contig=", contig_name, contig_length); + } + } + } END { + if (contig_idx == 0) { + printf "Error: No \"@SQ\" header line found in BAM file.\n" > "/dev/stderr"; + exit(1); + } else if (output_type == "names") { + contig_names = ""; + for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { + contig_names = contig_names " " contig_idx_to_name[contig_idx]; + } + # Print all contig names (without leading space). + print substr(contig_names, 2); + } else if (output_type == "chrom_sizes") { + # Print all contig names with their length in a TAB separated fashion. + for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { + print contig_idx_to_name[contig_idx] "\t" contig_idx_to_length[contig_idx]; + } + } else if (output_type == "vcf") { + # Print VCF header section for contigs. + for (contig_idx = 1; contig_idx <= length(contig_idx_to_vcf_contig); contig_idx++) { + print contig_idx_to_vcf_contig[contig_idx]; + } + } + }' + + check_exit_codes; + + return $?; +} + + + +# Sort VCF file in the same order as the BAM file, so it can be used with popscle. +sort_vcf_same_as_bam () { + local bam_input_file="${1}"; + local vcf_input_file="${2}"; + local vcf_type="${3:-v}"; + + if [ ${#@} -lt 2 ] ; then + printf 'Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]\n\n'; + printf 'Arguments:\n'; + printf ' - BAM_file: BAM file from which to get the contig order to sort the VCF file.\n'; + printf ' - VCF_file: VCF file to sort by contig order as defined in the BAM file.\n'; + printf ' - VCF_type: VCF ouput file type (default: same as input VCF file type):\n'; + printf ' v: uncompressed VCF, z: compressed VCF,\n'; + printf ' u: uncompressed BCF, b: compressed BCF\n\n'; + printf 'Purpose:\n'; + printf ' Sort VCF file in the same order as the BAM file, so it can be used with popscle.\n\n'; + return 1; + fi + + check_if_programs_exists || return $?; + + # If VCF type is not specified, try to guess it from the filename extension. + if [ ${#@} -eq 2 ] ; then + if [ "${vcf_input_file%.vcf.gz}" != "${vcf_input_file}" ] ; then + vcf_type='z'; + elif [ "${vcf_input_file%.bcf}" != "${vcf_input_file}" ] ; then + vcf_type='b'; + fi + fi + + # Sort VCF file by same chromosome order as BAM file. + cat <( + # Create new VCF header: + # - Get VCF header of VCF input file. + # - Remove all contig header lines and "#CHROM" line from the VCF header. + # - Append contig headers in the order they appear in the input BAM file. + # - Add "#CHROM" line as last line of the new VCF header. + bcftools view -h "${vcf_input_file}" \ + | awk \ + ' + { + if ($1 !~ /^##contig=/ && $1 !~ /^#CHROM/) { + # Remove all contig header lines and "#CHROM" line. + print $0; + } + }' \ + | cat \ + - \ + <(get_contig_order_from_bam "${bam_input_file}" 'vcf') \ + <(bcftools view -h "${vcf_input_file}" | tail -n 1) \ + ) \ + <(bcftools view -H -O v "${vcf_input_file}") \ + | bcftools sort -O "${vcf_type}"; + + check_exit_codes; + + return $?; +} + + + +sort_vcf_same_as_bam "${@}"; diff --git a/ensemblex.pip/tools/utils.sh b/ensemblex.pip/tools/utils.sh new file mode 100644 index 0000000..67942fd --- /dev/null +++ b/ensemblex.pip/tools/utils.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +function call_parameter () { + a=$1 + my_string=${a#*=} + my_array=($(echo $my_string | tr "," "\n")) + for i in "${my_array[@]}" + do + eval $i + done +} + +function remove_argument () { + X=(ACCOUNT MODULEUSE= THREADS MEM_ARRAY WALLTIME_ARRAY CELLRANGER R_VERSIO step integrate) + for item in ${X[@]}; do + sed -i $1 -e "/${item}/d" + done +} +