ver.0.0.2

neurobioinfo · Jun 13, 2024 · 6f5198e · 6f5198e
1 parent c1c193b
commit 6f5198e
Show file tree

Hide file tree

Showing 31 changed files with 8,800 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 # Ensemblex: an accuracy-weighted ensemble genetic demultiplexing framework for single-cell RNA sequencing
 
+[![](https://img.shields.io/badge/Documentation-scrnabox-blue)](https://neurobioinfo.github.io/ensemblex/site/) 
+<!-- [![biorxiv](https://img.shields.io/badge/biorxiv-manuscript-blue)](https://www.biorxiv.org/content/biorxiv/early/2023/11/15/2023.11.13.566851.full.pdf)  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7768427.svg)](https://doi.org/10.5281/zenodo.7768427)  -->
+
 -------------
 ## Contents
 - [Introduction](#introduction)

diff --git a/ensemblex.pip/.DS_Store b/ensemblex.pip/.DS_Store
diff --git a/ensemblex.pip/README b/ensemblex.pip/README
@@ -0,0 +1,12 @@
+
+# Ensembleux pipeline 
+Ensemblex is an accuracy-weighted ensemble framework for genetic demultiplexing of pooled single-cell RNA seqeuncing (scRNAseq) data. Ensemblex can be used to demultiplex pools **with** or **without** prior genotype information. It is compatible with Linux workstations. The detail of installing and running this pipeline can be found in tutorial in [ensembleux](https://neurobioinfo.github.io/ensembleux/site/). 
+
+## Contributing
+Any contributions or suggestions for improving the Ensemblex pipeline are welcomed and appreciated. If you encounter any issues, please open an issue in the [GitHub repository](https://github.com/neurobioinfo/ensemblex). Alternatively, you are welcomed to email the developers directly; for any questions please contact Michael Fiorini: michael.fiorini@mail.mcgill.ca
+
+## License
+This project is licensed under the MIT License - see the [LICENSE.md](https://github.com/neurobioinfo/ensembleux/blob/main/LICENSE) file for details
+
+## Acknowledgement
+The Ensemblex pipeline was produced for projects funded by the Canadian Institute of Health Research and Michael J. Fox Foundation Parkinson's Progression Markers Initiative (MJFF PPMI) in collaboration with The Neuro's Early Drug Discovery Unit (EDDU), McGill University. It is written by [Michael Fiorini](https://github.com/fiorini9) and [Saeid Amiri](https://github.com/saeidamiri1) with supervision from [Rhalena Thomas](https://github.com/RhalenaThomas) and Sali Farhan at the Montreal Neurological Institute-Hospital. Copyright belongs MNI BIOINFO CORE.
diff --git a/ensemblex.pip/gt/.DS_Store b/ensemblex.pip/gt/.DS_Store
diff --git a/ensemblex.pip/gt/configs/ensemblex_config.ini b/ensemblex.pip/gt/configs/ensemblex_config.ini
@@ -0,0 +1,121 @@
+############# Config Trace
+############# [DEFAULT]
+############# Depending on system and your requirments,  
+############# change the defaults
+
+########################
+## General parameters ##
+########################
+METHOD=GT
+CONTAINER_CMD=singularity
+
+###############
+## Demuxalot ##
+###############
+## List of Sample ID's in the sample VCF file (e.g., 'Sample_1,Sample_2,Sample_3')
+PAR_demuxalot_genotype_names=''
+## Read prior strength
+PAR_demuxalot_prior_strength=100
+## Minimum read coverage
+PAR_demuxalot_minimum_coverage=200
+## Minimum alternative read coverage
+PAR_demuxalot_minimum_alternative_coverage=10
+## Number of best snps for each donor to use for demultiplexing
+PAR_demuxalot_n_best_snps_per_donor=100
+## Genotype prior strength
+PAR_demuxalot_genotypes_prior_strength=1
+## Doublet prior strength
+PAR_demuxalot_doublet_prior=0.25
+
+##############
+## Demuxlet ##
+##############
+## Field to extract the genotypes (GT), genotype likelihood (PL), or posterior probability (GP) from the sample .vcf file
+PAR_demuxlet_field=GT
+
+###########
+## Vireo ##
+###########
+## Number of pooled samples
+PAR_vireo_N=
+## Field to extract the genotypes (GT), genotype likelihood (PL), or posterior probability (GP) from the sample .vcf file
+PAR_vireo_type=GT
+## Number of subprocesses for computing
+PAR_vireo_processes=20
+## Minimum minor allele frequency
+PAR_vireo_minMAF=0.1
+## Minimum aggregated count
+PAR_vireo_minCOUNT=20
+## Whether or not to treat donor GT as prior only
+PAR_vireo_forcelearnGT=T
+
+################
+## Souporcell ##
+################
+## Minimap2 parameters. For information regarding the minimap2 parameters, please see the documentation: 
+PAR_minimap2='-ax splice -t 8 -G50k -k 21 -w 11 --sr -A2 -B8 -O12,32 -E2,1 -r200 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g2000 -2K50m --secondary=no'
+## Freebayes parameters. For information regarding the freebayes parameters, please see the documentation: 
+PAR_freebayes=' -iXu -C 2 -q 20 -n 3 -E 1 -m 30 --min-coverage 6'
+## Whether or no to consider UMI information when populating coverage matrices
+PAR_vartrix_umi=TRUE
+## Minimum read mapping quality
+PAR_vartrix_mapq=30
+## Number of threads for computing
+PAR_vartrix_threads=8
+## Number of pooled samples
+PAR_souporcell_k=
+## Number of threads for computing
+PAR_souporcell_t=8
+
+#########################
+## ensemblex algorithm ##
+#########################
+#== Pool parameters ==#
+## Number of pooled samples
+PAR_ensemblex_sample_size=
+## Expected doublet rate for the pool. If using 10X Genomics, the expected doublet rate can be estimated based on the number of recovered cells. 
+PAR_ensemblex_expected_doublet_rate=0.12
+
+#== Set up parameters ==#
+## Whether or not to merge the output files of the constituent demultiplexing tools. If running ensemblex on a pool for the first time, this parameter should be set to "Yes".
+PAR_ensemblex_merge_constituents=yes
+
+#== Step 1 parameters: Probabilistic-weighted ensemble ==#
+## Whether or not to perform Step 1: Probabilistic-weighted ensemble. If running ensemblex on a pool for the first time, this parameter should be set to "Yes".
+PAR_ensemblex_probabilistic_weighted_ensemble=yes
+
+#== Step 2 parameters: Graph-based doublet detection ==#
+## Whether or not to perform a preliminary parameter sweep for Step 2: Graph-based doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define the number of confident doublets in the pool (nCD) and the percentile threshold of the nearest neighour frequency (pT), which can be defined in the following two parameters, respectively.
+PAR_ensemblex_preliminary_parameter_sweep=no
+## Manually defined number of confident doublets in the pool (nCD). To manually define nCD, uncomment the parament and enter the value (e.g., PAR_ensemblex_nCD=200)
+#PAR_ensemblex_nCD=
+## Manually defined percentile threshold of the nearest neighour frequency (pT. To manually define pT, uncomment the parament and enter the value (e.g., PAR_ensemblex_pT=0.9)
+#PAR_ensemblex_pT=
+## Whether or not to perform Step 2: Graph-based doublet detection. If PAR_ensemblex_nCD and PAR_ensemblex_pT are not defined by the user (NULL), ensemblex will automatically determine the optimal parameter values using an unsupervised parameter sweep. If PAR_ensemblex_nCD and PAR_ensemblex_pT are defined by the user, graph-based doublet detection will be performed with the user-defined values.
+PAR_ensemblex_graph_based_doublet_detection=yes
+
+#== Step 3 parameters: Ensemble-independent doublet detection ==#
+## Whether or not to perform a preliminary parameter sweep for Step 3: Ensemble-independent doublet detection. Users should utilize the preliminary parameter sweep if they wish to manually define which constituent tools to utilize for ensemble-independent doublet detection. Users can define which tools to utilize for ensemble-independent doublet detection in the following parameters.
+PAR_ensemblex_preliminary_ensemble_independent_doublet=no
+## Whether or not to perform Step 3: Ensemble-independent doublet detection.
+PAR_ensemblex_ensemble_independent_doublet=yes
+## Whether or not to label doublets identified by Demuxalot as doublets. Only doublets with assignment probabilities exceeding Demuxalot's recommended probability threshold will be labeled as doublets by ensemblex.
+PAR_ensemblex_doublet_Demuxalot_threshold=yes
+## Whether or not to label doublets identified by Demuxalot as doublets, regardless of the corresponding assignment probability.
+PAR_ensemblex_doublet_Demuxalot_no_threshold=no
+## Whether or not to label doublets identified by Demuxlet as doublets. Only doublets with assignment probabilities exceeding Demuxlet's recommended probability threshold will be labeled as doublets by ensemblex.
+PAR_ensemblex_doublet_Demuxlet_threshold=no
+## Whether or not to label doublets identified by Demuxlet as doublets, regardless of the corresponding assignment probability.
+PAR_ensemblex_doublet_Demuxlet_no_threshold=no
+## Whether or not to label doublets identified by Souporcell as doublets. Only doublets with assignment probabilities exceeding Souporcell's recommended probability threshold will be labeled as doublets by ensemblex.
+PAR_ensemblex_doublet_Souporcell_threshold=no
+## Whether or not to label doublets identified by Souporcell as doublets, regardless of the corresponding assignment probability.
+PAR_ensemblex_doublet_Souporcell_no_threshold=no
+## Whether or not to label doublets identified by Vireo as doublets. Only doublets with assignment probabilities exceeding Vireo's recommended probability threshold will be labeled as doublets by ensemblex.
+PAR_ensemblex_doublet_Vireo_threshold=yes
+## Whether or not to label doublets identified by Vireo as doublets, regardless of the corresponding assignment probability.
+PAR_ensemblex_doublet_Vireo_no_threshold=no
+
+#== Confidence score parameters ==#
+## Whether or not to compute ensemblex's singlet confidence score. This will define low confidence assignments which should be removed from downstream analyses.
+PAR_ensemblex_compute_singlet_confidence=yes
diff --git a/ensemblex.pip/gt/scripts/.DS_Store b/ensemblex.pip/gt/scripts/.DS_Store
diff --git a/ensemblex.pip/gt/scripts/demuxalot/pipeline_demuxalot.sh b/ensemblex.pip/gt/scripts/demuxalot/pipeline_demuxalot.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+umask 002
+source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini
+source $OUTPUT_DIR/job_info/.tmp/temp_config.ini
+
+#----------------------------------------------------------------#
+#                                                     #
+# INITIALIZE VARIABLES                                #
+#                                                     #
+#----------------------------------------------------------------#
+echo "-------------------------------------------"
+echo "* step Swmuxalot GT submitted at `date +%FT%H.%M.%S`"
+echo "-------------------------------------------"
+echo "* PIPELINE_HOME:           $PIPELINE_HOME"
+echo "* OUTPUT_DIR:              $OUTPUT_DIR"
+echo "-------------------------------------------"
+echo "------Parameters used in this step---------"
+echo "* PAR_demuxalot_genotype_names  $PAR_demuxalot_genotype_names"
+echo "* PAR_demuxalot_prior_strength  $PAR_demuxalot_prior_strength"
+echo "* PAR_demuxalot_minimum_coverage  $PAR_demuxalot_minimum_coverage"
+echo "* PAR_demuxalot_minimum_alternative_coverage  $PAR_demuxalot_minimum_alternative_coverage"
+echo "* PAR_demuxalot_n_best_snps_per_donor  $PAR_demuxalot_n_best_snps_per_donor"
+echo "* PAR_demuxalot_genotypes_prior_strength  $PAR_demuxalot_genotypes_prior_strength"
+echo "* PAR_demuxalot_doublet_prior  $PAR_demuxalot_doublet_prior"
+echo "-------------------------------------------"
+echo -e "------Output of Run------------------------\n\n"
+CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif
+#----------------------------------------------------------------#
+# START PIPELINE                                      #
+#----------------------------------------------------------------#
+echo "Start of demuxalot"
+$CONTAINER_CMD exec  --bind  $OUTPUT_DIR,$PIPELINE_HOME ${CONTAINER1} python3 $PIPELINE_HOME/gt/scripts/demuxalot/pipline_demuxalot.py -fl $OUTPUT_DIR -p1 $PAR_demuxalot_genotype_names -p2 $PAR_demuxalot_prior_strength -p3 $PAR_demuxalot_minimum_coverage -p4 $PAR_demuxalot_minimum_alternative_coverage -p5 $PAR_demuxalot_n_best_snps_per_donor -p6 $PAR_demuxalot_genotypes_prior_strength -p7 $PAR_demuxalot_doublet_prior
+echo "End of demuxalot"
+exit 0
diff --git a/ensemblex.pip/gt/scripts/demuxalot/pipline_demuxalot.py b/ensemblex.pip/gt/scripts/demuxalot/pipline_demuxalot.py
@@ -0,0 +1,101 @@
+import argparse
+import sys
+import os
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-fl', '--folder',action='store', type=str,required=False, help='outputfolder') 
+parser.add_argument('-p1', '--pdgn',action='store',required=False, help='PAR_demuxalot_genotype_names') 
+parser.add_argument('-p2', '--pdps',action='store', type=int,required=False, help='PAR_demuxalot_prior_strength') 
+parser.add_argument('-p3', '--pdmc',action='store', type=int,required=False, help='PAR_demuxalot_minimum_coverage') 
+parser.add_argument('-p4', '--pdmac',action='store', type=int,required=False, help='PAR_demuxalot_minimum_alternative_coverage') 
+parser.add_argument('-p5', '--pdnbspd',action='store', type=int, required=False, help='PAR_demuxalot_n_best_snps_per_donor')
+parser.add_argument('-p6', '--pdgps',action='store',type=int,required=False, help='PAR_demuxalot_genotypes_prior_strength') 
+parser.add_argument('-p7', '--pddp',action='store', type=float,required=False, help='PAR_demuxalot_doublet_prior') 
+args = parser.parse_args()
+folder=args.folder
+PAR_demuxalot_genotype_names=args.pdgn.split(",")
+
+PAR_demuxalot_prior_strength=args.pdps
+PAR_demuxalot_minimum_coverage=args.pdmc
+PAR_demuxalot_minimum_alternative_coverage=args.pdmac
+PAR_demuxalot_n_best_snps_per_donor=args.pdnbspd
+PAR_demuxalot_genotypes_prior_strength=args.pdgps
+PAR_demuxalot_doublet_prior=args.pddp
+
+
+print(PAR_demuxalot_doublet_prior)
+
+from pathlib import Path
+import pandas as pd
+import pysam
+# usr/local/lib/python3.10/dist-packages
+from demuxalot.utils import download_file
+from demuxalot import BarcodeHandler, ProbabilisticGenotypes, Demultiplexer, count_snps, detect_snps_positions
+from pysam import VariantFile
+import pandas as pd
+import io
+from demuxalot import utils
+
+print('Part I')
+handler = BarcodeHandler.from_file(os.path.join(folder,'input_files/pooled_barcodes.tsv')) ### modify
+
+
+
+genotype_names = PAR_demuxalot_genotype_names
+
+genotype_names.sort()
+genotypes = ProbabilisticGenotypes(genotype_names=genotype_names)
+
+genotypes.add_vcf(os.path.join(folder,'input_files/pooled_samples.vcf'), prior_strength=100) ### modify
+
+print('Part II')
+
+pysam.index(os.path.join(folder,'input_files/pooled_bam.bam')) ### modify
+print('Part III')
+counts = count_snps(
+    bamfile_location=os.path.join(folder,'input_files/pooled_bam.bam'), ### modify
+    chromosome2positions=genotypes.get_chromosome2positions(),
+    barcode_handler=handler,
+)
+
+utils.summarize_counted_SNPs(counts)
+print('Part IV')
+
+new_snps_filename = 'new_snps_single_file.betas'
+_ = detect_snps_positions(
+    bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify
+    genotypes=genotypes,
+    barcode_handler=handler,
+    minimum_coverage=PAR_demuxalot_minimum_coverage,
+    minimum_alternative_coverage=PAR_demuxalot_minimum_alternative_coverage,
+    result_beta_prior_filename=str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')),
+    n_best_snps_per_donor=PAR_demuxalot_n_best_snps_per_donor,
+)
+
+print('Part III')
+
+genotypes_with_new_snps = genotypes.clone()
+genotypes_with_new_snps.add_prior_betas(str(os.path.join(folder,'demuxalot/new_snps_single_file.betas')), prior_strength=PAR_demuxalot_genotypes_prior_strength)
+
+counts_enriched = count_snps(
+    bamfile_location=str(os.path.join(folder,'input_files/pooled_bam.bam')), ### modify
+    chromosome2positions=genotypes_with_new_snps.get_chromosome2positions(),
+    barcode_handler=handler,
+)
+
+print('Part VI')
+
+learnt_enriched_genotypes, probs_learning_new_snps = Demultiplexer.learn_genotypes(
+    counts_enriched,
+    genotypes_with_new_snps,
+    barcode_handler=handler,
+    doublet_prior=PAR_demuxalot_doublet_prior,
+)
+
+print('Part V')
+
+probs_learning_new_snps.head()
+
+
+probs_learning_new_snps.to_csv(os.path.join(folder,'demuxalot/Demuxalot_result.csv')) ### modify
diff --git a/ensemblex.pip/gt/scripts/demuxlet/pipeline_demuxlet.sh b/ensemblex.pip/gt/scripts/demuxlet/pipeline_demuxlet.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+umask 002
+source $OUTPUT_DIR/job_info/configs/ensemblex_config.ini
+source $OUTPUT_DIR/job_info/.tmp/temp_config.ini
+
+#----------------------------------------------------------------#
+#                                                     #
+# INITIALIZE VARIABLES                                #
+#                                                     #
+#----------------------------------------------------------------#
+echo "-------------------------------------------"
+echo "* step Souporcell submitted at `date +%FT%H.%M.%S`"
+echo "-------------------------------------------"
+echo "* PIPELINE_HOME:           $PIPELINE_HOME"
+echo "* OUTPUT_DIR:              $OUTPUT_DIR"
+echo "-------------------------------------------"
+echo "------Parameters used in this step---------"
+echo "* PAR_demuxlet_field:             $PAR_demuxlet_field"
+echo "-------------------------------------------"
+echo -e "------Output of Run------------------------\n\n"
+CONTAINER1=$PIPELINE_HOME/soft/ensemblex.sif
+# SOFT_SOUP=/opt/souporcell
+#----------------------------------------------------------------#
+# START PIPELINE                                      #
+#----------------------------------------------------------------#
+echo "Start of pileup step"
+$CONTAINER_CMD exec  --bind  $OUTPUT_DIR ${CONTAINER1}  /opt/popscle/bin/popscle dsc-pileup \
+--sam $OUTPUT_DIR/input_files/pooled_bam.bam \
+--vcf $OUTPUT_DIR/input_files/pooled_samples.vcf \
+--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \
+--out $OUTPUT_DIR/demuxlet/pileup
+echo "End of pileup step"
+
+echo "Start of demuxlet step"
+$CONTAINER_CMD exec  --bind  $OUTPUT_DIR ${CONTAINER1}  /opt/popscle/bin/popscle demuxlet  \
+--plp $OUTPUT_DIR/demuxlet/pileup \
+--vcf $OUTPUT_DIR/input_files/pooled_samples.vcf \
+--field $PAR_demuxlet_field \
+--group-list $OUTPUT_DIR/input_files/pooled_barcodes.tsv \
+--out $OUTPUT_DIR/demuxlet/outs
+echo "End of demuxlet step"
+
+exit 0