From 227d08c06115bd02dcaca80fbb87a87504c06c26 Mon Sep 17 00:00:00 2001 From: Shadi Date: Fri, 7 Jun 2024 19:17:12 -0400 Subject: [PATCH] Fixed bug in how LD window thresholds that are passed to plink are computed/processed. --- CHANGELOG.md | 1 + magenpy/stats/ld/c_utils.pyx | 5 +++ magenpy/stats/ld/estimator.py | 59 +++++++++++++++++++++++++++++++++++ magenpy/stats/ld/utils.py | 33 +++++--------------- 4 files changed, 72 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef33edd..1612055 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 not work well for very large datasets with millions of variants and it causes overflow errors. - Updated the way we determine the `pandas` chunksize when converting from `plink` tables to `zarr`. - Simplified the way we compute the quantization scale in `model_utils`. +- Fixed major bug in how LD window thresholds that are passed to `plink1.9` are computed. ### Added diff --git a/magenpy/stats/ld/c_utils.pyx b/magenpy/stats/ld/c_utils.pyx index 0eb7dcd..3b04823 100644 --- a/magenpy/stats/ld/c_utils.pyx +++ b/magenpy/stats/ld/c_utils.pyx @@ -464,6 +464,11 @@ cpdef find_windowed_ld_boundaries(floating[:] pos, double max_dist): Find the LD boundaries for the windowed estimator of LD, i.e., the indices of the leftmost and rightmost neighbors for each SNP. + .. note:: + To match plink's behavior, the bounds here are inclusive, i.e., + if the distance between two SNPs is exactly equal to the maximum distance, + they are considered neighbors. + :param pos: A vector with the position of each genetic variant. :param max_dist: The maximum distance between SNPs to consider them neighbors. """ diff --git a/magenpy/stats/ld/estimator.py b/magenpy/stats/ld/estimator.py index b577470..058e053 100644 --- a/magenpy/stats/ld/estimator.py +++ b/magenpy/stats/ld/estimator.py @@ -56,6 +56,60 @@ def compute_ld_boundaries(self): m = self.genotype_matrix.n_snps return np.array((np.zeros(m), np.ones(m)*m)).astype(np.int64) + def compute_plink_window_thresholds(self, ld_boundaries=None): + """ + Computes the LD window thresholds to pass to plink1.9 for computing LD matrices. + Unfortunately, plink1.9 sets some default values for the window size and it + is important to set all the thresholds to obtain the desired shape for the + LD matrix. + + :param ld_boundaries: The LD boundaries for which to compute the thresholds. If not passed, + we compute the LD boundaries using the `compute_ld_boundaries` method. + + :return: A dictionary containing the window size thresholds for plink1.9. + + """ + + if ld_boundaries is None: + ld_boundaries = self.compute_ld_boundaries() + + threshold_dict = {} + + # (1) Determine maximum window size (Maximum number of neighbors on each side): + try: + threshold_dict['window_size'] = getattr(self, "window_size") + assert threshold_dict['window_size'] is not None + except (AttributeError, AssertionError): + threshold_dict['window_size'] = np.abs(ld_boundaries - + np.arange(ld_boundaries.shape[1])).max() + + # (2) Determine the maximum window size in kilobases + Centi Morgan (if available): + + positional_bounds = np.array([ld_boundaries[0, :], ld_boundaries[1, :] - 1]) + + try: + threshold_dict['kb_window_size'] = getattr(self, "kb_window_size") + assert threshold_dict['kb_window_size'] is not None + except (AttributeError, AssertionError): + kb_pos = .001 * self.genotype_matrix.bp_pos + kb_bounds = kb_pos[positional_bounds] + threshold_dict['kb_window_size'] = np.abs(kb_bounds - kb_pos).max() + + # (3) centi Morgan: + try: + threshold_dict['cm_window_size'] = getattr(self, "cm_window_size") + assert threshold_dict['cm_window_size'] is not None + except (AttributeError, AssertionError): + try: + # Checks if cm_pos is available in the genotype matrix: + cm_pos = self.genotype_matrix.cm_pos + cm_bounds = self.genotype_matrix.cm_pos[positional_bounds] + threshold_dict['cm_window_size'] = np.abs(cm_bounds - cm_pos).max() + except KeyError: + del threshold_dict['cm_window_size'] + + return threshold_dict + def compute(self, output_dir, temp_dir='temp', @@ -100,9 +154,14 @@ def compute(self, compressor_name=compressor_name, compression_level=compression_level) elif isinstance(self.genotype_matrix, plinkBEDGenotypeMatrix): + + # Compute the window size thresholds to pass to plink 1.9: + window_size_thersh = self.compute_plink_window_thresholds(ld_boundaries) + ld_mat = compute_ld_plink1p9(self.genotype_matrix, ld_boundaries, output_dir, + window_size_thersh, temp_dir=temp_dir, overwrite=overwrite, dtype=dtype, diff --git a/magenpy/stats/ld/utils.py b/magenpy/stats/ld/utils.py index e1cae77..603ad28 100644 --- a/magenpy/stats/ld/utils.py +++ b/magenpy/stats/ld/utils.py @@ -253,6 +253,7 @@ def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128): def compute_ld_plink1p9(genotype_matrix, ld_boundaries, output_dir, + window_size_thresholds, temp_dir='temp', overwrite=True, dtype='int8', @@ -263,8 +264,9 @@ def compute_ld_plink1p9(genotype_matrix, Compute LD matrices using plink 1.9. :param genotype_matrix: A plinkBEDGenotypeMatrix object - :param ld_boundaries: An array of LD boundaries for every SNP + :param ld_boundaries: An 2xM matrix of LD boundaries for every variant. :param output_dir: The output directory for the final LD matrix file (after processing). + :param window_size_thresholds: A dictionary of the window size thresholds to pass to plink1.9. :param temp_dir: A temporary directory to store intermediate files (e.g. files created for and by plink). :param overwrite: If True, it overwrites any LD matrices in `output_dir`. :param dtype: The data type for the entries of the LD matrix (supported data types are float32, float64 @@ -291,27 +293,6 @@ def compute_ld_plink1p9(genotype_matrix, plink_output = osp.join(temp_dir, f'chr_{str(genotype_matrix.chromosome)}') - # Set the window sizes in various units: - - # (1) Number of neighboring SNPs: - window_size = (ld_boundaries - np.arange(genotype_matrix.m)).max() + 10 - - # (2) Kilobases: - positional_bounds = np.clip(np.array([ld_boundaries[0, :] - 1, ld_boundaries[1, :]]), - a_min=0, a_max=ld_boundaries.shape[1] - 1) - - kb_pos = .001*genotype_matrix.bp_pos - kb_bounds = kb_pos[positional_bounds] - kb_window_size = (kb_bounds - kb_pos).max() + .01 - - # (3) centi Morgan: - try: - cm_pos = genotype_matrix.cm_pos - cm_bounds = genotype_matrix.cm_pos[positional_bounds] - cm_window_size = (cm_bounds - cm_pos).max() + .01 - except Exception: - cm_window_size = None - cmd = [ f"--bfile {genotype_matrix.bed_file.replace('.bed', '')}", f"--keep {keep_file}", @@ -319,12 +300,12 @@ def compute_ld_plink1p9(genotype_matrix, "--keep-allele-order", f"--out {plink_output}", "--r gz", - f"--ld-window {window_size}", - f"--ld-window-kb {kb_window_size}" + f"--ld-window {window_size_thresholds['window_size']}", + f"--ld-window-kb {window_size_thresholds['kb_window_size']}" ] - if cm_window_size is not None: - cmd.append(f"--ld-window-cm {cm_window_size}") + if 'cm_window_size' in window_size_thresholds: + cmd.append(f"--ld-window-cm {window_size_thresholds['cm_window_size']}") # --------------------------------------------------------- # Test if plink1.9 version is compatible with setting the --ld-window-r2 flag: