Major update to add new features to LDMatrix data structure + suppo…

…rt for `LDLinearOperator`.
shz9 · Jul 30, 2024 · 7c3f4d9 · 7c3f4d9
1 parent 8a566e0
commit 7c3f4d9
Show file tree

Hide file tree

Showing 23 changed files with 1,487 additions and 387 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [0.1.4] - TBD
+## [0.1.4] - 2024-07-29
 
 ### Changed
 
@@ -18,10 +18,17 @@ not work well for very large datasets with millions of variants and it causes ov
 - Fixed in-place `fillna` in `from_plink_table` in `LDMatrix` to conform to latest `pandas` API.
 - Update `run_shell_script` to check for and capture errors.
 - Refactored code to slightly reduce import/load times.
+- Cleaned up `load_data` method of `LDMatrix` and subsumed functionality in `load_rows`.
 
 ### Added
 
 - Added extra validation checks in `LDMatrix` to ensure that the index pointer is formatted correctly.
+- `LDLinearOperator` class to allow for efficient linear algebra operations on the LD matrix without
+representing the full symmetric matrix in memory.
+- Added utility methods to `LDMatrix` class to allow for computing eigenvalues, performing SVD, etc.
+- Added `Min eigenvalue` to the attributes of LD matrices.
+- Added support to slice/retrieve entries of LD matrix by using SNP rsIDs.
+- Added support to reading LD matrices from AWS s3 storage.
 
 ## [0.1.3] - 2024-05-21
 

diff --git a/bin/magenpy_ld b/bin/magenpy_ld
@@ -85,6 +85,13 @@ parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
                     default='int8', help='The data type for the entries of the LD matrix.',
                     choices={'float32', 'float64', 'int16', 'int8'})
 
+# Other options:
+parser.add_argument('--compute-spectral-properties', dest='compute_spectral',
+                    default=False,
+                    action='store_true',
+                    help='Compute and store the spectral properties of the '
+                         'LD matrix (e.g. eigenvalues, eigenvectors).')
+
 # Add arguments for the compressor:
 parser.add_argument('--compressor', dest='compressor', type=str,
                     default='zstd', help='The compressor name or compression algorithm to use for the LD matrix.',
@@ -226,6 +233,7 @@ ld_mat = g.compute_ld(args.estimator,
                       dtype=args.storage_dtype,
                       compressor_name=args.compressor,
                       compression_level=args.compression_level,
+                      compute_spectral_properties=args.compute_spectral,
                       **ld_kwargs)
 
 # Store metadata (if provided):

diff --git a/docs/commandline/magenpy_ld.md b/docs/commandline/magenpy_ld.md
@@ -17,36 +17,36 @@ Which outputs the following help message:
 
 ```bash
 
-**********************************************                            
- _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
-| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
-| | | | | | (_| | (_| |  __/ | | | |_) | |_| |
-|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
-                 |___/           |_|    |___/
-Modeling and Analysis of Genetics data in python
-Version: 0.1.0 | Release date: April 2024
-Author: Shadi Zabad, McGill University
-**********************************************
-< Compute LD matrix and output in Zarr format >
+        **********************************************                            
+         _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
+        | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
+        | | | | | | (_| | (_| |  __/ | | | |_) | |_| |
+        |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
+                         |___/           |_|    |___/
+        Modeling and Analysis of Genetics data in python
+        Version: 0.1.4 | Release date: June 2024
+        Author: Shadi Zabad, McGill University
+        **********************************************
+        < Compute LD matrix and store in Zarr format >
 
-usage: magenpy_ld [-h] [--estimator {shrinkage,windowed,block,sample}] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE]
-                  [--backend {plink,xarray}] [--temp-dir TEMP_DIR] --output-dir OUTPUT_DIR [--min-maf MIN_MAF] [--min-mac MIN_MAC]
-                  [--genome-build GENOME_BUILD] [--metadata METADATA] [--storage-dtype {int8,float32,int16,float64}]
-                  [--compressor {zstd,lz4,zlib,gzip}] [--compression-level COMPRESSION_LEVEL] [--ld-window LD_WINDOW] [--ld-window-kb LD_WINDOW_KB]
-                  [--ld-window-cm LD_WINDOW_CM] [--ld-blocks LD_BLOCKS] [--genmap-Ne GENMAP_NE] [--genmap-sample-size GENMAP_SS]
-                  [--shrinkage-cutoff SHRINK_CUTOFF]
+usage: magenpy_ld [-h] [--estimator {shrinkage,block,windowed,sample}] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE]
+                  [--backend {xarray,plink}] [--temp-dir TEMP_DIR] --output-dir OUTPUT_DIR [--min-maf MIN_MAF] [--min-mac MIN_MAC]
+                  [--genome-build GENOME_BUILD] [--metadata METADATA] [--storage-dtype {float64,float32,int16,int8}]
+                  [--compute-spectral-properties] [--compressor {lz4,zlib,zstd,gzip}] [--compression-level COMPRESSION_LEVEL]
+                  [--ld-window LD_WINDOW] [--ld-window-kb LD_WINDOW_KB] [--ld-window-cm LD_WINDOW_CM] [--ld-blocks LD_BLOCKS]
+                  [--genmap-Ne GENMAP_NE] [--genmap-sample-size GENMAP_SS] [--shrinkage-cutoff SHRINK_CUTOFF]
 
-Commandline arguments for LD matrix computation
+Commandline arguments for LD matrix computation and storage
 
 options:
   -h, --help            show this help message and exit
-  --estimator {shrinkage,windowed,block,sample}
+  --estimator {shrinkage,block,windowed,sample}
                         The LD estimator (windowed, shrinkage, block, sample)
   --bfile BED_FILE      The path to a plink BED file.
   --keep KEEP_FILE      A plink-style keep file to select a subset of individuals to compute the LD matrices.
   --extract EXTRACT_FILE
                         A plink-style extract file to select a subset of SNPs to compute the LD matrix for.
-  --backend {plink,xarray}
+  --backend {xarray,plink}
                         The backend software used to compute the Linkage-Disequilibrium between variants.
   --temp-dir TEMP_DIR   The temporary directory where we store intermediate files.
   --output-dir OUTPUT_DIR
@@ -55,12 +55,15 @@ options:
   --min-mac MIN_MAC     The minimum minor allele count for variants included in the LD matrix.
   --genome-build GENOME_BUILD
                         The genome build for the genotype data (recommend storing as metadata).
-  --metadata METADATA   A comma-separated string with metadata keys and values. This is used to store information about the genotype data from which
-                        the LD matrix was computed, such as the biobank/samples, cohort characteristics (e.g. ancestry), etc. Keys and values should
-                        be separated by =, such that inputs are in the form of:--metadata Biobank=UKB,Ancestry=EUR,Date=April2024
-  --storage-dtype {int8,float32,int16,float64}
+  --metadata METADATA   A comma-separated string with metadata keys and values. This is used to store information about the genotype data
+                        from which the LD matrix was computed, such as the biobank/samples, cohort characteristics (e.g. ancestry), etc.
+                        Keys and values should be separated by =, such that inputs are in the form of:--metadata
+                        Biobank=UKB,Ancestry=EUR,Date=April2024
+  --storage-dtype {float64,float32,int16,int8}
                         The data type for the entries of the LD matrix.
-  --compressor {zstd,lz4,zlib,gzip}
+  --compute-spectral-properties
+                        Compute and store the spectral properties of the LD matrix (e.g. eigenvalues, eigenvectors).
+  --compressor {lz4,zlib,zstd,gzip}
                         The compressor name or compression algorithm to use for the LD matrix.
   --compression-level COMPRESSION_LEVEL
                         The compression level to use for the entries of the LD matrix (1-9).
@@ -78,5 +81,6 @@ options:
                         The sample size for the dataset used to infer the genetic map.
   --shrinkage-cutoff SHRINK_CUTOFF
                         The cutoff value below which we assume that the correlation between variants is zero.
+
  
 ```
diff --git a/docs/commandline/magenpy_simulate.md b/docs/commandline/magenpy_simulate.md
@@ -18,22 +18,22 @@ Which outputs the following help message:
 
 ```bash
 
-**********************************************                            
- _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
-| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
-| | | | | | (_| | (_| |  __/ | | | |_) | |_| |
-|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
-                 |___/           |_|    |___/
-Modeling and Analysis of Genetics data in python
-Version: 0.1.0 | Release date: April 2024
-Author: Shadi Zabad, McGill University
-**********************************************
-< Simulate complex quantitative or case-control traits >
+        ********************************************************                            
+             _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
+            | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
+            | | | | | | (_| | (_| |  __/ | | | |_) | |_| |
+            |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
+                             |___/           |_|    |___/
+            Modeling and Analysis of Genetics data in python
+            Version: 0.1.4 | Release date: June 2024
+            Author: Shadi Zabad, McGill University
+        ********************************************************
+        < Simulate complex quantitative or case-control traits >
 
 usage: magenpy_simulate [-h] --bfile BED_FILE [--keep KEEP_FILE] [--extract EXTRACT_FILE] [--backend {plink,xarray}] [--temp-dir TEMP_DIR]
-                        --output-file OUTPUT_FILE [--output-simulated-beta] [--min-maf MIN_MAF] [--min-mac MIN_MAC] --h2 H2 [--mix-prop MIX_PROP]
-                        [--prop-causal PROP_CAUSAL] [--var-mult VAR_MULT] [--phenotype-likelihood {binomial,gaussian}] [--prevalence PREVALENCE]
-                        [--seed SEED]
+                        --output-file OUTPUT_FILE [--output-simulated-beta] [--min-maf MIN_MAF] [--min-mac MIN_MAC] --h2 H2
+                        [--mix-prop MIX_PROP] [--prop-causal PROP_CAUSAL] [--var-mult VAR_MULT]
+                        [--phenotype-likelihood {gaussian,binomial}] [--prevalence PREVALENCE] [--seed SEED]
 
 Commandline arguments for the complex trait simulator
 
@@ -53,13 +53,13 @@ options:
   --min-maf MIN_MAF     The minimum minor allele frequency for variants included in the simulation.
   --min-mac MIN_MAC     The minimum minor allele count for variants included in the simulation.
   --h2 H2               Trait heritability. Ranges between 0. and 1., inclusive.
-  --mix-prop MIX_PROP   Mixing proportions for the mixture density (comma separated). For example, for the spike-and-slab mixture density, with the
-                        proportion of causal variants set to 0.1, you can specify: "--mix-prop 0.9,0.1 --var-mult 0,1".
+  --mix-prop MIX_PROP   Mixing proportions for the mixture density (comma separated). For example, for the spike-and-slab mixture density,
+                        with the proportion of causal variants set to 0.1, you can specify: "--mix-prop 0.9,0.1 --var-mult 0,1".
   --prop-causal PROP_CAUSAL, -p PROP_CAUSAL
                         The proportion of causal variants in the simulation. See --mix-prop for more complex architectures specification.
   --var-mult VAR_MULT, -d VAR_MULT
                         Multipliers on the variance for each mixture component.
-  --phenotype-likelihood {binomial,gaussian}
+  --phenotype-likelihood {gaussian,binomial}
                         The likelihood for the simulated trait: gaussian (e.g. quantitative) or binomial (e.g. case-control).
   --prevalence PREVALENCE
                         The prevalence of cases (or proportion of positives) for binary traits. Ranges between 0. and 1.

diff --git a/magenpy/GWADataLoader.py b/magenpy/GWADataLoader.py
@@ -84,7 +84,7 @@ def __init__(self,
         :param drop_duplicated: If True, drop SNPs with duplicated rsID.
         :param phenotype_likelihood: The likelihood of the phenotype (e.g. `gaussian`, `binomial`).
         :param sumstats_files: The path to the summary statistics file(s). The path may be a wildcard.
-        :param sumstats_format: The format for the summary statistics. Currently supports the following
+        :param sumstats_format: The format for the summary statistics. Currently, supports the following
         formats: `plink1.9`, `plink2`, `magenpy`, `fastGWA`, `COJO`, `SAIGE`, or `GWASCatalog` for the standard
         summary statistics format (also known as `ssf` or `gwas-ssf`).
         :param ld_store_files: The path to the LD matrices. This may be a wildcard to accommodate reading data
@@ -592,7 +592,11 @@ def read_ld(self, ld_store_paths):
             return
 
         if not iterable(ld_store_paths):
-            ld_store_files = get_filenames(ld_store_paths, extension='.zgroup')
+            if 's3://' in ld_store_paths:
+                from .utils.system_utils import glob_s3_path
+                ld_store_files = glob_s3_path(ld_store_paths)
+            else:
+                ld_store_files = get_filenames(ld_store_paths, extension='.zgroup')
         else:
             ld_store_files = ld_store_paths
 
@@ -634,6 +638,7 @@ def compute_ld(self,
                    dtype='int8',
                    compressor_name='zstd',
                    compression_level=7,
+                   compute_spectral_properties=False,
                    **ld_kwargs):
         """
         Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson
@@ -650,6 +655,7 @@ def compute_ld(self,
         and integer quantized data types int8 and int16).
         :param compressor_name: The name of the compression algorithm to use for the LD matrix.
         :param compression_level: The compression level to use for the entries of the LD matrix (1-9).
+        :param compute_spectral_properties: If True, compute the spectral properties of the LD matrix.
         :param ld_kwargs: keyword arguments for the various LD estimators. Consult
         the implementations of `WindowedLD`, `ShrinkageLD`, and `BlockLD` for details.
         """
@@ -663,6 +669,7 @@ def compute_ld(self,
                             dtype=dtype,
                             compressor_name=compressor_name,
                             compression_level=compression_level,
+                            compute_spectral_properties=compute_spectral_properties,
                             **ld_kwargs)
             for c, g in tqdm(sorted(self.genotype.items(), key=lambda x: x[0]),
                              total=len(self.genotype),
@@ -672,7 +679,8 @@ def compute_ld(self,
 
     def get_ld_matrices(self):
         """
-        :return: The LD matrices computed for each chromosome.
+        :return: A dictionary containing the chromosome ID as key and corresponding LD matrices
+        as value.
         """
         return self.ld
 
@@ -849,6 +857,7 @@ def to_individual_table(self):
         :return: A plink-style dataframe of individual IDs, in the form of
         Family ID (FID) and Individual ID (IID).
         """
+        assert self.sample_table is not None
 
         return self.sample_table.get_individual_table()
 
@@ -858,6 +867,8 @@ def to_phenotype_table(self):
         Individual ID (IID), and phenotype value.
         """
 
+        assert self.sample_table is not None
+
         return self.sample_table.get_phenotype_table()
 
     def to_snp_table(self, col_subset=None, per_chromosome=False, resource='auto'):
@@ -879,6 +890,7 @@ def to_snp_table(self, col_subset=None, per_chromosome=False, resource='auto'):
 
         # Sanity checks:
         assert resource in ('auto', 'genotype', 'ld', 'sumstats')
+
         if resource != 'auto':
             if resource == 'genotype' and self.genotype is None:
                 raise ValueError("Genotype matrix is not available!")

diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
@@ -343,6 +343,7 @@ def compute_ld(self,
                    dtype='int8',
                    compressor_name='zstd',
                    compression_level=7,
+                   compute_spectral_properties=False,
                    **ld_kwargs):
         """
 
@@ -359,6 +360,8 @@ def compute_ld(self,
         :param compression_level: The compression level for the Zarr array (1-9)
         :param ld_kwargs: keyword arguments for the various LD estimators. Consult
         the implementations of `WindowedLD`, `ShrinkageLD`, and `BlockLD` for details.
+        :param compute_spectral_properties: If True, compute and store information about the eigenvalues of
+        the LD matrix.
         """
 
         from .stats.ld.estimator import SampleLD, WindowedLD, ShrinkageLD, BlockLD
@@ -382,7 +385,8 @@ def compute_ld(self,
                               temp_dir=tmp_ld_dir.name,
                               dtype=dtype,
                               compressor_name=compressor_name,
-                              compression_level=compression_level)
+                              compression_level=compression_level,
+                              compute_spectral_properties=compute_spectral_properties)
 
     def set_sample_table(self, sample_table):
         """