Small update to optimize memory usage and clean up bugs.

shz9 · May 16, 2024 · 2e8013c · 2e8013c
1 parent 9fde47f
commit 2e8013c
Show file tree

Hide file tree

Showing 12 changed files with 123 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [0.1.3] - 2024-05-15
+
+### Changed
+
+- Updated the logic for `detect_outliers` in phenotype transforms to actually reflect the function
+name (before it was returning true for inliers...).
+- Updated `quantize` and `dequantize` to minimize data copying as much as possible.
+- Updated `LDMatrix.load_rows()` method to minimize data copying.
+- Fixed bug in `LDMatrix.n_neighbors` implementation.
+- Updated `dask` version in `requirements.txt` to avoid installing `dask-expr`.
+
+
+### Added
+
+- Added `get_peak_memory_usage` to `system_utils` to inspect peak memory usage of a process.
+- Placeholder method to perform QC on `SumstatsTable` objects (needs to be implemented still).
+
 ## [0.1.2] - 2024-04-24
 
 ### Changed

diff --git a/bin/magenpy_ld b/bin/magenpy_ld
@@ -31,17 +31,17 @@ from magenpy.utils.system_utils import valid_url
 from magenpy.GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix
 
 print(fr"""
-**********************************************                            
- _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
-| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
-| | | | | | (_| | (_| |  __/ | | | |_) | |_| |
-|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
-                 |___/           |_|    |___/
-Modeling and Analysis of Genetics data in python
-Version: {mgp.__version__} | Release date: {mgp.__release_date__}
-Author: Shadi Zabad, McGill University
-**********************************************
-< Compute LD matrix and output in Zarr format >
+        **********************************************                            
+         _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
+        | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
+        | | | | | | (_| | (_| |  __/ | | | |_) | |_| |
+        |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
+                         |___/           |_|    |___/
+        Modeling and Analysis of Genetics data in python
+        Version: {mgp.__version__} | Release date: {mgp.__release_date__}
+        Author: Shadi Zabad, McGill University
+        **********************************************
+        < Compute LD matrix and output in Zarr format >
 """)
 
 parser = argparse.ArgumentParser(description="""

diff --git a/bin/magenpy_simulate b/bin/magenpy_simulate
@@ -33,17 +33,17 @@ import argparse
 
 
 print(fr"""
-**********************************************                            
- _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
-| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
-| | | | | | (_| | (_| |  __/ | | | |_) | |_| |
-|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
-                 |___/           |_|    |___/
-Modeling and Analysis of Genetics data in python
-Version: {mgp.__version__} | Release date: {mgp.__release_date__}
-Author: Shadi Zabad, McGill University
-**********************************************
-< Simulate complex quantitative or case-control traits >
+        ********************************************************                            
+             _ __ ___   __ _  __ _  ___ _ __  _ __  _   _ 
+            | '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
+            | | | | | | (_| | (_| |  __/ | | | |_) | |_| |
+            |_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
+                             |___/           |_|    |___/
+            Modeling and Analysis of Genetics data in python
+            Version: {mgp.__version__} | Release date: {mgp.__release_date__}
+            Author: Shadi Zabad, McGill University
+        ********************************************************
+        < Simulate complex quantitative or case-control traits >
 """)
 
 # --------- Options ---------

diff --git a/magenpy/LDMatrix.py b/magenpy/LDMatrix.py
@@ -122,6 +122,8 @@ def from_csr(cls,
         """
         Initialize an LDMatrix object from a sparse CSR matrix.
 
+        TODO: Determine the chunksize based on the avg neighborhood size?
+
         :param csr_mat: The sparse CSR matrix.
         :param store_path: The path to the Zarr LD store where the data will be stored.
         :param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -173,8 +175,9 @@ def from_plink_table(cls,
                          compressor_name='lz4',
                          compression_level=5):
         """
-        Construct a Zarr LD matrix using output tables from plink1.9.
-        This class method takes the following inputs:
+        Construct a Zarr LD matrix using LD tables generated by plink1.9.
+
+        TODO: Determine the chunksize based on the avg neighborhood size?
 
         :param plink_ld_file: The path to the plink LD table file.
         :param snps: An iterable containing the list of SNPs in the LD matrix.
@@ -265,6 +268,8 @@ def from_dense_zarr_matrix(cls,
          useful for converting a dense LD matrix computed using Dask (or other distributed computing
          software) to a sparse or banded one.
 
+         TODO: Determine the chunksize based on the avg neighborhood size?
+
          :param dense_zarr: The path to the dense Zarr array object.
          :param ld_boundaries: The LD boundaries for each SNP in the LD matrix (delineates the indices of
             the leftmost and rightmost neighbors of each SNP).
@@ -364,6 +369,8 @@ def from_ragged_zarr_matrix(cls,
         This utility function will also copy some of the stored attributes
         associated with the matrix in the old format.
 
+        TODO: Determine the chunksize based on the avg neighborhood size?
+
         :param ragged_zarr: The path to the ragged Zarr array object.
         :param store_path: The path where to store the new LD matrix.
         :param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -722,7 +729,7 @@ def n_neighbors(self):
         :return: The number of variants in the LD window for each SNP.
 
         """
-        return self.window_size()
+        return self.window_size
 
     @property
     def csr_matrix(self):
@@ -1150,8 +1157,10 @@ def low_memory_load(self, dtype=None):
             from .stats.ld.c_utils import filter_ut_csr_matrix_low_memory
 
             data_mask, indptr = filter_ut_csr_matrix_low_memory(indptr, mask)
-            # Unfortunately, .vindex is very slow in Zarr right now (~order of magnitude)
-            # So for now, we load the entire data array before performing the mask selection:
+            # .oindex and .vindex are slow and likely convert to integer indices in the background,
+            # which unnecessarily increases memory usage. Unfortunately, here we have to load the entire
+            # data and index it using the boolean array afterward.
+            # Something to be improved in the future...
             data = self._zg['matrix/data'][:][data_mask]
         else:
             data = self._zg['matrix/data'][:]
@@ -1282,7 +1291,7 @@ def load_rows(self,
             mat.data[mat.data == 0] = invalid_value
 
             # Add the matrix transpose to make it symmetric:
-            mat = (mat + mat.T).astype(dtype)
+            mat += mat.T
 
             # If the user requested filling the diagonals, do it here:
             if fill_diag:
@@ -1458,7 +1467,7 @@ def __iter__(self):
         TODO: Add a flag to allow for chunked iterator, with limited memory footprint.
         """
         self.index = 0
-        self.load(return_symmetric=self.is_symmetric)
+        self.load(return_symmetric=self.is_symmetric, fill_diag=self.is_symmetric)
         return self
 
     def __next__(self):

diff --git a/magenpy/SumstatsTable.py b/magenpy/SumstatsTable.py
@@ -378,6 +378,16 @@ def set_sample_size(self, n):
         """
         self.table['N'] = n
 
+    def run_quality_control(self, reference_table=None):
+        """
+        Run quality control checks on the summary statistics table.
+        TODO: Implement quality control checks following recommendations given by Prive et al.:
+        https://doi.org/10.1016/j.xhgg.2022.100136
+        Given user fine-control over which checks to run and which to skip.
+        Maybe move parts of this implementation to a module in `stats` (TBD)
+        """
+        pass
+
     def match(self, reference_table, correct_flips=True):
         """
         Match the summary statistics table with a reference table,

diff --git a/magenpy/__init__.py b/magenpy/__init__.py
@@ -16,8 +16,8 @@
 
 from .utils.data_utils import *
 
-__version__ = '0.1.2'
-__release_date__ = 'April 2024'
+__version__ = '0.1.3'
+__release_date__ = 'May 2024'
 
 
 config = configparser.ConfigParser()

diff --git a/magenpy/stats/transforms/phenotype.py b/magenpy/stats/transforms/phenotype.py
@@ -35,7 +35,7 @@ def rint(phenotype, offset=3./8):
     return norm.ppf((ranked_pheno - offset) / (len(ranked_pheno) - 2 * offset + 1))
 
 
-def detect_outliers(phenotype, sigma_threshold=5):
+def detect_outliers(phenotype, sigma_threshold=5, nan_policy='omit'):
     """
     Detect samples with outlier phenotype values.
     This function takes a vector of quantitative phenotypes,
@@ -45,11 +45,14 @@ def detect_outliers(phenotype, sigma_threshold=5):
     :param phenotype: A numpy vector of continuous or quantitative phenotypes.
     :param sigma_threshold: The multiple of standard deviations or sigmas after
     which we consider the phenotypic value an outlier.
+    :param nan_policy: The policy to use when encountering NaN values in the phenotype vector.
+    By default, we compute the z-scores ignoring NaN values.
 
-    :return: A boolean array indicating whether the phenotype value is an outlier.
+    :return: A boolean array indicating whether the phenotype value is an outlier (i.e.
+    True indicates outlier).
     """
     from scipy.stats import zscore
-    return np.abs(zscore(phenotype)) < sigma_threshold
+    return np.abs(zscore(phenotype, nan_policy=nan_policy)) > sigma_threshold
 
 
 def standardize(phenotype):
@@ -109,8 +112,8 @@ def chained_transform(sample_table,
                 # Remove outlier samples whose phenotypes are more than `threshold` standard deviations from the mean:
                 if outlier_sigma_threshold is not None:
                     # Find outliers:
-                    mask = detect_outliers(phenotype, outlier_sigma_threshold)
-                    # Filter phenotype vector:
+                    mask = ~detect_outliers(phenotype, outlier_sigma_threshold)
+                    # Filter phenotype vector to exclude outliers:
                     phenotype = phenotype[mask]
 
     return phenotype, mask
diff --git a/magenpy/utils/model_utils.py b/magenpy/utils/model_utils.py
@@ -374,16 +374,21 @@ def quantize(floats, int_dtype=np.int8):
     # See discussions on Scale Quantization Mapping.
     scale = 2. / (info.max - (info.min + 1))
 
-    # Quantize the floats to int
-    return np.clip((floats / scale).round(), info.min, info.max).astype(int_dtype)
+    # Use as much in-place operations as possible
+    # (Currently, we copy the data twice)
+    scaled_floats = floats / scale
+    np.round(scaled_floats, out=scaled_floats)
+    np.clip(scaled_floats, info.min, info.max, out=scaled_floats)
+
+    return scaled_floats.astype(int_dtype)
 
 
 def dequantize(ints, float_dtype=np.float32):
     """
     Dequantize integers to the specified floating point type.
     NOTE: Assumes original floats are in the range [-1, 1].
     :param ints: A numpy array of integers
-    :param float_dtype: The floating point type to dequantize to.
+    :param float_dtype: The floating point data type to dequantize the integers to.
     """
 
     # Infer the boundaries from the integer type
@@ -394,7 +399,10 @@ def dequantize(ints, float_dtype=np.float32):
     # See discussions on Scale Quantization Mapping.
     scale = 2. / (info.max - (info.min + 1))
 
-    return ints.astype(float_dtype) * scale
+    dq = ints.astype(float_dtype)
+    dq *= scale  # in-place multiplication
+
+    return dq
 
 
 def multinomial_rvs(n, p):

diff --git a/magenpy/utils/system_utils.py b/magenpy/utils/system_utils.py
@@ -4,6 +4,7 @@
 import subprocess
 import glob
 import psutil
+import sys
 
 
 def available_cpu():
@@ -13,9 +14,38 @@ def available_cpu():
     return psutil.cpu_count() - 1
 
 
+def get_peak_memory_usage(include_children=False):
+    """
+    Get the peak memory usage of the running process in Mega Bytes (MB).
+
+    !!! warning
+        This function is only available on Unix-based systems for now.
+
+    :param include_children: A boolean flag to include the memory usage of the child processes.
+    :return: The peak memory usage of the running process in Mega Bytes (MB).
+    """
+
+    try:
+        import resource
+    except ImportError:
+        return
+
+    mem_usage_self = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+
+    if include_children:
+        mem_usage_self = max(mem_usage_self, resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss)
+
+    if sys.platform != 'darwin':
+        mem_usage_self /= 1024
+    else:
+        mem_usage_self /= 1024**2
+
+    return mem_usage_self
+
+
 def get_memory_usage():
     """
-    Get the memory usage of the current process in Mega Bytes (MB)
+    Get the current memory usage of the running process in Mega Bytes (MB)
     """
     process = psutil.Process(os.getpid())
     mem_info = process.memory_info()
@@ -145,5 +175,5 @@ def delete_temp_files(prefix):
     for f in glob.glob(f"{prefix}*"):
         try:
             os.remove(f)
-        except Exception as e:
+        except Exception:
             continue
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-dask
+dask<=2024.1.0  # Seen installation issues with newer versions
 scipy
 numpy
 pandas

diff --git a/setup.py b/setup.py
@@ -79,7 +79,7 @@ def no_cythonize(extensions, **_ignore):
 
 setup(
     name="magenpy",
-    version="0.1.2",
+    version="0.1.3",
     author="Shadi Zabad",
     author_email="shadi.zabad@mail.mcgill.ca",
     description="Modeling and Analysis of Statistical Genetics data in python",

diff --git a/tests/conda_manual_testing.sh b/tests/conda_manual_testing.sh
@@ -29,7 +29,7 @@ do
 
     # Install magenpy
     make clean
-    python -m pip install -v -e .[test]
+    python -m pip install --no-cache-dir -v -e .[test]
 
     # List the installed packages:
     python -m pip list
-Original file line number
+Diff line change
@@ -1,4 +1,4 @@
-    dask
+    dask<=2024.1.0  # Seen installation issues with newer versions
     scipy
     numpy
     pandas
@@ Expand Down @@