Skip to content

Commit

Permalink
Small update to optimize memory usage and clean up bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
shz9 committed May 16, 2024
1 parent 9fde47f commit 2e8013c
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 45 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [0.1.3] - 2024-05-15

### Changed

- Updated the logic for `detect_outliers` in phenotype transforms to actually reflect the function
name (before it was returning true for inliers...).
- Updated `quantize` and `dequantize` to minimize data copying as much as possible.
- Updated `LDMatrix.load_rows()` method to minimize data copying.
- Fixed bug in `LDMatrix.n_neighbors` implementation.
- Updated `dask` version in `requirements.txt` to avoid installing `dask-expr`.


### Added

- Added `get_peak_memory_usage` to `system_utils` to inspect peak memory usage of a process.
- Placeholder method to perform QC on `SumstatsTable` objects (needs to be implemented still).

## [0.1.2] - 2024-04-24

### Changed
Expand Down
22 changes: 11 additions & 11 deletions bin/magenpy_ld
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,17 @@ from magenpy.utils.system_utils import valid_url
from magenpy.GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix

print(fr"""
**********************************************
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
|___/ |_| |___/
Modeling and Analysis of Genetics data in python
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
Author: Shadi Zabad, McGill University
**********************************************
< Compute LD matrix and output in Zarr format >
**********************************************
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
|___/ |_| |___/
Modeling and Analysis of Genetics data in python
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
Author: Shadi Zabad, McGill University
**********************************************
< Compute LD matrix and output in Zarr format >
""")

parser = argparse.ArgumentParser(description="""
Expand Down
22 changes: 11 additions & 11 deletions bin/magenpy_simulate
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ import argparse


print(fr"""
**********************************************
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
|___/ |_| |___/
Modeling and Analysis of Genetics data in python
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
Author: Shadi Zabad, McGill University
**********************************************
< Simulate complex quantitative or case-control traits >
********************************************************
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
|___/ |_| |___/
Modeling and Analysis of Genetics data in python
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
Author: Shadi Zabad, McGill University
********************************************************
< Simulate complex quantitative or case-control traits >
""")

# --------- Options ---------
Expand Down
23 changes: 16 additions & 7 deletions magenpy/LDMatrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ def from_csr(cls,
"""
Initialize an LDMatrix object from a sparse CSR matrix.
TODO: Determine the chunksize based on the avg neighborhood size?
:param csr_mat: The sparse CSR matrix.
:param store_path: The path to the Zarr LD store where the data will be stored.
:param overwrite: If True, it overwrites the LD store at `store_path`.
Expand Down Expand Up @@ -173,8 +175,9 @@ def from_plink_table(cls,
compressor_name='lz4',
compression_level=5):
"""
Construct a Zarr LD matrix using output tables from plink1.9.
This class method takes the following inputs:
Construct a Zarr LD matrix using LD tables generated by plink1.9.
TODO: Determine the chunksize based on the avg neighborhood size?
:param plink_ld_file: The path to the plink LD table file.
:param snps: An iterable containing the list of SNPs in the LD matrix.
Expand Down Expand Up @@ -265,6 +268,8 @@ def from_dense_zarr_matrix(cls,
useful for converting a dense LD matrix computed using Dask (or other distributed computing
software) to a sparse or banded one.
TODO: Determine the chunksize based on the avg neighborhood size?
:param dense_zarr: The path to the dense Zarr array object.
:param ld_boundaries: The LD boundaries for each SNP in the LD matrix (delineates the indices of
the leftmost and rightmost neighbors of each SNP).
Expand Down Expand Up @@ -364,6 +369,8 @@ def from_ragged_zarr_matrix(cls,
This utility function will also copy some of the stored attributes
associated with the matrix in the old format.
TODO: Determine the chunksize based on the avg neighborhood size?
:param ragged_zarr: The path to the ragged Zarr array object.
:param store_path: The path where to store the new LD matrix.
:param overwrite: If True, it overwrites the LD store at `store_path`.
Expand Down Expand Up @@ -722,7 +729,7 @@ def n_neighbors(self):
:return: The number of variants in the LD window for each SNP.
"""
return self.window_size()
return self.window_size

@property
def csr_matrix(self):
Expand Down Expand Up @@ -1150,8 +1157,10 @@ def low_memory_load(self, dtype=None):
from .stats.ld.c_utils import filter_ut_csr_matrix_low_memory

data_mask, indptr = filter_ut_csr_matrix_low_memory(indptr, mask)
# Unfortunately, .vindex is very slow in Zarr right now (~order of magnitude)
# So for now, we load the entire data array before performing the mask selection:
# .oindex and .vindex are slow and likely convert to integer indices in the background,
# which unnecessarily increases memory usage. Unfortunately, here we have to load the entire
# data and index it using the boolean array afterward.
# Something to be improved in the future...
data = self._zg['matrix/data'][:][data_mask]
else:
data = self._zg['matrix/data'][:]
Expand Down Expand Up @@ -1282,7 +1291,7 @@ def load_rows(self,
mat.data[mat.data == 0] = invalid_value

# Add the matrix transpose to make it symmetric:
mat = (mat + mat.T).astype(dtype)
mat += mat.T

# If the user requested filling the diagonals, do it here:
if fill_diag:
Expand Down Expand Up @@ -1458,7 +1467,7 @@ def __iter__(self):
TODO: Add a flag to allow for chunked iterator, with limited memory footprint.
"""
self.index = 0
self.load(return_symmetric=self.is_symmetric)
self.load(return_symmetric=self.is_symmetric, fill_diag=self.is_symmetric)
return self

def __next__(self):
Expand Down
10 changes: 10 additions & 0 deletions magenpy/SumstatsTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,16 @@ def set_sample_size(self, n):
"""
self.table['N'] = n

def run_quality_control(self, reference_table=None):
"""
Run quality control checks on the summary statistics table.
TODO: Implement quality control checks following recommendations given by Prive et al.:
https://doi.org/10.1016/j.xhgg.2022.100136
Given user fine-control over which checks to run and which to skip.
Maybe move parts of this implementation to a module in `stats` (TBD)
"""
pass

def match(self, reference_table, correct_flips=True):
"""
Match the summary statistics table with a reference table,
Expand Down
4 changes: 2 additions & 2 deletions magenpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

from .utils.data_utils import *

__version__ = '0.1.2'
__release_date__ = 'April 2024'
__version__ = '0.1.3'
__release_date__ = 'May 2024'


config = configparser.ConfigParser()
Expand Down
13 changes: 8 additions & 5 deletions magenpy/stats/transforms/phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def rint(phenotype, offset=3./8):
return norm.ppf((ranked_pheno - offset) / (len(ranked_pheno) - 2 * offset + 1))


def detect_outliers(phenotype, sigma_threshold=5):
def detect_outliers(phenotype, sigma_threshold=5, nan_policy='omit'):
"""
Detect samples with outlier phenotype values.
This function takes a vector of quantitative phenotypes,
Expand All @@ -45,11 +45,14 @@ def detect_outliers(phenotype, sigma_threshold=5):
:param phenotype: A numpy vector of continuous or quantitative phenotypes.
:param sigma_threshold: The multiple of standard deviations or sigmas after
which we consider the phenotypic value an outlier.
:param nan_policy: The policy to use when encountering NaN values in the phenotype vector.
By default, we compute the z-scores ignoring NaN values.
:return: A boolean array indicating whether the phenotype value is an outlier.
:return: A boolean array indicating whether the phenotype value is an outlier (i.e.
True indicates outlier).
"""
from scipy.stats import zscore
return np.abs(zscore(phenotype)) < sigma_threshold
return np.abs(zscore(phenotype, nan_policy=nan_policy)) > sigma_threshold


def standardize(phenotype):
Expand Down Expand Up @@ -109,8 +112,8 @@ def chained_transform(sample_table,
# Remove outlier samples whose phenotypes are more than `threshold` standard deviations from the mean:
if outlier_sigma_threshold is not None:
# Find outliers:
mask = detect_outliers(phenotype, outlier_sigma_threshold)
# Filter phenotype vector:
mask = ~detect_outliers(phenotype, outlier_sigma_threshold)
# Filter phenotype vector to exclude outliers:
phenotype = phenotype[mask]

return phenotype, mask
16 changes: 12 additions & 4 deletions magenpy/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,16 +374,21 @@ def quantize(floats, int_dtype=np.int8):
# See discussions on Scale Quantization Mapping.
scale = 2. / (info.max - (info.min + 1))

# Quantize the floats to int
return np.clip((floats / scale).round(), info.min, info.max).astype(int_dtype)
# Use as much in-place operations as possible
# (Currently, we copy the data twice)
scaled_floats = floats / scale
np.round(scaled_floats, out=scaled_floats)
np.clip(scaled_floats, info.min, info.max, out=scaled_floats)

return scaled_floats.astype(int_dtype)


def dequantize(ints, float_dtype=np.float32):
"""
Dequantize integers to the specified floating point type.
NOTE: Assumes original floats are in the range [-1, 1].
:param ints: A numpy array of integers
:param float_dtype: The floating point type to dequantize to.
:param float_dtype: The floating point data type to dequantize the integers to.
"""

# Infer the boundaries from the integer type
Expand All @@ -394,7 +399,10 @@ def dequantize(ints, float_dtype=np.float32):
# See discussions on Scale Quantization Mapping.
scale = 2. / (info.max - (info.min + 1))

return ints.astype(float_dtype) * scale
dq = ints.astype(float_dtype)
dq *= scale # in-place multiplication

return dq


def multinomial_rvs(n, p):
Expand Down
34 changes: 32 additions & 2 deletions magenpy/utils/system_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import subprocess
import glob
import psutil
import sys


def available_cpu():
Expand All @@ -13,9 +14,38 @@ def available_cpu():
return psutil.cpu_count() - 1


def get_peak_memory_usage(include_children=False):
"""
Get the peak memory usage of the running process in Mega Bytes (MB).
!!! warning
This function is only available on Unix-based systems for now.
:param include_children: A boolean flag to include the memory usage of the child processes.
:return: The peak memory usage of the running process in Mega Bytes (MB).
"""

try:
import resource
except ImportError:
return

mem_usage_self = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss

if include_children:
mem_usage_self = max(mem_usage_self, resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss)

if sys.platform != 'darwin':
mem_usage_self /= 1024
else:
mem_usage_self /= 1024**2

return mem_usage_self


def get_memory_usage():
"""
Get the memory usage of the current process in Mega Bytes (MB)
Get the current memory usage of the running process in Mega Bytes (MB)
"""
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
Expand Down Expand Up @@ -145,5 +175,5 @@ def delete_temp_files(prefix):
for f in glob.glob(f"{prefix}*"):
try:
os.remove(f)
except Exception as e:
except Exception:
continue
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
dask
dask<=2024.1.0 # Seen installation issues with newer versions
scipy
numpy
pandas
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def no_cythonize(extensions, **_ignore):

setup(
name="magenpy",
version="0.1.2",
version="0.1.3",
author="Shadi Zabad",
author_email="shadi.zabad@mail.mcgill.ca",
description="Modeling and Analysis of Statistical Genetics data in python",
Expand Down
2 changes: 1 addition & 1 deletion tests/conda_manual_testing.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ do

# Install magenpy
make clean
python -m pip install -v -e .[test]
python -m pip install --no-cache-dir -v -e .[test]

# List the installed packages:
python -m pip list
Expand Down

0 comments on commit 2e8013c

Please sign in to comment.