From e6d4481194909dd1c146d6000a93ecd7c67b3287 Mon Sep 17 00:00:00 2001
From: Shadi <shadizabad1@gmail.com>
Date: Tue, 28 May 2024 12:38:23 -0400
Subject: [PATCH] Updating defaults for LD matrix construction

---
 bin/magenpy_ld                | 14 +++++++++++---
 magenpy/GWADataLoader.py      | 10 +++++-----
 magenpy/GenotypeMatrix.py     |  6 +++---
 magenpy/LDMatrix.py           | 24 ++++++++++++------------
 magenpy/stats/ld/estimator.py | 24 ++++++++++++------------
 magenpy/stats/ld/utils.py     | 16 ++++++++--------
 6 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/bin/magenpy_ld b/bin/magenpy_ld
index 22a1989..81b30c4 100644
--- a/bin/magenpy_ld
+++ b/bin/magenpy_ld
@@ -82,16 +82,16 @@ parser.add_argument('--metadata', dest='metadata', type=str,
 
 # Argument for the float precision:
 parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
-                    default='int16', help='The data type for the entries of the LD matrix.',
+                    default='int8', help='The data type for the entries of the LD matrix.',
                     choices={'float32', 'float64', 'int16', 'int8'})
 
 # Add arguments for the compressor:
 parser.add_argument('--compressor', dest='compressor', type=str,
-                    default='lz4', help='The compressor name or compression algorithm to use for the LD matrix.',
+                    default='zstd', help='The compressor name or compression algorithm to use for the LD matrix.',
                     choices={'lz4', 'zstd', 'gzip', 'zlib'})
 
 parser.add_argument('--compression-level', dest='compression_level', type=int,
-                    default=5, help='The compression level to use for the entries of the LD matrix (1-9).')
+                    default=7, help='The compression level to use for the entries of the LD matrix (1-9).')
 
 # Options for the various LD estimators:
 
@@ -229,6 +229,7 @@ ld_mat = g.compute_ld(args.estimator,
                       **ld_kwargs)
 
 # Store metadata (if provided):
+
 if args.metadata is not None:
     parsed_metadata = {
         k: v for entry in args.metadata.split(',') for k, v in [entry.strip().split('=')]
@@ -239,6 +240,13 @@ if args.metadata is not None:
         for k, v in parsed_metadata.items():
             ld_mat.set_store_attr(k, v)
 
+    if 'Date' not in parsed_metadata:
+        # Store the date when the computation was done:
+        ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
+
+else:
+    # Store the date when the computation was done:
+    ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
 
 # Clean up all intermediate files and directories:
 g.cleanup()
diff --git a/magenpy/GWADataLoader.py b/magenpy/GWADataLoader.py
index d27b1a1..ca6bb6c 100644
--- a/magenpy/GWADataLoader.py
+++ b/magenpy/GWADataLoader.py
@@ -415,13 +415,13 @@ def read_genotypes(self,
             gmat_class = plinkBEDGenotypeMatrix
 
         if self.verbose and len(bed_files) < 2:
-            print("> Reading BED file...")
+            print("> Reading genotype metadata...")
 
         self.genotype = {}
 
         for bfile in tqdm(bed_files,
                           total=len(bed_files),
-                          desc="Reading BED files",
+                          desc="Reading genotype metadata",
                           disable=not self.verbose or len(bed_files) < 2):
             # Read BED file and update the genotypes dictionary:
             self.genotype.update(gmat_class.from_file(bfile,
@@ -615,9 +615,9 @@ def release_ld(self):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int16',
-                   compressor_name='lz4',
-                   compression_level=5,
+                   dtype='int8',
+                   compressor_name='zstd',
+                   compression_level=7,
                    **ld_kwargs):
         """
         Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson
diff --git a/magenpy/GenotypeMatrix.py b/magenpy/GenotypeMatrix.py
index df1ce34..c08eae6 100644
--- a/magenpy/GenotypeMatrix.py
+++ b/magenpy/GenotypeMatrix.py
@@ -341,9 +341,9 @@ def get_snp_attribute(self, attr):
     def compute_ld(self,
                    estimator,
                    output_dir,
-                   dtype='int16',
-                   compressor_name='lz4',
-                   compression_level=5,
+                   dtype='int8',
+                   compressor_name='zstd',
+                   compression_level=7,
                    **ld_kwargs):
         """
 
diff --git a/magenpy/LDMatrix.py b/magenpy/LDMatrix.py
index 0639fe3..5c8dfa5 100644
--- a/magenpy/LDMatrix.py
+++ b/magenpy/LDMatrix.py
@@ -116,9 +116,9 @@ def from_csr(cls,
                  csr_mat,
                  store_path,
                  overwrite=False,
-                 dtype='int16',
-                 compressor_name='lz4',
-                 compression_level=5):
+                 dtype='int8',
+                 compressor_name='zstd',
+                 compression_level=7):
         """
         Initialize an LDMatrix object from a sparse CSR matrix.
 
@@ -171,9 +171,9 @@ def from_plink_table(cls,
                          store_path,
                          pandas_chunksize=None,
                          overwrite=False,
-                         dtype='int16',
-                         compressor_name='lz4',
-                         compression_level=5):
+                         dtype='int8',
+                         compressor_name='zstd',
+                         compression_level=7):
         """
         Construct a Zarr LD matrix using LD tables generated by plink1.9.
 
@@ -260,9 +260,9 @@ def from_dense_zarr_matrix(cls,
                                store_path,
                                overwrite=False,
                                delete_original=False,
-                               dtype='int16',
-                               compressor_name='lz4',
-                               compression_level=5):
+                               dtype='int8',
+                               compressor_name='zstd',
+                               compression_level=7):
         """
          Initialize a new LD matrix object using a Zarr array object. This method is
          useful for converting a dense LD matrix computed using Dask (or other distributed computing
@@ -359,9 +359,9 @@ def from_ragged_zarr_matrix(cls,
                                 store_path,
                                 overwrite=False,
                                 delete_original=False,
-                                dtype='int16',
-                                compressor_name='lz4',
-                                compression_level=5):
+                                dtype='int8',
+                                compressor_name='zstd',
+                                compression_level=7):
         """
         Initialize a new LD matrix object using a Zarr array object
         conforming to the old LD Matrix format from magenpy v<=0.0.12.
diff --git a/magenpy/stats/ld/estimator.py b/magenpy/stats/ld/estimator.py
index f1ae261..b577470 100644
--- a/magenpy/stats/ld/estimator.py
+++ b/magenpy/stats/ld/estimator.py
@@ -61,9 +61,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
         A utility method to compute the LD matrix and store in Zarr array format.
         The computes the LD matrix and stores it in Zarr array format, set its attributes,
@@ -238,9 +238,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
 
         Compute the windowed LD matrix and store in Zarr array format.
@@ -346,9 +346,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5,
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7,
                 chunk_size=1000):
         """
 
@@ -465,9 +465,9 @@ def compute(self,
                 temp_dir='temp',
                 overwrite=True,
                 delete_original=True,
-                dtype='int16',
-                compressor_name='lz4',
-                compression_level=5):
+                dtype='int8',
+                compressor_name='zstd',
+                compression_level=7):
         """
 
         Compute the block-based LD matrix and store in Zarr array format.
diff --git a/magenpy/stats/ld/utils.py b/magenpy/stats/ld/utils.py
index 4bdb28a..060d57b 100644
--- a/magenpy/stats/ld/utils.py
+++ b/magenpy/stats/ld/utils.py
@@ -233,7 +233,7 @@ def harmonic_series_sum(n):
     return ld_mat_obj
 
 
-def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
+def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
     """
     Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
     The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
@@ -255,9 +255,9 @@ def compute_ld_plink1p9(genotype_matrix,
                         output_dir,
                         temp_dir='temp',
                         overwrite=True,
-                        dtype='int16',
-                        compressor_name='lz4',
-                        compression_level=5):
+                        dtype='int8',
+                        compressor_name='zstd',
+                        compression_level=7):
 
     """
     Compute LD matrices using plink 1.9.
@@ -354,7 +354,7 @@ def compute_ld_plink1p9(genotype_matrix,
     plink1.execute(cmd)
 
     # Convert from PLINK LD files to Zarr:
-    fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome))
+    fin_ld_store = osp.join(output_dir, 'chr_' + str(genotype_matrix.chromosome))
 
     # Compute the pandas chunk_size
     # The goal of this is to process chunks of the LD table without overwhelming memory resources:
@@ -382,9 +382,9 @@ def compute_ld_xarray(genotype_matrix,
                       temp_dir='temp',
                       overwrite=True,
                       delete_original=True,
-                      dtype='int16',
-                      compressor_name='lz4',
-                      compression_level=5):
+                      dtype='int8',
+                      compressor_name='zstd',
+                      compression_level=7):
 
     """
     Compute the Linkage Disequilibrium matrix or snp-by-snp