|
| 1 | +############################################################################### |
| 2 | +# Caterva2 - On demand access to remote Blosc2 data repositories |
| 3 | +# |
| 4 | +# Copyright (c) 2023 ironArray SLU <contact@ironarray.io> |
| 5 | +# https://www.blosc.org |
| 6 | +# License: GNU Affero General Public License v3.0 |
| 7 | +# See LICENSE.txt for details about copyright and rights to use. |
| 8 | +############################################################################### |
| 9 | + |
| 10 | +# This script encodes the first nframes of the kevlar HDF5 dataset using Blosc2 with different codecs, |
| 11 | +# including the GROK codec for JPEG2000 compression. The output is saved to a new HDF5 file |
| 12 | +# that can be opened with h5py and hdf5plugin; use b2h5py/PyTables to enable optimized slicing. |
| 13 | +# |
| 14 | +# For this to work, you will need to download the kevlar.h5 file from the kevlar repository: |
| 15 | +# http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5 |
| 16 | +# More info on the kevlar dataset can be found here: |
| 17 | +# https://hdf5plugin.readthedocs.io/en/stable/hdf5plugin_EuropeanHUG2022.html#A-look-at-performances-on-a-single-use-case |
| 18 | + |
| 19 | +import os |
| 20 | +import sys |
| 21 | + |
| 22 | +import blosc2 |
| 23 | +import blosc2_grok |
| 24 | +import h5py |
| 25 | +import hdf5plugin |
| 26 | +import numpy as np |
| 27 | + |
| 28 | +clevel = 5 # compression level, e.g., 0-9, where 0 is no compression and 9 is maximum compression |
| 29 | +cratio = 10 # compression ratio for JPEG2000 (grok) |
| 30 | +nframes = 3 # number of frames to encode by default, e.g., 3, 10, 100, etc. |
| 31 | +if len(sys.argv) > 1: |
| 32 | + try: |
| 33 | + nframes = int(sys.argv[1]) |
| 34 | + except ValueError: |
| 35 | + print(f"Invalid number of frames: {sys.argv[1]}. Using default: {nframes} frames.") |
| 36 | + |
| 37 | +fname_in = "kevlar.h5" # input file with the kevlar dataset |
| 38 | +fname_out = f"kevlar-blosc2-{nframes}frames.h5" |
| 39 | + |
| 40 | +if not os.path.exists(fname_in): |
| 41 | + raise FileNotFoundError( |
| 42 | + f"Input file {fname_in} does not exist\n" |
| 43 | + "Please download it from the kevlar repository at:" |
| 44 | + " http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5" |
| 45 | + ) |
| 46 | + |
| 47 | + |
| 48 | +def encode_frames_grok(dset, fw): |
| 49 | + """Encode frames with blosc2-grok and save to HDF5 file.""" |
| 50 | + # Define the compression and decompression parameters for Blosc2. |
| 51 | + # Disable the filters and the splitmode, because these don't work with grok. |
| 52 | + cparams = { |
| 53 | + "codec": blosc2.Codec.GROK, |
| 54 | + "filters": [], |
| 55 | + "splitmode": blosc2.SplitMode.NEVER_SPLIT, |
| 56 | + } |
| 57 | + # Set the parameters that will be used by grok |
| 58 | + kwargs = { |
| 59 | + "cod_format": blosc2_grok.GrkFileFmt.GRK_FMT_JP2, |
| 60 | + "num_threads": 1, # this does not have any effect (grok should work in multithreading mode) |
| 61 | + "quality_mode": "rates", |
| 62 | + "quality_layers": np.array([cratio], dtype=np.float64), |
| 63 | + } |
| 64 | + blosc2_grok.set_params_defaults(**kwargs) |
| 65 | + |
| 66 | + b2comp = hdf5plugin.Blosc2() # just for identification, no compression algorithm specified |
| 67 | + dset_out = g.create_dataset( |
| 68 | + "cname-grok", |
| 69 | + (nframes,) + dset.shape[1:], |
| 70 | + dset.dtype, |
| 71 | + chunks=(1,) + dset.shape[1:], # chunk size of 1 frame |
| 72 | + **b2comp, |
| 73 | + ) |
| 74 | + for i in range(nframes): |
| 75 | + im = dset[i : i + 1] |
| 76 | + # Transform the numpy array to a blosc2 array. This is where compression happens. |
| 77 | + b2im = blosc2.asarray(im, chunks=im.shape, blocks=im.shape, cparams=cparams) |
| 78 | + # Write to disk |
| 79 | + dset_out.id.write_direct_chunk((i, 0, 0), b2im.schunk.to_cframe()) |
| 80 | + |
| 81 | + return dset_out |
| 82 | + |
| 83 | + |
| 84 | +with h5py.File(fname_in, "r") as fr: |
| 85 | + dset = fr["/entry/data/data"] |
| 86 | + with h5py.File(fname_out, "w") as fw: |
| 87 | + g = fw.create_group("/data") |
| 88 | + for cname in ("blosclz", "lz4", "zstd", "grok"): |
| 89 | + if cname == "grok": |
| 90 | + # For grok, we need to encode the frames with the grok codec. |
| 91 | + # The grok codec is not available in the hdf5plugin.Blosc2 class, |
| 92 | + # so we use blosc2 directly. |
| 93 | + dset_out = encode_frames_grok(dset, fw) |
| 94 | + else: |
| 95 | + # For other codecs, we can use the hdf5plugin.Blosc2 class. |
| 96 | + b2comp = hdf5plugin.Blosc2(cname=cname, clevel=clevel, filters=hdf5plugin.Blosc2.BITSHUFFLE) |
| 97 | + dset_out = g.create_dataset( |
| 98 | + f"cname-{cname}", |
| 99 | + data=dset[:nframes], |
| 100 | + dtype=dset.dtype, |
| 101 | + chunks=(1,) + dset.shape[1:], # chunk size of 1 frame |
| 102 | + **b2comp, |
| 103 | + ) |
| 104 | + print("dset ready:", fw.filename, dset_out) |
0 commit comments