Skip to content

Commit 03ea726

Browse files
committed
Add an example of encoding HDF5 data with Blosc2 (including jpeg2000)
1 parent 9fba477 commit 03ea726

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

examples/encode-hdf5-kevlar.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
###############################################################################
2+
# Caterva2 - On demand access to remote Blosc2 data repositories
3+
#
4+
# Copyright (c) 2023 ironArray SLU <contact@ironarray.io>
5+
# https://www.blosc.org
6+
# License: GNU Affero General Public License v3.0
7+
# See LICENSE.txt for details about copyright and rights to use.
8+
###############################################################################
9+
10+
# This script encodes the first nframes of the kevlar HDF5 dataset using Blosc2 with different codecs,
11+
# including the GROK codec for JPEG2000 compression. The output is saved to a new HDF5 file
12+
# that can be opened with h5py and hdf5plugin; use b2h5py/PyTables to enable optimized slicing.
13+
#
14+
# For this to work, you will need to download the kevlar.h5 file from the kevlar repository:
15+
# http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5
16+
# More info on the kevlar dataset can be found here:
17+
# https://hdf5plugin.readthedocs.io/en/stable/hdf5plugin_EuropeanHUG2022.html#A-look-at-performances-on-a-single-use-case
18+
19+
import os
20+
import sys
21+
22+
import blosc2
23+
import blosc2_grok
24+
import h5py
25+
import hdf5plugin
26+
import numpy as np
27+
28+
clevel = 5 # compression level, e.g., 0-9, where 0 is no compression and 9 is maximum compression
29+
cratio = 10 # compression ratio for JPEG2000 (grok)
30+
nframes = 3 # number of frames to encode by default, e.g., 3, 10, 100, etc.
31+
if len(sys.argv) > 1:
32+
try:
33+
nframes = int(sys.argv[1])
34+
except ValueError:
35+
print(f"Invalid number of frames: {sys.argv[1]}. Using default: {nframes} frames.")
36+
37+
fname_in = "kevlar.h5" # input file with the kevlar dataset
38+
fname_out = f"kevlar-blosc2-{nframes}frames.h5"
39+
40+
if not os.path.exists(fname_in):
41+
raise FileNotFoundError(
42+
f"Input file {fname_in} does not exist\n"
43+
"Please download it from the kevlar repository at:"
44+
" http://www.silx.org/pub/pyFAI/pyFAI_UM_2020/data_ID13/kevlar.h5"
45+
)
46+
47+
48+
def encode_frames_grok(dset, fw):
49+
"""Encode frames with blosc2-grok and save to HDF5 file."""
50+
# Define the compression and decompression parameters for Blosc2.
51+
# Disable the filters and the splitmode, because these don't work with grok.
52+
cparams = {
53+
"codec": blosc2.Codec.GROK,
54+
"filters": [],
55+
"splitmode": blosc2.SplitMode.NEVER_SPLIT,
56+
}
57+
# Set the parameters that will be used by grok
58+
kwargs = {
59+
"cod_format": blosc2_grok.GrkFileFmt.GRK_FMT_JP2,
60+
"num_threads": 1, # this does not have any effect (grok should work in multithreading mode)
61+
"quality_mode": "rates",
62+
"quality_layers": np.array([cratio], dtype=np.float64),
63+
}
64+
blosc2_grok.set_params_defaults(**kwargs)
65+
66+
b2comp = hdf5plugin.Blosc2() # just for identification, no compression algorithm specified
67+
dset_out = g.create_dataset(
68+
"cname-grok",
69+
(nframes,) + dset.shape[1:],
70+
dset.dtype,
71+
chunks=(1,) + dset.shape[1:], # chunk size of 1 frame
72+
**b2comp,
73+
)
74+
for i in range(nframes):
75+
im = dset[i : i + 1]
76+
# Transform the numpy array to a blosc2 array. This is where compression happens.
77+
b2im = blosc2.asarray(im, chunks=im.shape, blocks=im.shape, cparams=cparams)
78+
# Write to disk
79+
dset_out.id.write_direct_chunk((i, 0, 0), b2im.schunk.to_cframe())
80+
81+
return dset_out
82+
83+
84+
with h5py.File(fname_in, "r") as fr:
85+
dset = fr["/entry/data/data"]
86+
with h5py.File(fname_out, "w") as fw:
87+
g = fw.create_group("/data")
88+
for cname in ("blosclz", "lz4", "zstd", "grok"):
89+
if cname == "grok":
90+
# For grok, we need to encode the frames with the grok codec.
91+
# The grok codec is not available in the hdf5plugin.Blosc2 class,
92+
# so we use blosc2 directly.
93+
dset_out = encode_frames_grok(dset, fw)
94+
else:
95+
# For other codecs, we can use the hdf5plugin.Blosc2 class.
96+
b2comp = hdf5plugin.Blosc2(cname=cname, clevel=clevel, filters=hdf5plugin.Blosc2.BITSHUFFLE)
97+
dset_out = g.create_dataset(
98+
f"cname-{cname}",
99+
data=dset[:nframes],
100+
dtype=dset.dtype,
101+
chunks=(1,) + dset.shape[1:], # chunk size of 1 frame
102+
**b2comp,
103+
)
104+
print("dset ready:", fw.filename, dset_out)

0 commit comments

Comments
 (0)