From c06253f98e455632da9e11c7594e00700afebd7c Mon Sep 17 00:00:00 2001 From: Mateusz Jakub Fila <37295697+m-fila@users.noreply.github.com> Date: Mon, 17 Feb 2025 11:36:28 +0100 Subject: [PATCH] add globbing support in `makeReader` and `CreateDataSource` (#729) * add globbing support in `Reader` and `DataSource` * add docstrings, error handling, throwing on unsupported platforms * exclude test from sanitizers since otherwise its dependencies are not met * remove obsolete include * add reading multiple files and glob patterns with `get_reader` * fix broken syntax in branch for no-glob platforms * move glob to utils * Revert "add reading multiple files and glob patterns with `get_reader`" This reverts commit 60c9b023d7e746a646643f61241206d8b2ce0550. * test standalone glob utility * add reading multiple files in `get_reader`, glob utilities accessible in python * import whole root in utils * fix typos, comments, loading glob header in python * fix glob namespace in selection.xml * define PODIO_HAS_GLOB_SUPPORT --- include/podio/DataSource.h | 3 +- include/podio/Reader.h | 5 ++- include/podio/utilities/Glob.h | 42 ++++++++++++++++++++ python/podio/reading.py | 30 +++++++++----- python/podio/utils.py | 29 ++++++++++++++ src/CMakeLists.txt | 2 + src/DataSource.cc | 5 +-- src/Glob.cc | 64 ++++++++++++++++++++++++++++++ src/Reader.cc | 4 +- src/selection.xml | 3 ++ tests/CTestCustom.cmake | 2 + tests/root_io/CMakeLists.txt | 6 +++ tests/root_io/read_datasource.py | 4 ++ tests/root_io/read_glob.cpp | 40 +++++++++++++++++++ tests/root_io/read_multiple.py | 13 ++++++ tests/root_io/write_frame_root.cpp | 8 +++- 16 files changed, 242 insertions(+), 18 deletions(-) create mode 100644 include/podio/utilities/Glob.h create mode 100644 src/Glob.cc create mode 100644 tests/root_io/read_glob.cpp create mode 100644 tests/root_io/read_multiple.py diff --git a/include/podio/DataSource.h b/include/podio/DataSource.h index ba00fe4e4..d44518c20 100644 --- a/include/podio/DataSource.h +++ b/include/podio/DataSource.h @@ -152,9 +152,10 @@ class DataSource : public ROOT::RDF::RDataSource { ROOT::RDataFrame CreateDataFrame(const std::vector& filePathList); /// -/// @brief Create RDataFrame from a Podio file. +/// @brief Create RDataFrame from a Podio file or glob pattern matching multiple Podio files. /// /// @param[in] filePath File path from which the RDataFrame will be created. +/// The file path can include glob patterns to match multiple files. /// @return RDataFrame created from input file list. /// ROOT::RDataFrame CreateDataFrame(const std::string& filePath); diff --git a/include/podio/Reader.h b/include/podio/Reader.h index d8209197e..246e9cddd 100644 --- a/include/podio/Reader.h +++ b/include/podio/Reader.h @@ -206,12 +206,13 @@ class Reader { } }; -/// Create a Reader is able to read the file +/// Create a Reader that is able to read the file or files matching a glob pattern /// /// This will inspect the filename as well as peek at the file contents to /// instantiate the correct low level reader to open and read the file /// -/// @param filename The (path to the) file to read from +/// @param filename The (path to the) file to read from. +/// The file path can include glob patterns to match multiple files. /// /// @returns A Reader that has been initialized and that can be used for reading /// data from the passed file diff --git a/include/podio/utilities/Glob.h b/include/podio/utilities/Glob.h new file mode 100644 index 000000000..6b5d752ac --- /dev/null +++ b/include/podio/utilities/Glob.h @@ -0,0 +1,42 @@ +#ifndef PODIO_UTILITIES_GLOB_H +#define PODIO_UTILITIES_GLOB_H +#include +#include + +// Support for glob expansion. +#if __has_include() + #define PODIO_HAS_GLOB_SUPPORT 1 +#else + #define PODIO_HAS_GLOB_SUPPORT 0 +#endif + +namespace podio::utils { +/// @brief Expands a given glob pattern into a list of matching file paths. +/// +/// This function takes a glob pattern as input and returns a vector of strings +/// containing the paths that match the pattern. It supports standard glob rules +/// extended with tilde expansion and brace expansion. If the pattern doesn't +/// contain any wildcards then it is placed in the returned vector as is. Paths +/// that cannot be accessed are displayed on std::cerr, but the expansion process +/// is not aborted. On platforms without no expansion is done and vector +/// containing the original pattern is returned +/// +/// @param pattern The glob pattern to expand. +/// @return A vector of strings containing the matching file paths. +/// +/// @throws std::runtime_error If no matches are found or if there is an error +/// during glob expansion. +std::vector expand_glob(const std::string& pattern); + +/// @brief Checks if a given pattern is a glob pattern. +/// +/// This function determines whether the provided pattern contains any standard +/// glob or brace expansion wildcards. +/// +/// @param pattern The pattern to check. +/// @return true if the pattern is a glob pattern, false otherwise. +bool is_glob_pattern(const std::string& pattern); + +} // namespace podio::utils + +#endif // PODIO_UTILITIES_GLOB_H diff --git a/python/podio/reading.py b/python/podio/reading.py index 0ae43facf..cd750f34b 100644 --- a/python/podio/reading.py +++ b/python/podio/reading.py @@ -50,32 +50,42 @@ def _determine_root_format(filename): return RootFileFormat.RNTUPLE -def get_reader(filename): - """Get an appropriate reader for the passed file. +def get_reader(filenames): + """Get an appropriate reader for the passed files. + + The reader is inferred from the first file if multiple are given. + All files are assumed to be of the same I/O format. Args: - filename (str): The input file + filenames (str or list[str]): The input file(s) Returns: root_io.[Legacy]Reader, sio_io.[Legacy]Reader: an initialized reader that - is able to process the input file. + is able to process the input file(s). Raises: - ValueError: If the file cannot be recognized, or if podio has not been + ValueError: If the files cannot be recognized, or if podio has not been built with the necessary backend I/O support + IndexError: If filenames is an empty list """ + + if isinstance(filenames, str): + filename = filenames + else: + filename = filenames[0] + if filename.endswith(".sio"): if _is_frame_sio_file(filename): - return sio_io.Reader(filename) - return sio_io.LegacyReader(filename) + return sio_io.Reader(filenames) + return sio_io.LegacyReader(filenames) if filename.endswith(".root"): root_flavor = _determine_root_format(filename) if root_flavor == RootFileFormat.TTREE: - return root_io.Reader(filename) + return root_io.Reader(filenames) if root_flavor == RootFileFormat.RNTUPLE: - return root_io.RNTupleReader(filename) + return root_io.RNTupleReader(filenames) if root_flavor == RootFileFormat.LEGACY: - return root_io.LegacyReader(filename) + return root_io.LegacyReader(filenames) raise ValueError("file must end on .root or .sio") diff --git a/python/podio/utils.py b/python/podio/utils.py index 5b7221c88..57e8e0c9d 100644 --- a/python/podio/utils.py +++ b/python/podio/utils.py @@ -5,6 +5,8 @@ from collections.abc import Iterable from pathlib import Path +from ROOT import podio + def convert_to_str_paths(filenames): """Converts filenames to string paths, handling both string and pathlib.Path objects and @@ -22,3 +24,30 @@ def convert_to_str_paths(filenames): return [os.fspath(fn) for fn in filenames] return [os.fspath(filenames)] + + +def expand_glob(pattern): + """ + Expands a given glob pattern into a list of matching file paths. + + This function takes a glob pattern as input and returns a list of strings + containing the paths that match the pattern. It supports standard glob rules + extended with tilde expansion and brace expansion. If the pattern doesn't + contain any wildcards, it is placed in the returned list as is. Paths that + cannot be accessed are displayed on stderr, but the expansion process is not + aborted. + + Args: + pattern (str): The glob pattern to expand. + + Returns: + list of str: A list of strings containing the matching file paths. + + Raises: + cppyy.gbl.std.runtime_error: If no matches are found or if there is an error during glob + expansion. + """ + return [str(x) for x in podio.utils.expand_glob(pattern)] + + +is_glob_pattern = podio.utils.is_glob_pattern diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0c07f674b..eec259834 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,6 +56,7 @@ SET(core_sources CollectionBufferFactory.cc MurmurHash3.cpp SchemaEvolution.cc + Glob.cc ) SET(core_headers @@ -69,6 +70,7 @@ SET(core_headers ${PROJECT_SOURCE_DIR}/include/podio/utilities/DatamodelRegistryIOHelpers.h ${PROJECT_SOURCE_DIR}/include/podio/GenericParameters.h ${PROJECT_SOURCE_DIR}/include/podio/LinkCollection.h + ${PROJECT_SOURCE_DIR}/include/podio/utilities/Glob.h ) PODIO_ADD_LIB_AND_DICT(podio "${core_headers}" "${core_sources}" selection.xml) diff --git a/src/DataSource.cc b/src/DataSource.cc index 4b8fac0af..bdff155fc 100644 --- a/src/DataSource.cc +++ b/src/DataSource.cc @@ -1,5 +1,6 @@ #include "podio/DataSource.h" #include "podio/Reader.h" +#include "podio/utilities/Glob.h" // podio #include @@ -13,9 +14,7 @@ #include namespace podio { -DataSource::DataSource(const std::string& filePath, int nEvents) : m_nSlots{1} { - m_filePathList.emplace_back(filePath); - SetupInput(nEvents); +DataSource::DataSource(const std::string& filePath, int nEvents) : DataSource(utils::expand_glob(filePath), nEvents) { } DataSource::DataSource(const std::vector& filePathList, int nEvents) : diff --git a/src/Glob.cc b/src/Glob.cc new file mode 100644 index 000000000..25d634b17 --- /dev/null +++ b/src/Glob.cc @@ -0,0 +1,64 @@ +#include "podio/utilities/Glob.h" +#include +#include +#include +#include +#if __has_include() + #include +#else + #include +#endif + +namespace podio::utils { + +bool is_glob_pattern(const std::string& pattern) { + bool escape = false; + for (auto c : pattern) { + if (escape) { + escape = false; + } else if (c == '\\') { + escape = true; + } else if (c == '*' || c == '?' || c == '[' || c == '{') { + return true; + } + } + return false; +} + +#if __has_include() + +int glob_err_handler(const char* epath, int eerrno) { + std::cerr << "Glob expansion error accessing path: " << epath << " (error code: " << eerrno << ")\n"; + return 0; +} + +std::vector expand_glob(const std::string& pattern) { + glob_t glob_result; + auto retv = glob(pattern.c_str(), GLOB_TILDE | GLOB_BRACE | GLOB_NOMAGIC, glob_err_handler, &glob_result); + if (retv == GLOB_NOMATCH) { + throw std::runtime_error("Glob expansion found no matches for pattern: " + pattern); + } else if (retv != 0) { + globfree(&glob_result); + throw std::runtime_error("Glob expansion error"); + } + std::vector results; + results.reserve(glob_result.gl_pathc); + for (size_t i = 0; i < glob_result.gl_pathc; ++i) { + results.emplace_back(glob_result.gl_pathv[i]); + } + globfree(&glob_result); + return results; +} + +#else + +std::vector expand_glob(const std::string& pattern) { + if (is_glob_pattern(pattern)) { + throw std::system_error("Glob expansion is not supported on this platform") + } + return {pattern}; +} + +#endif // __has_include() + +} // namespace podio::utils diff --git a/src/Reader.cc b/src/Reader.cc index 5aeb367f5..71bf19b6c 100644 --- a/src/Reader.cc +++ b/src/Reader.cc @@ -8,6 +8,8 @@ #include "podio/SIOReader.h" #endif +#include "podio/utilities/Glob.h" + #include "TFile.h" #include "TKey.h" #include @@ -19,7 +21,7 @@ Reader::Reader(std::unique_ptr reader) : m_self(std::make_unique{filename}); + return makeReader(utils::expand_glob(filename)); } Reader makeReader(const std::vector& filenames) { diff --git a/src/selection.xml b/src/selection.xml index 477c9f7fa..c3541060f 100644 --- a/src/selection.xml +++ b/src/selection.xml @@ -46,5 +46,8 @@ + + + diff --git a/tests/CTestCustom.cmake b/tests/CTestCustom.cmake index bce6fc5fd..628f296d8 100644 --- a/tests/CTestCustom.cmake +++ b/tests/CTestCustom.cmake @@ -23,6 +23,8 @@ if ((NOT "@FORCE_RUN_ALL_TESTS@" STREQUAL "ON") AND (NOT "@USE_SANITIZER@" STREQ write_frame_root read_frame_root + read_glob + read_python_multiple write_interface_root read_interface_root diff --git a/tests/root_io/CMakeLists.txt b/tests/root_io/CMakeLists.txt index b32889e49..d1da23370 100644 --- a/tests/root_io/CMakeLists.txt +++ b/tests/root_io/CMakeLists.txt @@ -8,6 +8,7 @@ set(root_dependent_tests read_and_write_frame_root.cpp write_interface_root.cpp read_interface_root.cpp + read_glob.cpp ) if(ENABLE_RNTUPLE) set(root_dependent_tests @@ -39,11 +40,16 @@ set_tests_properties( read_frame_root read_frame_root_multiple read_and_write_frame_root + read_glob PROPERTIES DEPENDS write_frame_root ) +add_test(NAME read_python_multiple COMMAND python3 ${PROJECT_SOURCE_DIR}/tests/root_io/read_multiple.py) +PODIO_SET_TEST_ENV(read_python_multiple) +set_property(TEST read_python_multiple PROPERTY DEPENDS write_frame_root) + if(ENABLE_RNTUPLE) set_property(TEST read_rntuple PROPERTY DEPENDS write_rntuple) set_property(TEST read_interface_rntuple PROPERTY DEPENDS write_interface_rntuple) diff --git a/tests/root_io/read_datasource.py b/tests/root_io/read_datasource.py index 4b6318dac..b8528568d 100644 --- a/tests/root_io/read_datasource.py +++ b/tests/root_io/read_datasource.py @@ -10,3 +10,7 @@ rdf = CreateDataFrame("example_frame.root") assert rdf.Count().GetValue() == 10 + +rdf = CreateDataFrame("example_frame_?.root") + +assert rdf.Count().GetValue() == 20 diff --git a/tests/root_io/read_glob.cpp b/tests/root_io/read_glob.cpp new file mode 100644 index 000000000..41fa0a51e --- /dev/null +++ b/tests/root_io/read_glob.cpp @@ -0,0 +1,40 @@ +#include "podio/Reader.h" +#include "podio/utilities/Glob.h" +#if PODIO_ENABLE_DATASOURCE + #include "podio/DataSource.h" +#endif + +#define ASSERT(condition, msg) \ + if (!(condition)) { \ + throw std::runtime_error(msg); \ + } + +int main() { + const auto pattern = "example_frame_?.root"; + const auto expected_events = 20; + // standalone globbing + + ASSERT(podio::utils::is_glob_pattern(pattern), "Failed to recognize glob pattern"); + auto files = podio::utils::expand_glob(pattern); + ASSERT(files.size() == 2, "Glob expanded to a wrong number of files"); + ASSERT(files[0] == "example_frame_0.root", "Glob expanded to an unexpected file"); + ASSERT(files[1] == "example_frame_1.root", "Glob expanded to an unexpected file"); + { + // externally resolved glob + const auto reader = podio::makeReader(files); + ASSERT((reader.getEvents() == expected_events), "Reader read invalid number of events"); +#if PODIO_ENABLE_DATASOURCE + auto rdf = podio::CreateDataFrame(files); + ASSERT(rdf.Count().GetValue() == expected_events, "DataSource read invalid number of events"); +#endif // PODIO_ENABLE_DATASOURCE + } + { + // implicit globbing + const auto reader = podio::makeReader(pattern); + ASSERT((reader.getEvents() == expected_events), "Reader read invalid number of events"); +#if PODIO_ENABLE_DATASOURCE + auto rdf = podio::CreateDataFrame(pattern); + ASSERT(rdf.Count().GetValue() == expected_events, "DataSource read invalid number of events"); +#endif // PODIO_ENABLE_DATASOURCE + } +} diff --git a/tests/root_io/read_multiple.py b/tests/root_io/read_multiple.py new file mode 100644 index 000000000..a1be9b24a --- /dev/null +++ b/tests/root_io/read_multiple.py @@ -0,0 +1,13 @@ +"""Small test case for checking get_reader working with +a single file, list of files, and a glob pattern""" + +import podio + +assert podio.utils.is_glob_pattern("example_frame_?.root") +files = podio.utils.expand_glob("example_frame_?.root") +assert files == ["example_frame_0.root", "example_frame_1.root"] + +reader = podio.reading.get_reader("example_frame.root") +assert len(reader.get("events")) == 10 +reader = podio.reading.get_reader(files) +assert len(reader.get("events")) == 20 diff --git a/tests/root_io/write_frame_root.cpp b/tests/root_io/write_frame_root.cpp index 2ad648e36..cda73f007 100644 --- a/tests/root_io/write_frame_root.cpp +++ b/tests/root_io/write_frame_root.cpp @@ -2,7 +2,13 @@ #include "podio/ROOTWriter.h" +#include + int main(int, char**) { - write_frames("example_frame.root"); + const auto filename = "example_frame.root"; + write_frames(filename); + // copy file multiple times for tests with glob + std::filesystem::copy_file(filename, "example_frame_0.root", std::filesystem::copy_options::overwrite_existing); + std::filesystem::copy_file(filename, "example_frame_1.root", std::filesystem::copy_options::overwrite_existing); return 0; }