Skip to content

Commit

Permalink
refac: Introduced parallel processing in gather_files for faster extr…
Browse files Browse the repository at this point in the history
…action of dataframe
  • Loading branch information
pmugudas committed Feb 19, 2025
1 parent 104470e commit 52a1ec0
Showing 1 changed file with 20 additions and 3 deletions.
23 changes: 20 additions & 3 deletions src/eotransform_pandas/filesystem/gather.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os
from collections import defaultdict, deque
from concurrent.futures.process import ProcessPoolExecutor
from functools import partial
from pathlib import Path
from typing import Dict, Callable, Optional, Sequence, AnyStr, Pattern
from typing import Dict, Callable, Optional, Sequence, AnyStr, Pattern, Generator

import pandas as pd

Expand All @@ -10,16 +13,30 @@ def gather_files(root: Path, naming_convention: Callable[[str], Dict],
index: Optional[str] = None) -> pd.DataFrame:
directories = list()
_add_sub_folders(root, deque(sub_folder_structure or []), directories)
files = _files_generator(directories)
process_func = partial(_process_file, naming_convention=naming_convention)
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
files_and_metadata = executor.map(process_func, files)

files = [(file, naming_convention(file.name)) for directory in directories for file in directory.glob("*.*")]
data = defaultdict(list)
for file, meta_data in files:
for file, meta_data in files_and_metadata:
if meta_data:
_add_file_and_meta_data(data, file, meta_data)

return _make_data_frame_from(data, index)


def _process_file(file, naming_convention):
return file, naming_convention(file.name)


def _files_generator(directories: list) -> Generator[Path, None, None]:
for directory in directories:
for file in directory.iterdir():
if file.is_file():
yield file


def _add_sub_folders(current: Path, sub_folders: deque, file_list: list):
if sub_folders:
sub_pattern = sub_folders.popleft()
Expand Down

1 comment on commit 52a1ec0

@SwamyDev
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Really well done! This is such a nice performance improvement and you've implemented in a simple and elegant way! Excellent work, nothing else to say :).

Please sign in to comment.