|
11 | 11 | # See the License for the specific language governing permissions and
|
12 | 12 | # limitations under the License.
|
13 | 13 |
|
| 14 | +import concurrent.futures |
14 | 15 | import inspect
|
15 | 16 | import os
|
16 | 17 | from datetime import datetime
|
17 | 18 | from functools import partial
|
18 | 19 | from pathlib import Path
|
19 | 20 | from types import FunctionType
|
20 |
| -from typing import Any, Callable, Dict, Optional, Sequence, Union |
| 21 | +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union |
21 | 22 |
|
22 | 23 | import torch
|
23 | 24 |
|
@@ -286,3 +287,51 @@ def optimize(
|
286 | 287 | num_nodes,
|
287 | 288 | machine,
|
288 | 289 | )
|
| 290 | + |
| 291 | + |
| 292 | +def _listdir(folder: str) -> Tuple[str, List[str]]: |
| 293 | + return folder, os.listdir(folder) |
| 294 | + |
| 295 | + |
| 296 | +class walk: |
| 297 | + """This class is an optimized version of os.walk for listing files and folders from cloud filesystem. |
| 298 | +
|
| 299 | + Note: The order of files and folders yielded aren't depth-first anymore due to the asynchronous listing call. |
| 300 | +
|
| 301 | + """ |
| 302 | + |
| 303 | + def __init__(self, folder: str, max_workers: Optional[int] = os.cpu_count()) -> None: |
| 304 | + self.folders = [folder] |
| 305 | + self.max_workers = max_workers or 1 |
| 306 | + self.futures: List[concurrent.futures.Future] = [] |
| 307 | + |
| 308 | + def __iter__(self) -> Any: |
| 309 | + """This function queues the folders to perform listdir across multiple workers.""" |
| 310 | + with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: |
| 311 | + while len(self.folders): |
| 312 | + folder = self.folders.pop(0) |
| 313 | + future = executor.submit(_listdir, folder) |
| 314 | + self.futures.append(future) |
| 315 | + |
| 316 | + while self.futures: |
| 317 | + for future in concurrent.futures.as_completed(self.futures): |
| 318 | + filenames = [] |
| 319 | + folders = [] |
| 320 | + |
| 321 | + folder, files_or_folders = future.result() |
| 322 | + self.futures = [f for f in self.futures if f != future] |
| 323 | + |
| 324 | + for file_or_folder in files_or_folders: |
| 325 | + if os.path.isfile(os.path.join(folder, file_or_folder)): |
| 326 | + filenames.append(file_or_folder) |
| 327 | + else: |
| 328 | + folders.append(file_or_folder) |
| 329 | + self.folders.append(os.path.join(folder, file_or_folder)) |
| 330 | + |
| 331 | + yield folder, folders, filenames |
| 332 | + |
| 333 | + while len(self.folders) and len(self.futures) <= self.max_workers * 2: |
| 334 | + folder = self.folders.pop(0) |
| 335 | + future = executor.submit(_listdir, folder) |
| 336 | + self.futures.append(future) |
| 337 | + return |
0 commit comments