Skip to content

Commit 0a75d3b

Browse files
authored
tiny improvement (#19341)
1 parent 577bd85 commit 0a75d3b

File tree

2 files changed

+43
-13
lines changed

2 files changed

+43
-13
lines changed

src/lightning/data/streaming/downloader.py

+30-13
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# limitations under the License.
1313
import os
1414
import shutil
15+
import subprocess
1516
from abc import ABC
1617
from typing import Any, Dict, List
1718
from urllib import parse
@@ -40,29 +41,45 @@ def download_file(self, remote_chunkpath: str, local_chunkpath: str) -> None:
4041
class S3Downloader(Downloader):
4142
def __init__(self, remote_dir: str, cache_dir: str, chunks: List[Dict[str, Any]]):
4243
super().__init__(remote_dir, cache_dir, chunks)
43-
self._client = S3Client()
44+
self._s5cmd_available = os.system("s5cmd > /dev/null 2>&1") == 0
45+
46+
if not self._s5cmd_available:
47+
self._client = S3Client()
4448

4549
def download_file(self, remote_filepath: str, local_filepath: str) -> None:
4650
obj = parse.urlparse(remote_filepath)
4751

4852
if obj.scheme != "s3":
4953
raise ValueError(f"Expected obj.scheme to be `s3`, instead, got {obj.scheme} for remote={remote_filepath}")
5054

51-
from boto3.s3.transfer import TransferConfig
52-
53-
extra_args: Dict[str, Any] = {}
55+
if os.path.exists(local_filepath):
56+
return
5457

5558
try:
56-
with FileLock(local_filepath + ".lock", timeout=1):
57-
if not os.path.exists(local_filepath):
58-
# Issue: https://github.com/boto/boto3/issues/3113
59-
self._client.client.download_file(
60-
obj.netloc,
61-
obj.path.lstrip("/"),
62-
local_filepath,
63-
ExtraArgs=extra_args,
64-
Config=TransferConfig(use_threads=False),
59+
with FileLock(local_filepath + ".lock", timeout=0):
60+
if self._s5cmd_available:
61+
proc = subprocess.Popen(
62+
f"s5cmd --numworkers 64 cp {remote_filepath} {local_filepath}",
63+
shell=True,
64+
stdout=subprocess.PIPE,
6565
)
66+
proc.wait()
67+
else:
68+
from boto3.s3.transfer import TransferConfig
69+
70+
extra_args: Dict[str, Any] = {}
71+
72+
# try:
73+
# with FileLock(local_filepath + ".lock", timeout=1):
74+
if not os.path.exists(local_filepath):
75+
# Issue: https://github.com/boto/boto3/issues/3113
76+
self._client.client.download_file(
77+
obj.netloc,
78+
obj.path.lstrip("/"),
79+
local_filepath,
80+
ExtraArgs=extra_args,
81+
Config=TransferConfig(use_threads=False),
82+
)
6683
except Timeout:
6784
# another process is responsible to download that file, continue
6885
pass
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import os
2+
from unittest.mock import MagicMock
3+
4+
from lightning.data.streaming.downloader import S3Downloader, subprocess
5+
6+
7+
def test_s3_downloader_fast(tmpdir, monkeypatch):
8+
monkeypatch.setattr(os, "system", MagicMock(return_value=0))
9+
popen_mock = MagicMock()
10+
monkeypatch.setattr(subprocess, "Popen", MagicMock(return_value=popen_mock))
11+
downloader = S3Downloader(tmpdir, tmpdir, [])
12+
downloader.download_file("s3://random_bucket/a.txt", os.path.join(tmpdir, "a.txt"))
13+
popen_mock.wait.assert_called()

0 commit comments

Comments
 (0)