Skip to content

Commit 71c8204

Browse files
authored
Merge pull request #4 from AllenInstitute/feature/add-size-only-field-to-s3-sync-paths
expose and use size-only in data sync s3 sync paths
2 parents d9e5c8f + 13573e1 commit 71c8204

File tree

2 files changed

+63
-22
lines changed

2 files changed

+63
-22
lines changed

src/aibs_informatics_aws_utils/data_sync/operations.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ def sync_local_to_s3(self, source_path: LocalPath, destination_path: S3URI):
5757
destination_path=destination_path,
5858
transfer_config=self.s3_transfer_config,
5959
config=self.botocore_config,
60-
force=False,
60+
force=self.config.force,
61+
size_only=self.config.size_only,
6162
delete=True,
6263
)
6364
if not self.config.retain_source_data:
@@ -93,7 +94,8 @@ def sync_paths_with_lock(*args, **kwargs):
9394
destination_path=destination_path,
9495
transfer_config=self.s3_transfer_config,
9596
config=self.botocore_config,
96-
force=False,
97+
force=self.config.force,
98+
size_only=self.config.size_only,
9799
delete=True,
98100
)
99101

@@ -131,7 +133,8 @@ def sync_s3_to_s3(
131133
source_path_prefix=source_path_prefix,
132134
transfer_config=self.s3_transfer_config,
133135
config=self.botocore_config,
134-
force=False,
136+
force=self.config.force,
137+
size_only=self.config.size_only,
135138
delete=True,
136139
)
137140
if not self.config.retain_source_data:
@@ -195,21 +198,25 @@ def sync_data(
195198
max_concurrency: int = 10,
196199
retain_source_data: bool = True,
197200
require_lock: bool = False,
201+
force: bool = False,
202+
size_only: bool = False,
198203
):
199204
request = DataSyncRequest(
200205
source_path=source_path,
201206
destination_path=destination_path,
202-
source_path_prefix=source_path_prefix,
207+
source_path_prefix=S3KeyPrefix(source_path_prefix) if source_path_prefix else None,
203208
max_concurrency=max_concurrency,
204209
retain_source_data=retain_source_data,
205210
require_lock=require_lock,
211+
force=force,
212+
size_only=size_only,
206213
)
207214
return DataSyncOperations.sync_request(request=request)
208215

209216

210217
def refresh_local_path__mtime(path: Path, min_mtime: Union[int, float]):
211218
paths = find_all_paths(path, include_dirs=False, include_files=True)
212-
for path in paths:
213-
path_stats = os.stat(path)
219+
for subpath in paths:
220+
path_stats = os.stat(subpath)
214221
if path_stats.st_mtime < min_mtime:
215-
os.utime(path, times=(path_stats.st_atime, min_mtime))
222+
os.utime(subpath, times=(path_stats.st_atime, min_mtime))

src/aibs_informatics_aws_utils/s3.py

+49-15
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def upload_folder(
374374

375375
@retry(ResponseStreamingError)
376376
def upload_file(
377-
local_path: Path,
377+
local_path: Union[str, Path],
378378
s3_path: S3URI,
379379
extra_args: Optional[Dict[str, Any]] = None,
380380
transfer_config: Optional[TransferConfig] = None,
@@ -384,7 +384,7 @@ def upload_file(
384384
):
385385
s3_client = get_s3_client(**kwargs)
386386
if force or should_sync(
387-
source_path=local_path, destination_path=s3_path, size_only=size_only, **kwargs
387+
source_path=Path(local_path), destination_path=s3_path, size_only=size_only, **kwargs
388388
):
389389
s3_client.upload_file(
390390
Filename=str(local_path),
@@ -527,15 +527,19 @@ def get_s3_path_stats(s3_path: S3URI, **kwargs) -> S3PathStats:
527527
)
528528

529529

530+
# TODO: Two things
531+
# 1. allow for a way to specify `size_only` for this function when transfering large number of files
532+
# 2. add flag for failing if no source data exists.
530533
def sync_paths(
531534
source_path: Union[Path, S3URI],
532535
destination_path: Union[Path, S3URI],
533-
source_path_prefix: str = None,
536+
source_path_prefix: Optional[str] = None,
534537
include: Optional[List[Pattern]] = None,
535538
exclude: Optional[List[Pattern]] = None,
536539
extra_args: Optional[Dict[str, Any]] = None,
537540
transfer_config: Optional[TransferConfig] = None,
538541
force: bool = False,
542+
size_only: bool = False,
539543
delete: bool = False,
540544
**kwargs,
541545
) -> List[S3TransferResponse]:
@@ -576,12 +580,17 @@ def sync_paths(
576580
destination_path=destination_path,
577581
source_path_prefix=source_path_prefix,
578582
extra_args=extra_args,
579-
force=force,
580583
)
581584
for nested_source_path in nested_source_paths
582585
]
583586

584-
responses = process_transfer_requests(*requests, transfer_config=transfer_config, **kwargs)
587+
responses = process_transfer_requests(
588+
*requests,
589+
transfer_config=transfer_config,
590+
force=force,
591+
size_only=size_only,
592+
**kwargs,
593+
)
585594

586595
if delete:
587596
logger.info(f"Sync: checking for files to delete following sync")
@@ -613,7 +622,6 @@ def generate_transfer_request(
613622
destination_path: Union[Path, S3URI],
614623
source_path_prefix: Optional[str] = None,
615624
extra_args: Optional[Dict[str, Any]] = None,
616-
force: bool = True,
617625
) -> S3TransferRequest:
618626
"""Create a S3 transfer request
619627
@@ -641,26 +649,43 @@ def generate_transfer_request(
641649
# This will be sanitized by S3URI class (removing double slashes)
642650
new_destination_path = S3URI(destination_path + relative_source_path)
643651
if isinstance(source_path, S3URI):
644-
return S3CopyRequest(source_path, new_destination_path, force, extra_args=extra_args)
652+
return S3CopyRequest(source_path, new_destination_path, extra_args=extra_args)
645653
else:
646-
return S3UploadRequest(source_path, new_destination_path, force, extra_args=extra_args)
654+
return S3UploadRequest(source_path, new_destination_path, extra_args=extra_args)
647655
elif isinstance(source_path, S3URI) and isinstance(destination_path, Path):
648656
local_destination_path: Path = (
649657
Path(get_path_with_root(relative_source_path, destination_path))
650658
if relative_source_path
651659
else destination_path
652660
)
653-
return S3DownloadRequest(source_path, local_destination_path, force)
661+
return S3DownloadRequest(source_path, local_destination_path)
654662
else:
655663
raise ValueError("Local to local transfer is not ")
656664

657665

658666
def process_transfer_requests(
659667
*transfer_requests: S3TransferRequest,
660668
transfer_config: Optional[TransferConfig] = None,
669+
force: bool = False,
670+
size_only: bool = False,
661671
suppress_errors: bool = False,
662672
**kwargs,
663673
) -> List[S3TransferResponse]:
674+
"""Process a list of S3 transfer requests
675+
676+
Args:
677+
transfer_config (Optional[TransferConfig]): transfer config.
678+
Defaults to None.
679+
force (bool): Whether to force the transfer.
680+
Defaults to False.
681+
size_only (bool): Whether to only check size when transferring.
682+
Defaults to False.
683+
suppress_errors (bool): Whether to suppress errors.
684+
Defaults to False.
685+
686+
Returns:
687+
List[S3TransferResponse]: List of transfer responses
688+
"""
664689
transfer_responses = []
665690

666691
for request in transfer_requests:
@@ -671,7 +696,8 @@ def process_transfer_requests(
671696
destination_path=request.destination_path,
672697
extra_args=request.extra_args,
673698
transfer_config=transfer_config,
674-
force=request.force,
699+
force=force,
700+
size_only=size_only,
675701
**kwargs,
676702
)
677703
elif isinstance(request, S3UploadRequest):
@@ -680,7 +706,8 @@ def process_transfer_requests(
680706
s3_path=request.destination_path,
681707
extra_args=request.extra_args,
682708
transfer_config=transfer_config,
683-
force=request.force,
709+
force=force,
710+
size_only=size_only,
684711
**kwargs,
685712
)
686713
elif isinstance(request, S3DownloadRequest):
@@ -697,7 +724,8 @@ def process_transfer_requests(
697724
s3_path=request.source_path,
698725
local_path=request.destination_path,
699726
transfer_config=transfer_config,
700-
force=request.force,
727+
force=force,
728+
size_only=size_only,
701729
**kwargs,
702730
)
703731
transfer_responses.append(S3TransferResponse(request, False))
@@ -716,7 +744,12 @@ def process_transfer_requests(
716744
copy_s3_path = sync_paths
717745

718746

719-
@retry(ClientError, [lambda ex: client_error_code_check(ex, "SlowDown")])
747+
client_error_code_check__SlowDown: Callable[[Exception], bool] = lambda ex: isinstance(
748+
ex, ClientError
749+
) and client_error_code_check(ex, "SlowDown")
750+
751+
752+
@retry(ClientError, [client_error_code_check__SlowDown])
720753
def copy_s3_object(
721754
source_path: S3URI,
722755
destination_path: S3URI,
@@ -890,7 +923,7 @@ def list_s3_paths(
890923
s3 = get_s3_client(**kwargs)
891924

892925
def match_results(value: str, patterns: List[Pattern]) -> List[bool]:
893-
return [_.match(value) for _ in patterns]
926+
return [_.match(value) is not None for _ in patterns]
894927

895928
paginator = s3.get_paginator("list_objects_v2")
896929

@@ -1013,6 +1046,7 @@ def should_sync(
10131046
dest_local_path, multipart_chunk_size_bytes, multipart_threshold_bytes
10141047
)
10151048
else:
1049+
logger.warning(f"Destination path {destination_path} does not exist as a file or object.")
10161050
return True
10171051

10181052
if isinstance(source_path, S3URI) and is_object(source_path):
@@ -1156,7 +1190,7 @@ def update_s3_storage_class(
11561190
print(
11571191
f"debug: current storage class: {s3_obj.storage_class}, target: {target_storage_class}"
11581192
)
1159-
current_storage_class = S3StorageClass.from_boto_s3_obj(s3_obj)
1193+
current_storage_class = S3StorageClass.from_boto_s3_obj(s3_obj) # type: ignore[arg-type]
11601194
# Current storage class matches target: No-op
11611195
if current_storage_class == target_storage_class:
11621196
continue

0 commit comments

Comments
 (0)