|
14 | 14 | # KIND, either express or implied. See the License for the
|
15 | 15 | # specific language governing permissions and limitations
|
16 | 16 | # under the License.
|
| 17 | +from typing import Iterator, Optional |
17 | 18 |
|
18 | 19 | from pyiceberg.exceptions import ValidationException
|
19 |
| -from pyiceberg.manifest import ManifestContent, ManifestFile |
| 20 | +from pyiceberg.expressions import BooleanExpression |
| 21 | +from pyiceberg.expressions.visitors import ROWS_CANNOT_MATCH, _InclusiveMetricsEvaluator |
| 22 | +from pyiceberg.manifest import ManifestContent, ManifestEntry, ManifestEntryStatus, ManifestFile |
20 | 23 | from pyiceberg.table import Table
|
21 | 24 | from pyiceberg.table.snapshots import Operation, Snapshot, ancestors_between
|
| 25 | +from pyiceberg.typedef import Record |
| 26 | + |
| 27 | +VALIDATE_DATA_FILES_EXIST_OPERATIONS = {Operation.OVERWRITE, Operation.REPLACE, Operation.DELETE} |
22 | 28 |
|
23 | 29 |
|
24 | 30 | def validation_history(
|
@@ -69,3 +75,78 @@ def validation_history(
|
69 | 75 | raise ValidationException("No matching snapshot found.")
|
70 | 76 |
|
71 | 77 | return manifests_files, snapshots
|
| 78 | + |
| 79 | + |
| 80 | +def _deleted_data_files( |
| 81 | + table: Table, |
| 82 | + starting_snapshot: Snapshot, |
| 83 | + data_filter: Optional[BooleanExpression], |
| 84 | + partition_set: Optional[dict[int, set[Record]]], |
| 85 | + parent_snapshot: Optional[Snapshot], |
| 86 | +) -> Iterator[ManifestEntry]: |
| 87 | + """Find deleted data files matching a filter since a starting snapshot. |
| 88 | +
|
| 89 | + Args: |
| 90 | + table: Table to validate |
| 91 | + starting_snapshot: Snapshot current at the start of the operation |
| 92 | + data_filter: Expression used to find deleted data files |
| 93 | + partition_set: dict of {spec_id: set[partition]} to filter on |
| 94 | + parent_snapshot: Ending snapshot on the branch being validated |
| 95 | +
|
| 96 | + Returns: |
| 97 | + List of conflicting manifest-entries |
| 98 | + """ |
| 99 | + # if there is no current table state, no files have been deleted |
| 100 | + if parent_snapshot is None: |
| 101 | + return |
| 102 | + |
| 103 | + manifests, snapshot_ids = validation_history( |
| 104 | + table, |
| 105 | + parent_snapshot, |
| 106 | + starting_snapshot, |
| 107 | + VALIDATE_DATA_FILES_EXIST_OPERATIONS, |
| 108 | + ManifestContent.DATA, |
| 109 | + ) |
| 110 | + |
| 111 | + if data_filter is not None: |
| 112 | + evaluator = _InclusiveMetricsEvaluator(table.schema(), data_filter).eval |
| 113 | + |
| 114 | + for manifest in manifests: |
| 115 | + for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False): |
| 116 | + if entry.snapshot_id not in snapshot_ids: |
| 117 | + continue |
| 118 | + |
| 119 | + if entry.status != ManifestEntryStatus.DELETED: |
| 120 | + continue |
| 121 | + |
| 122 | + if data_filter is not None and evaluator(entry.data_file) is ROWS_CANNOT_MATCH: |
| 123 | + continue |
| 124 | + |
| 125 | + if partition_set is not None: |
| 126 | + spec_id = entry.data_file.spec_id |
| 127 | + partition = entry.data_file.partition |
| 128 | + if spec_id not in partition_set or partition not in partition_set[spec_id]: |
| 129 | + continue |
| 130 | + |
| 131 | + yield entry |
| 132 | + |
| 133 | + |
| 134 | +def _validate_deleted_data_files( |
| 135 | + table: Table, |
| 136 | + starting_snapshot: Snapshot, |
| 137 | + data_filter: Optional[BooleanExpression], |
| 138 | + parent_snapshot: Snapshot, |
| 139 | +) -> None: |
| 140 | + """Validate that no files matching a filter have been deleted from the table since a starting snapshot. |
| 141 | +
|
| 142 | + Args: |
| 143 | + table: Table to validate |
| 144 | + starting_snapshot: Snapshot current at the start of the operation |
| 145 | + data_filter: Expression used to find deleted data files |
| 146 | + parent_snapshot: Ending snapshot on the branch being validated |
| 147 | +
|
| 148 | + """ |
| 149 | + conflicting_entries = _deleted_data_files(table, starting_snapshot, data_filter, None, parent_snapshot) |
| 150 | + if any(conflicting_entries): |
| 151 | + conflicting_snapshots = {entry.snapshot_id for entry in conflicting_entries} |
| 152 | + raise ValidationException(f"Deleted data files were found matching the filter for snapshots {conflicting_snapshots}!") |
0 commit comments