113
113
SnapshotLogEntry ,
114
114
SnapshotSummaryCollector ,
115
115
Summary ,
116
+ ancestors_between ,
117
+ is_parent_ancestor_of ,
116
118
update_snapshot_summaries ,
117
119
)
118
120
from pyiceberg .table .sorting import UNSORTED_SORT_ORDER , SortOrder
@@ -1703,10 +1705,6 @@ def snapshot(self) -> Optional[Snapshot]:
1703
1705
return self .table_metadata .snapshot_by_id (self .snapshot_id )
1704
1706
return self .table_metadata .current_snapshot ()
1705
1707
1706
- def _build_manifest_evaluator (self , spec_id : int ) -> Callable [[ManifestFile ], bool ]:
1707
- spec = self .table_metadata .specs ()[spec_id ]
1708
- return manifest_evaluator (spec , self .table_metadata .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1709
-
1710
1708
def projection (self ) -> Schema :
1711
1709
current_schema = self .table_metadata .schema ()
1712
1710
if self .snapshot_id is not None :
@@ -1727,41 +1725,6 @@ def projection(self) -> Schema:
1727
1725
1728
1726
return current_schema .select (* self .selected_fields , case_sensitive = self .case_sensitive )
1729
1727
1730
- def _build_partition_evaluator (self , spec_id : int ) -> Callable [[DataFile ], bool ]:
1731
- spec = self .table_metadata .specs ()[spec_id ]
1732
- partition_type = spec .partition_type (self .table_metadata .schema ())
1733
- partition_schema = Schema (* partition_type .fields )
1734
- partition_expr = self .partition_filters [spec_id ]
1735
-
1736
- # The lambda created here is run in multiple threads.
1737
- # So we avoid creating _EvaluatorExpression methods bound to a single
1738
- # shared instance across multiple threads.
1739
- return lambda data_file : expression_evaluator (partition_schema , partition_expr , self .case_sensitive )(data_file .partition )
1740
-
1741
- def _check_sequence_number (self , min_data_sequence_number : int , manifest : ManifestFile ) -> bool :
1742
- """Ensure that no manifests are loaded that contain deletes that are older than the data.
1743
-
1744
- Args:
1745
- min_data_sequence_number (int): The minimal sequence number.
1746
- manifest (ManifestFile): A ManifestFile that can be either data or deletes.
1747
-
1748
- Returns:
1749
- Boolean indicating if it is either a data file, or a relevant delete file.
1750
- """
1751
- return manifest .content == ManifestContent .DATA or (
1752
- # Not interested in deletes that are older than the data
1753
- manifest .content == ManifestContent .DELETES
1754
- and (manifest .sequence_number or INITIAL_SEQUENCE_NUMBER ) >= min_data_sequence_number
1755
- )
1756
-
1757
- def use_ref (self : S , name : str ) -> S :
1758
- if self .snapshot_id : # type: ignore
1759
- raise ValueError (f"Cannot override ref, already set snapshot id={ self .snapshot_id } " ) # type: ignore
1760
- if snapshot := self .table_metadata .snapshot_by_name (name ):
1761
- return self .update (snapshot_id = snapshot .snapshot_id )
1762
-
1763
- raise ValueError (f"Cannot scan unknown ref={ name } " )
1764
-
1765
1728
def plan_files (self ) -> Iterable [FileScanTask ]:
1766
1729
"""Plans the relevant files by filtering on the PartitionSpecs.
1767
1730
@@ -1849,6 +1812,14 @@ def to_arrow(self) -> pa.Table:
1849
1812
def to_pandas (self , ** kwargs : Any ) -> pd .DataFrame :
1850
1813
return self .to_arrow ().to_pandas (** kwargs )
1851
1814
1815
+ def use_ref (self : S , name : str ) -> S :
1816
+ if self .snapshot_id : # type: ignore
1817
+ raise ValueError (f"Cannot override ref, already set snapshot id={ self .snapshot_id } " ) # type: ignore
1818
+ if snapshot := self .table_metadata .snapshot_by_name (name ):
1819
+ return self .update (snapshot_id = snapshot .snapshot_id )
1820
+
1821
+ raise ValueError (f"Cannot scan unknown ref={ name } " )
1822
+
1852
1823
def to_duckdb (self , table_name : str , connection : Optional [DuckDBPyConnection ] = None ) -> DuckDBPyConnection :
1853
1824
import duckdb
1854
1825
@@ -1864,6 +1835,13 @@ def to_ray(self) -> ray.data.dataset.Dataset:
1864
1835
1865
1836
1866
1837
class BaseIncrementalScan (TableScan ):
1838
+ """Base class for incremental scans.
1839
+
1840
+ Args:
1841
+ to_snapshot_id: The end snapshot ID (inclusive).
1842
+ from_snapshot_id_exclusive: The start snapshot ID (exclusive).
1843
+ """
1844
+
1867
1845
to_snapshot_id : Optional [int ]
1868
1846
from_snapshot_id_exclusive : Optional [int ]
1869
1847
@@ -4015,35 +3993,3 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
4015
3993
table_partitions : List [TablePartition ] = _get_table_partitions (arrow_table , spec , schema , slice_instructions )
4016
3994
4017
3995
return table_partitions
4018
-
4019
-
4020
- def ancestors_between (to_snapshot : int , from_snapshot : Optional [int ], table_metadata : TableMetadata ) -> Iterable [Snapshot ]:
4021
- if from_snapshot is not None :
4022
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (to_snapshot ), table_metadata ): # type: ignore
4023
- if snapshot .snapshot_id == from_snapshot :
4024
- break
4025
- yield snapshot
4026
- else :
4027
- yield from ancestors_of (table_metadata .snapshot_by_id (to_snapshot ), table_metadata ) # type: ignore
4028
-
4029
-
4030
- def is_parent_ancestor_of (snapshot_id : int , ancestor_parent_snapshot_id : int , table_metadata : TableMetadata ) -> bool :
4031
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (snapshot_id ), table_metadata ): # type: ignore
4032
- if snapshot .parent_snapshot_id and snapshot .parent_snapshot_id == ancestor_parent_snapshot_id :
4033
- return True
4034
- return False
4035
-
4036
-
4037
- def oldest_ancestor_of (snapshot_id : int , table_metadata : TableMetadata ) -> Optional [int ]:
4038
- last_snapshot = None
4039
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (snapshot_id ), table_metadata ): # type: ignore
4040
- last_snapshot = snapshot .snapshot_id
4041
- return last_snapshot
4042
-
4043
-
4044
- def ancestors_of (latest_snapshot : Snapshot , table_metadata : TableMetadata ) -> Iterable [Snapshot ]:
4045
- if latest_snapshot :
4046
- yield latest_snapshot
4047
- if latest_snapshot .parent_snapshot_id :
4048
- if parent := table_metadata .snapshot_by_id (latest_snapshot .parent_snapshot_id ):
4049
- yield from ancestors_of (parent , table_metadata )
0 commit comments