-
Notifications
You must be signed in to change notification settings - Fork 275
Added ExpireSnapshots Feature #1880
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 22 commits
0a94d96
5f0b62b
f995daa
65365e1
e28815f
4628ede
e80c41c
cb9f0c9
ebcff2b
97399bf
95e5af2
5ab5890
5acd690
d30a08c
e62ab58
1af3258
352b48f
382e0ea
549c183
386cb15
12729fa
ce3515c
28fce4b
2c3153e
27c3ece
05793c0
8ec1889
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,7 +66,9 @@ | |
AddSnapshotUpdate, | ||
AssertRefSnapshotId, | ||
RemoveSnapshotRefUpdate, | ||
RemoveSnapshotsUpdate, | ||
SetSnapshotRefUpdate, | ||
TableMetadata, | ||
TableRequirement, | ||
TableUpdate, | ||
U, | ||
|
@@ -82,7 +84,11 @@ | |
from pyiceberg.utils.properties import property_as_bool, property_as_int | ||
|
||
if TYPE_CHECKING: | ||
from pyiceberg.table import Transaction | ||
pass | ||
|
||
|
||
from pyiceberg.table.metadata import Snapshot, TableMetadata | ||
from pyiceberg.table.snapshots import Snapshot | ||
ForeverAngry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
def _new_manifest_file_name(num: int, commit_uuid: uuid.UUID) -> str: | ||
|
@@ -238,7 +244,7 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: | |
previous_summary=previous_snapshot.summary if previous_snapshot is not None else None, | ||
) | ||
|
||
def _commit(self) -> UpdatesAndRequirements: | ||
def _commit(self, base_metadata: TableMetadata) -> UpdatesAndRequirements: | ||
ForeverAngry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
new_manifests = self._manifests() | ||
next_sequence_number = self._transaction.table_metadata.next_sequence_number() | ||
|
||
|
@@ -739,11 +745,11 @@ class ManageSnapshots(UpdateTableMetadata["ManageSnapshots"]): | |
ms.create_tag(snapshot_id1, "Tag_A").create_tag(snapshot_id2, "Tag_B") | ||
""" | ||
|
||
_snapshot_ids_to_expire = set() | ||
_updates: Tuple[TableUpdate, ...] = () | ||
_requirements: Tuple[TableRequirement, ...] = () | ||
|
||
def _commit(self) -> UpdatesAndRequirements: | ||
"""Apply the pending changes and commit.""" | ||
return self._updates, self._requirements | ||
|
||
def _remove_ref_snapshot(self, ref_name: str) -> ManageSnapshots: | ||
|
@@ -843,3 +849,64 @@ def remove_branch(self, branch_name: str) -> ManageSnapshots: | |
This for method chaining | ||
""" | ||
return self._remove_ref_snapshot(ref_name=branch_name) | ||
|
||
class ExpireSnapshots(UpdateTableMetadata["ExpireSnapshots"]): | ||
""" | ||
Expire snapshots by ID or by timestamp. | ||
Use table.expire_snapshots().<operation>().commit() to run a specific operation. | ||
Use table.expire_snapshots().<operation-one>().<operation-two>().commit() to run multiple operations. | ||
Pending changes are applied on commit. | ||
""" | ||
|
||
_snapshot_ids_to_expire = set() | ||
_updates: Tuple[TableUpdate, ...] = () | ||
_requirements: Tuple[TableRequirement, ...] = () | ||
|
||
def _commit(self) -> UpdatesAndRequirements: | ||
""" | ||
Commit the staged updates and requirements. | ||
This will remove the snapshots with the given IDs. | ||
|
||
Returns: | ||
Tuple of updates and requirements to be committed, | ||
as required by the calling parent apply functions. | ||
""" | ||
update = RemoveSnapshotsUpdate(snapshot_ids=self._snapshot_ids_to_expire) | ||
self._updates += (update,) | ||
return self._updates, self._requirements | ||
ForeverAngry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def expire_snapshot_by_id(self, snapshot_id: int) -> ExpireSnapshots: | ||
""" | ||
Expire a snapshot by its ID. | ||
|
||
Args: | ||
snapshot_id (int): The ID of the snapshot to expire. | ||
|
||
Returns: | ||
This for method chaining. | ||
""" | ||
if self._transaction.table_metadata.snapshot_by_id(snapshot_id) is None: | ||
raise ValueError(f"Snapshot with ID {snapshot_id} does not exist.") | ||
self._snapshot_ids_to_expire.add(snapshot_id) | ||
return self | ||
|
||
def expire_snapshots_older_than(self, timestamp_ms: int) -> ExpireSnapshots: | ||
""" | ||
Expire snapshots older than the given timestamp. | ||
|
||
Args: | ||
timestamp_ms (int): The timestamp in milliseconds. Snapshots older than this will be expired. | ||
ForeverAngry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Returns: | ||
This for method chaining. | ||
""" | ||
# Collect IDs of snapshots to be expired | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunally, it is not that simple to just look at the time alone. Instead, there are some rules, for example:
The easiest way of going through the logic is following this method: https://github.com/apache/iceberg/blob/3f661d5c6657542538a1e944db57405efdefea29/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java#L179 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might just pull this out into another issue, separate from this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm hestitant to do that, because when folks would run it, it might break their tables 😱 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I meant, for now, ill remove the |
||
snapshots_to_remove = [ | ||
snapshot.snapshot_id | ||
for snapshot in self._transaction.table_metadata.snapshots | ||
if snapshot.timestamp_ms < timestamp_ms | ||
] | ||
if snapshots_to_remove: | ||
for snapshot_id in snapshots_to_remove: | ||
self._snapshot_ids_to_expire.add(snapshot_id) | ||
return self |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from unittest.mock import MagicMock | ||
ForeverAngry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from uuid import uuid4 | ||
|
||
from pyiceberg.table import CommitTableResponse, Table | ||
|
||
|
||
def test_expire_snapshot(table_v2: Table) -> None: | ||
EXPIRE_SNAPSHOT = 3051729675574597004 | ||
KEEP_SNAPSHOT = 3055729675574597004 | ||
# Mock the catalog's commit_table method | ||
mock_response = CommitTableResponse( | ||
# Use the table's current metadata but keep only the snapshot not to be expired | ||
metadata=table_v2.metadata.model_copy(update={"snapshots": [KEEP_SNAPSHOT]}), | ||
metadata_location="mock://metadata/location", | ||
uuid=uuid4(), | ||
) | ||
|
||
# Mock the commit_table method to return the mock response | ||
table_v2.catalog.commit_table = MagicMock(return_value=mock_response) | ||
|
||
# Print snapshot IDs for debugging | ||
print(f"Snapshot IDs before expiration: {[snapshot.snapshot_id for snapshot in table_v2.metadata.snapshots]}") | ||
|
||
# Assert fixture data to validate test assumptions | ||
assert len(table_v2.metadata.snapshots) == 2 | ||
assert len(table_v2.metadata.snapshot_log) == 2 | ||
assert len(table_v2.metadata.refs) == 2 | ||
|
||
# Expire the snapshot directly without using a transaction | ||
try: | ||
table_v2.expire_snapshots().expire_snapshot_by_id(EXPIRE_SNAPSHOT).commit() | ||
except Exception as e: | ||
assert False, f"Commit failed with error: {e}" | ||
|
||
# Assert that commit_table was called once | ||
table_v2.catalog.commit_table.assert_called_once() | ||
|
||
# Assert the expired snapshot ID is no longer present | ||
remaining_snapshots = table_v2.metadata.snapshots | ||
assert EXPIRE_SNAPSHOT not in remaining_snapshots | ||
|
||
# Assert the length of snapshots after expiration | ||
assert len(table_v2.metadata.snapshots) == 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This
if
is there to break circular dependencies. If it is not needed anymore, we can remove it 👍