diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java index f77f7a0cb..2743c6579 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java @@ -59,22 +59,24 @@ public class OpenHouseInternalCatalog extends BaseMetastoreCatalog { @Autowired StorageType storageType; - @Autowired SnapshotInspector snapshotInspector; - @Autowired HouseTableMapper houseTableMapper; @Autowired MeterRegistry meterRegistry; @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { + FileIO fileIO = resolveFileIO(tableIdentifier); + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, METRICS_PREFIX, Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); return new OpenHouseInternalTableOperations( houseTableRepository, - resolveFileIO(tableIdentifier), - snapshotInspector, + fileIO, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, METRICS_PREFIX, Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); } @Override diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index d9fa34257..d96d9d6b1 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -23,28 +23,19 @@ import java.io.IOException; import java.time.Clock; import java.time.Instant; -import java.util.ArrayList; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.collections.MapUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.iceberg.BaseMetastoreTableOperations; import org.apache.iceberg.PartitionField; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortField; import org.apache.iceberg.SortOrder; @@ -60,7 +51,6 @@ import org.apache.iceberg.expressions.Term; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.springframework.data.util.Pair; @AllArgsConstructor @Slf4j @@ -70,8 +60,6 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio FileIO fileIO; - SnapshotInspector snapshotInspector; - HouseTableMapper houseTableMapper; TableIdentifier tableIdentifier; @@ -80,6 +68,8 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio FileIOManager fileIOManager; + SnapshotDiffApplier snapshotDiffApplier; + private static final Gson GSON = new Gson(); private static final Cache CACHE = @@ -229,6 +219,8 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { metadata = rebuildTblMetaWithSchema(metadata, CatalogConstants.EVOLVED_SCHEMA_KEY, true); } + metadata = snapshotDiffApplier.applySnapshots(base, metadata); + int version = currentVersion() + 1; CommitStatus commitStatus = CommitStatus.FAILURE; @@ -260,8 +252,6 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { if (properties.containsKey(CatalogConstants.EVOLVED_SCHEMA_KEY)) { properties.remove(CatalogConstants.EVOLVED_SCHEMA_KEY); } - String serializedSnapshotsToPut = properties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); - String serializedSnapshotRefs = properties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); boolean isStageCreate = Boolean.parseBoolean(properties.remove(CatalogConstants.IS_STAGE_CREATE_KEY)); String sortOrderJson = properties.remove(CatalogConstants.SORT_ORDER_KEY); @@ -274,24 +264,6 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { updatedMetadata = updatedMetadata.replaceSortOrder(sortOrder); } - if (serializedSnapshotsToPut != null) { - List snapshotsToPut = - SnapshotsUtil.parseSnapshots(fileIO, serializedSnapshotsToPut); - Pair, List> snapshotsDiff = - SnapshotsUtil.symmetricDifferenceSplit(snapshotsToPut, updatedMetadata.snapshots()); - List appendedSnapshots = snapshotsDiff.getFirst(); - List deletedSnapshots = snapshotsDiff.getSecond(); - snapshotInspector.validateSnapshotsUpdate( - updatedMetadata, appendedSnapshots, deletedSnapshots); - Map snapshotRefs = - serializedSnapshotRefs == null - ? new HashMap<>() - : SnapshotsUtil.parseSnapshotRefs(serializedSnapshotRefs); - updatedMetadata = - maybeAppendSnapshots(updatedMetadata, appendedSnapshots, snapshotRefs, true); - updatedMetadata = maybeDeleteSnapshots(updatedMetadata, deletedSnapshots); - } - final TableMetadata updatedMtDataRef = updatedMetadata; long metadataUpdateStartTime = System.currentTimeMillis(); try { @@ -531,125 +503,6 @@ private void failIfRetryUpdate(Map properties) { } } - public TableMetadata maybeDeleteSnapshots( - TableMetadata metadata, List snapshotsToDelete) { - TableMetadata result = metadata; - if (CollectionUtils.isNotEmpty(snapshotsToDelete)) { - Set snapshotIds = - snapshotsToDelete.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Map updatedProperties = new HashMap<>(result.properties()); - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - snapshotsToDelete.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(","))); - result = - TableMetadata.buildFrom(result) - .setProperties(updatedProperties) - .build() - .removeSnapshotsIf(s -> snapshotIds.contains(s.snapshotId())); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, snapshotsToDelete.size()); - } - return result; - } - - public TableMetadata maybeAppendSnapshots( - TableMetadata metadata, - List snapshotsToAppend, - Map snapshotRefs, - boolean recordAction) { - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); - List appendedSnapshots = new ArrayList<>(); - List stagedSnapshots = new ArrayList<>(); - List cherryPickedSnapshots = new ArrayList<>(); - // Throw an exception if client sent request that included non-main branches in the - // snapshotRefs. - for (Map.Entry entry : snapshotRefs.entrySet()) { - if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { - throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); - } - } - /** - * First check if there are new snapshots to be appended to current TableMetadata. If yes, - * following are the cases to be handled: - * - *

[1] A regular (non-wap) snapshot is being added to the MAIN branch. - * - *

[2] A staged (wap) snapshot is being created on top of current snapshot as its base. - * Recognized by STAGED_WAP_ID_PROP. - * - *

[3] A staged (wap) snapshot is being cherry picked to the MAIN branch wherein current - * snapshot in the MAIN branch is not the same as the base snapshot the staged (wap) snapshot - * was created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward - * cherry pick. - * - *

In case no new snapshots are to be appended to current TableMetadata, there could be a - * cherrypick of a staged (wap) snapshot on top of the current snapshot in the MAIN branch which - * is the same as the base snapshot the staged (wap) snapshot was created on. This case is - * called fast forward cherry pick. - */ - if (CollectionUtils.isNotEmpty(snapshotsToAppend)) { - for (Snapshot snapshot : snapshotsToAppend) { - snapshotInspector.validateSnapshot(snapshot); - if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { - // a stage only snapshot using wap.id - metadataBuilder.addSnapshot(snapshot); - stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { - // a snapshot created on a non fast-forward cherry-pick snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - cherryPickedSnapshots.add( - String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); - } else { - // a regular snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } - } - } else if (MapUtils.isNotEmpty(snapshotRefs)) { - // Updated ref in the main branch with no new snapshot means this is a - // fast-forward cherry-pick or rollback operation. - long newSnapshotId = snapshotRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - // Either the current snapshot is null or the current snapshot is not equal - // to the new snapshot indicates an update. The first case happens when the - // stage/wap snapshot being cherry-picked is the first snapshot. - if (MapUtils.isEmpty(metadata.refs()) - || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { - metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); - cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); - } - } - if (recordAction) { - Map updatedProperties = new HashMap<>(metadata.properties()); - if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - appendedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); - } - if (CollectionUtils.isNotEmpty(stagedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - stagedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); - } - if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - cherryPickedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, - cherryPickedSnapshots.size()); - } - metadataBuilder.setProperties(updatedProperties); - } - return metadataBuilder.build(); - } - /** Helper function to dump contents for map in debugging mode. */ private void logPropertiesMap(Map map) { log.debug(" === Printing the table properties within doCommit method === "); diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java new file mode 100644 index 000000000..7663836e0 --- /dev/null +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -0,0 +1,717 @@ +package com.linkedin.openhouse.internal.catalog; + +import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; + +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import lombok.AllArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.TableMetadata; + +/** + * Service responsible for applying snapshot changes to Iceberg table metadata. + * + *

The main entry point applySnapshots() has a clear flow: parse input → compute diff → validate + * → apply. + */ +@AllArgsConstructor +@Slf4j +public class SnapshotDiffApplier { + + private final MetricsReporter metricsReporter; + + /** + * Applies snapshot updates from metadata properties. Simple and clear: parse input, compute diff, + * validate, apply, record metrics, build. + * + * @param existingMetadata The existing table metadata (may be null for table creation) + * @param providedMetadata The new metadata with properties containing snapshot updates + * @return Updated metadata with snapshots applied + * @throws NullPointerException if providedMetadata is null + */ + public TableMetadata applySnapshots( + TableMetadata existingMetadata, TableMetadata providedMetadata) { + // Validate at system boundary + Objects.requireNonNull(providedMetadata, "providedMetadata cannot be null"); + + String snapshotsJson = providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); + Map providedRefs = + Optional.ofNullable(providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) + .map(SnapshotsUtil::parseSnapshotRefs) + .orElse(new HashMap<>()); + + if (snapshotsJson == null) { + return providedMetadata; + } + + // Parse input + List providedSnapshots = SnapshotsUtil.parseSnapshots(null, snapshotsJson); + + List existingSnapshots = + existingMetadata != null ? existingMetadata.snapshots() : Collections.emptyList(); + Map existingRefs = + existingMetadata != null ? existingMetadata.refs() : Collections.emptyMap(); + + // Compute diff (all maps created once in factory method) + SnapshotDiff diff = + SnapshotDiff.create( + metricsReporter, + existingMetadata, + providedMetadata, + existingSnapshots, + providedSnapshots, + existingRefs, + providedRefs); + + // Validate, apply, record metrics (in correct order) + diff.validate(); + TableMetadata result = diff.apply(); + diff.recordMetrics(); + return result; + } + + /** + * State object that computes and caches all snapshot analysis. Computes all maps once in the + * factory method to avoid redundant operations. Provides clear methods for validation and + * application. + */ + private static class SnapshotDiff { + // Injected dependency + private final MetricsReporter metricsReporter; + + // Input state + private final TableMetadata existingMetadata; + private final TableMetadata providedMetadata; + private final String databaseId; + private final Map providedRefs; + + // Computed maps (created once) + private final Map providedSnapshotByIds; + private final List newSnapshots; + private final List deletedSnapshots; + + // Categorized snapshots + private final List newStagedSnapshots; + private final List cherryPickedSnapshots; + + // Changes + private final Map branchUpdates; + private final Map> newSnapshotsByBranch; + private final Set staleRefs; + private final List unreferencedNewSnapshots; + + // Application state (pre-computed in create) + private final List autoAppendedToMainSnapshots; + private final List mainBranchSnapshotsForMetrics; + private final Set snapshotIdsToAdd; + + /** + * Creates a SnapshotDiff by computing all snapshot analysis from the provided inputs. + * + *

Preconditions: All parameters except existingMetadata must be non-null. Collections should + * be empty rather than null. + * + * @param metricsReporter Metrics reporter for recording snapshot operations + * @param existingMetadata The existing table metadata (may be null for table creation) + * @param providedMetadata The new metadata with properties containing snapshot updates + * @param existingSnapshots Snapshots currently in the table + * @param providedSnapshots Snapshots provided in the update + * @param existingRefs Snapshot refs currently in the table + * @param providedRefs Snapshot refs provided in the update + * @return A new SnapshotDiff with all analysis computed + */ + static SnapshotDiff create( + MetricsReporter metricsReporter, + TableMetadata existingMetadata, + TableMetadata providedMetadata, + List existingSnapshots, + List providedSnapshots, + Map existingRefs, + Map providedRefs) { + + // Compute all index maps once + Map providedSnapshotByIds = + providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + Map existingSnapshotByIds = + existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + + // Compute changes + List newSnapshots = + providedSnapshots.stream() + .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) + .collect(Collectors.toList()); + List deletedSnapshots = + existingSnapshots.stream() + .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) + .collect(Collectors.toList()); + + // Categorize snapshots - process in dependency order + // 1. Cherry-picked has highest priority (includes WAP being published) + // 2. WAP snapshots (staged, not published) + // 3. Regular snapshots (everything else) + Set existingBranchRefIds = + existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); + Set providedBranchRefIds = + providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); + + List cherryPickedSnapshots = + computeCherryPickedSnapshots( + providedSnapshots, existingSnapshotByIds, existingBranchRefIds, providedBranchRefIds); + Set cherryPickedIds = + cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + List newStagedSnapshots = + computeWapSnapshots( + providedSnapshots, cherryPickedIds, existingBranchRefIds, providedBranchRefIds); + Set wapIds = + newStagedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + List regularSnapshots = + computeRegularSnapshots(providedSnapshots, cherryPickedIds, wapIds); + + // Compute branch updates (refs that have changed) + Map branchUpdates = + providedRefs.entrySet().stream() + .filter( + entry -> { + SnapshotRef existing = existingRefs.get(entry.getKey()); + return existing == null + || existing.snapshotId() != entry.getValue().snapshotId(); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + // Compute new snapshots by branch (only regular snapshots) + Set regularSnapshotIds = + regularSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Map> newSnapshotsByBranch = + computeNewSnapshotsByBranch( + branchUpdates, providedSnapshotByIds, existingSnapshotByIds, regularSnapshotIds); + + // Compute derived changes for multi-branch support + Set staleRefs = new HashSet<>(existingRefs.keySet()); + staleRefs.removeAll(providedRefs.keySet()); + + // Collect all snapshot IDs that are in branch lineages (will be added via setBranchSnapshot) + Set snapshotsInBranchLineages = + newSnapshotsByBranch.values().stream() + .flatMap(List::stream) + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + + // Compute existing snapshot IDs after deletion + Set deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set existingAfterDeletionIds = new HashSet<>(existingSnapshotByIds.keySet()); + existingAfterDeletionIds.removeAll(deletedIds); + + // Compute metadata snapshot IDs + Set metadataSnapshotIds = + providedMetadata.snapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + + List unreferencedNewSnapshots = + providedSnapshots.stream() + .filter( + s -> + !existingAfterDeletionIds.contains(s.snapshotId()) + && !providedBranchRefIds.contains(s.snapshotId()) + && !metadataSnapshotIds.contains(s.snapshotId()) + && !snapshotsInBranchLineages.contains(s.snapshotId())) + .collect(Collectors.toList()); + + // Compute auto-appended snapshots for MAIN branch (backward compatibility) + final List autoAppendedToMainSnapshots; + if (!providedRefs.containsKey(SnapshotRef.MAIN_BRANCH) + && !unreferencedNewSnapshots.isEmpty()) { + autoAppendedToMainSnapshots = + unreferencedNewSnapshots.stream() + .filter(s -> !wapIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } else { + autoAppendedToMainSnapshots = Collections.emptyList(); + } + + // Main branch snapshots for metrics (includes auto-appended) + List mainForMetrics = + new ArrayList<>( + newSnapshotsByBranch.getOrDefault(SnapshotRef.MAIN_BRANCH, Collections.emptyList())); + mainForMetrics.addAll(autoAppendedToMainSnapshots); + + // Global set of snapshot IDs to add (deduplicated upfront) + Set snapshotIdsToAdd = new HashSet<>(); + autoAppendedToMainSnapshots.stream().map(Snapshot::snapshotId).forEach(snapshotIdsToAdd::add); + newSnapshotsByBranch.values().stream() + .flatMap(List::stream) + .map(Snapshot::snapshotId) + .forEach(snapshotIdsToAdd::add); + // Remove any that already exist and aren't deleted + snapshotIdsToAdd.removeIf(id -> metadataSnapshotIds.contains(id) && !deletedIds.contains(id)); + + // Extract database ID from metadata properties + String databaseId = + providedMetadata.properties().get(CatalogConstants.OPENHOUSE_DATABASEID_KEY); + + return new SnapshotDiff( + metricsReporter, + existingMetadata, + providedMetadata, + databaseId, + providedRefs, + providedSnapshotByIds, + newSnapshots, + deletedSnapshots, + newStagedSnapshots, + cherryPickedSnapshots, + branchUpdates, + newSnapshotsByBranch, + staleRefs, + unreferencedNewSnapshots, + autoAppendedToMainSnapshots, + mainForMetrics, + snapshotIdsToAdd); + } + + /** Private constructor that accepts all pre-computed values. Use {@link #create} instead. */ + private SnapshotDiff( + MetricsReporter metricsReporter, + TableMetadata existingMetadata, + TableMetadata providedMetadata, + String databaseId, + Map providedRefs, + Map providedSnapshotByIds, + List newSnapshots, + List deletedSnapshots, + List newStagedSnapshots, + List cherryPickedSnapshots, + Map branchUpdates, + Map> newSnapshotsByBranch, + Set staleRefs, + List unreferencedNewSnapshots, + List autoAppendedToMainSnapshots, + List mainBranchSnapshotsForMetrics, + Set snapshotIdsToAdd) { + this.metricsReporter = metricsReporter; + this.existingMetadata = existingMetadata; + this.providedMetadata = providedMetadata; + this.databaseId = databaseId; + this.providedRefs = providedRefs; + this.providedSnapshotByIds = providedSnapshotByIds; + this.newSnapshots = newSnapshots; + this.deletedSnapshots = deletedSnapshots; + this.newStagedSnapshots = newStagedSnapshots; + this.cherryPickedSnapshots = cherryPickedSnapshots; + this.branchUpdates = branchUpdates; + this.newSnapshotsByBranch = newSnapshotsByBranch; + this.staleRefs = staleRefs; + this.unreferencedNewSnapshots = unreferencedNewSnapshots; + this.autoAppendedToMainSnapshots = autoAppendedToMainSnapshots; + this.mainBranchSnapshotsForMetrics = mainBranchSnapshotsForMetrics; + this.snapshotIdsToAdd = snapshotIdsToAdd; + } + + /** + * Computes staged WAP snapshots that are not yet published to any branch. + * + *

Depends on: cherry-picked IDs (to exclude WAP snapshots being published) + * + * @param providedSnapshots All snapshots provided in the update + * @param excludeCherryPicked Set of cherry-picked snapshot IDs to exclude + * @param existingBranchRefIds Set of snapshot IDs referenced by existing branches + * @param providedBranchRefIds Set of snapshot IDs referenced by provided branches + * @return List of staged WAP snapshots + */ + private static List computeWapSnapshots( + List providedSnapshots, + Set excludeCherryPicked, + Set existingBranchRefIds, + Set providedBranchRefIds) { + Set allBranchRefIds = + java.util.stream.Stream.concat( + existingBranchRefIds.stream(), providedBranchRefIds.stream()) + .collect(Collectors.toSet()); + + return providedSnapshots.stream() + .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) + && !allBranchRefIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } + + private static List computeCherryPickedSnapshots( + List providedSnapshots, + Map existingSnapshotByIds, + Set existingBranchRefIds, + Set providedBranchRefIds) { + Set cherryPickSourceIds = + providedSnapshots.stream() + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .map(s -> Long.parseLong(s.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))) + .collect(Collectors.toSet()); + + return providedSnapshots.stream() + .filter( + provided -> { + // Only consider EXISTING snapshots as cherry-picked + Snapshot existing = existingSnapshotByIds.get(provided.snapshotId()); + if (existing == null) { + return false; + } + + // Parent changed (moved to different branch) + if (!Objects.equals(provided.parentId(), existing.parentId())) { + return true; + } + + // Is source of cherry-pick + if (cherryPickSourceIds.contains(provided.snapshotId())) { + return true; + } + + // WAP snapshot being published (staged → branch) + boolean hasWapId = + provided.summary() != null + && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + boolean wasStaged = !existingBranchRefIds.contains(provided.snapshotId()); + boolean isNowOnBranch = providedBranchRefIds.contains(provided.snapshotId()); + return hasWapId && wasStaged && isNowOnBranch; + }) + .collect(Collectors.toList()); + } + + /** + * Computes regular snapshots that are neither cherry-picked nor staged WAP. + * + *

Depends on: cherry-picked and WAP IDs (everything else is regular) + * + * @param providedSnapshots All snapshots provided in the update + * @param excludeCherryPicked Set of cherry-picked snapshot IDs to exclude + * @param excludeWap Set of WAP snapshot IDs to exclude + * @return List of regular snapshots + */ + private static List computeRegularSnapshots( + List providedSnapshots, Set excludeCherryPicked, Set excludeWap) { + return providedSnapshots.stream() + .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) + .filter(s -> !excludeWap.contains(s.snapshotId())) + .collect(Collectors.toList()); + } + + /** + * Computes new regular snapshots grouped by branch. + * + *

For each branch that has been updated, walks backward from the new branch head following + * parent pointers until reaching an existing snapshot (merge base). Collects only new regular + * snapshots along the way. + * + * @param branchUpdates Map of branch names to their new refs (only branches that changed) + * @param providedSnapshotByIds Index of all provided snapshots by ID + * @param existingSnapshotByIds Index of all existing snapshots by ID + * @param regularSnapshotIds Set of IDs for regular (non-WAP, non-cherry-picked) snapshots + * @return Map of branch names to lists of new regular snapshots added to each branch + */ + private static Map> computeNewSnapshotsByBranch( + Map branchUpdates, + Map providedSnapshotByIds, + Map existingSnapshotByIds, + Set regularSnapshotIds) { + + Map> result = new HashMap<>(); + + branchUpdates.forEach( + (branchName, newRef) -> { + java.util.Deque branchNewSnapshots = new java.util.ArrayDeque<>(); + long currentId = newRef.snapshotId(); + + // Walk backwards from new branch head until we hit an existing snapshot (merge base) + while (currentId != -1) { + Snapshot snapshot = providedSnapshotByIds.get(currentId); + if (snapshot == null) { + break; + } + + // Stop at existing snapshot (merge base or old branch head) + if (existingSnapshotByIds.containsKey(currentId)) { + break; + } + + // Collect new regular snapshots in chronological order (add to front) + if (regularSnapshotIds.contains(currentId)) { + branchNewSnapshots.addFirst(snapshot); + } + + currentId = snapshot.parentId() != null ? snapshot.parentId() : -1; + } + + if (!branchNewSnapshots.isEmpty()) { + result.put(branchName, new ArrayList<>(branchNewSnapshots)); + } + }); + + return result; + } + + /** + * Validates all snapshot changes before applying them to table metadata. + * + * @throws InvalidIcebergSnapshotException if any validation check fails + */ + void validate() { + validateCurrentSnapshotNotDeleted(); + validateDeletedSnapshotsNotReferenced(); + } + + /** + * Validates that the current snapshot is not deleted without providing replacement snapshots. + * + * @throws InvalidIcebergSnapshotException if the current snapshot is being deleted without + * replacements + */ + private void validateCurrentSnapshotNotDeleted() { + if (this.existingMetadata == null || this.existingMetadata.currentSnapshot() == null) { + return; + } + if (!this.newSnapshots.isEmpty()) { + return; + } + long latestSnapshotId = this.existingMetadata.currentSnapshot().snapshotId(); + // Check if the last deleted snapshot is the current one (snapshots are ordered by time) + if (!this.deletedSnapshots.isEmpty() + && this.deletedSnapshots.get(this.deletedSnapshots.size() - 1).snapshotId() + == latestSnapshotId) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete the current snapshot %s without adding replacement snapshots.", + latestSnapshotId)); + } + } + + /** + * Validates that snapshots being deleted are not still referenced by any branches or tags. This + * prevents data loss and maintains referential integrity by ensuring that all branch and tag + * pointers reference valid snapshots that will continue to exist after the commit. + * + * @throws InvalidIcebergSnapshotException if any deleted snapshot is still referenced by a + * branch or tag + */ + private void validateDeletedSnapshotsNotReferenced() { + Map> referencedIdsToRefs = + providedRefs.entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + Set deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + List invalidDeleteDetails = + deletedIds.stream() + .filter(referencedIdsToRefs::containsKey) + .map( + id -> + String.format( + "snapshot %s (referenced by: %s)", + id, String.join(", ", referencedIdsToRefs.get(id)))) + .collect(Collectors.toList()); + + if (!invalidDeleteDetails.isEmpty()) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete snapshots that are still referenced by branches/tags: %s", + String.join("; ", invalidDeleteDetails))); + } + } + + /** + * Applies snapshot changes to the table metadata. + * + *

Application strategy: + * + *

[1] Remove deleted snapshots to clean up old data + * + *

[2] Remove stale branch references that are no longer needed + * + *

[3] Add unreferenced snapshots (auto-append to MAIN for backward compat, then staged WAP) + * + *

[4] Set branch pointers for all provided refs. Uses pre-computed deduplication set to + * determine whether to call setBranchSnapshot (for new snapshots) or setRef (for existing + * snapshots being fast-forwarded or cherry-picked). + * + *

[5] Set properties using pre-computed snapshot lists for tracking + * + *

This order ensures referential integrity is maintained throughout the operation. + */ + TableMetadata apply() { + TableMetadata.Builder builder = TableMetadata.buildFrom(this.providedMetadata); + + // [1] Remove deletions + if (!this.deletedSnapshots.isEmpty()) { + Set deletedIds = + this.deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + builder.removeSnapshots(deletedIds); + } + this.staleRefs.forEach(builder::removeRef); + + // [2] Add unreferenced snapshots (auto-append to MAIN for backward compat, then WAP) + this.autoAppendedToMainSnapshots.forEach( + s -> builder.setBranchSnapshot(s, SnapshotRef.MAIN_BRANCH)); + this.unreferencedNewSnapshots.stream() + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .forEach(builder::addSnapshot); + + // [3] Apply branch operations (using pre-computed deduplication set) + // First, apply new snapshots for branches that have them + this.newSnapshotsByBranch.forEach( + (branchName, branchSnapshots) -> { + // Add each new snapshot in lineage order + for (Snapshot snapshot : branchSnapshots) { + if (this.snapshotIdsToAdd.contains(snapshot.snapshotId())) { + builder.setBranchSnapshot(snapshot, branchName); + } + } + }); + + // Then, set refs for all provided refs (branches without new snapshots, and tags) + this.providedRefs.forEach( + (refName, ref) -> { + // Skip if we already added snapshots for this branch + if (this.newSnapshotsByBranch.containsKey(refName) + && !this.newSnapshotsByBranch.get(refName).isEmpty()) { + return; + } + + // For refs without new snapshots (fast-forward, cherry-pick, or tags) + Snapshot refSnapshot = this.providedSnapshotByIds.get(ref.snapshotId()); + if (refSnapshot == null) { + throw new InvalidIcebergSnapshotException( + String.format( + "Ref %s references non-existent snapshot %s", refName, ref.snapshotId())); + } + builder.setRef(refName, ref); + }); + + // [4] Set properties + setSnapshotProperties(builder); + + return builder.build(); + } + + /** + * Sets snapshot-related properties on the metadata builder. + * + *

Records snapshot IDs in table properties for tracking and cleanup of temporary input + * properties used for snapshot transfer. + * + * @param builder The metadata builder to set properties on + */ + private void setSnapshotProperties(TableMetadata.Builder builder) { + // First, remove the temporary transfer properties + builder.removeProperties( + new HashSet<>( + java.util.Arrays.asList( + CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); + + // Then set the tracking properties + if (!this.mainBranchSnapshotsForMetrics.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + formatSnapshotIds(this.mainBranchSnapshotsForMetrics))); + } + if (!this.newStagedSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + formatSnapshotIds(this.newStagedSnapshots))); + } + if (!this.cherryPickedSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + formatSnapshotIds(this.cherryPickedSnapshots))); + } + if (!this.deletedSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + formatSnapshotIds(this.deletedSnapshots))); + } + } + + /** + * Records metrics for snapshot operations. + * + *

Tracks counts of: - Regular snapshots added (new commits and cherry-pick results) - Staged + * snapshots (WAP) - Cherry-picked source snapshots - Deleted snapshots + */ + void recordMetrics() { + // Count only MAIN branch snapshots for backward compatibility + int appendedCount = + this.newSnapshotsByBranch + .getOrDefault(SnapshotRef.MAIN_BRANCH, Collections.emptyList()) + .size(); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, this.newStagedSnapshots.size()); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, + this.cherryPickedSnapshots.size()); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, this.deletedSnapshots.size()); + } + + /** + * Helper method to record a metric with database tag if count is greater than zero. + * + * @param metricName The name of the metric to record + * @param count The count value to record + */ + private void recordMetricWithDatabaseTag(String metricName, int count) { + if (count > 0) { + // Only add database tag if databaseId is present; otherwise record metric without tag + if (this.databaseId != null) { + this.metricsReporter.count( + metricName, count, InternalCatalogMetricsConstant.DATABASE_TAG, this.databaseId); + } else { + this.metricsReporter.count(metricName, count); + } + } + } + + /** + * Helper method to format a list of snapshots into a comma-separated string of snapshot IDs. + * + * @param snapshots List of snapshots to format + * @return Comma-separated string of snapshot IDs + */ + private static String formatSnapshotIds(List snapshots) { + return snapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")); + } + } +} diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java deleted file mode 100644 index dc7dd06c2..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.linkedin.openhouse.internal.catalog; - -import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.function.Consumer; -import java.util.function.Supplier; -import java.util.stream.StreamSupport; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - -/** - * A inspector class providing functionalities that inspect components of {@link Snapshot} provided - * by clients and decide if OpenHouse need to take additional steps to incorporate it or decide - * whether to incorporate at all. - * - *

Instance of this class will be injected into {@link OpenHouseInternalTableOperations} in - * runtime. - */ -@Component -public class SnapshotInspector { - @Autowired private Consumer> fileSecurer; - /** - * TODO: ADD Validation for snapshot: Sequence-number based, schema-id based, see iceberg spec for - * details. Throwing exceptions when failures occurred. - * - * @param providedSnapshot deserialized {@link Snapshot} object that clients provided. - * @throws InvalidIcebergSnapshotException Exception thrown from the process validating the - * snapshot provided by client. - */ - void validateSnapshot(Snapshot providedSnapshot) throws InvalidIcebergSnapshotException { - // TODO: Fill this method. - } - - void validateSnapshotsUpdate( - TableMetadata metadata, List addedSnapshots, List deletedSnapshots) { - if (metadata.currentSnapshot() == null) { - // no need to verify attempt to delete current snapshot if it doesn't exist - // deletedSnapshots is necessarily empty when original snapshots list is empty - return; - } - if (!addedSnapshots.isEmpty()) { - // latest snapshot can be deleted if new snapshots are added. - return; - } - long latestSnapshotId = metadata.currentSnapshot().snapshotId(); - if (!deletedSnapshots.isEmpty() - && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { - throw new InvalidIcebergSnapshotException( - String.format("Cannot delete the latest snapshot %s", latestSnapshotId)); - } - } - - /** - * A sister method to {@link #validateSnapshot(Snapshot)} that change the file-level permission to - * be OpenHouse exclusive to avoid unexpected changes from unauthorized parties. Throwing - * exceptions when failures occurred. - * - * @param providedSnapshot deserialized {@link Snapshot} object that clients provided. - * @param fileIO {@link FileIO} object - * @throws UncheckedIOException Exception thrown from the process securing the files associated - * with {@param providedSnapshot}. - */ - @VisibleForTesting - void secureSnapshot(Snapshot providedSnapshot, FileIO fileIO) throws UncheckedIOException { - secureDataFile(providedSnapshot.addedDataFiles(fileIO)); - secureDeleteFile(providedSnapshot.addedDeleteFiles(fileIO)); - secureManifestFile(providedSnapshot.allManifests(fileIO)); - } - - private void secureDataFile(Iterable dataFiles) { - StreamSupport.stream(dataFiles.spliterator(), false) - .map(x -> (Supplier) (() -> new Path(x.path().toString()))) - .forEach(fileSecurer); - } - - private void secureDeleteFile(Iterable deleteFiles) { - StreamSupport.stream(deleteFiles.spliterator(), false) - .map(x -> (Supplier) (() -> new Path(x.path().toString()))) - .forEach(fileSecurer); - } - - private void secureManifestFile(List manifestFiles) throws UncheckedIOException { - manifestFiles.stream() - .map(x -> (Supplier) (() -> new Path(x.path()))) - .forEach(fileSecurer); - } -} diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java index d4fd6efaa..1c50bb0e2 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java @@ -16,6 +16,8 @@ public final class IcebergTestUtil { private static final String SNAPSHOTS_FILE = "serialized_snapshots.json"; private static final String FUTURE_SNAPSHOTS_FILE = "future_serialized_snapshots.json"; private static final String EXTRA_SNAPSHOTS_FILE = "extra_serialized_snapshots.json"; + private static final String EXTRA_LINEAR_SNAPSHOTS_FILE = + "extra_linear_serialized_snapshots.json"; private static final String WAP_SNAPSHOTS_FILE = "wap_serialized_snapshots.json"; private IcebergTestUtil() {} @@ -32,6 +34,10 @@ public static List getExtraSnapshots() throws IOException { return loadSnapshots(EXTRA_SNAPSHOTS_FILE); } + public static List getExtraLinearSnapshots() throws IOException { + return loadSnapshots(EXTRA_LINEAR_SNAPSHOTS_FILE); + } + public static List getWapSnapshots() throws IOException { return loadSnapshots(WAP_SNAPSHOTS_FILE); } @@ -44,15 +50,14 @@ private static List loadSnapshots(String snapshotFile) throws IOExcept return SnapshotsUtil.parseSnapshots(null, data); } - public static Map obtainSnapshotRefsFromSnapshot(Snapshot snapshot) { + public static Map createMainBranchRefPointingTo(Snapshot snapshot) { Map snapshotRefs = new HashMap<>(); SnapshotRef snapshotRef = SnapshotRef.branchBuilder(snapshot.snapshotId()).build(); snapshotRefs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(snapshotRef)); return snapshotRefs; } - public static Map obtainSnapshotRefsFromSnapshot( - Snapshot snapshot, String branch) { + public static Map createBranchRefPointingTo(Snapshot snapshot, String branch) { Map snapshotRefs = new HashMap<>(); SnapshotRef snapshotRef = SnapshotRef.branchBuilder(snapshot.snapshotId()).build(); snapshotRefs.put(branch, SnapshotRefParser.toJson(snapshotRef)); diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index f484b60ae..3f160fc45 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -8,6 +8,7 @@ import com.linkedin.openhouse.cluster.storage.StorageType; import com.linkedin.openhouse.cluster.storage.local.LocalStorage; import com.linkedin.openhouse.cluster.storage.local.LocalStorageClient; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -26,6 +27,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.UUID; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -41,6 +43,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotRefParser; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.TableMetadata; @@ -100,32 +103,41 @@ void setup() { Mockito.when(mockHouseTableMapper.toHouseTable(Mockito.any(TableMetadata.class), Mockito.any())) .thenReturn(mockHouseTable); HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + MetricsReporter metricsReporter = + new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); openHouseInternalTableOperations = new OpenHouseInternalTableOperations( mockHouseTableRepository, fileIO, - Mockito.mock(SnapshotInspector.class), mockHouseTableMapper, TEST_TABLE_IDENTIFIER, - new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); // Create a separate instance with mock metrics reporter for testing metrics + SnapshotDiffApplier snapshotDiffApplierWithMockMetrics = + new SnapshotDiffApplier(mockMetricsReporter); openHouseInternalTableOperationsWithMockMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, fileIO, - Mockito.mock(SnapshotInspector.class), mockHouseTableMapper, TEST_TABLE_IDENTIFIER, mockMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplierWithMockMetrics); LocalStorage localStorage = mock(LocalStorage.class); when(fileIOManager.getStorage(fileIO)).thenReturn(localStorage); when(localStorage.getType()).thenReturn(StorageType.LOCAL); } + /** + * Tests committing snapshots to a table with no existing snapshots (initial version). Verifies + * that all snapshots are appended and tracked in table properties. + */ @Test void testDoCommitAppendSnapshotsInitialVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -137,7 +149,7 @@ void testDoCommitAppendSnapshotsInitialVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); @@ -161,6 +173,10 @@ void testDoCommitAppendSnapshotsInitialVersion() throws IOException { } } + /** + * Tests committing additional snapshots to a table that already has existing snapshots. Verifies + * that only new snapshots are appended and tracked appropriately. + */ @Test void testDoCommitAppendSnapshotsExistingVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -178,7 +194,7 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -190,7 +206,7 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { Assertions.assertEquals( 5, updatedProperties - .size()); /*write.parquet.compression-codec, location, lastModifiedTime, version and deleted_snapshots*/ + .size()); /*write.parquet.compression-codec, location, lastModifiedTime, version and appended_snapshots*/ Assertions.assertEquals( TEST_LOCATION, updatedProperties.get(getCanonicalFieldName("tableVersion"))); @@ -205,6 +221,10 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { } } + /** + * Tests committing changes that both append new snapshots and delete existing ones. Verifies that + * both appended and deleted snapshots are correctly tracked in properties. + */ @Test void testDoCommitAppendAndDeleteSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -229,7 +249,7 @@ void testDoCommitAppendAndDeleteSnapshots() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( newSnapshots.get(newSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -245,9 +265,12 @@ void testDoCommitAppendAndDeleteSnapshots() throws IOException { Assertions.assertEquals( TEST_LOCATION, updatedProperties.get(getCanonicalFieldName("tableVersion"))); - // verify only 4 snapshots are added + // verify only snapshots in MAIN's lineage are tracked as appended + // extraTestSnapshots has 4 snapshots, but only the last 2 (snapshots 7,8) are in MAIN's + // lineage + // snapshots 5,6 form a separate branch not connected to MAIN (their parent was deleted) Assertions.assertEquals( - extraTestSnapshots.stream() + extraTestSnapshots.subList(2, 4).stream() .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")), updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); @@ -263,6 +286,10 @@ void testDoCommitAppendAndDeleteSnapshots() throws IOException { } } + /** + * Tests that metadata file updates are performed for replicated table initial version commits. + * Verifies that updateMetadataField is called with the correct parameters for replicated tables. + */ @Test void testDoCommitUpdateMetadataForInitalVersionCommit() throws IOException { Map properties = new HashMap<>(); @@ -323,6 +350,10 @@ void testDoCommitUpdateMetadataForInitalVersionCommit() throws IOException { verify(mockLocalStorageClient).getNativeClient(); } + /** + * Tests that metadata file updates are not performed for non-replicated tables. Verifies that + * updateMetadataField is never called when the table is not replicated. + */ @Test void testDoCommitUpdateMetadataNotCalledForNonReplicatedTable() throws IOException { Map properties = new HashMap<>(); @@ -349,6 +380,10 @@ void testDoCommitUpdateMetadataNotCalledForNonReplicatedTable() throws IOExcepti Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.any(HouseTable.class)); } + /** + * Tests that metadata file updates are not performed for non-initial version commits. Verifies + * that updateMetadataField is only called during table creation, not for subsequent updates. + */ @Test void testDoCommitUpdateMetadataNotCalledForNonInitialVersionCommit() throws IOException { Map properties = new HashMap<>(); @@ -382,6 +417,10 @@ void testDoCommitUpdateMetadataNotCalledForNonInitialVersionCommit() throws IOEx Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.any(HouseTable.class)); } + /** + * Tests committing changes that delete some snapshots while keeping others. Verifies that deleted + * snapshots are properly tracked in table properties. + */ @Test void testDoCommitDeleteSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -403,7 +442,7 @@ void testDoCommitDeleteSnapshots() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -430,6 +469,10 @@ void testDoCommitDeleteSnapshots() throws IOException { } } + /** + * Tests that commits to staged tables do not persist to the repository. Verifies that table + * metadata is set locally but save() and findById() are never called. + */ @Test void testDoCommitDoesntPersistForStagedTable() { TableMetadata metadata = @@ -451,6 +494,106 @@ void testDoCommitDoesntPersistForStagedTable() { .get()); } + /** + * Tests staged table creation with no snapshots (initial version). Verifies that the table + * metadata is set locally but no persistence occurs to the repository. + */ + @Test + void testStagedTableCreationWithoutSnapshots() throws IOException { + Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); + properties.put(CatalogConstants.IS_STAGE_CREATE_KEY, "true"); + + TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { + openHouseInternalTableOperations.doCommit(null, metadata); + + // Verify TableMetadata is set locally + Assertions.assertNotNull(openHouseInternalTableOperations.currentMetadataLocation()); + Assertions.assertNotNull(openHouseInternalTableOperations.current()); + + // Verify no snapshots were added + Assertions.assertEquals(0, openHouseInternalTableOperations.current().snapshots().size()); + + // Verify no persistence to repository + verify(mockHouseTableRepository, times(0)).save(any()); + + // Verify no snapshot properties were set + Map resultProperties = + openHouseInternalTableOperations.current().properties(); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("deleted_snapshots"))); + } + } + + /** + * Tests staged table creation with staged (WAP) snapshots. Verifies that staged snapshots are + * added to the table but no persistence occurs to the repository. + */ + @Test + void testStagedTableCreationWithStagedSnapshots() throws IOException { + List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 2); + Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); + properties.put(CatalogConstants.IS_STAGE_CREATE_KEY, "true"); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(testWapSnapshots)); + + TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { + openHouseInternalTableOperations.doCommit(null, metadata); + + // Verify TableMetadata is set locally + Assertions.assertNotNull(openHouseInternalTableOperations.currentMetadataLocation()); + Assertions.assertNotNull(openHouseInternalTableOperations.current()); + + // Verify staged snapshots were added + TableMetadata currentMetadata = openHouseInternalTableOperations.current(); + Assertions.assertEquals( + testWapSnapshots.size(), + currentMetadata.snapshots().size(), + "Staged snapshots should be added"); + + // Verify all snapshots are staged (have WAP ID) + for (Snapshot snapshot : currentMetadata.snapshots()) { + Assertions.assertTrue( + snapshot.summary().containsKey(org.apache.iceberg.SnapshotSummary.STAGED_WAP_ID_PROP), + "All snapshots should be staged with WAP ID"); + } + + // Verify no branch references exist (staged snapshots should not be on main) + Assertions.assertTrue( + currentMetadata.refs().isEmpty() + || !currentMetadata.refs().containsKey(SnapshotRef.MAIN_BRANCH), + "Staged snapshots should not have main branch reference"); + + // Verify no persistence to repository + verify(mockHouseTableRepository, times(0)).save(any()); + + // Verify snapshot properties tracking + Map resultProperties = currentMetadata.properties(); + Assertions.assertEquals( + testWapSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")), + resultProperties.get(getCanonicalFieldName("staged_snapshots")), + "Staged snapshots should be tracked in properties"); + Assertions.assertNull( + resultProperties.get(getCanonicalFieldName("appended_snapshots")), + "No snapshots should be appended to main"); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("deleted_snapshots"))); + } + } + + /** + * Tests that repository exceptions are properly converted to Iceberg exceptions. Verifies that + * various repository exceptions map to CommitFailedException or CommitStateUnknownException. + */ @Test void testDoCommitExceptionHandling() { TableMetadata base = BASE_TABLE_METADATA; @@ -479,38 +622,61 @@ void testDoCommitExceptionHandling() { () -> openHouseInternalTableOperations.doCommit(base, metadata)); } + /** + * Tests that attempting to delete a snapshot that is still referenced by a branch throws an + * exception. Verifies that InvalidIcebergSnapshotException is thrown when snapshot refs conflict + * with deletions. + */ @Test - void testDoCommitSnapshotsValidationExceptionHandling() throws IOException { + void testDoCommitSnapshotsValidationThrowsException() throws IOException { TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(ImmutableMap.of("random", "value")); List testSnapshots = IcebergTestUtil.getSnapshots(); Map properties = new HashMap<>(metadata.properties()); + + // The key issue: SNAPSHOTS_JSON_KEY says to keep only snapshot 2, but snapshot 1 is referenced + // by main + // This creates a conflict - we're trying to delete snapshot 1 but it's still referenced properties.put( CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(testSnapshots.subList(1, 3))); + SnapshotsUtil.serializedSnapshots( + testSnapshots.subList(2, 3))); // Only snapshot 2 should remain properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - testSnapshots.get(testSnapshots.size() - 1)))); + IcebergTestUtil.createMainBranchRefPointingTo( + testSnapshots.get(1)))); // But main refs snapshot 1 properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); metadata = metadata.replaceProperties(properties); + + // Create initial metadata with snapshots 1 and 2, where snapshot 1 is referenced by main TableMetadata metadataWithSnapshots = TableMetadata.buildFrom(metadata) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) // snapshot 1 -> main + .addSnapshot(testSnapshots.get(2)) // snapshot 2 exists but unreferenced initially .build(); + + // Target metadata tries to delete snapshot 1 (not in SNAPSHOTS_JSON_KEY) but main still refs it TableMetadata metadataWithSnapshotsDeleted = TableMetadata.buildFrom(metadata) - .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .setBranchSnapshot( + testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) // main still points to snapshot 1 .build(); - Assertions.assertDoesNotThrow( + // This should throw exception because snapshot 1 is marked for deletion but still referenced by + // main + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, () -> openHouseInternalTableOperations.doCommit( - metadataWithSnapshots, metadataWithSnapshotsDeleted)); + metadataWithSnapshots, metadataWithSnapshotsDeleted), + "Should throw exception when trying to delete referenced snapshots"); } + /** + * Tests committing WAP (write-audit-publish) staged snapshots to an initial version table. + * Verifies that snapshots are marked as staged but not appended to the main branch. + */ @Test void testDoCommitAppendStageOnlySnapshotsInitialVersion() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 2); @@ -532,16 +698,18 @@ void testDoCommitAppendStageOnlySnapshotsInitialVersion() throws IOException { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")), updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } + /** + * Tests committing WAP staged snapshots to a table with existing snapshots. Verifies that new + * snapshots are tracked as staged without being appended to main. + */ @Test void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -563,7 +731,7 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(newSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -577,60 +745,66 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")), updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } - @Test - void testDoCommitAppendSnapshotsToNonMainBranch() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(testSnapshots.subList(0, 1))); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testSnapshots.get(0), "branch"))); - properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); - - TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); - // verify throw an error when committing to non-main branch. - Assertions.assertThrows( - CommitStateUnknownException.class, - () -> openHouseInternalTableOperations.doCommit(BASE_TABLE_METADATA, metadata)); - } - } - + /** + * Tests validation that rejects appending snapshots older than the current metadata timestamp. + * Verifies that IllegalArgumentException is thrown for stale snapshots unless newer ones are + * included. + */ @Test void testAppendSnapshotsWithOldSnapshots() throws IOException { - TableMetadata metadata = + // Create base metadata (existing table state) + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setPreviousFileLocation("tmp_location") + .setPreviousFileLocation("tmp_location") // this is key .setLocation(BASE_TABLE_METADATA.metadataFileLocation()) .build(); + // all snapshots are from the past and snapshots add should fail the validation List snapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + Assertions.assertThrows( IllegalArgumentException.class, () -> - openHouseInternalTableOperations.maybeAppendSnapshots( - metadata, snapshots, ImmutableMap.of(), false)); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata)); + // the latest snapshots have larger timestamp than the previous metadata timestamp, so it should // pass the validation snapshots.addAll(IcebergTestUtil.getFutureSnapshots()); - openHouseInternalTableOperations.maybeAppendSnapshots( - metadata, snapshots, ImmutableMap.of(), false); + Map propertiesWithFuture = new HashMap<>(baseMetadata.properties()); + propertiesWithFuture.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + propertiesWithFuture.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadataWithFuture = baseMetadata.replaceProperties(propertiesWithFuture); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadataWithFuture); } + /** + * Tests cherry-picking a staged snapshot to main when the base snapshot hasn't changed. Verifies + * that the existing staged snapshot is promoted without creating a new snapshot. + */ @Test void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -653,7 +827,7 @@ void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -662,19 +836,20 @@ void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify the staged snapshot is cherry picked by use the existing one - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(0).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } + /** + * Tests cherry-picking a staged snapshot when the base has changed since staging. Verifies that a + * new snapshot is created and appended to track the rebased changes. + */ @Test void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); @@ -687,13 +862,13 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { Map properties = new HashMap<>(base.properties()); try (MockedStatic ignoreWriteMock = Mockito.mockStatic(TableMetadataParser.class)) { - // cherry pick the staged snapshot whose base has changed + // cherry-pick the staged snapshot whose base has changed properties.put( CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(testWapSnapshots)); properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testWapSnapshots.get(2)))); // new snapshot properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -702,21 +877,23 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); Map updatedProperties = tblMetadataCaptor.getValue().properties(); - // verify the staged snapshot is cherry picked by creating a new snapshot and append it - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + // verify the staged snapshot is cherry-picked by creating a new snapshot and append it + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(2).snapshotId()), updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(1).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } + /** + * Tests cherry-picking the first staged snapshot (with no parent) to the main branch. Verifies + * that the staged snapshot is promoted directly without creating a new snapshot. + */ @Test void testDoCommitCherryPickFirstSnapshot() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 1); @@ -732,7 +909,7 @@ void testDoCommitCherryPickFirstSnapshot() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -741,19 +918,20 @@ void testDoCommitCherryPickFirstSnapshot() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify the staged snapshot is cherry picked by using the existing one - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(0).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } + /** + * Tests deleting the last staged snapshot when no references point to it. Verifies that no + * snapshot operations are tracked since the snapshot was unreferenced. + */ @Test void testDoCommitDeleteLastStagedSnapshotWhenNoRefs() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 1); @@ -772,18 +950,19 @@ void testDoCommitDeleteLastStagedSnapshotWhenNoRefs() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify nothing happens - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } + /** + * Tests rebuilding an unpartitioned table's partition spec with a new schema. Verifies that the + * rebuilt spec remains unpartitioned. + */ @Test void testRebuildPartitionSpecUnpartitioned() { Schema originalSchema = @@ -798,6 +977,10 @@ void testRebuildPartitionSpecUnpartitioned() { Assertions.assertTrue(rebuiltSpec.isUnpartitioned()); } + /** + * Tests rebuilding partition spec when the new schema has the same field IDs as the original. + * Verifies that partition fields are correctly mapped using matching field IDs. + */ @Test void testRebuildPartitionSpec_NewSchemaSameFieldIds() { Schema originalSchema = @@ -835,6 +1018,11 @@ void testRebuildPartitionSpec_NewSchemaSameFieldIds() { Assertions.assertEquals(3, rebuiltSpec.fields().get(2).sourceId()); } + /** + * Tests rebuilding partition spec when the new schema has different field IDs for same field + * names. Verifies that partition fields are correctly remapped to new field IDs based on field + * names. + */ @Test void testRebuildPartitionSpec_NewSchemaDifferentFieldIds() { Schema originalSchema = @@ -880,6 +1068,10 @@ void testRebuildPartitionSpec_NewSchemaDifferentFieldIds() { Assertions.assertEquals(2, rebuiltSpec.fields().get(2).sourceId()); } + /** + * Tests rebuilding partition spec when a partition field is missing from the new schema. Verifies + * that an IllegalArgumentException is thrown for the missing field. + */ @Test void testRebuildPartitionSpec_fieldMissingInNewSchema() { Schema originalSchema = @@ -901,6 +1093,10 @@ void testRebuildPartitionSpec_fieldMissingInNewSchema() { "Field field1 does not exist in the new schema", exception.getMessage()); } + /** + * Tests rebuilding sort order when the new schema has the same field IDs as the original. + * Verifies that sort fields are correctly mapped using matching field IDs. + */ @Test void testRebuildSortOrder_NewSchemaSameFieldIds() { Schema originalSchema = @@ -927,6 +1123,10 @@ void testRebuildSortOrder_NewSchemaSameFieldIds() { Assertions.assertEquals(2, rebuiltSortOrder.fields().get(1).sourceId()); } + /** + * Tests rebuilding sort order when the new schema has different field IDs for same field names. + * Verifies that sort fields are correctly remapped to new field IDs based on field names. + */ @Test void testRebuildSortOrder_NewSchemaDifferentFieldIds() { Schema originalSchema = @@ -953,6 +1153,10 @@ void testRebuildSortOrder_NewSchemaDifferentFieldIds() { Assertions.assertEquals(1, rebuiltSortOrder.fields().get(1).sourceId()); } + /** + * Tests rebuilding sort order when a sort field is missing from the new schema. Verifies that an + * IllegalArgumentException is thrown for the missing field. + */ @Test void testRebuildSortOrder_fieldMissingInNewSchema() { Schema originalSchema = @@ -971,6 +1175,10 @@ void testRebuildSortOrder_fieldMissingInNewSchema() { "Field field1 does not exist in the new schema", exception.getMessage()); } + /** + * Tests that refresh metadata operations record metrics with database tag but not table tag. + * Verifies that only the database dimension is included to avoid high cardinality. + */ @Test void testRefreshMetadataIncludesDatabaseTag() { testMetricIncludesDatabaseTag( @@ -980,6 +1188,10 @@ void testRefreshMetadataIncludesDatabaseTag() { "Timer should not have table tag (removed because the table tag has super high cardinality and overloads metric emission max size)"); } + /** + * Tests that commit metadata update operations record metrics with database tag but not table + * tag. Verifies that only the database dimension is included to avoid high cardinality. + */ @Test void testCommitMetadataUpdateIncludesDatabaseTag() { testMetricIncludesDatabaseTag( @@ -989,6 +1201,10 @@ void testCommitMetadataUpdateIncludesDatabaseTag() { "Timer should not have table tag (only database dimension should be included)"); } + /** + * Tests that refresh metadata latency timer has histogram buckets configured. Verifies that the + * metrics can be used for histogram-based monitoring and alerting. + */ @Test void testRefreshMetadataLatencyHasHistogramBuckets() { testMetricHasHistogramBuckets( @@ -997,6 +1213,10 @@ void testRefreshMetadataLatencyHasHistogramBuckets() { this::executeRefreshMetadata); } + /** + * Tests that commit metadata update latency timer has histogram buckets configured. Verifies that + * the metrics can be used for histogram-based monitoring and alerting. + */ @Test void testCommitMetadataUpdateLatencyHasHistogramBuckets() { testMetricHasHistogramBuckets( @@ -1023,17 +1243,19 @@ private void testMetricIncludesDatabaseTag( SimpleMeterRegistry meterRegistry = new SimpleMeterRegistry(); MetricsReporter realMetricsReporter = new MetricsReporter(meterRegistry, "TEST_CATALOG", Lists.newArrayList()); + HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(realMetricsReporter); // Create instance with real metrics reporter OpenHouseInternalTableOperations operationsWithRealMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - Mockito.mock(SnapshotInspector.class), + fileIO, mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplier); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1086,17 +1308,19 @@ private void testMetricHasHistogramBuckets( MetricsReporter realMetricsReporter = new MetricsReporter(meterRegistry, "TEST_CATALOG", Lists.newArrayList()); + HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(realMetricsReporter); // Create instance with real metrics reporter OpenHouseInternalTableOperations operationsWithRealMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - Mockito.mock(SnapshotInspector.class), + fileIO, mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplier); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1225,4 +1449,1221 @@ private void verifyMetricHistogramBuckets( Assertions.assertFalse(Double.isNaN(totalTime), "Timer total time should not be NaN"); Assertions.assertFalse(Double.isNaN(maxTime), "Timer max time should not be NaN"); } + + /** + * Tests that attempting to delete a snapshot referenced by the main branch throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown with appropriate error message. + */ + @Test + void testDeleteSnapshotWithMainReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with multiple snapshots + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(2)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Get the current head snapshot that is referenced by main branch + Snapshot referencedSnapshot = testSnapshots.get(3); + + // Create new metadata that attempts to delete the referenced snapshot + // The SNAPSHOTS_JSON_KEY will only include first 3 snapshots (excluding the referenced one) + // But SNAPSHOTS_REFS_KEY will still reference snapshot 3, causing a conflict + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots( + testSnapshots.subList(0, 3))); // Only snapshots 0-2, excluding referenced snapshot 3 + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo( + referencedSnapshot))); // Still references snapshot 3 + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // This MUST throw IllegalArgumentException for referenced snapshots + InvalidIcebergSnapshotException exception = + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete referenced snapshot"); + + // Verify error message mentions the reference + String expectedMessage = + "Cannot delete the current snapshot " + + referencedSnapshot.snapshotId() + + " without adding replacement snapshots"; + Assertions.assertTrue( + exception.getMessage().contains(expectedMessage), + "Error message should indicate snapshot is still referenced: " + exception.getMessage()); + } + + /** + * Tests that unreferenced snapshots can be successfully deleted from the table. Verifies that + * deleted snapshots are removed from metadata and tracked in properties. + */ + @Test + void testDeleteSnapshotWithNoReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with multiple snapshots + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(2)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Delete unreferenced snapshots (first two snapshots) + // New metadata keeps snapshots 2 and 3 + Snapshot referencedSnapshot = testSnapshots.get(3); + List remainingSnapshots = testSnapshots.subList(2, 4); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(referencedSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + TableMetadata result = + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); + + // Verify unreferenced snapshots were removed + List unreferencedSnapshots = testSnapshots.subList(0, 2); + for (Snapshot unreferenced : unreferencedSnapshots) { + boolean snapshotExists = + result.snapshots().stream().anyMatch(s -> s.snapshotId() == unreferenced.snapshotId()); + Assertions.assertFalse( + snapshotExists, + "Unreferenced snapshot " + unreferenced.snapshotId() + " should be deleted"); + } + + // Verify referenced snapshot still exists + boolean referencedExists = + result.snapshots().stream() + .anyMatch(s -> s.snapshotId() == referencedSnapshot.snapshotId()); + Assertions.assertTrue(referencedExists, "Referenced snapshot should still exist"); + + // Verify deletion tracking + Map resultProperties = result.properties(); + String deletedSnapshotsStr = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshotsStr); + + for (Snapshot unreferenced : unreferencedSnapshots) { + Assertions.assertTrue( + deletedSnapshotsStr.contains(Long.toString(unreferenced.snapshotId())), + "Unreferenced snapshot should be tracked as deleted"); + } + } + + /** + * Tests that attempting to delete a snapshot referenced by multiple branches throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown indicating the snapshot is still + * referenced. + */ + @Test + void testDeleteSnapshotWithMultipleReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced + Snapshot sharedSnapshot = testSnapshots.get(0); // This will be referenced by both branches + Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced + + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(sharedSnapshot) + .addSnapshot(mainSnapshot) + .setRef( + SnapshotRef.MAIN_BRANCH, + SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) + .setRef( + "feature_branch", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + .setRef( + "feature_branch1", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + .build(); + + // Attempt to delete the shared snapshot by creating new metadata without it + // Keep the unreferenced snapshot so we're not deleting everything + List remainingSnapshots = List.of(mainSnapshot); + + // Keep refs pointing to the shared snapshot (causing conflict) + Map refs = baseMetadata.refs(); + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches + InvalidIcebergSnapshotException exception = + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); + + // Verify error message mentions the snapshot is still referenced + String exceptionMessage = exception.getMessage(); + Assertions.assertTrue( + exceptionMessage.contains("Still referenced by refs") + || exceptionMessage.contains("still referenced"), + "Error message should indicate snapshot is still referenced by branches: " + + exceptionMessage); + } + + /** + * Tests that attempting to delete a snapshot referenced by a tag throws an exception. Verifies + * that InvalidIcebergSnapshotException is thrown with branch/tag reference details. + */ + @Test + void testDeleteSnapshotWithBranchReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with snapshots - build up MAIN branch lineage first + Snapshot taggedSnapshot = testSnapshots.get(0); + TableMetadata baseMetadata = BASE_TABLE_METADATA; + // Add all snapshots to MAIN branch in order to avoid sequence number conflicts + for (Snapshot snapshot : testSnapshots) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + // Now add the tag reference to the first snapshot + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setRef("feature_branch", SnapshotRef.tagBuilder(taggedSnapshot.snapshotId()).build()) + .build(); + + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + + // Attempt to delete snapshot that has a tag reference by creating new metadata without it + List remainingSnapshots = + finalBaseMetadata.snapshots().stream() + .filter(s -> s.snapshotId() != taggedSnapshot.snapshotId()) + .collect(Collectors.toList()); + + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + // Keep refs pointing to the tagged snapshot (causing conflict) + Map serializedRefs = + finalBaseMetadata.refs().entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); + + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by tags + InvalidIcebergSnapshotException exception = + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by tag"); + + // Verify error message mentions tag reference + String exceptionMessage = exception.getMessage(); + String expectedMessage = + "Cannot delete snapshots that are still referenced by branches/tags: snapshot " + + taggedSnapshot.snapshotId() + + " (referenced by: feature_branch)"; + Assertions.assertTrue( + exceptionMessage.contains(expectedMessage), + "Error message should indicate snapshot is still referenced by branches: " + + exceptionMessage); + } + + /** + * Tests that attempting to delete an empty list of snapshots makes no changes to the table. + * Verifies that no snapshots are deleted and no deletion properties are set. + */ + @Test + void testDeleteEmptySnapshotList() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Delete empty list - new metadata is same as base (no snapshots deleted) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + TableMetadata result = + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); + + // Verify no changes were made + Assertions.assertEquals( + baseMetadata.snapshots().size(), + result.snapshots().size(), + "No snapshots should be deleted when list is empty"); + + // Verify no deletion tracking properties were added + Map resultProperties = result.properties(); + String deletedSnapshots = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); + } + + /** + * Tests that attempting to delete a null list of snapshots makes no changes to the table. + * Verifies that no snapshots are deleted and no deletion properties are set. + */ + @Test + void testDeleteNullSnapshotList() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Delete null list - new metadata is same as base (no snapshots deleted) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + TableMetadata result = + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); + + // Verify no changes were made + Assertions.assertEquals( + baseMetadata.snapshots().size(), + result.snapshots().size(), + "No snapshots should be deleted when list is null"); + + // Verify no deletion tracking properties were added + Map resultProperties = result.properties(); + String deletedSnapshots = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); + } + + /** + * Tests that attempting to delete a snapshot that doesn't exist in the metadata has no effect. + * Verifies that snapshot count remains unchanged and no deletion tracking occurs. + */ + @Test + void testDeleteNonExistentSnapshot() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Create a snapshot that doesn't exist in the metadata + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + Snapshot nonExistentSnapshot = extraSnapshots.get(0); + + // New metadata is same as base (non-existent snapshot can't be removed) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + TableMetadata result = + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); + + // Verify original snapshots are unchanged + Assertions.assertEquals( + baseMetadata.snapshots().size(), + result.snapshots().size(), + "Snapshot count should be unchanged when deleting non-existent snapshot"); + + // Verify deletion is not tracked (since no actual deletion occurred) + Map resultProperties = result.properties(); + String deletedSnapshots = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots should be tracked"); + } + + /** + * Tests that snapshot deletion operations record the correct metrics. Verifies that + * SNAPSHOTS_DELETED_CTR counter is incremented by the number of deleted snapshots. + */ + @Test + void testDeleteSnapshotMetricsRecorded() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); + } + + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + + // Delete some snapshots (first two snapshots) + List remainingSnapshots = testSnapshots.subList(2, testSnapshots.size()); + + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(new HashMap<>())); // No refs since all are unreferenced + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata); + + // Verify metrics were recorded + Mockito.verify(mockMetricsReporter) + .count( + eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), + eq((double) 2)); // 2 snapshots deleted + } + + /** + * Tests that snapshot deletion metrics are recorded when deleting unreferenced snapshots. + * Verifies that SNAPSHOTS_DELETED_CTR counter tracks deletions with branch references present. + */ + @Test + void testDeleteSnapshotMetricsRecordedBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with snapshots that have branch references + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Delete unreferenced snapshots (first two snapshots) + Snapshot referencedSnapshot = testSnapshots.get(2); + List remainingSnapshots = List.of(referencedSnapshot); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(referencedSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); + + // Verify metrics were recorded for the basic deletion + Mockito.verify(mockMetricsReporter) + .count( + eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), + eq((double) 2)); // 2 snapshots deleted + } + + /** + * Tests that snapshot deletion metrics are not recorded when no actual deletion occurs. Verifies + * that SNAPSHOTS_DELETED_CTR counter is not called for non-existent snapshots. + */ + @Test + void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + + // Create a snapshot that doesn't exist in the metadata + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + Snapshot nonExistentSnapshot = extraSnapshots.get(0); + + // New metadata is same as base (non-existent snapshot can't be removed) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(finalBaseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata); + + // Verify metrics are not recorded for non-existent snapshots (no actual deletion) + Mockito.verify(mockMetricsReporter, Mockito.never()) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), Mockito.anyDouble()); + } + + /** + * Tests that attempting to delete all snapshots fails when the main branch references a snapshot. + * Verifies that InvalidIcebergSnapshotException is thrown to prevent deleting referenced + * snapshots. + */ + @Test + void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced + Snapshot unreferencedSnapshot = + testSnapshots.get(0); // This will be referenced by both branches + Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced + + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(unreferencedSnapshot) + .addSnapshot(mainSnapshot) + .setRef( + SnapshotRef.MAIN_BRANCH, + SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) + .build(); + + // Attempt to delete the shared snapshot by creating new metadata without it + // Keep the unreferenced snapshot so we're not deleting everything + List remainingSnapshots = List.of(mainSnapshot); + + // Keep refs pointing to the shared snapshot (causing conflict) + Map refs = baseMetadata.refs(); + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(List.of())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(mainSnapshot))); + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches + InvalidIcebergSnapshotException exception = + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); + + // Verify error message mentions the snapshot is still referenced + String exceptionMessage = exception.getMessage(); + String expectedMessage = + "Cannot delete the current snapshot " + + mainSnapshot.snapshotId() + + " without adding replacement snapshots."; + Assertions.assertTrue(exceptionMessage.contains(expectedMessage)); + } + + /** + * Tests that deleting all unreferenced snapshots succeeds without errors. Verifies that all + * snapshots can be deleted when no branches or tags reference them. + */ + @Test + void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with unreferenced snapshots only (no main branch or other refs) + TableMetadata baseMetadata = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); + } + // Note: No setBranchSnapshot or setRef calls - all snapshots are unreferenced + + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + + // Attempt to delete all unreferenced snapshots + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(List.of())); // Empty - all snapshots deleted + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(new HashMap<>())); // No refs + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); + + // This should succeed since no snapshots are referenced by any branch/tag + TableMetadata result = + Assertions.assertDoesNotThrow( + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata), + "Should succeed when deleting all unreferenced snapshots"); + + // Verify all snapshots were removed from the metadata + Assertions.assertEquals( + 0, + result.snapshots().size(), + "All unreferenced snapshots should be deleted, resulting in empty snapshots list"); + + // Verify deletion tracking shows all snapshots were deleted + Map resultProperties = result.properties(); + String deletedSnapshots = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshots, "Deleted snapshots should be tracked"); + + for (Snapshot snapshot : testSnapshots) { + Assertions.assertTrue( + deletedSnapshots.contains(Long.toString(snapshot.snapshotId())), + "Snapshot " + snapshot.snapshotId() + " should be tracked as deleted"); + } + } + + /** + * Tests that multiple branches can point to different snapshots without conflicts. Verifies that + * commits with multiple valid branch references succeed without exceptions. + */ + @Test + void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // New metadata includes all snapshots (base + new ones) + List allSnapshots = testSnapshots.subList(0, 4); // snapshots 0, 1, 2, 3 + + // Create snapshotRefs where each branch points to a DIFFERENT snapshot (valid scenario) + Map validRefs = new HashMap<>(); + validRefs.put("branch_a", SnapshotRef.branchBuilder(testSnapshots.get(1).snapshotId()).build()); + validRefs.put("branch_b", SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build()); + validRefs.put("branch_c", SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build()); + + // Serialize the refs + Map serializedRefs = + validRefs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // This should NOT throw an exception + Assertions.assertDoesNotThrow( + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), + "Should NOT throw exception when branches target different snapshots"); + } + + /** + * Tests the standard Write-Audit-Publish (WAP) workflow where a staged snapshot becomes main. + * Verifies that pulling a WAP snapshot into the main branch succeeds without errors. + */ + @Test + void testStandardWAPScenario() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Create base with existing snapshots and a WAP snapshot + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(wapSnapshots.get(0)) // WAP snapshot (not referenced by any branch) + .build(); + + // Standard WAP scenario: pull the WAP snapshot into main branch + Snapshot wapSnapshot = wapSnapshots.get(0); + + // New metadata keeps the same snapshots but changes the main branch ref to point to WAP + // snapshot + List allSnapshots = List.of(testSnapshots.get(0), wapSnapshot); + + // Create refs to pull WAP snapshot into main branch + Map refs = new HashMap<>(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(wapSnapshot.snapshotId()).build()); + + // Serialize the refs + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should succeed - standard WAP workflow where WAP snapshot becomes the new main + Assertions.assertDoesNotThrow( + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), + "Should successfully pull WAP snapshot into main branch"); + } + + /** + * Tests committing metadata that has diverged multiple versions from the base (N to N+3). + * Verifies that "jump" commits succeed with all snapshots and references correctly applied. + */ + @Test + void testMultipleDiffCommit() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 4 snapshots ========== + // Simulate evolving through N+1 and N+2 without committing + TableMetadata intermediate1 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata intermediate2 = + TableMetadata.buildFrom(intermediate1) + .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(intermediate2) + .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .build(); + + // Add custom properties for commit + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(snapshots4.get(3)))); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify the divergent commit contains all 4 snapshots + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit should contain all 4 snapshots despite jumping from base with 1 snapshot"); + + Set expectedSnapshotIds = + snapshots4.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set actualSnapshotIds = + capturedMetadata.snapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + Assertions.assertEquals( + expectedSnapshotIds, + actualSnapshotIds, + "All snapshot IDs should be present after divergent commit"); + + // Verify main ref points to the expected snapshot (the 4th snapshot) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to the 4th snapshot after divergent commit"); + } + } + + /** + * Tests divergent commit (N to N+3) with multiple branches pointing to different snapshots. + * Verifies that divergent commits succeed when branch references are valid and non-conflicting. + */ + @Test + void testMultipleDiffCommitWithValidBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 4 snapshots and 2 branches ========== + TableMetadata intermediate1 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata intermediate2 = + TableMetadata.buildFrom(intermediate1) + .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(intermediate2) + .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .build(); + + // Add custom properties for commit with multiple branches + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + + // Create refs for both MAIN (pointing to snapshot 3) and feature_a (pointing to snapshot 2) + Map multipleRefs = new HashMap<>(); + multipleRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + multipleRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build())); + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(multipleRefs)); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Should succeed with multiple valid branches ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify all 4 snapshots are present + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit with multiple branches should contain all 4 snapshots"); + + // Verify main ref points to the expected snapshot + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to the 4th snapshot"); + + // Verify feature_a ref points to the expected snapshot + SnapshotRef featureRef = capturedMetadata.ref("feature_a"); + Assertions.assertNotNull(featureRef, "Feature_a branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(2).snapshotId(), + featureRef.snapshotId(), + "Feature_a branch should point to the 3rd snapshot"); + } + } + + /** + * Tests committing with multiple branches advancing forward, each pointing to different + * snapshots. Verifies that complex multi-branch commits succeed when each branch has a unique + * target snapshot. + */ + @Test + void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws IOException { + // Combine regular snapshots (4) + extra snapshots (4) to get 8 total snapshots + List testSnapshots = IcebergTestUtil.getSnapshots(); + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + List allSnapshots = new ArrayList<>(); + allSnapshots.addAll(testSnapshots); + allSnapshots.addAll(extraSnapshots); + + // ========== Create base metadata with 2 branches ========== + // Base has snapshots 0, 1, 2, 3 with MAIN at snapshot 0 and feature_a at snapshot 1 + TableMetadata.Builder baseBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); + baseBuilder.addSnapshot(allSnapshots.get(0)); + baseBuilder.addSnapshot(allSnapshots.get(1)); + baseBuilder.addSnapshot(allSnapshots.get(2)); + baseBuilder.addSnapshot(allSnapshots.get(3)); + baseBuilder.setBranchSnapshot(allSnapshots.get(0).snapshotId(), SnapshotRef.MAIN_BRANCH); + baseBuilder.setBranchSnapshot(allSnapshots.get(1).snapshotId(), "feature_a"); + TableMetadata baseMetadata = baseBuilder.build(); + + // Add custom properties with base snapshots + Map baseProperties = new HashMap<>(baseMetadata.properties()); + List baseSnapshots = allSnapshots.subList(0, 4); + baseProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(baseSnapshots)); + + Map baseRefs = new HashMap<>(); + baseRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(0).snapshotId()).build())); + baseRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(1).snapshotId()).build())); + + baseProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(baseRefs)); + + TableMetadata finalBaseMetadata = baseMetadata.replaceProperties(baseProperties); + + // ========== Create new metadata with 3 branches, all advanced 2 snapshots further ========== + // New metadata has snapshots 0-7 with MAIN at snapshot 2, feature_a at snapshot 3, feature_b at + // snapshot 4 + TableMetadata.Builder newBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); + for (int i = 0; i < 8; i++) { + newBuilder.addSnapshot(allSnapshots.get(i)); + } + newBuilder.setBranchSnapshot(allSnapshots.get(2).snapshotId(), SnapshotRef.MAIN_BRANCH); + newBuilder.setBranchSnapshot(allSnapshots.get(3).snapshotId(), "feature_a"); + newBuilder.setBranchSnapshot(allSnapshots.get(4).snapshotId(), "feature_b"); + TableMetadata newMetadata = newBuilder.build(); + + // Add custom properties with new snapshots + Map newProperties = new HashMap<>(newMetadata.properties()); + List newSnapshots = allSnapshots.subList(0, 8); + newProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(newSnapshots)); + + Map newRefs = new HashMap<>(); + newRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(2).snapshotId()).build())); + newRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(3).snapshotId()).build())); + newRefs.put( + "feature_b", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(4).snapshotId()).build())); + + newProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(newRefs)); + + TableMetadata finalNewMetadata = newMetadata.replaceProperties(newProperties); + + // commit should succeed + openHouseInternalTableOperations.doCommit(finalBaseMetadata, finalNewMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify all 8 snapshots are present + Assertions.assertEquals( + 8, capturedMetadata.snapshots().size(), "Commit should contain all 8 snapshots"); + + // Verify MAIN branch advanced 2 snapshots (from snapshot 0 to snapshot 2) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(2).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to snapshot 2 (advanced 2 snapshots from snapshot 0)"); + + // Verify feature_a branch advanced 2 snapshots (from snapshot 1 to snapshot 3) + SnapshotRef featureARef = capturedMetadata.ref("feature_a"); + Assertions.assertNotNull(featureARef, "Feature_a branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(3).snapshotId(), + featureARef.snapshotId(), + "Feature_a branch should point to snapshot 3 (advanced 2 snapshots from snapshot 1)"); + + // Verify feature_b branch exists and points to snapshot 4 (new branch in this commit) + SnapshotRef featureBRef = capturedMetadata.ref("feature_b"); + Assertions.assertNotNull(featureBRef, "Feature_b branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(4).snapshotId(), + featureBRef.snapshotId(), + "Feature_b branch should point to snapshot 4"); + + // Verify correct lineage: snapshots should be in order + List capturedSnapshots = capturedMetadata.snapshots(); + for (int i = 0; i < 8; i++) { + Assertions.assertEquals( + allSnapshots.get(i).snapshotId(), + capturedSnapshots.get(i).snapshotId(), + "Snapshot " + i + " should be preserved in correct order"); + } + } + + /** + * Tests that committing with multiple branches pointing to the same snapshot throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown for ambiguous branch configurations. + */ + @Test + void testMultipleDiffCommitWithInvalidBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create metadata with 4 snapshots but only snapshot 0 in refs ========== + // Build metadata with all 4 snapshots added, but keep MAIN pointing to snapshot 0 + TableMetadata.Builder builder = TableMetadata.buildFrom(baseAtN); + // Add snapshots 1, 2, 3 without assigning them to any branch + builder.addSnapshot(testSnapshots.get(1)); + builder.addSnapshot(testSnapshots.get(2)); + builder.addSnapshot(testSnapshots.get(3)); + TableMetadata metadataWithAllSnapshots = builder.build(); + + // Add custom properties with AMBIGUOUS branch refs - both pointing to same snapshot + Map divergentProperties = + new HashMap<>(metadataWithAllSnapshots.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + + // Create INVALID refs: both MAIN and feature_a pointing to the SAME snapshot (ambiguous!) + Map ambiguousRefs = new HashMap<>(); + ambiguousRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + ambiguousRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()) + .build())); // Same snapshot! + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(ambiguousRefs)); + + TableMetadata finalDivergentMetadata = + metadataWithAllSnapshots.replaceProperties(divergentProperties); + + InvalidIcebergSnapshotException exception = + Assertions.assertThrows( + InvalidIcebergSnapshotException.class, + () -> openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata), + "Should throw InvalidIcebergSnapshotException when multiple branches point to same snapshot"); + + // Verify error message indicates the ambiguous commit + String exceptionMessage = exception.getMessage(); + String expectedMessage = + "Ambiguous commit: snapshot " + + testSnapshots.get(3).snapshotId() + + " is referenced by multiple branches [feature_a, main] in a single commit. Each snapshot can only be referenced by one branch per commit."; + Assertions.assertTrue( + exceptionMessage.contains(expectedMessage), + "Error message should indicate multiple branches targeting same snapshot: " + + exceptionMessage); + } + } + + /** + * Tests divergent commit (N to N+3) that includes both regular snapshots and WAP staged + * snapshots. Verifies that staged snapshots remain properly tracked as staged even during a + * multi-version jump commit. + */ + @Test + void testMultipleDiffCommitWithWAPSnapshots() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 2 regular + 2 WAP snapshots ========== + // Simulate evolving through N+1 and N+2 without committing + // The new metadata will have: + // - testSnapshots[0] (existing in base, main branch) + // - testSnapshots[1] (new, main branch will advance here) + // - wapSnapshots[0] (new, staged - no branch reference) + // - wapSnapshots[1] (new, staged - no branch reference) + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .addSnapshot(wapSnapshots.get(0)) + .addSnapshot(wapSnapshots.get(1)) + .build(); + + // Add custom properties for commit + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + + // Include 2 regular snapshots (0, 1) and 2 WAP snapshots (0, 1) + List allSnapshots = new ArrayList<>(); + allSnapshots.add(testSnapshots.get(0)); + allSnapshots.add(testSnapshots.get(1)); + allSnapshots.add(wapSnapshots.get(0)); + allSnapshots.add(wapSnapshots.get(1)); + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + + // Only main branch ref pointing to testSnapshots[1], WAP snapshots have no refs + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo(testSnapshots.get(1)))); + divergentProperties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + Map updatedProperties = capturedMetadata.properties(); + + // Verify the divergent commit contains all 4 snapshots + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit should contain all 4 snapshots (2 regular + 2 WAP)"); + + Set expectedSnapshotIds = + allSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set actualSnapshotIds = + capturedMetadata.snapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + Assertions.assertEquals( + expectedSnapshotIds, + actualSnapshotIds, + "All snapshot IDs (regular + WAP) should be present after divergent commit"); + + // Verify main ref points to the expected snapshot (testSnapshots[1]) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(1).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to testSnapshots[1] after divergent commit"); + + // Verify WAP snapshots are tracked as staged + String stagedSnapshots = updatedProperties.get(getCanonicalFieldName("staged_snapshots")); + Assertions.assertNotNull(stagedSnapshots, "Staged snapshots should be tracked"); + Set stagedSnapshotIds = Set.of(stagedSnapshots.split(",")); + Assertions.assertTrue( + stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(0).snapshotId())), + "WAP snapshot 0 should be tracked as staged"); + Assertions.assertTrue( + stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(1).snapshotId())), + "WAP snapshot 1 should be tracked as staged"); + + // Verify regular snapshot is tracked as appended (not testSnapshots[0] since it was in base) + String appendedSnapshots = updatedProperties.get(getCanonicalFieldName("appended_snapshots")); + Assertions.assertNotNull(appendedSnapshots, "Appended snapshots should be tracked"); + Assertions.assertEquals( + Long.toString(testSnapshots.get(1).snapshotId()), + appendedSnapshots, + "testSnapshots[1] should be tracked as appended"); + + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots")), + "No snapshots should be cherry-picked in this scenario"); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("deleted_snapshots")), + "No snapshots should be deleted in this scenario"); + + Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); + } + } } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java new file mode 100644 index 000000000..78693eaf6 --- /dev/null +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -0,0 +1,1287 @@ +package com.linkedin.openhouse.internal.catalog; + +import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import lombok.SneakyThrows; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotRefParser; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +/** + * Unit tests for {@link SnapshotDiffApplier}. Tests the refactored snapshot logic with multi-branch + * support that extends the base implementation. + */ +public class SnapshotDiffApplierTest { + + private SnapshotDiffApplier snapshotDiffApplier; + private MetricsReporter mockMetricsReporter; + private TableMetadata baseMetadata; + private static final String TEST_TABLE_LOCATION = getTempLocation(); + + @SneakyThrows + private static String getTempLocation() { + return Files.createTempDirectory(UUID.randomUUID().toString()).toString(); + } + + @BeforeEach + void setup() { + mockMetricsReporter = Mockito.mock(MetricsReporter.class); + snapshotDiffApplier = new SnapshotDiffApplier(mockMetricsReporter); + + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + baseMetadata = + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + TEST_TABLE_LOCATION, + new HashMap<>()); + } + + // ========== Helper Methods ========== + + /** + * Creates metadata with snapshots and refs properties for testing. + * + * @param base Base metadata to start from + * @param snapshots Snapshots to include + * @param refs Snapshot refs to include (nullable) + * @return Metadata with properties set + */ + private TableMetadata createMetadataWithSnapshots( + TableMetadata base, List snapshots, Map refs) { + Map properties = new HashMap<>(base.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + if (refs != null) { + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(refs)); + } + return base.replaceProperties(properties); + } + + /** + * Creates metadata with snapshots pointing to the last snapshot as main branch. + * + * @param base Base metadata to start from + * @param snapshots Snapshots to include + * @return Metadata with snapshots and main branch ref + */ + private TableMetadata createMetadataWithSnapshotsAndMainRef( + TableMetadata base, List snapshots) { + Map refs = + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)); + return createMetadataWithSnapshots(base, snapshots, refs); + } + + // ========== Basic Functionality Tests ========== + + /** Verifies that when no snapshot JSON is provided, metadata is returned unmodified. */ + @Test + void testApplySnapshots_noSnapshotsJson_returnsUnmodified() { + TableMetadata result = snapshotDiffApplier.applySnapshots(null, baseMetadata); + + assertEquals(baseMetadata, result); + verifyNoInteractions(mockMetricsReporter); + } + + /** Verifies that table creation (null base) with main branch is handled correctly. */ + @Test + void testApplySnapshots_nullBase_handlesTableCreationWithMainBranch() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + assertNotNull(result); + assertEquals(snapshots.size(), result.snapshots().size()); + } + + /** Verifies that new snapshots are added correctly to branches. */ + @Test + void testApplySnapshots_addNewSnapshots_success() throws IOException { + List initialSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); + + List allSnapshots = new ArrayList<>(initialSnapshots); + allSnapshots.addAll(IcebergTestUtil.getExtraLinearSnapshots()); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo( + allSnapshots.get(allSnapshots.size() - 1)))); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertTrue(result.snapshots().size() > baseWithSnapshots.snapshots().size()); + + verify(mockMetricsReporter, atLeastOnce()).count(anyString(), anyDouble()); + } + + /** Verifies that new snapshots are added correctly to the main branch. */ + @Test + void testApplySnapshots_addNewSnapshotsToMainBranch_success() throws IOException { + List initialSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); + + List allSnapshots = new ArrayList<>(initialSnapshots); + allSnapshots.addAll(IcebergTestUtil.getExtraLinearSnapshots()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, allSnapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertTrue(result.snapshots().size() > baseWithSnapshots.snapshots().size()); + verify(mockMetricsReporter, atLeastOnce()).count(anyString(), anyDouble()); + } + + /** + * Verifies that snapshot-tracking properties are added without overwriting existing properties + * and that transfer properties are removed after applying the diff. + */ + @Test + void testApplySnapshots_preservesExistingPropertiesAndSetsTracking() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + Map baseProperties = new HashMap<>(baseWithSnapshots.properties()); + baseProperties.put("custom-property", "custom-value"); + baseWithSnapshots = baseWithSnapshots.replaceProperties(baseProperties); + + List extraSnapshots = IcebergTestUtil.getExtraLinearSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(extraSnapshots); + + Map providedProperties = new HashMap<>(baseWithSnapshots.properties()); + providedProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + providedProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.createMainBranchRefPointingTo( + allSnapshots.get(allSnapshots.size() - 1)))); + + TableMetadata providedMetadata = baseWithSnapshots.replaceProperties(providedProperties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, providedMetadata); + + Map resultProperties = result.properties(); + + assertEquals("custom-value", resultProperties.get("custom-property")); + assertFalse(resultProperties.containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY)); + assertFalse(resultProperties.containsKey(CatalogConstants.SNAPSHOTS_REFS_KEY)); + + String appendedSnapshots = + resultProperties.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots); + for (Snapshot extraSnapshot : extraSnapshots) { + assertTrue( + appendedSnapshots.contains(Long.toString(extraSnapshot.snapshotId())), + "Expected extra snapshot to be tracked as appended"); + } + } + + /** Verifies that deleting snapshots from main branch works correctly. */ + @Test + void testApplySnapshots_deleteSnapshotsFromMainBranch_success() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, remainingSnapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(remainingSnapshots.size(), result.snapshots().size()); + } + + /** Verifies that updating main branch references works correctly. */ + @Test + void testApplySnapshots_mainBranchUpdates_success() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot newBranchTarget = snapshots.get(1); + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(newBranchTarget); + TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, snapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertNotNull(result.currentSnapshot()); + assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); + } + + /** + * Verifies that when snapshots are provided out of JSON order, the parent pointers are trusted + * and snapshots are correctly reordered based on lineage before being added to Iceberg. + */ + @Test + void testApplySnapshots_snapshotsOutOfOrder_reordersBasedOnParentPointers() throws IOException { + List initialSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); + + // Add extra snapshots which may have different timestamps + List extraSnapshots = IcebergTestUtil.getExtraLinearSnapshots(); + List allSnapshots = new ArrayList<>(initialSnapshots); + allSnapshots.addAll(extraSnapshots); + + // Deliberately create out-of-order snapshots (hardcoded random order) + // Original order: [0, 1, 2, 3, 4, ...] + // Out-of-order: [2, 0, 4, 1, 3, 5, 7, 6] + List outOfOrderSnapshots = new ArrayList<>(); + int[] indices = {2, 0, 4, 1, 3, 5, 7, 6}; // Hardcoded scrambled indices + for (int i = 0; i < Math.min(indices.length, allSnapshots.size()); i++) { + if (indices[i] < allSnapshots.size()) { + outOfOrderSnapshots.add(allSnapshots.get(indices[i])); + } + } + // Add any remaining snapshots + for (int i = indices.length; i < allSnapshots.size(); i++) { + outOfOrderSnapshots.add(allSnapshots.get(i)); + } + + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, outOfOrderSnapshots); + + // Should succeed: parent pointers are trusted and snapshots are reordered correctly + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + // Verify operation succeeded and MAIN branch points to the correct snapshot + assertNotNull(result); + assertNotNull(result.currentSnapshot()); + + // The MAIN branch should point to the last snapshot in the out-of-order list + // (which is used by createMetadataWithSnapshotsAndMainRef) + Snapshot expectedHead = outOfOrderSnapshots.get(outOfOrderSnapshots.size() - 1); + assertEquals(expectedHead.snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify that new snapshots were added (base had 4, we're adding extras) + assertTrue(result.snapshots().size() > initialSnapshots.size()); + } + + // ========== Validation Tests ========== + + /** Verifies that deleting the current snapshot without replacements throws an exception. */ + @Test + void testValidateCurrentSnapshotNotDeleted_whenCurrentDeleted_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(Collections.emptyList())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(new HashMap<>())); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + } + + /** + * Verifies that multiple branches can reference the same snapshot in a single commit. This is + * valid in Iceberg (e.g., after a merge or when creating a branch from an existing point). + */ + @Test + void testApplySnapshots_multipleBranchesReferenceSameSnapshot_succeeds() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot targetSnapshot = snapshots.get(0); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef ref = SnapshotRef.branchBuilder(targetSnapshot.snapshotId()).build(); + snapshotRefs.put("branch1", org.apache.iceberg.SnapshotRefParser.toJson(ref)); + snapshotRefs.put("branch2", org.apache.iceberg.SnapshotRefParser.toJson(ref)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + // Should succeed - multiple branches can point to the same snapshot + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(targetSnapshot.snapshotId(), result.ref("branch1").snapshotId()); + assertEquals(targetSnapshot.snapshotId(), result.ref("branch2").snapshotId()); + } + + /** + * Verifies that applying snapshots succeeds when existing metadata already contains multiple + * branches pointing to the same snapshot. + */ + @Test + void testApplySnapshots_existingDuplicateBranchRefs_allowed() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + TableMetadata.Builder existingBuilder = TableMetadata.buildFrom(baseMetadata); + for (Snapshot snapshot : snapshots) { + existingBuilder.addSnapshot(snapshot); + } + Snapshot sharedSnapshot = snapshots.get(0); + SnapshotRef sharedRef = SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build(); + existingBuilder.setRef("branch1", sharedRef); + existingBuilder.setRef("branch2", sharedRef); + + TableMetadata existingMetadata = existingBuilder.build(); + + Map refsJson = new HashMap<>(); + refsJson.put("branch1", SnapshotRefParser.toJson(sharedRef)); + refsJson.put("branch2", SnapshotRefParser.toJson(sharedRef)); + + Map properties = new HashMap<>(existingMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(refsJson)); + + TableMetadata providedMetadata = existingMetadata.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(existingMetadata, providedMetadata); + + assertNotNull(result); + assertEquals(sharedSnapshot.snapshotId(), result.ref("branch1").snapshotId()); + assertEquals(sharedSnapshot.snapshotId(), result.ref("branch2").snapshotId()); + } + + /** + * Verifies that attempting to delete a snapshot that is still referenced by a branch or tag + * throws an exception. + */ + @Test + void + testValidateDeletedSnapshotsNotReferenced_whenDeletedSnapshotStillReferenced_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot snapshotToDelete = snapshots.get(0); + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef ref = SnapshotRef.branchBuilder(snapshotToDelete.snapshotId()).build(); + snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(ref)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete snapshots")); + assertTrue(exception.getMessage().contains("still referenced")); + } + + /** + * Verifies that deleting the current snapshot from main branch without replacements throws an + * exception. + */ + @Test + void testApplySnapshots_deletingCurrentSnapshotFromMainBranchWithoutReplacement_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, Collections.emptyList(), new HashMap<>()); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + } + + /** Verifies that duplicate snapshot IDs in provided snapshots throw an exception. */ + @Test + void testApplySnapshots_duplicateSnapshotIds_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Create a list with duplicate snapshots (same snapshot ID appears twice) + List duplicateSnapshots = new ArrayList<>(); + duplicateSnapshots.add(snapshots.get(0)); + duplicateSnapshots.add(snapshots.get(0)); // Duplicate + + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, duplicateSnapshots); + + // Should throw IllegalStateException due to duplicate keys in toMap collector + assertThrows( + IllegalStateException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + } + + /** + * Verifies that providing a branch ref pointing to a non-existent snapshot ID causes an + * exception. This tests a critical bug where no validation exists before calling + * setBranchSnapshot. + */ + @Test + void testApplySnapshots_refPointingToNonExistentSnapshot_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Create a ref pointing to a snapshot ID that doesn't exist in the snapshot list + long nonExistentSnapshotId = 999999999L; + Map refs = new HashMap<>(); + SnapshotRef invalidRef = SnapshotRef.branchBuilder(nonExistentSnapshotId).build(); + refs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(invalidRef)); + + TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, snapshots, refs); + + // SnapshotDiffApplier should throw InvalidIcebergSnapshotException when snapshot doesn't exist + assertThrows( + com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies that attempting to set a ref to a snapshot being deleted throws an exception. The + * validation correctly catches this case where a commit attempts to both delete a snapshot and + * set the main branch to point to that deleted snapshot. This prevents leaving the table in an + * invalid state. + */ + @Test + void testApplySnapshots_settingRefToDeletedSnapshot_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Try to delete the first snapshot, then point main branch to the first (deleted) one + Snapshot snapshotToDelete = snapshots.get(0); + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + + // Create refs pointing to the snapshot we're trying to delete + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(snapshotToDelete.snapshotId()).build(); + refs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); + + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, remainingSnapshots, refs); + + // This should throw an exception because we're trying to delete a snapshot + // while setting a branch reference to it + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue( + exception + .getMessage() + .contains("Cannot delete snapshots that are still referenced by branches/tags")); + assertTrue(exception.getMessage().contains("snapshot " + snapshotToDelete.snapshotId())); + assertTrue(exception.getMessage().contains("main")); + } + + /** + * Verifies that a snapshot with an invalid (non-numeric) source snapshot ID in cherry-pick causes + * JsonSyntaxException during parsing. NOTE: This fails at the JSON parsing stage due to Iceberg's + * strict validation, not at the cherry-pick categorization stage. + */ + @Test + void testApplySnapshots_invalidCherryPickSourceSnapshotId_failsAtParsingStage() { + // Create a custom snapshot JSON with invalid source-snapshot-id using Gson + // Note: Iceberg validates snapshot structure strictly, so this fails at Gson parsing + Gson gson = new Gson(); + JsonObject snapshotJson = new JsonObject(); + snapshotJson.addProperty("snapshot-id", 1234567890123456789L); + snapshotJson.addProperty("timestamp-ms", 1669126937912L); + JsonObject summary = new JsonObject(); + summary.addProperty("operation", "append"); + summary.addProperty("source-snapshot-id", "not-a-number"); + snapshotJson.add("summary", summary); + snapshotJson.addProperty("manifest-list", "/tmp/test.avro"); + snapshotJson.addProperty("schema-id", 0); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "[" + gson.toJson(snapshotJson) + "]"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException when Gson tries to parse the invalid source-snapshot-id + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies that a snapshot with null summary is handled correctly during WAP detection. Tests + * lines 172, 180, 202 which check snapshot.summary(). NOTE: This currently fails at Iceberg's + * parsing stage due to strict validation. + */ + @Test + void testApplySnapshots_snapshotWithNullSummary_failsAtParsingStage() { + // Create a custom snapshot JSON with null/missing summary using Gson + // Note: Iceberg validates snapshot structure strictly, so this fails at parsing + Gson gson = new Gson(); + JsonObject snapshotJson = new JsonObject(); + snapshotJson.addProperty("snapshot-id", 1234567890123456789L); + snapshotJson.addProperty("timestamp-ms", 1669126937912L); + snapshotJson.addProperty("manifest-list", "/tmp/test.avro"); + snapshotJson.addProperty("schema-id", 0); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "[" + gson.toJson(snapshotJson) + "]"); + + // Add a main branch ref pointing to this snapshot + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(1234567890123456789L).build(); + refs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(refs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException during Iceberg parsing due to missing required summary + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies behavior when provided snapshots are empty but refs are not. Tests that a ref pointing + * to nothing causes an exception. + */ + @Test + void testApplySnapshots_emptySnapshotsWithNonEmptyRefs_throwsException() { + // Create refs pointing to a snapshot that doesn't exist + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(123456789L).build(); + refs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); + + TableMetadata newMetadata = + createMetadataWithSnapshots(baseMetadata, Collections.emptyList(), refs); + + // Should throw InvalidIcebergSnapshotException because ref points to non-existent snapshot + assertThrows( + com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** Verifies that null providedMetadata throws NullPointerException. */ + @Test + void testApplySnapshots_nullProvidedMetadata_throwsNullPointerException() { + NullPointerException exception = + assertThrows( + NullPointerException.class, + () -> snapshotDiffApplier.applySnapshots(baseMetadata, null)); + + assertTrue(exception.getMessage().contains("providedMetadata cannot be null")); + } + + /** Verifies that malformed JSON in SNAPSHOTS_JSON_KEY property throws exception. */ + @Test + void testApplySnapshots_malformedSnapshotsJson_throwsException() { + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "{ invalid json {{"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException or similar from Gson + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** Verifies that malformed JSON in SNAPSHOTS_REFS_KEY property throws exception. */ + @Test + void testApplySnapshots_malformedRefsJson_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, "{ invalid json {{"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException or similar from Gson + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies behavior when attempting to delete all snapshots with no replacement. This should be + * caught by the existing validation. + */ + @Test + void testApplySnapshots_deletingAllSnapshotsWithNoReplacement_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Try to delete all snapshots without providing replacements + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, Collections.emptyList(), new HashMap<>()); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + } + + // ========== Metrics Tests ========== + + /** Verifies that staged snapshots (not on main branch) trigger the correct metrics. */ + @Test + void testMetrics_addStagedSnapshots_recordsStagedCounter() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + // Use only the first snapshot as base to avoid sequence number conflicts with WAP snapshots + List baseSnapshotsList = List.of(baseSnapshots.get(0)); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshotsList); + + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshotsList); + allSnapshots.addAll(wapSnapshots); + + Map refs = + IcebergTestUtil.createMainBranchRefPointingTo( + baseSnapshotsList.get(baseSnapshotsList.size() - 1)); + TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, allSnapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + verify(mockMetricsReporter) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR), anyDouble()); + } + + /** Verifies that deleting snapshots from main branch triggers the correct metrics. */ + @Test + void testMetrics_deleteSnapshotsFromMainBranch_recordsDeletedCounter() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, remainingSnapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(remainingSnapshots.size(), result.snapshots().size()); + verify(mockMetricsReporter) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), eq(1.0)); + } + + // ========== Property Management Tests ========== + + /** Verifies that appended snapshot IDs to main branch are recorded in properties. */ + @Test + void testProperties_appendedSnapshotsToMainBranch_recordedCorrectly() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + List newSnapshotsList = IcebergTestUtil.getExtraLinearSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(newSnapshotsList); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, allSnapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + String appendedSnapshots = + result.properties().get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots, "Appended snapshots should be recorded in properties"); + + // Verify actual snapshot IDs are present + for (Snapshot newSnapshot : newSnapshotsList) { + assertTrue( + appendedSnapshots.contains(String.valueOf(newSnapshot.snapshotId())), + "Snapshot ID " + newSnapshot.snapshotId() + " should be in appended_snapshots"); + } + } + + /** + * Verifies that temporary snapshot processing keys are removed from final properties when adding + * to main branch. + */ + @Test + void testProperties_tempKeysRemovedForMainBranch_success() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + assertNotNull(result); + assertFalse( + result.properties().containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY), + "Temp snapshots JSON key should be removed"); + assertFalse( + result.properties().containsKey(CatalogConstants.SNAPSHOTS_REFS_KEY), + "Temp snapshots refs key should be removed"); + } + + // ========== Edge Case Tests ========== + + /** + * Verifies transition from table with unreferenced snapshots to having a MAIN branch. Tests + * ref-only update without snapshot changes. + */ + @Test + void testApplySnapshots_baseWithUnreferencedSnapshotsOnly_addFirstMainBranch() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Create base with snapshots but no refs (all unreferenced) + TableMetadata base = baseMetadata; + for (Snapshot snapshot : snapshots) { + base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + } + // Verify no refs in base + assertTrue(base.refs().isEmpty() || !base.refs().containsKey(SnapshotRef.MAIN_BRANCH)); + + // Provided: same snapshots + MAIN ref to one of them + Snapshot mainSnapshot = snapshots.get(2); + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(mainSnapshot); + TableMetadata newMetadata = createMetadataWithSnapshots(base, snapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); + + // Verify MAIN ref is set + assertNotNull(result.currentSnapshot()); + assertEquals(mainSnapshot.snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify no add/delete operations (ref-only update) + assertEquals(snapshots.size(), result.snapshots().size()); + Map resultProps = result.properties(); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS))); + } + + /** + * Verifies table creation with no snapshots (empty state). Tests that an empty table can be + * created successfully. + */ + @Test + void testApplySnapshots_nullBaseEmptySnapshotsEmptyRefs_createsEmptyTable() { + // Provided: empty snapshots list, empty refs + TableMetadata newMetadata = + createMetadataWithSnapshots(baseMetadata, Collections.emptyList(), new HashMap<>()); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + // Verify empty table created + assertNotNull(result); + assertEquals(0, result.snapshots().size()); + assertNull(result.currentSnapshot()); + assertTrue(result.refs().isEmpty() || !result.refs().containsKey(SnapshotRef.MAIN_BRANCH)); + + // Verify no snapshot operations tracked + Map resultProps = result.properties(); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS))); + } + + /** + * Verifies adding both regular and staged snapshots in a single commit. Tests that snapshot + * categorization correctly handles mixed types. + */ + @Test + void testApplySnapshots_addRegularAndStagedSimultaneously() throws IOException { + // Start from empty base (no existing snapshots) + // Simulate a commit that adds both regular and staged snapshots simultaneously + + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + + // Create a custom WAP snapshot without hardcoded sequence number to avoid conflicts + // Build snapshot JSON manually and wrap it in a Gson array + String wapSnapshotJson = + String.format( + "{\"snapshot-id\":%d,\"timestamp-ms\":%d,\"summary\":%s,\"manifest-list\":\"%s\",\"schema-id\":%d}", + 999940701710231339L, + 1669126937912L, + new Gson() + .toJson( + Map.of( + "operation", "append", + "wap.id", "test-wap", + "spark.app.id", "local-1669126906634", + "added-data-files", "1", + "added-records", "1")), + "/data/test.avro", + 0); + String wapSnapshotArrayJson = new Gson().toJson(List.of(wapSnapshotJson)); + List customWapSnapshots = SnapshotsUtil.parseSnapshots(null, wapSnapshotArrayJson); + + List allSnapshots = new ArrayList<>(); + allSnapshots.add(extraSnapshots.get(0)); // New regular snapshot + allSnapshots.add(customWapSnapshots.get(0)); // New staged snapshot + + // MAIN ref points to the new regular snapshot + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(extraSnapshots.get(0)); + TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, allSnapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + // Verify both snapshots added + assertEquals(2, result.snapshots().size()); + + // Verify regular snapshot is on MAIN + assertNotNull(result.currentSnapshot()); + assertEquals(extraSnapshots.get(0).snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify tracking: regular appended, staged tracked separately + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + String stagedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS)); + + assertNotNull(appendedSnapshotsStr); + assertTrue(appendedSnapshotsStr.contains(Long.toString(extraSnapshots.get(0).snapshotId()))); + + assertNotNull(stagedSnapshotsStr); + assertTrue(stagedSnapshotsStr.contains(Long.toString(customWapSnapshots.get(0).snapshotId()))); + } + + /** + * Verifies cherry-picking a staged snapshot while adding a new snapshot in the same commit. Tests + * compound operation tracking. + */ + @Test + void testApplySnapshots_cherryPickAndAddNewSimultaneously() throws IOException { + List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Base: MAIN snapshot + staged snapshot + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(testWapSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(testWapSnapshots.get(1)) // Staged snapshot + .build(); + + // Provided: existing + new snapshot becomes MAIN, staged is cherry-picked + List allSnapshots = new ArrayList<>(); + allSnapshots.add(testWapSnapshots.get(0)); + allSnapshots.add(testWapSnapshots.get(1)); // Was staged, now cherry-picked + allSnapshots.add(testWapSnapshots.get(2)); // New snapshot + + // MAIN ref points to new snapshot + Map refs = + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(2)); + TableMetadata newMetadata = createMetadataWithSnapshots(base, allSnapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); + + // Verify new snapshot is on MAIN + assertNotNull(result.currentSnapshot()); + assertEquals(testWapSnapshots.get(2).snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify both operations tracked + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + String cherryPickedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + + // New snapshot should be appended + assertNotNull(appendedSnapshotsStr); + assertTrue(appendedSnapshotsStr.contains(Long.toString(testWapSnapshots.get(2).snapshotId()))); + + // Staged snapshot should be cherry-picked + assertNotNull(cherryPickedSnapshotsStr); + assertTrue( + cherryPickedSnapshotsStr.contains(Long.toString(testWapSnapshots.get(1).snapshotId()))); + } + + /** + * Verifies that attempting to delete the current snapshot while unreferenced snapshots exist + * throws an exception. Tests current snapshot protection. + */ + @Test + void testApplySnapshots_attemptDeleteCurrentWithUnreferencedPresent_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Base: MAIN snapshot + 2 unreferenced snapshots + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .addSnapshot(snapshots.get(0)) // Unreferenced + .addSnapshot(snapshots.get(1)) // Unreferenced + .setBranchSnapshot(snapshots.get(2), SnapshotRef.MAIN_BRANCH) // Current snapshot + .build(); + + // Provided: only the 2 unreferenced (delete MAIN), no new snapshots + List remainingSnapshots = snapshots.subList(0, 2); + TableMetadata newMetadata = + createMetadataWithSnapshots(base, remainingSnapshots, new HashMap<>()); + + // Should throw exception because current snapshot is being deleted without replacement + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(base, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + assertTrue(exception.getMessage().contains(Long.toString(snapshots.get(2).snapshotId()))); + } + + /** + * Verifies adding regular (non-WAP) snapshots with empty refs. historically, such snapshots were + * automatically added to MAIN branch and tracked as APPENDED_SNAPSHOTS. This test validates + * backward compatibility with that behavior. NOTE: The semantics here are questionable - + * snapshots with no refs should arguably not be "appended" to MAIN, but this preserves the + * original behavior. + */ + @Test + void testApplySnapshots_regularSnapshotsWithEmptyRefs_autoAppendedToMain() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + // Provided: existing + new snapshots, but empty refs map (no MAIN branch) + List extraSnapshots = IcebergTestUtil.getExtraLinearSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(extraSnapshots); + + // Empty refs - no MAIN branch + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, allSnapshots, new HashMap<>()); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + // Verify new snapshots added + assertEquals(allSnapshots.size(), result.snapshots().size()); + + // Verify MAIN branch points to the latest snapshot (auto-appended to main) + assertNotNull(result.ref(SnapshotRef.MAIN_BRANCH)); + assertEquals( + allSnapshots.get(allSnapshots.size() - 1).snapshotId(), + result.ref(SnapshotRef.MAIN_BRANCH).snapshotId()); + + // Verify new snapshots tracked as appended (even though unreferenced, they're not staged WAP) + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + + assertNotNull(appendedSnapshotsStr); + for (Snapshot extraSnapshot : extraSnapshots) { + assertTrue( + appendedSnapshotsStr.contains(Long.toString(extraSnapshot.snapshotId())), + "Snapshot " + extraSnapshot.snapshotId() + " should be tracked as appended"); + } + } + + /** + * Verifies cherry-picking multiple staged snapshots in sequence, testing both fast-forward and + * rebase scenarios. wap1 and wap2 both have the same parent. Cherry-picking wap1 first is a + * fast-forward (no new snapshot). Cherry-picking wap2 after main has moved requires a rebase (new + * snapshot created). + */ + @Test + void testApplySnapshots_cherryPickMultipleStagedSnapshotsOutOfOrder() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Setup: MAIN snapshot + 2 staged WAP snapshots (wap1, wap2) + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(testWapSnapshots.get(0)) // wap1 (wap.id="wap1") + .addSnapshot(testWapSnapshots.get(1)) // wap2 (wap.id="wap2") + .build(); + + // Step 1: Fast-forward cherry-pick wap1 + // wap1's parent == current main, so it's promoted directly (no new snapshot) + List allSnapshots1 = new ArrayList<>(); + allSnapshots1.add(testSnapshots.get(0)); + allSnapshots1.add(testWapSnapshots.get(0)); // wap1 now on main + allSnapshots1.add(testWapSnapshots.get(1)); // wap2 still staged + + // Set MAIN branch to point to wap1 + Map refs1 = + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)); + TableMetadata newMetadata1 = createMetadataWithSnapshots(base, allSnapshots1, refs1); + + TableMetadata result1 = snapshotDiffApplier.applySnapshots(base, newMetadata1); + + // Verify fast-forward: only cherry_picked tracked, no new snapshot appended + assertNotNull(result1.currentSnapshot()); + assertEquals(testWapSnapshots.get(0).snapshotId(), result1.currentSnapshot().snapshotId()); + + Map resultProps1 = result1.properties(); + String cherryPickedSnapshots1 = + resultProps1.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + assertNotNull(cherryPickedSnapshots1); + assertTrue( + cherryPickedSnapshots1.contains(Long.toString(testWapSnapshots.get(0).snapshotId())), + "wap1 should be tracked as cherry-picked"); + assertNull( + resultProps1.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)), + "No new snapshot for fast-forward"); + + // Step 2: Rebase cherry-pick wap2 + // wap2's parent != current main (which is now wap1), so a new snapshot is created + // New snapshot has: parent=wap1, source-snapshot-id=wap2, published.wap.id="wap2" + List allSnapshots2 = new ArrayList<>(); + allSnapshots2.add(testSnapshots.get(0)); + allSnapshots2.add(testWapSnapshots.get(0)); // wap1 + allSnapshots2.add(testWapSnapshots.get(1)); // wap2 (source) + allSnapshots2.add(testWapSnapshots.get(2)); // New rebased snapshot + + Map refs2 = + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(2)); + TableMetadata newMetadata2 = createMetadataWithSnapshots(result1, allSnapshots2, refs2); + + TableMetadata result2 = snapshotDiffApplier.applySnapshots(result1, newMetadata2); + + // Verify rebase: both cherry_picked (source) and appended (new snapshot) tracked + assertNotNull(result2.currentSnapshot()); + assertEquals(testWapSnapshots.get(2).snapshotId(), result2.currentSnapshot().snapshotId()); + + Map resultProps2 = result2.properties(); + + String cherryPickedSnapshots2 = + resultProps2.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + assertNotNull(cherryPickedSnapshots2); + assertTrue( + cherryPickedSnapshots2.contains(Long.toString(testWapSnapshots.get(1).snapshotId())), + "wap2 should be tracked as cherry-picked (source)"); + + String appendedSnapshots2 = + resultProps2.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots2); + assertTrue( + appendedSnapshots2.contains(Long.toString(testWapSnapshots.get(2).snapshotId())), + "New rebased snapshot should be tracked as appended"); + + // Verify all 4 snapshots present + assertEquals(4, result2.snapshots().size()); + verify(mockMetricsReporter, atLeastOnce()) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR), anyDouble()); + } + + // ========== Branch Update Tests ========== + + /** Verifies that updating branch references works correctly. */ + @Test + void testApplySnapshots_branchUpdates_appliesCorrectly() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot newBranchTarget = snapshots.get(1); + Map snapshotRefs = + IcebergTestUtil.createMainBranchRefPointingTo(newBranchTarget); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertNotNull(result.currentSnapshot()); + assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); + } + + /** + * Verifies that multiple branch updates can be applied simultaneously. This is a PR2-specific + * test for multi-branch support. + */ + @Test + void testApplySnapshots_multipleBranchUpdates_success() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(snapshots.get(0).snapshotId()).build(); + SnapshotRef devRef = SnapshotRef.branchBuilder(snapshots.get(1).snapshotId()).build(); + snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); + snapshotRefs.put("dev", org.apache.iceberg.SnapshotRefParser.toJson(devRef)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(2, result.refs().size()); + } + + // ========== Helper Methods ========== + + /** + * Adds snapshots to metadata and sets main branch to the last snapshot. + * + * @param metadata Base metadata + * @param snapshots Snapshots to add + * @return Updated metadata + */ + private TableMetadata addSnapshotsToMetadata(TableMetadata metadata, List snapshots) { + TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); + for (Snapshot snapshot : snapshots) { + builder.addSnapshot(snapshot); + } + if (!snapshots.isEmpty()) { + Snapshot lastSnapshot = snapshots.get(snapshots.size() - 1); + SnapshotRef ref = SnapshotRef.branchBuilder(lastSnapshot.snapshotId()).build(); + builder.setRef(SnapshotRef.MAIN_BRANCH, ref); + } + return builder.build(); + } + + // ========== Multi-Branch Commit Tests ========== + + /** + * Verifies that when new snapshots are committed to multiple branches simultaneously, + * APPENDED_SNAPSHOTS only tracks the MAIN branch snapshots (backward compatibility). + * + *

This test simulates a single commit that updates both MAIN and branch-A: - MAIN branch: gets + * 2 new commits - branch-A: gets 2 new commits + * + *

Expected: APPENDED_SNAPSHOTS property should only contain the MAIN branch snapshots, not the + * branch-A snapshots. + */ + @Test + void testMultiBranchCommit_newSnapshotsOnMainAndBranchA_appendedSnapshotsOnlyContainsMain() + throws IOException { + // 1. Create base metadata with one snapshot on main + List snapshots = IcebergTestUtil.getSnapshots(); + Snapshot baseSnapshot = snapshots.get(0); + + TableMetadata.Builder baseBuilder = TableMetadata.buildFrom(baseMetadata); + baseBuilder.setBranchSnapshot(baseSnapshot, SnapshotRef.MAIN_BRANCH); + TableMetadata baseWithSnapshots = baseBuilder.build(); + + // 2. Create new metadata from base with: + // - 2 more snapshots on main + // - 2 snapshots on feature branch + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + Snapshot mainSnapshot1 = extraSnapshots.get(0); + Snapshot mainSnapshot2 = extraSnapshots.get(1); + Snapshot featureSnapshot1 = extraSnapshots.get(2); + Snapshot featureSnapshot2 = extraSnapshots.get(3); + + TableMetadata.Builder newBuilder = TableMetadata.buildFrom(baseWithSnapshots); + newBuilder.setBranchSnapshot(mainSnapshot1, SnapshotRef.MAIN_BRANCH); + newBuilder.setBranchSnapshot(mainSnapshot2, SnapshotRef.MAIN_BRANCH); + newBuilder.setBranchSnapshot(featureSnapshot1, "branch-A"); + newBuilder.setBranchSnapshot(featureSnapshot2, "branch-A"); + + TableMetadata newMetadataWithSnapshots = newBuilder.build(); + + // Add properties with snapshot JSON and refs + List allSnapshots = + List.of(baseSnapshot, mainSnapshot1, mainSnapshot2, featureSnapshot1, featureSnapshot2); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(mainSnapshot2.snapshotId()).build(); + snapshotRefs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(mainRef)); + SnapshotRef branchARef = SnapshotRef.branchBuilder(featureSnapshot2.snapshotId()).build(); + snapshotRefs.put("branch-A", SnapshotRefParser.toJson(branchARef)); + + Map properties = new HashMap<>(newMetadataWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = newMetadataWithSnapshots.replaceProperties(properties); + + // 3. Verify newMetadata structure is as expected + assertEquals(5, newMetadata.snapshots().size(), "Should have 1 base + 4 new snapshots"); + assertEquals( + mainSnapshot2.snapshotId(), + newMetadata.ref(SnapshotRef.MAIN_BRANCH).snapshotId(), + "Main should point to mainSnapshot2"); + assertEquals( + featureSnapshot2.snapshotId(), + newMetadata.ref("branch-A").snapshotId(), + "branch-A should point to featureSnapshot2"); + + // 4. Apply snapshots + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + // 5. Verify result is combination of baseMetadata and newMetadata + assertNotNull(result); + + // Result should have all 5 snapshots (1 from base + 4 new) + assertEquals(5, result.snapshots().size(), "Result should have all snapshots from newMetadata"); + + // Result should have both branch refs + assertEquals( + mainSnapshot2.snapshotId(), + result.ref(SnapshotRef.MAIN_BRANCH).snapshotId(), + "Main should point to mainSnapshot2"); + assertEquals( + featureSnapshot2.snapshotId(), + result.ref("branch-A").snapshotId(), + "branch-A should point to featureSnapshot2"); + + // APPENDED_SNAPSHOTS should only contain the 2 main branch snapshots (not feature branch) + String appendedSnapshots = + result.properties().get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots, "APPENDED_SNAPSHOTS property should exist"); + + // Should contain both main snapshots + assertTrue( + appendedSnapshots.contains(String.valueOf(mainSnapshot1.snapshotId())), + "APPENDED_SNAPSHOTS should contain mainSnapshot1"); + assertTrue( + appendedSnapshots.contains(String.valueOf(mainSnapshot2.snapshotId())), + "APPENDED_SNAPSHOTS should contain mainSnapshot2"); + + // Should NOT contain feature branch snapshots + assertFalse( + appendedSnapshots.contains(String.valueOf(featureSnapshot1.snapshotId())), + "APPENDED_SNAPSHOTS should NOT contain featureSnapshot1"); + assertFalse( + appendedSnapshots.contains(String.valueOf(featureSnapshot2.snapshotId())), + "APPENDED_SNAPSHOTS should NOT contain featureSnapshot2"); + } +} diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java deleted file mode 100644 index 3fb9ced17..000000000 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java +++ /dev/null @@ -1,171 +0,0 @@ -package com.linkedin.openhouse.internal.catalog; - -import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; -import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapperTest; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.attribute.FileAttribute; -import java.nio.file.attribute.PosixFilePermission; -import java.nio.file.attribute.PosixFilePermissions; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.UUID; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.hadoop.HadoopOutputFile; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mockito; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.context.annotation.Import; - -@SpringBootTest -@Import(HouseTableMapperTest.MockConfiguration.class) -class SnapshotInspectorTest { - - @Autowired SnapshotInspector snapshotInspector; - - @TempDir static Path tempDir; - - private static final TableMetadata NO_SNAPSHOTS_METADATA = - TableMetadata.newTableMetadata( - new Schema( - Types.NestedField.required(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone())), - PartitionSpec.unpartitioned(), - UUID.randomUUID().toString(), - ImmutableMap.of()); - - @Test - void testValidateSnapshotsUpdateWithNoSnapshotMetadata() throws IOException { - - List testSnapshots = IcebergTestUtil.getSnapshots(); - // No exception since added as well deleted snapshots are allowed to support replication - // use case which performs table commit with added and deleted snapshots. - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, testSnapshots.subList(0, 1), testSnapshots.subList(1, 4))); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, testSnapshots, Collections.emptyList())); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, Collections.emptyList(), testSnapshots)); - } - - @Test - void testValidateSnapshotsUpdateWithSnapshotMetadata() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - List extraTestSnapshots = IcebergTestUtil.getExtraSnapshots(); - TableMetadata metadataWithSnapshots = - TableMetadata.buildFrom(NO_SNAPSHOTS_METADATA) - .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) - .build(); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, testSnapshots, Collections.emptyList())); - // No validation error if snapshots are added and deleted - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, testSnapshots, testSnapshots)); - // No validation error if snapshots are added and deleted - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, extraTestSnapshots, testSnapshots)); - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, Collections.emptyList(), testSnapshots)); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, - Collections.emptyList(), - testSnapshots.subList(0, testSnapshots.size() - 1))); - } - - @Test - void testSecureSnapshot() throws IOException { - // The default file attribute that sets the permission as 777 when a file is created. - FileAttribute> attr = - PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxrwxrwx")); - - // Mock DataFile and ManifestFile - Snapshot mockSnapshot = Mockito.mock(org.apache.iceberg.Snapshot.class); - Path tempFile1 = Files.createFile(tempDir.resolve("data1.parquet"), attr); - Path tempFile2 = Files.createFile(tempDir.resolve("data2.parquet"), attr); - Path tempFile3 = Files.createFile(tempDir.resolve("manifest"), attr); - - // Mock FileIO - FileIO fileIO = Mockito.mock(org.apache.iceberg.io.FileIO.class); - - List dataFileList = - ImmutableList.of( - createDataFile(tempFile1.toString()), createDataFile(tempFile2.toString())); - - ManifestWriter manifestWriter = - ManifestFiles.write( - PartitionSpec.unpartitioned(), - HadoopOutputFile.fromLocation(tempFile3.toString(), new Configuration())); - manifestWriter.close(); - - Mockito.when(mockSnapshot.allManifests(fileIO)) - .thenReturn(ImmutableList.of(manifestWriter.toManifestFile())); - Mockito.when(mockSnapshot.addedDataFiles(fileIO)).thenReturn(dataFileList); - snapshotInspector.secureSnapshot(mockSnapshot, fileIO); - - /* Verify the perms of files are modified as com.linkedin.openhouse.internal.catalog.MockApplication.perm does */ - FileSystem fileSystem = FileSystem.get(new Configuration()); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile1.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile2.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile3.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - } - - public static DataFile createDataFile(String dataPath) throws IOException { - Files.write(Paths.get(dataPath), Lists.newArrayList(), StandardCharsets.UTF_8); - return DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(dataPath) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - } -} diff --git a/iceberg/openhouse/internalcatalog/src/test/resources/extra_linear_serialized_snapshots.json b/iceberg/openhouse/internalcatalog/src/test/resources/extra_linear_serialized_snapshots.json new file mode 100644 index 000000000..8fc804818 --- /dev/null +++ b/iceberg/openhouse/internalcatalog/src/test/resources/extra_linear_serialized_snapshots.json @@ -0,0 +1,7 @@ +[ + "{ \"snapshot-id\" : 5151407017102313398, \"parent-snapshot-id\" : 4151407017102313398, \"sequence-number\" : 5, \"timestamp-ms\" : 1669126937916, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"5\", \"total-files-size\" : \"3365\", \"total-data-files\" : \"5\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-5151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 6151407017102313398, \"parent-snapshot-id\" : 5151407017102313398, \"sequence-number\" : 6, \"timestamp-ms\" : 1669126937917, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"6\", \"total-files-size\" : \"4038\", \"total-data-files\" : \"6\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-6151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 7151407017102313398, \"parent-snapshot-id\" : 6151407017102313398, \"sequence-number\" : 7, \"timestamp-ms\" : 1669126937918, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"7\", \"total-files-size\" : \"4711\", \"total-data-files\" : \"7\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-7151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 8151407017102313398, \"parent-snapshot-id\" : 7151407017102313398, \"sequence-number\" : 8, \"timestamp-ms\" : 1669126937919, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"8\", \"total-files-size\" : \"5384\", \"total-data-files\" : \"8\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-8151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" +] + diff --git a/iceberg/openhouse/internalcatalog/src/test/resources/extra_serialized_snapshots.json b/iceberg/openhouse/internalcatalog/src/test/resources/extra_serialized_snapshots.json index 08e74a1ba..8eb894d64 100644 --- a/iceberg/openhouse/internalcatalog/src/test/resources/extra_serialized_snapshots.json +++ b/iceberg/openhouse/internalcatalog/src/test/resources/extra_serialized_snapshots.json @@ -1,6 +1,6 @@ [ - "{ \"snapshot-id\" : 5151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 6151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 7151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 8151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" + "{ \"snapshot-id\" : 5151407017102313398, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 5, \"timestamp-ms\" : 1669126937916, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"2\", \"total-files-size\" : \"1346\", \"total-data-files\" : \"2\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-5151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 6151407017102313398, \"parent-snapshot-id\" : 5151407017102313398, \"sequence-number\" : 6, \"timestamp-ms\" : 1669126937917, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"3\", \"total-files-size\" : \"2019\", \"total-data-files\" : \"3\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-6151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 7151407017102313398, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 7, \"timestamp-ms\" : 1669126937918, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"2\", \"total-files-size\" : \"1346\", \"total-data-files\" : \"2\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-7151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 8151407017102313398, \"parent-snapshot-id\" : 7151407017102313398, \"sequence-number\" : 8, \"timestamp-ms\" : 1669126937919, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"3\", \"total-files-size\" : \"2019\", \"total-data-files\" : \"3\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-8151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" ] diff --git a/iceberg/openhouse/internalcatalog/src/test/resources/future_serialized_snapshots.json b/iceberg/openhouse/internalcatalog/src/test/resources/future_serialized_snapshots.json index 8d04dbd59..636e459ab 100644 --- a/iceberg/openhouse/internalcatalog/src/test/resources/future_serialized_snapshots.json +++ b/iceberg/openhouse/internalcatalog/src/test/resources/future_serialized_snapshots.json @@ -1,3 +1,3 @@ [ - "{ \"snapshot-id\" : 4151407017102313399, \"timestamp-ms\" : 2655481960000, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" + "{ \"snapshot-id\" : 4151407017102313399, \"parent-snapshot-id\" : 4151407017102313398, \"sequence-number\" : 9, \"timestamp-ms\" : 2655481960000, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"5\", \"total-files-size\" : \"3365\", \"total-data-files\" : \"5\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-4151407017102313399-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" ] \ No newline at end of file diff --git a/iceberg/openhouse/internalcatalog/src/test/resources/serialized_snapshots.json b/iceberg/openhouse/internalcatalog/src/test/resources/serialized_snapshots.json index 7fbc6666b..55d28da09 100644 --- a/iceberg/openhouse/internalcatalog/src/test/resources/serialized_snapshots.json +++ b/iceberg/openhouse/internalcatalog/src/test/resources/serialized_snapshots.json @@ -1,6 +1,6 @@ [ - "{ \"snapshot-id\" : 1151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 2151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 3151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 4151407017102313398, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" + "{ \"snapshot-id\" : 1151407017102313398, \"sequence-number\" : 1, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-1151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 2151407017102313398, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 2, \"timestamp-ms\" : 1669126937913, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"2\", \"total-files-size\" : \"1346\", \"total-data-files\" : \"2\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 3151407017102313398, \"parent-snapshot-id\" : 2151407017102313398, \"sequence-number\" : 3, \"timestamp-ms\" : 1669126937914, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"3\", \"total-files-size\" : \"2019\", \"total-data-files\" : \"3\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-3151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 4151407017102313398, \"parent-snapshot-id\" : 3151407017102313398, \"sequence-number\" : 4, \"timestamp-ms\" : 1669126937915, \"summary\" : { \"operation\" : \"append\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"4\", \"total-files-size\" : \"2692\", \"total-data-files\" : \"4\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-4151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" ] diff --git a/iceberg/openhouse/internalcatalog/src/test/resources/wap_serialized_snapshots.json b/iceberg/openhouse/internalcatalog/src/test/resources/wap_serialized_snapshots.json index 15107d1fe..720ad99aa 100644 --- a/iceberg/openhouse/internalcatalog/src/test/resources/wap_serialized_snapshots.json +++ b/iceberg/openhouse/internalcatalog/src/test/resources/wap_serialized_snapshots.json @@ -1,5 +1,5 @@ [ - "{ \"snapshot-id\" : 1151407017102313399, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 1, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"wap.id\" : \"wap1\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 2151407017102313399, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 2, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"wap.id\" : \"wap2\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", - "{ \"snapshot-id\" : 3151407017102313399, \"parent-snapshot-id\" : 1151407017102313399, \"sequence-number\" : 3, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"source-snapshot-id\" : \"2151407017102313399\", \"operation\" : \"append\", \"published.wap.id\" : \"wap2\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" + "{ \"snapshot-id\" : 1151407017102313399, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 5, \"timestamp-ms\" : 1669126937912, \"summary\" : { \"operation\" : \"append\", \"wap.id\" : \"wap1\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 2151407017102313399, \"parent-snapshot-id\" : 1151407017102313398, \"sequence-number\" : 6, \"timestamp-ms\" : 1669126937913, \"summary\" : { \"operation\" : \"append\", \"wap.id\" : \"wap2\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}", + "{ \"snapshot-id\" : 3151407017102313399, \"parent-snapshot-id\" : 1151407017102313399, \"sequence-number\" : 7, \"timestamp-ms\" : 1669126937914, \"summary\" : { \"source-snapshot-id\" : \"2151407017102313399\", \"operation\" : \"append\", \"published.wap.id\" : \"wap2\", \"spark.app.id\" : \"local-1669126906634\", \"added-data-files\" : \"1\", \"added-records\" : \"1\", \"added-files-size\" : \"673\", \"changed-partition-count\" : \"1\", \"total-records\" : \"1\", \"total-files-size\" : \"673\", \"total-data-files\" : \"1\", \"total-delete-files\" : \"0\", \"total-position-deletes\" : \"0\", \"total-equality-deletes\" : \"0\" }, \"manifest-list\" : \"/data/openhouse/db/test-7a9e8c95-1a62-4d29-9621-d8784047fc6b/metadata/snap-2151407017102313398-1-aa0dcbb9-707f-4f53-9df8-394bad8563f2.avro\", \"schema-id\" : 0}" ] diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java new file mode 100644 index 000000000..478059289 --- /dev/null +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -0,0 +1,2370 @@ +package com.linkedin.openhouse.spark.catalogtest; + +import static org.junit.jupiter.api.Assertions.*; + +import com.linkedin.openhouse.tablestest.OpenHouseSparkITest; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.MethodOrderer; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; + +/** + * Comprehensive tests for multi-branch WAP operations in Spark 3.5. Tests validate the enhanced + * applySnapshotOperations functionality that supports: - Non-main branch operations (add/expire + * snapshots from any branch) - WAP.id staging with multi-branch support - Cherry picking between + * any branches - Fast forward merges for all branches - Backward compatibility with main-only + * workflows - Forward compatibility for future wap.branch features + */ +@TestMethodOrder(MethodOrderer.MethodName.class) +@Execution(ExecutionMode.SAME_THREAD) +public class BranchTestSpark3_5 extends OpenHouseSparkITest { + + /** + * Comprehensive cleanup method to prevent configuration and table bleed-over between tests. This + * ensures WAP configurations are properly reset and all test tables are dropped. + */ + @AfterEach + public void cleanupAfterTest() { + try (SparkSession spark = getSparkSession()) { + // Clear WAP configurations to prevent bleed-over between tests + spark.conf().unset("spark.wap.id"); + spark.conf().unset("spark.wap.branch"); + + // Drop all test tables to ensure clean state for next test + // Get all tables in the d1 database that start with branch_test_ or similar patterns + try { + List tables = spark.sql("SHOW TABLES IN openhouse.d1").collectAsList(); + for (Row table : tables) { + String tableName = table.getString(1); // table name is in second column + if (tableName.startsWith("branch_test_") || tableName.startsWith("test_")) { + String fullTableName = "openhouse.d1." + tableName; + spark.sql("DROP TABLE IF EXISTS " + fullTableName); + } + } + } catch (Exception e) { + // If SHOW TABLES fails, try to drop common test table patterns + // This is a fallback in case the database doesn't exist yet + for (String pattern : new String[] {"branch_test_", "test_"}) { + for (int i = 0; i < 10; i++) { // Try a few recent timestamps + long timestamp = System.currentTimeMillis() - (i * 1000); + String tableName = "openhouse.d1." + pattern + timestamp; + try { + spark.sql("DROP TABLE IF EXISTS " + tableName); + } catch (Exception ignored) { + // Ignore failures for non-existent tables + } + } + } + } + } catch (Exception e) { + // Log but don't fail the test for cleanup issues + System.err.println("Warning: Failed to cleanup after test: " + e.getMessage()); + } + } + + // ===== BASIC BRANCH OPERATIONS ===== + + @Test + public void testBasicBranchOperations() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Add initial data to main + spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Write to feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data2')"); + + // Verify branch isolation + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature-a has 3 rows + + // Verify refs exist for both branches + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size()); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("feature_a")); + assertTrue(refNames.contains("main")); + } + } + + // ===== WAP STAGING WITH MULTI-BRANCH SUPPORT ===== + + @Test + public void testWapStagingWithBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup main and feature branches + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data')"); + + // Stage WAP snapshot (should not affect any branch) + spark.conf().set("spark.wap.id", "multi-branch-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.staged.data')"); + + // Verify WAP staging doesn't affect branch visibility + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature-a unchanged + + // Verify WAP snapshot exists but no new refs + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + ".snapshots") + .collectAsList() + .size()); // 1 main + 1 feature + 1 wap + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + ".refs") + .collectAsList() + .size()); // main + feature-a only + + // Verify WAP snapshot has correct properties + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'multi-branch-wap'") + .collectAsList(); + assertEquals(1, wapSnapshots.size()); + } + } + + @Test + public void testWapIdAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_id_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Enable WAP on the table + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Verify no branches exist yet (empty table has no branches) + List initialRefs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(0, initialRefs.size(), "Empty table should have no branches initially"); + + // ===== WAP STAGING ON EMPTY TABLE ===== + + // 1. Create WAP staged data on empty table (should create staging snapshot) + spark.conf().set("spark.wap.id", "wap-stage-1"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_1')"); + spark.conf().unset("spark.wap.id"); + + // Verify WAP snapshot was created + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id, summary FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-1'") + .collectAsList(); + assertEquals(1, wapSnapshots.size(), "Should have 1 WAP staged snapshot"); + + // Verify no branches exist yet (WAP staging doesn't create branches) + List refsAfterWapStaging = + spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(0, refsAfterWapStaging.size(), "WAP staging should not create branches"); + + // Verify WAP data is not visible in main queries (no branch exists) + assertEquals( + 0, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 0 rows - no branches exist, WAP data is staged"); + + // ===== WAP PUBLISHING TO CREATE MAIN BRANCH ===== + + // 2. Publish WAP data to create main branch + String wapSnapshotId = String.valueOf(wapSnapshots.get(0).getLong(0)); + spark.sql( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', " + + wapSnapshotId + + ")"); + + // Verify main branch now exists + List refsAfterPublishing = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, refsAfterPublishing.size(), "Should have main branch after publishing WAP data"); + assertEquals("main", refsAfterPublishing.get(0).getString(0), "Should have main branch"); + + // Verify WAP data is now visible in main branch + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row after WAP publishing"); + + List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals( + "wap_staged_data_1", mainData.get(0).getString(0), "Should see published WAP data"); + + // ===== MULTI-WAP OPERATIONS ===== + + // 3. Create multiple WAP staged data sets + spark.conf().set("spark.wap.id", "wap-stage-2"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_2')"); + spark.conf().unset("spark.wap.id"); + + spark.conf().set("spark.wap.id", "wap-stage-3"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_3')"); + spark.conf().unset("spark.wap.id"); + + // Verify multiple WAP snapshots exist + List allWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] IS NOT NULL") + .collectAsList(); + assertEquals(3, allWapSnapshots.size(), "Should have 3 WAP staged snapshots"); + + // Verify main branch is unchanged (WAP data is staged) + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (staged WAP not visible)"); + + // ===== SELECTIVE WAP PUBLISHING ===== + + // 4. Publish second WAP data set only + List wap2Snapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-2'") + .collectAsList(); + String wap2SnapshotId = String.valueOf(wap2Snapshots.get(0).getLong(0)); + spark.sql( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', " + + wap2SnapshotId + + ")"); + + // Verify main branch now has both published datasets + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 2 rows after second WAP publishing"); + + List publishedData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "wap_staged_data_1", + publishedData.get(0).getString(0), + "First row should be first WAP data"); + assertEquals( + "wap_staged_data_2", + publishedData.get(1).getString(0), + "Second row should be second WAP data"); + + // ===== UNPUBLISHED WAP DATA VERIFICATION ===== + + // 5. Verify third WAP data remains unpublished + List wap3Snapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-3'") + .collectAsList(); + assertEquals(1, wap3Snapshots.size(), "Third WAP snapshot should still exist"); + + // Verify unpublished WAP data is not visible + List currentData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertFalse( + currentData.stream().anyMatch(row -> "wap_staged_data_3".equals(row.getString(0))), + "Unpublished WAP data should not be visible in main branch"); + + // ===== REGULAR DATA VS WAP DATA ===== + + // 6. Add regular (non-WAP) data to main branch + spark.sql("INSERT INTO " + tableName + " VALUES ('regular_data')"); + + // Verify main branch now has mixed data + assertEquals( + 3, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 3 rows (2 published WAP + 1 regular)"); + + List finalData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("regular_data", finalData.get(0).getString(0), "Should contain regular data"); + assertEquals( + "wap_staged_data_1", finalData.get(1).getString(0), "Should contain first WAP data"); + assertEquals( + "wap_staged_data_2", finalData.get(2).getString(0), "Should contain second WAP data"); + + // ===== SNAPSHOT HISTORY VERIFICATION ===== + + // 7. Verify snapshot counts and types + List totalSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertTrue( + totalSnapshots.size() >= 4, "Should have at least 4 snapshots (3 WAP + 1 regular)"); + + // Verify WAP snapshots still exist in metadata + List remainingWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] IS NOT NULL") + .collectAsList(); + assertEquals( + 3, remainingWapSnapshots.size(), "All 3 WAP snapshots should still exist in metadata"); + + // Verify main branch has the latest published snapshot (points to regular INSERT snapshot) + List mainSnapshotRef = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshotRef.size(), "Main branch should exist and point to a snapshot"); + } + } + + @Test + public void testBranchAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Create branch on table with no existing snapshots + // According to Iceberg specification, this should succeed and create an empty snapshot + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_on_empty"); + + // Verify that an empty snapshot was created for the branch + List snapshotsAfterBranchCreation = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 1, + snapshotsAfterBranchCreation.size(), + "Should have 1 empty snapshot after branch creation"); + + // Verify the empty snapshot properties + Row emptySnapshot = snapshotsAfterBranchCreation.get(0); + // The parent_id should be null for the empty snapshot + assertNull( + emptySnapshot.get(emptySnapshot.fieldIndex("parent_id")), + "Empty snapshot should have no parent"); + + // Verify the branch was created successfully + List refsAfterBranchCreation = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, + refsAfterBranchCreation.size(), + "Should have feature_on_empty branch (main doesn't exist yet)"); + assertEquals( + "feature_on_empty", + refsAfterBranchCreation.get(0).getString(0), + "Should have feature_on_empty branch"); + + // Verify that main branch still doesn't exist (as expected) + boolean hasMainBranch = + refsAfterBranchCreation.stream().anyMatch(row -> "main".equals(row.getString(0))); + assertFalse(hasMainBranch, "Main branch should not exist on empty table"); + + // Now insert data to create a data snapshot + spark.sql("INSERT INTO " + tableName + " VALUES ('initial.data')"); + + // Verify we now have 2 snapshots (empty + data) + List snapshotsAfterInsert = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 2, snapshotsAfterInsert.size(), "Should have 2 snapshots after insert (empty + data)"); + + // Now we should have main branch as well + List refsAfterInsert = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refsAfterInsert.size(), "Should have feature_on_empty and main branches"); + + // Create another branch after data exists - this should also succeed + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_after_snapshot"); + + // Verify we now have 3 branches (feature_on_empty, main, feature_after_snapshot) + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(3, refs.size(), "Should have 3 branches total"); + + // Verify all expected branches exist + Set branchNames = + refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(branchNames.contains("feature_on_empty"), "feature_on_empty branch should exist"); + assertTrue(branchNames.contains("main"), "main branch should exist"); + assertTrue( + branchNames.contains("feature_after_snapshot"), + "feature_after_snapshot branch should exist"); + + // ===== BRANCH ISOLATION TESTING ===== + + // 1. Test initial state: main and feature_after_snapshot should have the same data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should have 1 row"); + + // 2. Test feature_on_empty branch should be empty (points to empty snapshot) + assertEquals( + 0, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should have 0 rows (points to empty snapshot)"); + + // 3. Add data to feature_on_empty branch only + spark.sql( + "INSERT INTO " + tableName + ".branch_feature_on_empty VALUES ('empty_branch_data')"); + + // Verify isolation: feature_on_empty now has data, others unchanged + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should now have 1 row"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should still have 1 row (unchanged)"); + + // 4. Add different data to feature_after_snapshot branch + spark.sql( + "INSERT INTO " + + tableName + + ".branch_feature_after_snapshot VALUES ('snapshot_branch_data')"); + + // Verify isolation: each branch has its own data + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should still have 1 row"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should now have 2 rows"); + + // 5. Add data to main branch + spark.sql("INSERT INTO " + tableName + " VALUES ('main_branch_data')"); + + // Verify complete isolation: each branch maintains its own data + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should still have 1 row"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should now have 2 rows"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should still have 2 rows (unchanged)"); + + // 6. Verify data content isolation + List featureOnEmptyData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_on_empty' ORDER BY name") + .collectAsList(); + assertEquals( + "empty_branch_data", + featureOnEmptyData.get(0).getString(0), + "feature_on_empty should contain its specific data"); + + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "initial.data", mainData.get(0).getString(0), "main should contain initial data"); + assertEquals( + "main_branch_data", + mainData.get(1).getString(0), + "main should contain its specific data"); + + List featureAfterSnapshotData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_after_snapshot' ORDER BY name") + .collectAsList(); + assertEquals( + "initial.data", + featureAfterSnapshotData.get(0).getString(0), + "feature_after_snapshot should contain initial data"); + assertEquals( + "snapshot_branch_data", + featureAfterSnapshotData.get(1).getString(0), + "feature_after_snapshot should contain its specific data"); + + // 7. Verify snapshot isolation: each branch should have different snapshot histories + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List featureOnEmptySnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_on_empty'") + .collectAsList(); + List featureAfterSnapshotSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".refs WHERE name = 'feature_after_snapshot'") + .collectAsList(); + + assertNotEquals( + mainSnapshots.get(0).getLong(0), + featureOnEmptySnapshots.get(0).getLong(0), + "main and feature_on_empty should point to different snapshots"); + assertNotEquals( + mainSnapshots.get(0).getLong(0), + featureAfterSnapshotSnapshots.get(0).getLong(0), + "main and feature_after_snapshot should point to different snapshots"); + assertNotEquals( + featureOnEmptySnapshots.get(0).getLong(0), + featureAfterSnapshotSnapshots.get(0).getLong(0), + "feature_on_empty and feature_after_snapshot should point to different snapshots"); + } + } + + @Test + public void testWapBranchAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Enable WAP on the table + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Create branch on table with no existing snapshots + // According to Iceberg specification, this should succeed and create an empty snapshot + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_empty"); + + // Verify that an empty snapshot was created for the branch + List snapshotsAfterBranchCreation = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 1, + snapshotsAfterBranchCreation.size(), + "Should have 1 empty snapshot after branch creation"); + + // Verify the branch was created successfully + List refsAfterBranchCreation = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, + refsAfterBranchCreation.size(), + "Should have feature_empty branch (main doesn't exist yet)"); + assertEquals( + "feature_empty", + refsAfterBranchCreation.get(0).getString(0), + "Should have feature_empty branch"); + + // ===== WAP BRANCH TESTING ===== + + // 1. Set WAP branch and insert data - should go to the feature_empty branch + spark.conf().set("spark.wap.branch", "feature_empty"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_1')"); + + // Verify WAP branch data is visible when spark.wap.branch is set + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 1 row when spark.wap.branch=feature_empty"); + + List wapBranchData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals( + "wap_branch_data_1", wapBranchData.get(0).getString(0), "Should see WAP branch data"); + + // Verify feature_empty branch directly + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 1 row"); + + // Unset WAP branch - queries should now see main branch (which doesn't exist yet, so empty) + spark.conf().unset("spark.wap.branch"); + assertEquals( + 0, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 0 rows when spark.wap.branch is unset (main doesn't exist)"); + + // ===== MULTI-BRANCH WAP TESTING ===== + + // 2. Create main branch with regular data + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Now we should have main branch + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size(), "Should have feature_empty and main branches"); + + // Verify main branch data when spark.wap.branch is unset + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Should see main branch data"); + + // 3. Create another branch and test WAP branch functionality + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_wap_test"); + + // Set WAP branch to feature_wap_test and add data + spark.conf().set("spark.wap.branch", "feature_wap_test"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_2')"); + + // Verify WAP branch data is visible when spark.wap.branch=feature_wap_test + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 2 rows when spark.wap.branch=feature_wap_test (main_data + wap_branch_data_2)"); + + // ===== COMPREHENSIVE WAP BRANCH ISOLATION VERIFICATION ===== + + // Verify each branch has independent data + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row when WAP branch is unset"); + + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 1 row"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") + .collectAsList() + .size(), + "feature_wap_test branch should have 2 rows"); + + // Verify data content isolation + List finalMainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", finalMainData.get(0).getString(0), "main should contain main_data"); + + List finalFeatureEmptyData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'feature_empty' ORDER BY name") + .collectAsList(); + assertEquals( + "wap_branch_data_1", + finalFeatureEmptyData.get(0).getString(0), + "feature_empty should contain wap_branch_data_1"); + + List finalFeatureWapTestData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_wap_test' ORDER BY name") + .collectAsList(); + assertEquals( + "main_data", + finalFeatureWapTestData.get(0).getString(0), + "feature_wap_test should contain main_data"); + assertEquals( + "wap_branch_data_2", + finalFeatureWapTestData.get(1).getString(0), + "feature_wap_test should contain wap_branch_data_2"); + + // ===== WAP BRANCH SWITCHING BEHAVIOR ===== + + // 4. Test switching between WAP branches + spark.conf().set("spark.wap.branch", "feature_empty"); + List switchToFeatureEmpty = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "wap_branch_data_1", + switchToFeatureEmpty.get(0).getString(0), + "Should see feature_empty data when switched"); + + spark.conf().set("spark.wap.branch", "feature_wap_test"); + List switchToFeatureWapTest = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + 2, switchToFeatureWapTest.size(), "Should see 2 rows when switched to feature_wap_test"); + assertEquals( + "main_data", switchToFeatureWapTest.get(0).getString(0), "First row should be main_data"); + assertEquals( + "wap_branch_data_2", + switchToFeatureWapTest.get(1).getString(0), + "Second row should be wap_branch_data_2"); + + // 5. Test INSERT behavior with WAP branch set + spark.conf().set("spark.wap.branch", "feature_empty"); + spark.sql("INSERT INTO " + tableName + " VALUES ('additional_wap_data')"); + + // Verify the insert went to the WAP branch + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 2 rows in feature_empty after additional insert"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 2 rows after additional insert"); + + // Verify other branches are unchanged + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") + .collectAsList() + .size(), + "feature_wap_test branch should still have 2 rows (unchanged)"); + + // ===== SNAPSHOT HISTORY VERIFICATION ===== + + // 6. Verify that each branch points to different snapshots + List finalMainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List finalFeatureEmptySnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_empty'") + .collectAsList(); + List finalFeatureWapTestSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_wap_test'") + .collectAsList(); + + assertNotEquals( + finalMainSnapshots.get(0).getLong(0), + finalFeatureEmptySnapshots.get(0).getLong(0), + "main and feature_empty should point to different snapshots"); + assertNotEquals( + finalMainSnapshots.get(0).getLong(0), + finalFeatureWapTestSnapshots.get(0).getLong(0), + "main and feature_wap_test should point to different snapshots"); + assertNotEquals( + finalFeatureEmptySnapshots.get(0).getLong(0), + finalFeatureWapTestSnapshots.get(0).getLong(0), + "feature_empty and feature_wap_test should point to different snapshots"); + + // Clean up WAP branch configuration + spark.conf().unset("spark.wap.branch"); + } + } + + @Test + public void testWapBranchCommitWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_multi_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table and enable WAP + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Step 1: Start with main at snapshotX + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Verify main branch exists and get its snapshot + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshots.size(), "Main branch should exist"); + long snapshotX = mainSnapshots.get(0).getLong(0); + System.out.println("SnapshotX (main): " + snapshotX); + + // Step 2: Create branchA from main → branchA also points to snapshotX + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); + + // Verify branchA points to same snapshot as main + List branchASnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + assertEquals(1, branchASnapshots.size(), "BranchA should exist"); + long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); + assertEquals( + snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); + + // Step 3: Set branchA as the WAP branch and commit data + spark.conf().set("spark.wap.branch", "branchA"); + spark.sql("INSERT INTO " + tableName + " VALUES ('branchA_data')"); + + // Step 4: Verify branchA now points to snapshotY (child of snapshotX) + List branchASnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); + System.out.println("SnapshotY (branchA after commit): " + snapshotY); + + // Verify branchA has both main_data and branchA_data + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows after commit"); + + // Verify main still points to snapshotX and has only main_data + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row"); + + // Step 5: Create branchB from branchA → branchB points to snapshotY + // First create the branch, then set it to point to the same snapshot as branchA + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); + + // Verify branchB points to snapshotY + List branchBSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); + assertEquals( + snapshotY, + branchBSnapshotAfterCreation, + "BranchB should point to snapshotY (same as branchA)"); + + // Step 6: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) + // Use direct branch syntax to target branchB specifically + spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); + + // Verify branchB now points to snapshotZ + List branchBSnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); + System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); + + // ===== VERIFICATION OF FINAL STATE ===== + + // Verify all three branches exist and point to different snapshots + List allRefs = + spark + .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") + .collectAsList(); + assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); + + // Verify snapshot relationships + List mainFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List branchAFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); + long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); + long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); + + assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); + assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); + assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); + + // Verify data isolation between branches + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should have 3 rows"); + + // Verify content + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); + + List branchAData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); + assertEquals( + "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); + + List branchBData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); + assertEquals( + "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); + assertEquals( + "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); + + // Verify parent-child relationships in snapshot metadata + List allSnapshots = + spark + .sql( + "SELECT snapshot_id, parent_id FROM " + + tableName + + ".snapshots ORDER BY committed_at") + .collectAsList(); + assertTrue(allSnapshots.size() >= 3, "Should have at least 3 snapshots"); + + // Clean up WAP configuration + spark.conf().unset("spark.wap.branch"); + } + } + + @Test + public void testRegularCommitWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "regular_multi_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table (no WAP needed for this test) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Step 1: Start with main at snapshotX + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Verify main branch exists and get its snapshot + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshots.size(), "Main branch should exist"); + long snapshotX = mainSnapshots.get(0).getLong(0); + System.out.println("SnapshotX (main): " + snapshotX); + + // Step 2: Create branchA from main → branchA also points to snapshotX + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); + + // Verify branchA points to same snapshot as main + List branchASnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + assertEquals(1, branchASnapshots.size(), "BranchA should exist"); + long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); + assertEquals( + snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); + + // Step 3: Commit some data on branchA → branchA now points to snapshotY (child of snapshotX) + spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('branchA_data')"); + + // Verify branchA now points to snapshotY (child of snapshotX) + List branchASnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); + System.out.println("SnapshotY (branchA after commit): " + snapshotY); + + // Verify branchA has both main_data and branchA_data + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows after commit"); + + // Verify main still points to snapshotX and has only main_data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row"); + + // Step 4: Create branchB from branchA → branchB points to snapshotY + // First create the branch, then set it to point to the same snapshot as branchA + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); + + // Verify branchB points to snapshotY + List branchBSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); + assertEquals( + snapshotY, + branchBSnapshotAfterCreation, + "BranchB should point to snapshotY (same as branchA)"); + + // Step 5: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) + spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); + + // Verify branchB now points to snapshotZ + List branchBSnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); + System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); + + // ===== VERIFICATION OF FINAL STATE ===== + + // Verify all three branches exist and point to different snapshots + List allRefs = + spark + .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") + .collectAsList(); + assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); + + // Verify snapshot relationships + List mainFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List branchAFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); + long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); + long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); + + assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); + assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); + assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); + + // Verify all snapshots are different + assertNotEquals( + finalSnapshotX, finalSnapshotY, "SnapshotX and snapshotY should be different"); + assertNotEquals( + finalSnapshotY, finalSnapshotZ, "SnapshotY and snapshotZ should be different"); + assertNotEquals( + finalSnapshotX, finalSnapshotZ, "SnapshotX and snapshotZ should be different"); + + // Verify data isolation between branches + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should have 3 rows"); + + // Verify content + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); + + List branchAData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); + assertEquals( + "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); + + List branchBData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); + assertEquals( + "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); + assertEquals( + "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); + + // ===== TEST THE SPECIFIC SCENARIO THAT WOULD HAVE BEEN AMBIGUOUS ===== + + // At this point, we have: + // - main points to snapshotX + // - branchA points to snapshotY + // - branchB points to snapshotZ + // + // If we were to commit a new snapshot as child of snapshotY, our fixed logic should work + // because only the explicitly targeted branch (via branch-specific insert syntax) should be + // considered + + // Verify that we can still commit to branchA even though multiple branches exist + spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('additional_branchA_data')"); + + // Verify branchA advanced but branchB didn't + List branchAFinalSnapshots2 = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots2 = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotY2 = branchAFinalSnapshots2.get(0).getLong(0); + long finalSnapshotZ2 = branchBFinalSnapshots2.get(0).getLong(0); + + assertNotEquals(snapshotY, finalSnapshotY2, "BranchA should have advanced to a new snapshot"); + assertEquals(snapshotZ, finalSnapshotZ2, "BranchB should remain at the same snapshot"); + + // Verify data counts + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should now have 3 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should still have 3 rows (unchanged)"); + } + } + + // ===== CHERRY PICKING BETWEEN BRANCHES ===== + + @Test + public void testCherryPickToMainWithFeatureBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup branches + spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create WAP snapshot + spark.conf().set("spark.wap.id", "feature-target-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.for.feature')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-target-wap'") + .first() + .mkString(); + + // CRITICAL: Unset WAP ID before advancing main branch to force non-fast-forward cherry-pick + spark.conf().unset("spark.wap.id"); + spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); + + // Cherry-pick WAP to main branch (this tests our enhanced applySnapshotOperations) + // Main should have 2 rows now (main.base + main.advance) + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify cherry-pick worked - 3 rows of data should appear in main (main.base + main.advance + // + wap.for.feature) + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify published WAP snapshot properties + List publishedSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['published-wap-id'] = 'feature-target-wap'") + .collectAsList(); + assertTrue( + publishedSnapshots.size() >= 1, + "Should find at least one snapshot with published-wap-id"); + } + } + + // ===== FAST FORWARD MERGES ===== + + @Test + public void testFastForwardMergeToMain() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); + + // Verify initial state + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature has 3 rows + + // Fast-forward main to feature_a + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); + + // Verify fast-forward worked - main should now have same data as feature_a + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify both branches point to same snapshot + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertEquals(mainSnapshot, featureSnapshot); + } + } + + @Test + public void testFastForwardMergeToFeature() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance main branch (feature_a stays at base) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data1')"); + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data2')"); + + // Verify initial state + assertEquals( + 3, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 3 rows + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature has 1 row + + // Fast-forward feature_a to main + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'feature_a', 'main')"); + + // Verify fast-forward worked - feature_a should now have same data as main + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify both branches point to same snapshot + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertEquals(mainSnapshot, featureSnapshot); + } + } + + @Test + public void testFastForwardFeatureToMainAndWapId() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create WAP snapshot + spark.conf().set("spark.wap.id", "test-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.data')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'test-wap'") + .first() + .mkString(); + + // Unset WAP ID before advancing feature branch normally (not using WAP - else WAP staged + // snapshot will apply to feature branch) + spark.conf().unset("spark.wap.id"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data')"); + + // Verify WAP snapshot doesn't interfere with fast-forward + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature advanced + + // Fast-forward main to feature_a should work despite WAP presence + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); + + // Verify fast-forward worked + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify WAP snapshot is still available for cherry-pick + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'test-wap'") + .collectAsList(); + assertEquals(1, wapSnapshots.size()); + assertEquals(wapSnapshotId, wapSnapshots.get(0).mkString()); + } + } + + @Test + public void testFastForwardMergeBetweenTwoFeatureBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create two feature branches from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_b"); + + // Advance feature_a + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data2')"); + + // Verify initial state + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a has 3 rows + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") + .collectAsList() + .size()); // feature_b has 1 row + + // Fast-forward feature_b to feature_a + spark.sql( + "CALL openhouse.system.fast_forward('" + tableName + "', 'feature_b', 'feature_a')"); + + // Verify fast-forward worked + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") + .collectAsList() + .size()); // feature_b now matches feature_a + + // Verify feature_a and feature_b point to same snapshot + String featureASnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + String featureBSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_b'") + .first() + .mkString(); + assertEquals(featureASnapshot, featureBSnapshot); + } + } + + @Test + public void testFastForwardMergeIncompatibleLineage() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance both branches independently (creating divergent history) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.divergent')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.divergent')"); + + // Verify divergent state + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 2 rows + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a has 2 rows (different) + + // Attempt fast-forward should fail due to incompatible lineage + assertThrows( + Exception.class, + () -> + spark.sql( + "CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"), + "Fast-forward should fail when branches have divergent history"); + + // Verify branches remain unchanged after failed fast-forward + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify snapshots are still different + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertNotEquals(mainSnapshot, featureSnapshot); + } + } + + // ===== SNAPSHOT EXPIRATION FROM NON-MAIN BRANCHES ===== + + @Test + public void testSnapshotExpirationFromFeatureBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup: Create multiple snapshots to have some that can be expired + + // 1. Create initial main data + spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); + + // 2. Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // 3. Add multiple snapshots to feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data3')"); + + // 4. Query metadata tables to find snapshots that are NOT current branch heads + + // Get all snapshots + List allSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".snapshots ORDER BY committed_at") + .collectAsList(); + assertTrue(allSnapshots.size() >= 4, "Should have at least 4 snapshots"); + + // Get current branch head snapshots from refs table + List branchHeads = + spark.sql("SELECT snapshot_id FROM " + tableName + ".refs").collectAsList(); + Set referencedSnapshots = + branchHeads.stream().map(row -> row.mkString()).collect(Collectors.toSet()); + + System.out.println( + "DEBUG: All snapshots: " + + allSnapshots.stream().map(Row::mkString).collect(Collectors.toList())); + System.out.println("DEBUG: Referenced snapshots (branch heads): " + referencedSnapshots); + + // Find snapshots that are NOT referenced by any branch head + List unreferencedSnapshots = + allSnapshots.stream() + .map(Row::mkString) + .filter(snapshotId -> !referencedSnapshots.contains(snapshotId)) + .collect(Collectors.toList()); + + System.out.println("DEBUG: Unreferenced snapshots: " + unreferencedSnapshots); + + // We should have at least one unreferenced snapshot (intermediate feature snapshots) + assertFalse( + unreferencedSnapshots.isEmpty(), + "Should have at least one unreferenced snapshot to expire"); + + // Select the first unreferenced snapshot to expire + String snapshotToExpire = unreferencedSnapshots.get(0); + + // Verify this snapshot exists in the snapshots table + List beforeExpiration = + spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); + assertTrue( + beforeExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), + "Snapshot to expire should exist before expiration"); + + // Expire the unreferenced snapshot + spark.sql( + String.format( + "CALL openhouse.system.expire_snapshots(table => '" + + tableName.replace("openhouse.", "") + + "', snapshot_ids => Array(%s))", + snapshotToExpire)); + + // Verify snapshot is gone + List afterExpiration = + spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); + assertFalse( + afterExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), + "Expired snapshot should no longer exist"); + + // Verify branches are still intact after expiration + // Main should have: main.initial = 1 row + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + + // Feature_a should have: main.initial + feature.data1 + feature.data2 + feature.data3 = 4 + // rows + assertEquals( + 4, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + } + } + + @Test + public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup multi-branch environment + spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.base')"); + + // Create multiple WAP snapshots + spark.conf().set("spark.wap.id", "wap-to-keep"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.keep.data')"); + + spark.conf().set("spark.wap.id", "wap-to-expire"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.expire.data')"); + String expireWapId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") + .first() + .mkString(); + + // Expire specific WAP snapshot + spark.sql( + String.format( + "CALL openhouse.system.expire_snapshots(table => '" + + tableName.replace("openhouse.", "") + + "', snapshot_ids => Array(%s))", + expireWapId)); + + // Verify selective WAP expiration + List remainingWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-keep'") + .collectAsList(); + assertEquals(1, remainingWaps.size()); + + List expiredWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") + .collectAsList(); + assertEquals(0, expiredWaps.size()); + + // Verify branches unchanged + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + } + } + + // ===== BACKWARD COMPATIBILITY ===== + + @Test + public void testWapIdOnFeatureBranchAndMainBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data in main branch + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'main_base')"); + + // Create feature branch and add base data to it + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (10, 'feature_base')"); + + // Verify initial state - main has 1 row, feature has 2 rows + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size()); + + // Create WAP staged snapshot (invisible to normal reads) + spark.conf().set("spark.wap.id", "shared-wap-snapshot"); + spark.sql("INSERT INTO " + tableName + " VALUES (99, 'wap_staged_data')"); + + // Get the WAP snapshot ID + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") + .first() + .mkString(); + + // Verify WAP staging doesn't affect normal reads (principle 2: invisible until published) + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main should not see WAP staged data"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature should not see WAP staged data"); + + // Clear WAP ID to avoid contamination + spark.conf().unset("spark.wap.id"); + + // Cherry-pick the same WAP snapshot to MAIN branch + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify cherry-pick to main worked - main should now have the WAP data + List mainAfterCherryPick = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(2, mainAfterCherryPick.size(), "Main should have base + cherry-picked WAP data"); + boolean mainHasWapData = + mainAfterCherryPick.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); + assertTrue(mainHasWapData, "Main should contain cherry-picked WAP data"); + + // Verify feature branch is still unaffected + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature branch should be unchanged"); + + // Demonstrate that WAP snapshots work independently on different branches by + // creating a separate WAP snapshot while on the feature branch context + + // Create another WAP snapshot that could be applied to feature branch + spark.conf().set("spark.wap.id", "feature-specific-wap"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (50, 'feature_wap_data')"); + + String featureWapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") + .first() + .mkString(); + + // Clear WAP ID again + spark.conf().unset("spark.wap.id"); + + // Verify that both WAP snapshots exist but are invisible to normal reads + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main should still only show cherry-picked data"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature should not show new WAP data yet"); + + // Show that we can cherry-pick the feature WAP to main as well (demonstrating cross-branch + // capability) + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + featureWapSnapshotId)); + + // Verify main now has both cherry-picked WAP snapshots + List finalMain = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(3, finalMain.size(), "Main should have base + first WAP + second WAP data"); + + boolean hasOriginalWap = + finalMain.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); + boolean hasFeatureWap = + finalMain.stream().anyMatch(row -> "feature_wap_data".equals(row.getString(1))); + assertTrue(hasOriginalWap, "Main should contain first cherry-picked WAP data"); + assertTrue(hasFeatureWap, "Main should contain second cherry-picked WAP data"); + + // Verify feature branch is still independent and unchanged by main's cherry-picks + List finalFeature = + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList(); + assertEquals( + 2, finalFeature.size(), "Feature should still only have base + feature_base data"); + + // Verify that both original WAP snapshots are still available in metadata + List originalWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") + .collectAsList(); + List featureWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") + .collectAsList(); + assertEquals(1, originalWapSnapshots.size(), "Original WAP snapshot should still exist"); + assertEquals(1, featureWapSnapshots.size(), "Feature WAP snapshot should still exist"); + } + } + + @Test + public void testBackwardCompatibilityMainBranchOnly() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Traditional main-only workflow (should work exactly as before) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.1')"); + spark.sql("INSERT INTO " + tableName + " VALUES ('main.2')"); + + // WAP staging (traditional) + spark.conf().set("spark.wap.id", "compat-test-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('compat.wap.data')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'compat-test-wap'") + .first() + .mkString(); + + // Traditional cherry-pick to main + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify traditional behavior preserved + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + List refs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(1, refs.size()); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("main")); + + // Traditional snapshot queries should work + assertTrue( + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList().size() >= 3); + } + } + + // ===== WAP BRANCH TESTING ===== + // These tests validate the intended WAP branch functionality. + // WAP branch should stage writes to a specific branch without affecting main. + + @Test + public void testStagedChangesVisibleViaConf() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'base_data')"); + + // Create WAP branch and insert staged data + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap_branch"); + spark.conf().set("spark.wap.branch", "wap_branch"); + spark.sql("INSERT INTO " + tableName + " VALUES (2, 'staged_data')"); + + // When spark.wap.branch is set, SELECT should see WAP branch data (2 rows) + List wapVisible = spark.sql("SELECT * FROM " + tableName).collectAsList(); + assertEquals( + 2, wapVisible.size(), "Should see both base and staged data when wap.branch is set"); + + // When spark.wap.branch is unset, SELECT should see only main data (1 row) + spark.conf().unset("spark.wap.branch"); + List mainOnly = spark.sql("SELECT * FROM " + tableName).collectAsList(); + assertEquals(1, mainOnly.size(), "Should see only base data when wap.branch is unset"); + } + } + + @Test + public void testStagedChangesHidden() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create WAP branch for staged operations + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap"); + + // Set WAP branch for staged testing + spark.conf().set("spark.wap.branch", "wap"); + + // INSERT INTO table -> inserts to the WAP branch + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_data')"); + + // When spark.wap.branch is set: + // ✅ SELECT * FROM table → reads from the WAP branch + List tableData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 2, + tableData.size(), + "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); + boolean hasBase = tableData.stream().anyMatch(row -> "base".equals(row.getString(1))); + boolean hasStaged = + tableData.stream().anyMatch(row -> "staged_data".equals(row.getString(1))); + assertTrue(hasBase, "WAP branch should contain base data"); + assertTrue(hasStaged, "WAP branch should contain staged data"); + + // ✅ SELECT * FROM table.branch_wap → explicitly reads from WAP branch + List wapBranchData = + spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); + assertEquals(2, wapBranchData.size(), "Explicit WAP branch select should show staged data"); + + // ✅ SELECT * FROM table.branch_main → explicitly reads from main branch + List mainBranchData = + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); + assertEquals( + 1, mainBranchData.size(), "Explicit main branch select should only show base data"); + assertEquals( + "base", mainBranchData.get(0).getString(1), "Main branch should only contain base data"); + + // Now unset spark.wap.branch and ensure main branch is the referenced data + spark.conf().unset("spark.wap.branch"); + + // When spark.wap.branch is unset, SELECT * FROM table should read from main branch + List afterUnsetData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 1, + afterUnsetData.size(), + "SELECT * FROM table should read from main branch when spark.wap.branch is unset"); + assertEquals( + "base", + afterUnsetData.get(0).getString(1), + "After unsetting wap.branch, should read from main"); + + // INSERT INTO table should go to main branch when spark.wap.branch is unset + spark.sql("INSERT INTO " + tableName + " VALUES (2, 'main_data')"); + List finalMainData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 2, finalMainData.size(), "Main branch should now have 2 rows after unsetting wap.branch"); + boolean hasMainData = + finalMainData.stream().anyMatch(row -> "main_data".equals(row.getString(1))); + assertTrue(hasMainData, "Main branch should contain the newly inserted data"); + + // WAP branch should remain unchanged + List finalWapData = + spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); + assertEquals( + 2, finalWapData.size(), "WAP branch should remain unchanged with base + staged data"); + } + } + + @Test + public void testPublishWapBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create staging branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Stage changes to WAP branch + spark.conf().set("spark.wap.branch", "staging"); + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_for_publish')"); + + // When spark.wap.branch is set, SELECT * FROM table should read from WAP branch + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'staging'") + .collectAsList() + .size(), + "Staging should have staged data"); + + // Verify main branch still only has base data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList().size(), + "Main branch should not have staged data"); + + // Fast-forward main branch to staging branch to publish the staged changes + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'staging')"); + + // Verify data is now published to main branch (need to explicitly check main branch) + List publishedData = + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); + assertEquals(2, publishedData.size(), "Main branch should now have published data"); + + boolean hasPublished = + publishedData.stream().anyMatch(row -> "staged_for_publish".equals(row.getString(1))); + assertTrue(hasPublished, "Main branch should contain the published staged data"); + + // Verify that with wap.branch still set, SELECT * FROM table still reads from WAP branch + List wapData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(2, wapData.size(), "SELECT * FROM table should still read from WAP branch"); + } + } + + @Test + public void testWapIdAndWapBranchIncompatible() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create staging branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Set both WAP ID and WAP branch - this should be invalid + spark.conf().set("spark.wap.id", "test-wap-id"); + spark.conf().set("spark.wap.branch", "staging"); + + // Attempt to write with both configurations should fail + assertThrows( + Exception.class, + () -> spark.sql("INSERT INTO " + tableName + " VALUES (1, 'invalid')"), + "Cannot use both wap.id and wap.branch simultaneously"); + } + } + + @Test + public void testCannotWriteToBothBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create branches + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Set WAP branch + spark.conf().set("spark.wap.branch", "staging"); + + // ❌ INVALID: Cannot write to both normal branch and WAP branch + assertThrows( + Exception.class, + () -> spark.sql("INSERT INTO " + tableName + ".branch_feature VALUES (1, 'invalid')"), + "Cannot write to explicit branch when wap.branch is set"); + } + } + + // ===== ERROR SCENARIOS ===== + + @Test + public void testErrorInsertToNonExistentBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create one valid branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Verify valid branch works + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('valid.data')"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Attempt to insert into non-existent branch should fail + assertThrows( + Exception.class, + () -> + spark.sql("INSERT INTO " + tableName + ".branch_nonexistent VALUES ('invalid.data')"), + "Insert to non-existent branch should fail"); + + // Verify table state unchanged after failed insert + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + + // Verify only valid branches exist + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size()); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("feature_a")); + assertTrue(refNames.contains("main")); + } + } + + @Test + public void testErrorCherryPickNonExistentWapId() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data and branch + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create a valid WAP snapshot + spark.conf().set("spark.wap.id", "valid-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('valid.wap.data')"); + String validWapId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") + .first() + .mkString(); + + // Verify valid WAP cherry-pick works + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + validWapId)); + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + + // Attempt to cherry-pick non-existent snapshot ID should fail + long nonExistentSnapshotId = 999999999L; + assertThrows( + Exception.class, + () -> + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + nonExistentSnapshotId)), + "Cherry-pick of non-existent snapshot should fail"); + + // Attempt to cherry-pick with malformed snapshot ID should fail + assertThrows( + Exception.class, + () -> + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + "invalid-id")), + "Cherry-pick with invalid snapshot ID should fail"); + + // Verify table state unchanged after failed cherry-picks + assertEquals( + 2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + + // Verify valid WAP snapshot still exists + List validWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") + .collectAsList(); + assertEquals(1, validWaps.size()); + } + } +} diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java index 85044da5e..ceb3f9e26 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java @@ -8,7 +8,7 @@ import com.linkedin.openhouse.cluster.storage.StorageManager; import com.linkedin.openhouse.common.test.cluster.PropertyOverrideContextInitializer; import com.linkedin.openhouse.internal.catalog.OpenHouseInternalTableOperations; -import com.linkedin.openhouse.internal.catalog.SnapshotInspector; +import com.linkedin.openhouse.internal.catalog.SnapshotDiffApplier; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -60,12 +60,12 @@ public class RepositoryTestWithSettableComponents { @Autowired FileIOManager fileIOManager; - @Autowired SnapshotInspector snapshotInspector; - @Autowired HouseTableMapper houseTableMapper; @Autowired MeterRegistry meterRegistry; + @Autowired SnapshotDiffApplier snapshotDiffApplier; + FileIO fileIO; @PostConstruct @@ -97,15 +97,17 @@ void testNoRetryInternalRepo() { // construct a real table object to prepare subsequent client call for table-update (that they // will fail) + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); OpenHouseInternalTableOperations actualOps = new OpenHouseInternalTableOperations( houseTablesRepository, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); ((SettableCatalogForTest) catalog).setOperation(actualOps); TableDto creationDTO = TABLE_DTO.toBuilder().tableVersion(INITIAL_TABLE_VERSION).build(); creationDTO = openHouseInternalRepository.save(creationDTO); @@ -114,15 +116,17 @@ void testNoRetryInternalRepo() { // injecting mocked htsRepo within a tableOperation that fails doCommit method. // The requirement to trigger htsRepo.save call are: Detectable updates in Transaction itself. + MetricsReporter metricsReporter2 = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter2, + fileIOManager, + snapshotDiffApplier); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); doReturn(actualOps.current()).when(spyOperations).refresh(); BaseTable spyOptsMockedTable = Mockito.spy(new BaseTable(spyOperations, realTable.name())); @@ -195,15 +199,17 @@ void testFailedHtsRepoWhenGet() { for (Class c : exs) { HouseTableRepository htsRepo = provideFailedHtsRepoWhenGet(c); + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); BaseTable spyOptsMockedTable = Mockito.spy( diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java index d845e1b39..7cf0528ec 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java @@ -5,9 +5,6 @@ import com.linkedin.openhouse.common.audit.model.ServiceAuditEvent; import com.linkedin.openhouse.tables.audit.DummyTableAuditHandler; import com.linkedin.openhouse.tables.audit.model.TableAuditEvent; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.hadoop.fs.Path; import org.mockito.Mockito; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; @@ -17,7 +14,6 @@ import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; -import org.springframework.context.annotation.Primary; @SpringBootApplication @ComponentScan( @@ -53,19 +49,6 @@ public static void main(String[] args) { SpringApplication.run(SpringH2Application.class, args); } - /** - * File secure used for testing purpose. We cannot directly use the actual - * SnapshotInspector#fileSecurer as that changes file to a user group that is not guaranteed to - * exist across different platforms thus creating environment dependencies for unit tests. - */ - @Bean - @Primary - Consumer> provideTestFileSecurer() { - return pathSupplier -> { - // This is a no-op Consumer. It does nothing with the supplied Path. - }; - } - @Bean public AuditHandler serviceAuditHandler() { return Mockito.mock(DummyServiceAuditHandler.class); diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java index 400b92b0f..f7d4f0124 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java @@ -1,8 +1,12 @@ package com.linkedin.openhouse.tables.settable; +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.SnapshotDiffApplier; import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository; import com.linkedin.openhouse.tables.repository.impl.SettableInternalRepositoryForTest; +import io.micrometer.core.instrument.MeterRegistry; import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.springframework.boot.test.context.TestConfiguration; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Primary; @@ -20,4 +24,11 @@ public Catalog provideTestCatalog() { public OpenHouseInternalRepository provideTestInternalRepo() { return new SettableInternalRepositoryForTest(); } + + @Bean + public SnapshotDiffApplier snapshotDiffApplier(MeterRegistry meterRegistry) { + MetricsReporter metricsReporter = + new MetricsReporter(meterRegistry, "test", Lists.newArrayList()); + return new SnapshotDiffApplier(metricsReporter); + } } diff --git a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java index 343c38d0d..0d85a24b0 100644 --- a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java +++ b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java @@ -1,17 +1,12 @@ package com.linkedin.openhouse.tablestest; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.hadoop.fs.Path; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.domain.EntityScan; import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; -import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; -import org.springframework.context.annotation.Primary; @SpringBootApplication @ComponentScan( @@ -47,17 +42,4 @@ public class SpringH2TestApplication { public static void main(String[] args) { SpringApplication.run(SpringH2TestApplication.class, args); } - - /** - * File secure used for testing purpose. We cannot directly use the actual - * SnapshotInspector#fileSecurer as that changes file to a user group that is not guaranteed to - * exist across different platforms thus creating environment dependencies for unit tests. - */ - @Bean - @Primary - Consumer> provideTestFileSecurer() { - return pathSupplier -> { - // This is a no-op Consumer. It does nothing with the supplied Path. - }; - } }