diff --git a/spark/unitycatalog/src/test/java/io/sparkuctest/UCDeltaStreamingEdgeDataReadTest.java b/spark/unitycatalog/src/test/java/io/sparkuctest/UCDeltaStreamingEdgeDataReadTest.java
new file mode 100644
index 00000000000..ed48751694c
--- /dev/null
+++ b/spark/unitycatalog/src/test/java/io/sparkuctest/UCDeltaStreamingEdgeDataReadTest.java
@@ -0,0 +1,305 @@
+/*
+ * Copyright (2026) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.sparkuctest;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.List;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * Reproduces the doc-claimed MANAGED-specific edge-data streaming bugs (NPE in
+ * OnHeapColumnVector.putNotNulls for null columns / boolean nulls / complex types) over the Unity
+ * Catalog catalog path, alongside the EXTERNAL counterparts.
+ *
+ *
Task D (`V2StreamingEdgeDataReadTest`) already proved these don't fire on EXTERNAL via
+ * file-path access (`dsv2.delta.`). This suite exercises the same shapes through Unity
+ * Catalog's `unity.default.` access path for both `EXTERNAL` and `MANAGED` table types.
+ *
+ * Each test runs twice via `@TestAllTableTypes`. The MANAGED variant is the relevant repro
+ * target; the EXTERNAL variant via UC catalog is a useful control (UC catalog read path may differ
+ * from raw file-path read path even for external tables).
+ */
+public class UCDeltaStreamingEdgeDataReadTest extends UCDeltaTableIntegrationBaseTest {
+
+ @TempDir private Path tempDir;
+ private int checkpointCount;
+
+ /**
+ * Allocates a fresh local checkpoint directory. Checkpoints must be on local FS since UC server
+ * holds the cloud credentials, not Spark.
+ */
+ private String checkpoint() throws IOException {
+ Path ckDir = tempDir.resolve("ck-" + checkpointCount++);
+ Files.createDirectory(ckDir);
+ return ckDir.toString();
+ }
+
+ // -------------------------------------------------------------------------
+ // Case 2: All-null INT column
+ // Doc claim: NPE in OnHeapColumnVector.putNotNulls (this.nulls uninitialized)
+ // on MANAGED. EXTERNAL was clean (Task D).
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedNullsInColumns(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_all_null_col",
+ "v INT",
+ tableType,
+ tableName -> {
+ sql("INSERT INTO %s VALUES (NULL), (NULL), (NULL)", tableName);
+
+ String queryName =
+ "edge_all_null_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ Dataset input = spark().readStream().format("delta").table(tableName);
+
+ // AvailableNow gives a deterministic single batch over the existing 3 nulls.
+ input
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .option("checkpointLocation", checkpoint())
+ .start()
+ .awaitTermination();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(3, rows.size(), "expected 3 null rows; got: " + rows);
+ for (Row r : rows) {
+ assertEquals(true, r.isNullAt(0), "expected null value, got: " + r);
+ }
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ });
+ }
+
+ // -------------------------------------------------------------------------
+ // Case 3: BOOLEAN column with nulls
+ // Doc claim: same NPE through bit-packing path on MANAGED.
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedBooleanNulls(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_bool_nulls",
+ "b BOOLEAN",
+ tableType,
+ tableName -> {
+ sql("INSERT INTO %s VALUES (true), (false), (NULL), (true), (NULL)", tableName);
+
+ String queryName =
+ "edge_bool_nulls_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ spark()
+ .readStream()
+ .format("delta")
+ .table(tableName)
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .option("checkpointLocation", checkpoint())
+ .start()
+ .awaitTermination();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(5, rows.size(), "expected 5 rows; got: " + rows);
+ long nullCount = rows.stream().filter(r -> r.isNullAt(0)).count();
+ assertEquals(2L, nullCount, "expected 2 null booleans; got: " + rows);
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ });
+ }
+
+ // -------------------------------------------------------------------------
+ // Case 4a: Complex types initial-snapshot (ARRAY / MAP / STRUCT)
+ // Sanity: Task D showed EXTERNAL passes initial-snapshot. Confirm same on UC.
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedComplexTypesInitialSnapshot(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_complex_initial",
+ "id INT, arr ARRAY, mp MAP, st STRUCT",
+ tableType,
+ tableName -> {
+ sql(
+ "INSERT INTO %s VALUES "
+ + "(1, array(1, 2, 3), map('k1', 10), named_struct('a', 1, 'b', 'foo')), "
+ + "(2, array(), map(), named_struct('a', 2, 'b', 'bar')), "
+ + "(3, NULL, NULL, NULL)",
+ tableName);
+
+ String queryName =
+ "edge_complex_init_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ spark()
+ .readStream()
+ .format("delta")
+ .table(tableName)
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .option("checkpointLocation", checkpoint())
+ .start()
+ .awaitTermination();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(3, rows.size(), "expected 3 rows; got: " + rows);
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ });
+ }
+
+ // -------------------------------------------------------------------------
+ // Case 4b: Complex types INCREMENTAL (commits mid-stream)
+ // Doc claim: NPE in SparkMicroBatchStreaming for INCREMENTAL on MANAGED.
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedComplexTypesIncremental(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_complex_incr",
+ "id INT, arr ARRAY, mp MAP, st STRUCT",
+ tableType,
+ tableName -> {
+ // Seed one initial row.
+ sql(
+ "INSERT INTO %s VALUES (1, array(1,2), map('a',1), named_struct('a',1,'b','x'))",
+ tableName);
+
+ String queryName =
+ "edge_complex_incr_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ StreamingQuery query = null;
+ try {
+ // Continuous stream: process initial commit, then commit two more rows mid-stream
+ // to exercise the INCREMENTAL micro-batch path the doc flagged.
+ query =
+ spark()
+ .readStream()
+ .format("delta")
+ .table(tableName)
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .option("checkpointLocation", checkpoint())
+ .start();
+ query.processAllAvailable();
+
+ sql(
+ "INSERT INTO %s VALUES (2, array(3,4,5), map('b',2,'c',3), "
+ + "named_struct('a',2,'b','y'))",
+ tableName);
+ query.processAllAvailable();
+
+ sql("INSERT INTO %s VALUES (3, NULL, NULL, NULL)", tableName);
+ query.processAllAvailable();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(3, rows.size(), "expected 3 rows; got: " + rows);
+ } finally {
+ if (query != null) {
+ query.processAllAvailable();
+ query.stop();
+ query.awaitTermination(10000);
+ }
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ }
+ });
+ }
+
+ // -------------------------------------------------------------------------
+ // Bonus 1: Empty table — initial snapshot
+ // Task D showed EXTERNAL passes; confirm UC catalog (both types) is also clean.
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedEmptyTableInitialSnapshot(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_empty_table",
+ "v INT",
+ tableType,
+ tableName -> {
+ // No data inserted (table is empty) -- but for MANAGED we need at least the
+ // catalogManaged metadata commit, which CREATE TABLE produces.
+ String queryName = "edge_empty_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ spark()
+ .readStream()
+ .format("delta")
+ .table(tableName)
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .option("checkpointLocation", checkpoint())
+ .start()
+ .awaitTermination();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(0, rows.size(), "empty table should yield 0 rows; got: " + rows);
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ });
+ }
+
+ // -------------------------------------------------------------------------
+ // Bonus 2: NULL partition value (HIVE_DEFAULT_PARTITION sentinel)
+ // Task D's external file-path test passed; UC catalog's path-style table read may differ.
+ // -------------------------------------------------------------------------
+
+ @TestAllTableTypes
+ public void testManagedNullPartitionValue(TableType tableType) throws Exception {
+ withNewTable(
+ "edge_null_part",
+ "id INT, part STRING",
+ "part",
+ tableType,
+ null,
+ tableName -> {
+ sql("INSERT INTO %s VALUES (1, 'a'), (2, NULL), (3, 'c')", tableName);
+
+ String queryName =
+ "edge_null_part_" + tableType.name().toLowerCase() + "_" + checkpointCount;
+ spark()
+ .readStream()
+ .format("delta")
+ .table(tableName)
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .option("checkpointLocation", checkpoint())
+ .start()
+ .awaitTermination();
+
+ List rows = spark().sql("SELECT * FROM " + queryName).collectAsList();
+ assertEquals(3, rows.size(), "expected 3 rows; got: " + rows);
+ long nullParts = rows.stream().filter(r -> r.isNullAt(r.fieldIndex("part"))).count();
+ assertEquals(1L, nullParts, "expected 1 null partition value; got: " + rows);
+ spark().sql("DROP VIEW IF EXISTS " + queryName);
+ });
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2PartitionValueBoundaryTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2PartitionValueBoundaryTest.java
new file mode 100644
index 00000000000..ddb987e9b58
Binary files /dev/null and b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2PartitionValueBoundaryTest.java differ
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingColumnMappingAddColumnRestartTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingColumnMappingAddColumnRestartTest.java
new file mode 100644
index 00000000000..b3098cb90c2
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingColumnMappingAddColumnRestartTest.java
@@ -0,0 +1,830 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.apache.spark.sql.streaming.DataStreamWriter;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * Failing tests for Bug #29: DSv2 streaming rejects checkpoint recovery after an additive (ADD
+ * COLUMN) or non-additive (RENAME COLUMN) schema change on a column-mapped table.
+ *
+ * Repro shape: create a CM-name table, drain a first micro-batch, alter the schema, write
+ * post-change rows, then restart the same query from the same checkpoint with {@code
+ * schemaTrackingLocation}. DSv1 handles this - additive changes are absorbed by the schema-tracking
+ * log, non-additive changes are unblocked by {@code allowSourceColumnRename=always}. DSv2 raises
+ * {@code AnalysisException: This query does not support recovering from checkpoint location}
+ * because the {@link io.delta.spark.internal.v2.read.MetadataEvolutionHandler} path added in PR
+ * #6697 only wires schemaTrackingLocation for the non-additive rename/drop branch; additive ADD
+ * COLUMN (and the rename leg here, depending on Spark's resolution order) falls through to Spark's
+ * {@code ResolveWriteToStream}, which rejects the checkpoint.
+ *
+ *
Each test runs the V1 leg first to (a) establish the expected behavior and (b) drive the
+ * schema change so the shared on-disk table is in the right state for the V2 leg. The V2 leg is the
+ * failing-test assertion: it captures the divergent current behavior so the test is RED until Bug
+ * #29 is fixed. When DSv2 starts succeeding, the {@code assertThrows} will fail and force the test
+ * to be re-classified to PASS.
+ */
+public class V2StreamingColumnMappingAddColumnRestartTest extends V2TestBase {
+
+ /**
+ * Bug #29 - ADD COLUMN mid-stream + restart from checkpoint.
+ *
+ *
V1 succeeds via schemaTrackingLocation; V2 throws "does not support recovering from
+ * checkpoint location".
+ */
+ @Test
+ public void testCmNameTable_addColumnMidStream_restartRejected(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name', "
+ + " 'delta.minReaderVersion' = '2', "
+ + " 'delta.minWriterVersion' = '5')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('0', '0'), ('1', '1')", tablePath));
+
+ // V1 leg: drain, ADD COLUMN, write new rows, restart. Expect success.
+ File v1Checkpoint = new File(deltaTablePath, "_v1_checkpoint");
+ File v1SchemaTracking = new File(v1Checkpoint, "_schema_tracking");
+ File v1Output = new File(deltaTablePath, "_v1_output");
+
+ Dataset v1Df1 =
+ spark
+ .readStream()
+ .format("delta")
+ .option("schemaTrackingLocation", v1SchemaTracking.getAbsolutePath())
+ .load(tablePath);
+ runOnceWithParquetSink(v1Df1, v1Output, v1Checkpoint);
+ DeltaLog.clearCache();
+
+ // Additive schema change between V1 runs.
+ spark.sql(str("ALTER TABLE delta.`%s` ADD COLUMN extra INT", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('2', '2', 2), ('3', '3', 3)", tablePath));
+
+ Dataset v1Df2 =
+ spark
+ .readStream()
+ .format("delta")
+ .option("schemaTrackingLocation", v1SchemaTracking.getAbsolutePath())
+ .load(tablePath);
+ runOnceWithParquetSink(v1Df2, v1Output, v1Checkpoint);
+ DeltaLog.clearCache();
+
+ long v1Total = spark.read().parquet(v1Output.getAbsolutePath()).count();
+ assertEquals(
+ 4L,
+ v1Total,
+ () -> "V1 should ingest 2 pre-add + 2 post-add rows across the restart, got " + v1Total);
+
+ // V2 leg: drain a first batch on the already-evolved table, then attempt to restart from a
+ // fresh checkpoint advanced past the ADD COLUMN. Currently expected to throw at restart time.
+ File v2Checkpoint = new File(deltaTablePath, "_v2_checkpoint");
+ File v2SchemaTracking = new File(v2Checkpoint, "_schema_tracking");
+ File v2Output = new File(deltaTablePath, "_v2_output");
+
+ // First V2 run: read from version 0 up through (and including) the ADD COLUMN commit and
+ // the post-add inserts so the checkpoint is positioned past the schema change.
+ Dataset v2Df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", v2SchemaTracking.getAbsolutePath())
+ .table(dsv2TableRef);
+ runOnceWithParquetSink(v2Df1, v2Output, v2Checkpoint);
+ DeltaLog.clearCache();
+
+ // Additional post-restart writes to give the second run something to do.
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('4', '4', 4), ('5', '5', 5)", tablePath));
+
+ // Second V2 run: restart from the same checkpoint + schema-tracking log. Bug #29 fires here.
+ Throwable v2Err =
+ assertThrows(
+ Throwable.class,
+ () -> {
+ Dataset v2Df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", v2SchemaTracking.getAbsolutePath())
+ .table(dsv2TableRef);
+ runOnceWithParquetSink(v2Df2, v2Output, v2Checkpoint);
+ },
+ "DSv2 is currently expected to reject checkpoint recovery across an ADD COLUMN. "
+ + "If this assertion fails, Bug #29 has been fixed - re-classify this test to "
+ + "an end-to-end success assertion matching the V1 leg.");
+ String v2Msg = unwrapMessages(v2Err);
+ assertTrue(
+ v2Msg.contains("does not support recovering from checkpoint location"),
+ () ->
+ "Expected DSv2 'does not support recovering from checkpoint location' error, got: "
+ + v2Msg);
+ }
+
+ /**
+ * Bug #29 (non-additive variant) - RENAME COLUMN mid-stream + restart from checkpoint.
+ *
+ * V1 succeeds with schemaTrackingLocation + allowSourceColumnRename=always; V2 throws "does
+ * not support recovering from checkpoint location".
+ */
+ @Test
+ public void testCmNameTable_renameColumnMidStream_restartRejected(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name', "
+ + " 'delta.minReaderVersion' = '2', "
+ + " 'delta.minWriterVersion' = '5')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('0', '0'), ('1', '1')", tablePath));
+
+ // V1 leg: drain, RENAME COLUMN, write new rows, restart with allowSourceColumnRename=always.
+ File v1Checkpoint = new File(deltaTablePath, "_v1_checkpoint");
+ File v1SchemaTracking = new File(v1Checkpoint, "_schema_tracking");
+ File v1Output = new File(deltaTablePath, "_v1_output");
+
+ Dataset v1Df1 =
+ spark
+ .readStream()
+ .format("delta")
+ .option("schemaTrackingLocation", v1SchemaTracking.getAbsolutePath())
+ .load(tablePath);
+ runOnceWithParquetSink(v1Df1, v1Output, v1Checkpoint);
+ DeltaLog.clearCache();
+
+ spark.sql(str("ALTER TABLE delta.`%s` RENAME COLUMN value TO value2", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('2', '2'), ('3', '3')", tablePath));
+
+ Dataset v1Df2 =
+ spark
+ .readStream()
+ .format("delta")
+ .option("schemaTrackingLocation", v1SchemaTracking.getAbsolutePath())
+ .option("allowSourceColumnRename", "always")
+ .load(tablePath);
+ runOnceWithParquetSink(v1Df2, v1Output, v1Checkpoint);
+ DeltaLog.clearCache();
+
+ long v1Total = spark.read().parquet(v1Output.getAbsolutePath()).count();
+ assertEquals(
+ 4L,
+ v1Total,
+ () ->
+ "V1 should ingest 2 pre-rename + 2 post-rename rows across the restart, got "
+ + v1Total);
+
+ // V2 leg: same shape, expected to throw at restart time.
+ File v2Checkpoint = new File(deltaTablePath, "_v2_checkpoint");
+ File v2SchemaTracking = new File(v2Checkpoint, "_schema_tracking");
+ File v2Output = new File(deltaTablePath, "_v2_output");
+
+ Dataset v2Df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", v2SchemaTracking.getAbsolutePath())
+ .table(dsv2TableRef);
+ runOnceWithParquetSink(v2Df1, v2Output, v2Checkpoint);
+ DeltaLog.clearCache();
+
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('4', '4'), ('5', '5')", tablePath));
+
+ Throwable v2Err =
+ assertThrows(
+ Throwable.class,
+ () -> {
+ Dataset v2Df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", v2SchemaTracking.getAbsolutePath())
+ .option("allowSourceColumnRename", "always")
+ .table(dsv2TableRef);
+ runOnceWithParquetSink(v2Df2, v2Output, v2Checkpoint);
+ },
+ "DSv2 is currently expected to reject checkpoint recovery across a RENAME COLUMN, "
+ + "even with allowSourceColumnRename=always. If this assertion fails, Bug #29 "
+ + "(rename leg) has been fixed - re-classify this test to an end-to-end success "
+ + "assertion matching the V1 leg.");
+ String v2Msg = unwrapMessages(v2Err);
+ assertTrue(
+ v2Msg.contains("does not support recovering from checkpoint location"),
+ () ->
+ "Expected DSv2 'does not support recovering from checkpoint location' error, got: "
+ + v2Msg);
+ }
+
+ /**
+ * Bug #29 (CM name) - ADD COLUMN mid-stream + restart from checkpoint. Mirrors DSv1 "column
+ * mapping + streaming - allowed workflows - column addition". DSv2 rejects the restart with "This
+ * query does not support recovering from checkpoint location".
+ */
+ @Test
+ public void testColumnMapping_addColumnMidStream(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name')",
+ tablePath));
+ for (int i = 0; i < 5; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ Dataset streamingDF = spark.readStream().table(dsv2TableRef);
+
+ StreamingQuery q1 =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ // Add column then write data in new schema
+ spark.sql(str("ALTER TABLE delta.`%s` ADD COLUMN (value2 STRING)", tablePath));
+ for (int i = 5; i < 10; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d', '%d')", tablePath, i, i, i));
+ }
+
+ // Restart with a fresh DataFrame matching new schema. DSv1 expects only 5..10 to be ingested
+ // because the sink is reinitialized.
+ Dataset streamingDF2 = spark.readStream().table(dsv2TableRef);
+ List actualRows =
+ processStreamingQueryFromCheckpoint(streamingDF2, "test_cm_add_column_mid", checkpointDir);
+
+ List expected =
+ Arrays.asList(
+ RowFactory.create("5", "5", "5"),
+ RowFactory.create("6", "6", "6"),
+ RowFactory.create("7", "7", "7"),
+ RowFactory.create("8", "8", "8"),
+ RowFactory.create("9", "9", "9"));
+ assertDataEquals(actualRows, expected);
+ }
+
+ /**
+ * Bug #29 (CM name) - DROP COLUMN with unsafe flag + schema-tracking on. DSv2 rejects the restart
+ * even with schemaTrackingLocation set.
+ */
+ @Test
+ public void testColumnMapping_dropColumnUnsafe(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name')",
+ tablePath));
+ for (int i = 0; i < 5; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ File schemaTrackingDir = new File(checkpointDir, "_schema_tracking");
+
+ // First stream just to advance the checkpoint
+ Dataset df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ // Drop column
+ spark.sql(str("ALTER TABLE delta.`%s` DROP COLUMN value", tablePath));
+ for (int i = 5; i < 10; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d')", tablePath, i));
+ }
+
+ // Restart with unsafe flag enabled
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafeReadOnIncompatibleColumnMappingSchemaChanges.enabled",
+ "true",
+ () -> {
+ try {
+ Dataset df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ List actualRows =
+ processStreamingQueryFromCheckpoint(df2, "test_cm_drop_col_unsafe", checkpointDir);
+ // After drop, post-drop rows have only id; we expect 5..9 with single column
+ List expected =
+ Arrays.asList(
+ RowFactory.create("5"),
+ RowFactory.create("6"),
+ RowFactory.create("7"),
+ RowFactory.create("8"),
+ RowFactory.create("9"));
+ assertDataEquals(actualRows, expected);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+
+ /**
+ * Bug #29 (CM name) - RENAME COLUMN with unsafe flag + schema-tracking on. DSv2 rejects the
+ * restart even with schemaTrackingLocation set.
+ */
+ @Test
+ public void testColumnMapping_renameColumnUnsafe(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name')",
+ tablePath));
+ for (int i = 0; i < 5; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ File schemaTrackingDir = new File(checkpointDir, "_schema_tracking");
+
+ Dataset df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ // Rename column
+ spark.sql(str("ALTER TABLE delta.`%s` RENAME COLUMN value TO value2", tablePath));
+ for (int i = 5; i < 10; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ // Restart with unsafe flag enabled
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafeReadOnIncompatibleColumnMappingSchemaChanges.enabled",
+ "true",
+ () -> {
+ try {
+ Dataset df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ List actualRows =
+ processStreamingQueryFromCheckpoint(
+ df2, "test_cm_rename_col_unsafe", checkpointDir);
+ List expected =
+ Arrays.asList(
+ RowFactory.create("5", "5"),
+ RowFactory.create("6", "6"),
+ RowFactory.create("7", "7"),
+ RowFactory.create("8", "8"),
+ RowFactory.create("9", "9"));
+ assertDataEquals(actualRows, expected);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+
+ /**
+ * Bug #29 (CM name) - restart from checkpoint with no schema change. DSv2 rejects the restart
+ * because the V2 read path does not advertise checkpoint-recovery support for CM tables.
+ */
+ @Test
+ public void testColumnMapping_restart(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, name STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 'Alice'), (2, 'Bob')", tablePath));
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+
+ // First stream - drains 1, 2
+ Dataset df1 = spark.readStream().table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("memory")
+ .queryName("test_cm_restart_1")
+ .outputMode("append")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ // New writes
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, 'Charlie'), (4, 'Dave')", tablePath));
+
+ // Restart from same checkpoint with fresh DataFrame
+ Dataset df2 = spark.readStream().table(dsv2TableRef);
+ StreamingQuery q2 =
+ df2.writeStream()
+ .format("memory")
+ .queryName("test_cm_restart_2")
+ .outputMode("append")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ try {
+ q2.processAllAvailable();
+ List rows = spark.sql("SELECT * FROM test_cm_restart_2").collectAsList();
+ List expected =
+ Arrays.asList(RowFactory.create(3, "Charlie"), RowFactory.create(4, "Dave"));
+ assertDataEquals(rows, expected);
+ } finally {
+ q2.stop();
+ }
+ }
+
+ /**
+ * Bug #29 (CM name) - ADD COLUMN then RENAME COLUMN (sequential schema changes). DSv2 rejects the
+ * restart across the sequential schema changes.
+ */
+ @Test
+ public void testColumnMapping_addThenRename(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'name')",
+ tablePath));
+ for (int i = 0; i < 3; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ File schemaTrackingDir = new File(checkpointDir, "_schema_tracking");
+
+ Dataset df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ // Sequential schema changes: add new col then rename it
+ spark.sql(str("ALTER TABLE delta.`%s` ADD COLUMN extra STRING", tablePath));
+ spark.sql(str("ALTER TABLE delta.`%s` RENAME COLUMN extra TO extra2", tablePath));
+ for (int i = 3; i < 6; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d', '%d')", tablePath, i, i, i));
+ }
+
+ // With unsafe flag
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafeReadOnIncompatibleColumnMappingSchemaChanges.enabled",
+ "true",
+ () -> {
+ try {
+ Dataset df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ List rows =
+ processStreamingQueryFromCheckpoint(df2, "test_cm_add_then_rename", checkpointDir);
+ List expected =
+ Arrays.asList(
+ RowFactory.create("3", "3", "3"),
+ RowFactory.create("4", "4", "4"),
+ RowFactory.create("5", "5", "5"));
+ assertDataEquals(rows, expected);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+
+ /**
+ * Bug #29 (CM id) - ADD COLUMN mid-stream + restart from checkpoint on an id-mode table. Mirrors
+ * the name-mode variant.
+ */
+ @Test
+ public void testColumnMappingId_addColumnMidStream(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'id', "
+ + " 'delta.minReaderVersion' = '2', "
+ + " 'delta.minWriterVersion' = '5')",
+ tablePath));
+ for (int i = 0; i < 5; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ Dataset streamingDF = spark.readStream().table(dsv2TableRef);
+
+ StreamingQuery q1 =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ spark.sql(str("ALTER TABLE delta.`%s` ADD COLUMN (value2 STRING)", tablePath));
+ for (int i = 5; i < 10; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d', '%d')", tablePath, i, i, i));
+ }
+
+ Dataset streamingDF2 = spark.readStream().table(dsv2TableRef);
+ List actualRows =
+ processStreamingQueryFromCheckpoint(
+ streamingDF2, "test_cm_id_add_column_mid", checkpointDir);
+
+ List expected =
+ Arrays.asList(
+ RowFactory.create("5", "5", "5"),
+ RowFactory.create("6", "6", "6"),
+ RowFactory.create("7", "7", "7"),
+ RowFactory.create("8", "8", "8"),
+ RowFactory.create("9", "9", "9"));
+ assertDataEquals(actualRows, expected);
+ }
+
+ /**
+ * Bug #29 (CM id) - RENAME COLUMN with unsafe flag on an id-mode table. DSv2 rejects the restart
+ * even with schemaTrackingLocation set.
+ */
+ @Test
+ public void testColumnMappingId_rename(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id STRING, value STRING) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'id', "
+ + " 'delta.minReaderVersion' = '2', "
+ + " 'delta.minWriterVersion' = '5')",
+ tablePath));
+ for (int i = 0; i < 5; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ File schemaTrackingDir = new File(checkpointDir, "_schema_tracking");
+
+ Dataset df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("noop")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ q1.processAllAvailable();
+ q1.stop();
+
+ spark.sql(str("ALTER TABLE delta.`%s` RENAME COLUMN value TO value2", tablePath));
+ for (int i = 5; i < 10; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES ('%d', '%d')", tablePath, i, i));
+ }
+
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafeReadOnIncompatibleColumnMappingSchemaChanges.enabled",
+ "true",
+ () -> {
+ try {
+ Dataset df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ List actualRows =
+ processStreamingQueryFromCheckpoint(
+ df2, "test_cm_id_rename_col_unsafe", checkpointDir);
+ List expected =
+ Arrays.asList(
+ RowFactory.create("5", "5"),
+ RowFactory.create("6", "6"),
+ RowFactory.create("7", "7"),
+ RowFactory.create("8", "8"),
+ RowFactory.create("9", "9"));
+ assertDataEquals(actualRows, expected);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ });
+ }
+
+ /**
+ * Bug #29 (CM id) - nullability toggle (DROP NOT NULL) mid-stream. With schemaTrackingLocation,
+ * the restart from checkpoint must adopt the relaxed nullability and surface the post-toggle
+ * null-value row instead of raising DELTA_STREAMING_SCHEMA_MISMATCH_ON_RESTART.
+ */
+ @Test
+ public void testColumnMappingId_nullabilityToggle(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ File checkpointDir = new File(deltaTablePath, "_checkpoint");
+ File schemaTrackingDir = new File(checkpointDir, "_schema_tracking");
+ File outputDir = new File(deltaTablePath, "_out");
+
+ // CM-id table with a NOT NULL `value` column.
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, value STRING NOT NULL) USING delta "
+ + "TBLPROPERTIES ('delta.columnMapping.mode' = 'id', "
+ + " 'delta.minReaderVersion' = '2', "
+ + " 'delta.minWriterVersion' = '5')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 'a'), (2, 'b')", tablePath));
+
+ // First run drains the 2 pre-toggle rows.
+ Dataset df1 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q1 =
+ df1.writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ try {
+ q1.processAllAvailable();
+ } finally {
+ q1.stop();
+ DeltaLog.clearCache();
+ }
+ long firstRunRows = spark.read().parquet(outputDir.getAbsolutePath()).count();
+ assertEquals(2L, firstRunRows, "first run should emit 2 pre-toggle rows");
+
+ // Drop NOT NULL on `value`, then INSERT a row with NULL value.
+ spark.sql(str("ALTER TABLE delta.`%s` ALTER COLUMN value DROP NOT NULL", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, NULL)", tablePath));
+
+ // Restart from the same checkpoint + schema tracking log; the schema evolution handler must
+ // adopt the relaxed nullability and not raise SCHEMA_MISMATCH_ON_RESTART.
+ Dataset df2 =
+ spark
+ .readStream()
+ .option("schemaTrackingLocation", schemaTrackingDir.getAbsolutePath())
+ .table(dsv2TableRef);
+ StreamingQuery q2 =
+ df2.writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ try {
+ q2.processAllAvailable();
+ final StreamingQuery finalQ = q2;
+ assertTrue(
+ finalQ.exception().isEmpty(),
+ () ->
+ "Restart should not raise SCHEMA_MISMATCH_ON_RESTART: "
+ + (finalQ.exception().isDefined() ? finalQ.exception().get().toString() : ""));
+ } finally {
+ q2.stop();
+ DeltaLog.clearCache();
+ }
+
+ // Sink should now contain all 3 rows including the post-toggle null-value row.
+ List sinkRows = spark.read().parquet(outputDir.getAbsolutePath()).collectAsList();
+ assertEquals(3, sinkRows.size(), () -> "expected 3 total rows, got " + sinkRows);
+ assertTrue(
+ sinkRows.stream().anyMatch(r -> r.isNullAt(1)),
+ () -> "post-toggle null-value row must surface, got " + sinkRows);
+ }
+
+ /**
+ * Process a streaming query that resumes from a specific checkpoint location, returning the rows
+ * materialized into a memory sink.
+ */
+ private List processStreamingQueryFromCheckpoint(
+ Dataset streamingDF, String queryName, File checkpointDir) throws Exception {
+ StreamingQuery query = null;
+ try {
+ query =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ query.processAllAvailable();
+ return spark.sql("SELECT * FROM " + queryName).collectAsList();
+ } finally {
+ if (query != null) {
+ query.stop();
+ DeltaLog.clearCache();
+ }
+ }
+ }
+
+ /**
+ * Runs a streaming query against a parquet sink + checkpoint with {@link Trigger#AvailableNow()},
+ * drains everything, then stops. Parquet sink supports checkpoint recovery across restarts;
+ * memory sink does not.
+ */
+ private void runOnceWithParquetSink(Dataset streamingDF, File outputDir, File checkpointDir)
+ throws Exception {
+ StreamingQuery query = null;
+ try {
+ DataStreamWriter writer =
+ streamingDF
+ .writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .trigger(Trigger.AvailableNow());
+ query = writer.start();
+ query.processAllAvailable();
+ query.awaitTermination(60_000L);
+ } finally {
+ if (query != null) {
+ query.stop();
+ }
+ }
+ }
+
+ /**
+ * Walks the cause chain and concatenates messages so .contains() checks can match across wrapped
+ * exceptions.
+ */
+ private static String unwrapMessages(Throwable t) {
+ StringBuilder sb = new StringBuilder();
+ Throwable cur = t;
+ while (cur != null) {
+ sb.append(cur.getClass().getName())
+ .append(": ")
+ .append(cur.getMessage() == null ? "" : cur.getMessage())
+ .append("\n");
+ cur = cur.getCause();
+ }
+ return sb.toString();
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDeletionVectorVariantTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDeletionVectorVariantTest.java
new file mode 100644
index 00000000000..c43850e0a99
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDeletionVectorVariantTest.java
@@ -0,0 +1,210 @@
+/*
+ * Copyright (2026) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.Comparator;
+import java.util.List;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * End-to-end DSv1 + DSv2 streaming repro for the post-PR-#6578 silent-corruption bug in {@link
+ * io.delta.spark.internal.v2.read.deletionvector.ColumnVectorWithFilter#getChild(int)}.
+ *
+ * Bug: For non-Struct top-level types (e.g. VARIANT) the {@code getChild()} non-Struct branch
+ * returns the unwrapped delegate child, dropping the row-id mapping. Since Spark implements {@code
+ * ColumnVector.getVariant(rowId)} by calling {@code getChild(0).getBinary(rowId)} and {@code
+ * getChild(1).getBinary(rowId)}, the variant value reads from the original (pre-DV-filter) row, not
+ * the live row at the mapped position. Under a DV-only delete this returns the wrong variant
+ * payload silently - no exception, just wrong data.
+ *
+ *
Each test exercises BOTH the DSv1 and DSv2 streaming paths over the same Delta table and
+ * asserts that the two sides agree row-for-row. DSv1 is the oracle (its streaming read does not go
+ * through {@code ColumnVectorWithFilter}); a V1/V2 mismatch implicates the V2 path.
+ *
+ *
The DSv1 mirror at {@code DeltaSourceDeletionVectorsSuite} only verifies "no
+ * ClassCastException" - it does not assert variant values match the row identity. This file does
+ * the value-level assertion at the user-visible {@code spark.readStream} level.
+ *
+ *
Companion to the unit-level test {@code ColumnVectorWithFilterTypeFanoutTest}.
+ */
+public class V2StreamingDeletionVectorVariantTest extends V2TestBase {
+
+ /**
+ * E2E silent-corruption repro: DV + VARIANT.
+ *
+ *
Construct each row's variant as {@code parse_json('{"row":}')} so the variant value is a
+ * function of the row's id. After a DV-only delete (half rows removed, file kept), every output
+ * row's {@code variant_get(v,'$.row','int')} must equal its {@code id}. If the row-id mapping is
+ * dropped on {@code getChild()}, variants will appear at the wrong rows and the assertion fails.
+ */
+ @Test
+ public void testStreamingReadWithDeletionVectorAndVariant(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+
+ // 1. Create table with DVs enabled and a VARIANT column.
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, v VARIANT) USING delta "
+ + "TBLPROPERTIES ('delta.enableDeletionVectors' = 'true')",
+ tablePath));
+
+ // 2. Insert 10 rows, each with v = parse_json('{"row":}') so v.row == id by construction.
+ // Coalesce(1) so all rows live in a single Parquet file - the DELETE below then produces a
+ // DV-only delete (no file rewrite), which is the path that goes through ColumnVectorWith-
+ // Filter on read.
+ spark
+ .range(1, 11)
+ .selectExpr("cast(id as int) as id", "parse_json(concat('{\"row\":', id, '}')) as v")
+ .coalesce(1)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ // 3. DELETE half the rows via DV (id even). File is kept; only a DV is written.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id %% 2 = 0", tablePath));
+
+ // 4. Exercise both DSv1 and DSv2 streaming reads and assert parity.
+ List v1Rows =
+ collectVariantStreamingRows(tablePath, "dv_variant_repro_v1", /* v2= */ false);
+ List v2Rows =
+ collectVariantStreamingRows(tablePath, "dv_variant_repro_v2", /* v2= */ true);
+
+ // 5. Oracle assertions on DSv1: per-row identity variant_get(v,'$.row','int') == id.
+ // Surviving ids after DELETE id % 2 = 0: {1, 3, 5, 7, 9}.
+ assertEquals(5, v1Rows.size(), () -> "Expected 5 surviving rows from V1, got " + v1Rows);
+ for (Row row : v1Rows) {
+ int id = row.getInt(0);
+ Object vRowObj = row.get(1);
+ assertNotNull(
+ vRowObj,
+ () -> "V1 oracle: variant_get returned NULL for id=" + id + " — variant payload missing");
+ int vRow = ((Number) vRowObj).intValue();
+ assertEquals(id, vRow, () -> "V1 oracle row identity failed at id=" + id);
+ }
+
+ // 6. V1 vs V2 parity. V1 is the oracle.
+ assertV1V2Parity(v1Rows, v2Rows, "dv_variant_repro");
+ }
+
+ /**
+ * Control: same shape WITHOUT deletion vectors. Without DVs the read path skips
+ * ColumnVectorWithFilter entirely, so the bug should NOT surface here. If this control fails the
+ * harness itself is broken and the DV-test result is unreliable.
+ */
+ @Test
+ public void testStreamingReadWithVariantControl(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+
+ // No DV property, no DELETE - table is plain. Read path does not wrap columns with
+ // ColumnVectorWithFilter, so variant reads must be correct on master.
+ spark.sql(str("CREATE TABLE delta.`%s` (id INT, v VARIANT) USING delta", tablePath));
+
+ spark
+ .range(1, 11)
+ .selectExpr("cast(id as int) as id", "parse_json(concat('{\"row\":', id, '}')) as v")
+ .coalesce(1)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ List v1Rows =
+ collectVariantStreamingRows(tablePath, "dv_variant_control_v1", /* v2= */ false);
+ List v2Rows =
+ collectVariantStreamingRows(tablePath, "dv_variant_control_v2", /* v2= */ true);
+
+ // Oracle assertions on DSv1.
+ assertEquals(10, v1Rows.size(), () -> "Expected 10 rows from V1, got " + v1Rows);
+ for (Row row : v1Rows) {
+ int id = row.getInt(0);
+ int vRow = row.getInt(1);
+ assertEquals(id, vRow, () -> "V1 control failed at id=" + id);
+ }
+
+ // V1 vs V2 parity.
+ assertV1V2Parity(v1Rows, v2Rows, "dv_variant_control");
+ }
+
+ // ---------------------------------------------------------------------------
+ // INTERVAL coverage note.
+ //
+ // ColumnVectorWithFilter.getChild() also drops the row-id mapping for CalendarIntervalType
+ // (non-Struct, 3 child columns), so getInterval() reads from the wrong rows on a DV-filtered
+ // batch. We attempted to repro this end-to-end via spark.readStream() but Delta OSS rejects
+ // top-level interval columns: SchemaUtils.findUnsupportedDataTypesRecursively flags both
+ // YearMonthIntervalType and DayTimeIntervalType as UnsupportedDataType, and CalendarIntervalType
+ // is not exposed as a creatable SQL column type. There is no path through CREATE TABLE / INSERT
+ // that lands an INTERVAL value into a Delta Parquet file, so the bug cannot be exercised at the
+ // user-visible spark.readStream() level for INTERVAL on this codebase. The unit-level
+ // ColumnVectorWithFilterTypeFanoutTest covers it directly via OnHeapColumnVector.
+ // ---------------------------------------------------------------------------
+
+ /**
+ * Drives a streaming read against {@code tablePath} through either the DSv1 or DSv2 path,
+ * materializes the rows into a memory sink, and projects {@code (id, variant_get(v,'$.row',
+ * 'int'))} from the sink. Returns the projected rows sorted by id for direct V1/V2 comparison.
+ *
+ * @param v2 if true, read via {@code spark.readStream().table("dsv2.delta.``")}; otherwise
+ * {@code spark.readStream().format("delta").load(path)}.
+ */
+ private List collectVariantStreamingRows(String tablePath, String queryName, boolean v2)
+ throws Exception {
+ Dataset streamingDF =
+ v2
+ ? spark.readStream().table(str("dsv2.delta.`%s`", tablePath))
+ : spark.readStream().format("delta").load(tablePath);
+ assertTrue(streamingDF.isStreaming());
+
+ // Drain the stream into the memory sink (SELECT * - full rows, including the VARIANT column).
+ processStreamingQuery(streamingDF, queryName);
+
+ // Project (id, variant_get(...)) from the memory sink and sort by id for stable comparison.
+ List rows =
+ spark
+ .sql(
+ "SELECT id, variant_get(v, '$.row', 'int') AS v_row FROM "
+ + queryName
+ + " ORDER BY id")
+ .collectAsList();
+
+ // Defensive: sort in-memory too, since the upstream ORDER BY does not bind to the memory sink
+ // results across micro-batches in every Spark build.
+ rows.sort(Comparator.comparingInt(r -> r.getInt(0)));
+ return rows;
+ }
+
+ /**
+ * Asserts that DSv1 and DSv2 streaming reads produced byte-identical projected rows. V1 is the
+ * oracle.
+ */
+ private void assertV1V2Parity(List v1Rows, List v2Rows, String tag) {
+ v1Rows.sort(Comparator.comparingInt(r -> r.getInt(0)));
+ v2Rows.sort(Comparator.comparingInt(r -> r.getInt(0)));
+ assertEquals(
+ v1Rows.toString(),
+ v2Rows.toString(),
+ () -> tag + ": V1 vs V2 row mismatch.\nV1=" + v1Rows + "\nV2=" + v2Rows);
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDvIgnoreDeletesTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDvIgnoreDeletesTest.java
new file mode 100644
index 00000000000..b6994d43a38
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingDvIgnoreDeletesTest.java
@@ -0,0 +1,614 @@
+/*
+ * Copyright (2026) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.StreamingQueryProgress;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * DSv1 vs DSv2 streaming parity for {@code ignoreDeletes=true} on a DV-enabled table.
+ *
+ * Bug #28: When streaming a DV-enabled table with {@code ignoreDeletes=true}, DSv2 emits fewer
+ * rows than expected because DV-based delete commits are not skipped.
+ *
+ *
On a table with {@code delta.enableDeletionVectors=true}, a {@code DELETE} produces an {@code
+ * AddFile} with an attached deletion vector (rewriting the file's metadata, not the file itself)
+ * rather than a plain {@code RemoveFile}. DSv2's {@code ignoreDeletes} guard only skips commits
+ * that consist solely of {@code RemoveFile} actions, so DV-based delete commits bypass the skip
+ * logic, the rewritten {@code AddFile} is read as new data, and the surviving (non-deleted) rows
+ * are emitted to the stream a second time. The rows that were deleted disappear; the rows
+ * that survived may be emitted twice depending on how the post-DELETE {@code AddFile} is consumed.
+ *
+ *
DSv1 (oracle) handles this correctly: the V1 streaming source classifies DV-attached
+ * AddFile-only commits as deletes for the purpose of {@code ignoreDeletes} and skips them, so only
+ * the original INSERT rows and any post-DELETE INSERT rows are emitted.
+ *
+ *
Each test below writes the same table twice (once for the V1 read, once for the V2 read so the
+ * temp dirs do not collide), runs both streaming reads, and asserts row-for-row parity. Until Bug
+ * #28 is fixed, the parity assertion fails because V2 emits a different row set than V1.
+ */
+public class V2StreamingDvIgnoreDeletesTest extends V2TestBase {
+
+ /**
+ * Basic stream: INSERT into two partitions, DELETE one whole partition on a DV-enabled table,
+ * INSERT more rows, then read with {@code ignoreDeletes=true}. V1 skips the DELETE commit; V2
+ * does not. Parity assertion fails until Bug #28 is fixed.
+ */
+ @Test
+ public void testDvTable_ignoreDeletes_basicStream(@TempDir File baseDir) throws Exception {
+ File v1Dir = new File(baseDir, "v1");
+ File v2Dir = new File(baseDir, "v2");
+ assertTrue(v1Dir.mkdirs(), "Failed to create v1 dir");
+ assertTrue(v2Dir.mkdirs(), "Failed to create v2 dir");
+
+ String v1TablePath = v1Dir.getAbsolutePath();
+ String v2TablePath = v2Dir.getAbsolutePath();
+
+ createDvPartitionedTable(v1TablePath);
+ seedAndDeleteAndAppend(v1TablePath);
+
+ createDvPartitionedTable(v2TablePath);
+ seedAndDeleteAndAppend(v2TablePath);
+
+ Dataset v1Stream =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreDeletes", "true")
+ .load(v1TablePath)
+ .selectExpr("id", "p");
+ List v1Rows = processStreamingQuery(v1Stream, "dv_ignore_deletes_basic_v1");
+
+ Dataset v2Stream =
+ spark
+ .readStream()
+ .option("ignoreDeletes", "true")
+ .table(str("dsv2.delta.`%s`", v2TablePath))
+ .selectExpr("id", "p");
+ List v2Rows = processStreamingQuery(v2Stream, "dv_ignore_deletes_basic_v2");
+
+ // This assertion fails because of Bug #28 - V2 does not skip DV-based delete commits with
+ // ignoreDeletes=true. V1 emits the original INSERT rows plus the post-DELETE INSERT rows; V2
+ // emits a different set (DELETE commit not skipped, so the rewritten AddFile surfaces).
+ assertDataEquals(v2Rows, v1Rows);
+ }
+
+ /**
+ * Same DV + ignoreDeletes setup, but with {@code maxFilesPerTrigger=1} forcing one file per
+ * batch. V1 skips the DELETE commit; V2 admits the DV-rewritten AddFile and diverges. Parity
+ * assertion fails until Bug #28 is fixed.
+ */
+ @Test
+ public void testDvTable_ignoreDeletes_withMaxFilesPerTrigger(@TempDir File baseDir)
+ throws Exception {
+ File v1Dir = new File(baseDir, "v1");
+ File v2Dir = new File(baseDir, "v2");
+ assertTrue(v1Dir.mkdirs(), "Failed to create v1 dir");
+ assertTrue(v2Dir.mkdirs(), "Failed to create v2 dir");
+
+ String v1TablePath = v1Dir.getAbsolutePath();
+ String v2TablePath = v2Dir.getAbsolutePath();
+
+ createDvPartitionedTable(v1TablePath);
+ seedAndDeleteAndAppend(v1TablePath);
+
+ createDvPartitionedTable(v2TablePath);
+ seedAndDeleteAndAppend(v2TablePath);
+
+ Dataset v1Stream =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreDeletes", "true")
+ .option("maxFilesPerTrigger", "1")
+ .load(v1TablePath)
+ .selectExpr("id", "p");
+ List v1Rows = processStreamingQuery(v1Stream, "dv_ignore_deletes_mft_v1");
+
+ Dataset v2Stream =
+ spark
+ .readStream()
+ .option("ignoreDeletes", "true")
+ .option("maxFilesPerTrigger", "1")
+ .table(str("dsv2.delta.`%s`", v2TablePath))
+ .selectExpr("id", "p");
+ List v2Rows = processStreamingQuery(v2Stream, "dv_ignore_deletes_mft_v2");
+
+ // This assertion fails because of Bug #28 - V2 does not skip DV-based delete commits with
+ // ignoreDeletes=true.
+ assertDataEquals(v2Rows, v1Rows);
+ }
+
+ /**
+ * Same DV + ignoreDeletes setup, but using {@code Trigger.AvailableNow} so the stream
+ * self-terminates after draining the snapshot. V1 skips the DELETE commit; V2 diverges. Parity
+ * assertion fails until Bug #28 is fixed.
+ */
+ @Test
+ public void testDvTable_ignoreDeletes_withAvailableNow(@TempDir File baseDir) throws Exception {
+ File v1Dir = new File(baseDir, "v1");
+ File v2Dir = new File(baseDir, "v2");
+ assertTrue(v1Dir.mkdirs(), "Failed to create v1 dir");
+ assertTrue(v2Dir.mkdirs(), "Failed to create v2 dir");
+
+ String v1TablePath = v1Dir.getAbsolutePath();
+ String v2TablePath = v2Dir.getAbsolutePath();
+
+ createDvPartitionedTable(v1TablePath);
+ seedAndDeleteAndAppend(v1TablePath);
+
+ createDvPartitionedTable(v2TablePath);
+ seedAndDeleteAndAppend(v2TablePath);
+
+ List v1Rows =
+ runAvailableNowMemoryRows(
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreDeletes", "true")
+ .load(v1TablePath)
+ .selectExpr("id", "p"),
+ "dv_ignore_deletes_avail_v1");
+
+ List v2Rows =
+ runAvailableNowMemoryRows(
+ spark
+ .readStream()
+ .option("ignoreDeletes", "true")
+ .table(str("dsv2.delta.`%s`", v2TablePath))
+ .selectExpr("id", "p"),
+ "dv_ignore_deletes_avail_v2");
+
+ // This assertion fails because of Bug #28 - V2 does not skip DV-based delete commits with
+ // ignoreDeletes=true.
+ assertDataEquals(v2Rows, v1Rows);
+ }
+
+ // -- helpers --
+
+ /** CREATE TABLE with DV enabled, partitioned by {@code p}. */
+ private void createDvPartitionedTable(String path) {
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, p INT) USING delta PARTITIONED BY (p) "
+ + "TBLPROPERTIES ('delta.enableDeletionVectors' = 'true')",
+ path));
+ }
+
+ /**
+ * Standard fixture for Bug #28: INSERT into p=1 and p=2 (separate files per partition),
+ * whole-partition DELETE on p=1 (with DV enabled this produces AddFile+DV, not RemoveFile), then
+ * INSERT more rows into p=2.
+ */
+ private void seedAndDeleteAndAppend(String tablePath) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 1), (2, 1)", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (10, 2), (11, 2)", tablePath));
+ // Whole-partition DELETE on p=1. With delta.enableDeletionVectors=true, this commit consists
+ // of an AddFile carrying a DV (not a plain RemoveFile), which is the path Bug #28 hits.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE p = 1", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (20, 2), (21, 2)", tablePath));
+ }
+
+ /** Run a streaming query under {@code Trigger.AvailableNow} against a memory sink. */
+ private List runAvailableNowMemoryRows(Dataset streamingDF, String queryName)
+ throws Exception {
+ StreamingQuery query = null;
+ try {
+ query =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ query.processAllAvailable();
+ assertTrue(query.awaitTermination(60_000L), "AvailableNow did not terminate within 60s");
+ return spark.sql("SELECT * FROM " + queryName).collectAsList();
+ } finally {
+ if (query != null) {
+ query.stop();
+ DeltaLog.clearCache();
+ }
+ }
+ }
+
+ // -- Bug #28 surfaces in additional compound-feature and option-combination scenarios. The tests
+ // below are copied from V2StreamingCmRtDvTest, V2StreamingOptionCombinationsRateLimitTest,
+ // V2StreamingOptionCombinationsTriggerStartingTest, V2StreamingOptionCombinationsFilterFaultTest,
+ // and V2StreamingSchemaEvoLongTailTest. Each fails due to Bug #28 (DV + ignoreDeletes /
+ // ignoreChanges / skipChangeCommits wrong row counts).
+
+ /**
+ * Compound CM-name + RT + DV + partitions x {@code ignoreDeletes=true}. v0 CREATE, v1 INSERT
+ * across two partitions, v2 whole-partition DELETE (file-granular, ignoreDeletes-friendly), v3
+ * INSERT more. The DELETE commit must be skipped; only INSERT rows surface, all under the
+ * column-mapped + row-tracked + DV-enabled physical layout.
+ */
+ @Test
+ public void testCompound_ignoreDeletes(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createCmNameRtDvPartitionedTable(tablePath);
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (1, 'name-1', 'x'), (2, 'name-2', 'y')", tablePath));
+ // Whole-partition delete on p='y': file-granular, allowed under ignoreDeletes.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE p = 'y'", tablePath));
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (3, 'name-3', 'x'), (4, 'name-4', 'z')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .option("ignoreDeletes", "true")
+ .table(dsv2TableRef)
+ .selectExpr("id", "name", "p");
+
+ List rows = processStreamingQuery(streamingDF, "compound_ignore_deletes");
+ // 4 INSERT rows survive; the DELETE commit is dropped by ignoreDeletes.
+ assertEquals(
+ 4,
+ rows.size(),
+ () -> "expected 4 INSERT rows with DELETE skipped under compound features, got: " + rows);
+ Set ids = new HashSet<>();
+ for (Row r : rows) {
+ long id = r.getLong(0);
+ ids.add(id);
+ assertEquals("name-" + id, r.getString(1), "name column must align with id under CM rewrite");
+ }
+ assertEquals(Set.of(1L, 2L, 3L, 4L), ids);
+ }
+
+ /**
+ * Compound CM-name + RT + DV + partitions x {@code ignoreChanges=true}. v0 CREATE, v1 INSERT, v2
+ * UPDATE (re-emits the rewritten file's rows as appends under ignoreChanges), v3 INSERT more. The
+ * stream must not error; INSERT rows are present, and ignoreChanges treats the UPDATE rewrite as
+ * a re-emitted append rather than a hard failure.
+ */
+ @Test
+ public void testCompound_ignoreChanges(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createCmNameRtDvPartitionedTable(tablePath);
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (1, 'name-1', 'x'), (2, 'name-2', 'y')", tablePath));
+ // UPDATE p='y' row: under DV this is a rewrite commit; ignoreChanges re-emits the new AddFile
+ // as an append rather than failing the stream.
+ spark.sql(str("UPDATE delta.`%s` SET name = 'updated-2' WHERE id = 2", tablePath));
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (3, 'name-3', 'x'), (4, 'name-4', 'z')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .option("ignoreChanges", "true")
+ .table(dsv2TableRef)
+ .selectExpr("id", "name", "p");
+
+ List rows = processStreamingQuery(streamingDF, "compound_ignore_changes");
+ // The two original INSERT rows + the UPDATE-rewritten p='y' file (re-emitted as an append by
+ // ignoreChanges, carrying the updated name) + the two final-INSERT rows = 5 rows total.
+ assertEquals(
+ 5,
+ rows.size(),
+ () ->
+ "expected 5 rows (2 initial INSERTs + UPDATE rewrite re-emitted + 2 final INSERTs)"
+ + " under ignoreChanges, got: "
+ + rows);
+ Set ids = new HashSet<>();
+ for (Row r : rows) {
+ ids.add(r.getLong(0));
+ }
+ // All inserted ids must be present; the UPDATE re-emits id=2 (now once via the rewrite, since
+ // the original p='y' file was rewritten in place).
+ assertEquals(Set.of(1L, 2L, 3L, 4L), ids);
+ long updatedRowCount = rows.stream().filter(r -> "updated-2".equals(r.getString(1))).count();
+ assertEquals(
+ 1,
+ updatedRowCount,
+ () ->
+ "expected exactly one row to carry the updated name under ignoreChanges, got: " + rows);
+ }
+
+ /**
+ * Compound CM-name + RT + DV + partitions x {@code skipChangeCommits=true}. v0 CREATE, v1 INSERT,
+ * v2 UPDATE (change commit - dropped entirely by skipChangeCommits), v3 INSERT more. Only INSERT
+ * rows surface; the UPDATE rewrite does not produce any rows on the stream.
+ */
+ @Test
+ public void testCompound_skipChangeCommits(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createCmNameRtDvPartitionedTable(tablePath);
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (1, 'name-1', 'x'), (2, 'name-2', 'y')", tablePath));
+ // UPDATE: change commit, dropped entirely by skipChangeCommits.
+ spark.sql(str("UPDATE delta.`%s` SET name = 'updated-2' WHERE id = 2", tablePath));
+ spark.sql(
+ str("INSERT INTO delta.`%s` VALUES (3, 'name-3', 'x'), (4, 'name-4', 'z')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .option("skipChangeCommits", "true")
+ .table(dsv2TableRef)
+ .selectExpr("id", "name", "p");
+
+ List rows = processStreamingQuery(streamingDF, "compound_skip_change_commits");
+ // 4 INSERT rows surface; the UPDATE change commit is dropped, so no row carries 'updated-2'.
+ assertEquals(
+ 4,
+ rows.size(),
+ () ->
+ "expected 4 INSERT rows with UPDATE commit dropped under skipChangeCommits, got: "
+ + rows);
+ Set ids = new HashSet<>();
+ for (Row r : rows) {
+ long id = r.getLong(0);
+ ids.add(id);
+ assertEquals(
+ "name-" + id,
+ r.getString(1),
+ () -> "skipChangeCommits must not surface the UPDATE's rewritten rows, got: " + rows);
+ }
+ assertEquals(Set.of(1L, 2L, 3L, 4L), ids);
+ }
+
+ /**
+ * INSERT, whole-file DELETE, INSERT - stream with {@code maxFilesPerTrigger=1 + ignoreDeletes=
+ * true}. Partition by id so the DELETE removes whole files (ignoreDeletes only applies to
+ * file-granular deletes, not row-level rewrites). Assert only INSERT rows surface, the rate limit
+ * is respected, and the stream does not crash on the DELETE commit.
+ */
+ @Test
+ public void testMaxFilesPerTrigger_withIgnoreDeletes(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, name STRING) USING delta PARTITIONED BY (id)",
+ tablePath));
+ // Initial INSERTs across separate partitions (one file per partition).
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 'Alice')", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (2, 'Bob')", tablePath));
+ // Whole-file DELETE on partition id=2.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id = 2", tablePath));
+ // More INSERTs after the DELETE, into separate partitions.
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, 'Carol')", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (4, 'Dave')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .option("maxFilesPerTrigger", "1")
+ .option("ignoreDeletes", "true")
+ .table(dsv2TableRef);
+
+ StreamingQuery q =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName("maxFiles_ignoreDeletes")
+ .outputMode("append")
+ .start();
+ try {
+ q.processAllAvailable();
+ StreamingQueryProgress[] progress = nonEmptyProgress(q);
+ // 4 INSERTs -> 4 files admitted; ignoreDeletes drops the DELETE commit entirely.
+ assertEquals(
+ 4, progress.length, () -> "expected 4 batches (one per INSERT file), got: " + progress);
+ for (StreamingQueryProgress p : progress) {
+ assertEquals(1L, p.numInputRows(), "maxFilesPerTrigger=1 should admit exactly 1 row");
+ }
+ List rows = spark.sql("SELECT * FROM maxFiles_ignoreDeletes").collectAsList();
+ assertEquals(4, rows.size(), () -> "expected 4 INSERT rows, got: " + rows);
+ // Sanity check: only the inserted ids surface, never id=2's deleted file rewritten.
+ Set ids = new HashSet<>();
+ for (Row r : rows) {
+ ids.add(r.getInt(0));
+ }
+ assertEquals(
+ new HashSet<>(Arrays.asList(1, 2, 3, 4)),
+ ids,
+ () -> "expected ids {1,2,3,4} from the INSERT commits; got: " + ids);
+ } finally {
+ q.stop();
+ DeltaLog.clearCache();
+ }
+ }
+
+ /**
+ * D2. AvailableNow + ignoreDeletes=true: whole-file DELETE between INSERTs must not error the
+ * stream, and only INSERT rows are visible at the sink. AvailableNow self-terminates.
+ */
+ @Test
+ public void testAvailableNow_withIgnoreDeletes(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2Ref = str("dsv2.delta.`%s`", tablePath);
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, part INT) USING delta PARTITIONED BY (part)",
+ tablePath));
+ // INSERT, whole-file DELETE (by partition), then more INSERTs. With ignoreDeletes=true the
+ // stream sees v1 (3 rows in part=0) and v3 (3 rows in part=1), but not the DELETE.
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 0), (2, 0), (3, 0)", tablePath)); // v=1
+ spark.sql(str("DELETE FROM delta.`%s` WHERE part = 0", tablePath)); // v=2 (whole-file)
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (10, 1), (11, 1), (12, 1)", tablePath)); // v=3
+
+ long rows =
+ runAvailableNowMemoryCount(
+ spark.readStream().option("ignoreDeletes", "true").table(dsv2Ref),
+ "avail_ignore_deletes");
+ // 3 rows from v1 + 3 rows from v3 = 6. The DELETE is silently dropped by ignoreDeletes.
+ assertEquals(
+ 6L, rows, () -> "AvailableNow + ignoreDeletes should see 6 INSERT rows; got: " + rows);
+ }
+
+ /**
+ * {@code maxBytesPerTrigger=1b} x {@code ignoreDeletes=true}.
+ *
+ * Partitioned table so DELETE is whole-file. v0 CREATE, v1 INSERT into two partitions, v2
+ * DELETE one partition (whole-file delete), v3 INSERT more. ignoreDeletes drops the DELETE commit
+ * and the byte-rate limit (at least one file per batch) admits the INSERT files correctly.
+ */
+ @Test
+ public void testMaxBytesPerTrigger_withIgnoreDeletes(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ // Partition by p so DELETE removes whole files (ignoreDeletes only allows file-granular
+ // deletes; a row-level rewrite would still error).
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, p STRING) USING delta PARTITIONED BY (p)",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 'x'), (2, 'y')", tablePath));
+ // Whole-partition delete: removes the file for p='y'.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE p = 'y'", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, 'x'), (4, 'z')", tablePath));
+
+ Dataset v2Stream =
+ spark
+ .readStream()
+ .option("ignoreDeletes", "true")
+ .option("maxBytesPerTrigger", "1b")
+ .table(str("dsv2.delta.`%s`", tablePath));
+ List v2Rows = processStreamingQuery(v2Stream, "mbpt_id_v2");
+
+ Dataset v1Stream =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreDeletes", "true")
+ .option("maxBytesPerTrigger", "1b")
+ .load(tablePath);
+ List v1Rows = processStreamingQuery(v1Stream, "mbpt_id_v1");
+
+ assertDataEquals(v2Rows, v1Rows);
+ // 4 INSERT rows (id=1,2,3,4) - DELETE commit is dropped.
+ assertEquals(
+ 4, v2Rows.size(), () -> "expected 4 INSERT rows with DELETE skipped, got: " + v2Rows);
+ }
+
+ /**
+ * S9 Type widening x {@code ignoreDeletes=true}. Widen INT->LONG, insert into a partitioned
+ * column, DELETE one partition (file-granular remove), then INSERT more. Without ignoreDeletes
+ * the stream would fail on the DELETE; with ignoreDeletes=true the DELETE is dropped and all
+ * INSERT rows survive. DSv1 parity check.
+ */
+ @Test
+ public void testTypeWidening_intToLong_ignoreDeletes(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ // Partition by id so DELETE removes a whole file (ignoreDeletes only applies to file-granular
+ // deletes, not DV / row-level rewrites).
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, data INT) USING delta PARTITIONED BY (id) "
+ + "TBLPROPERTIES ('delta.enableTypeWidening' = 'true')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 10), (2, 20)", tablePath));
+ spark.sql(str("ALTER TABLE delta.`%s` CHANGE COLUMN data data LONG", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, 3000000000), (4, 4000000000)", tablePath));
+ // Partition delete: drops the file for id=2 wholesale, no DV rewrite.
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id = 2", tablePath));
+
+ Dataset v2Stream =
+ spark.readStream().option("ignoreDeletes", "true").table(str("dsv2.delta.`%s`", tablePath));
+ List v2Rows = processStreamingQuery(v2Stream, "widen_ignore_deletes_v2");
+
+ Dataset v1Stream =
+ spark.readStream().format("delta").option("ignoreDeletes", "true").load(tablePath);
+ List v1Rows = processStreamingQuery(v1Stream, "widen_ignore_deletes_v1");
+
+ assertDataEquals(v2Rows, v1Rows);
+ // Surviving INSERT rows (pre + post-widening) surface; the DELETE commit is suppressed.
+ assertEquals(
+ 4,
+ v2Rows.size(),
+ () -> "Type widening + ignoreDeletes should emit 4 surviving rows; got: " + v2Rows);
+ // Stream schema must carry the widened LONG type.
+ assertEquals(
+ org.apache.spark.sql.types.DataTypes.LongType,
+ v2Stream.schema().apply("data").dataType(),
+ () -> "Stream schema for `data` should be LONG; got: " + v2Stream.schema().apply("data"));
+ }
+
+ // -- helpers for the appended tests --
+
+ /**
+ * CREATE TABLE with column mapping (name mode) + row tracking + deletion vectors enabled,
+ * partitioned by {@code p}. Used by the compound-feature tests.
+ */
+ private void createCmNameRtDvPartitionedTable(String path) {
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id LONG, name STRING, p STRING) USING delta "
+ + "PARTITIONED BY (p) "
+ + "TBLPROPERTIES ("
+ + " 'delta.columnMapping.mode' = 'name',"
+ + " 'delta.enableRowTracking' = 'true',"
+ + " 'delta.enableDeletionVectors' = 'true',"
+ + " 'delta.minReaderVersion' = '3',"
+ + " 'delta.minWriterVersion' = '7')",
+ path));
+ }
+
+ /** Returns recent progress entries that produced rows. */
+ private static StreamingQueryProgress[] nonEmptyProgress(StreamingQuery q) {
+ return Arrays.stream(q.recentProgress())
+ .filter(p -> p.numInputRows() != 0L)
+ .toArray(StreamingQueryProgress[]::new);
+ }
+
+ /** Run an AvailableNow query against a memory sink and return the sink's row count. */
+ private long runAvailableNowMemoryCount(Dataset streamingDF, String queryName)
+ throws Exception {
+ StreamingQuery query = null;
+ try {
+ query =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ query.processAllAvailable();
+ assertTrue(query.awaitTermination(60_000L), "AvailableNow did not terminate within 60s");
+ return spark.sql("SELECT COUNT(*) FROM " + queryName).collectAsList().get(0).getLong(0);
+ } finally {
+ if (query != null) {
+ query.stop();
+ DeltaLog.clearCache();
+ }
+ }
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingIctTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingIctTest.java
new file mode 100644
index 00000000000..ff66e5ef7ab
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingIctTest.java
@@ -0,0 +1,529 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import io.delta.spark.internal.v2.utils.IctTestUtils;
+import java.io.File;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TimeZone;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.apache.spark.sql.streaming.DataStreamReader;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.StreamingQueryException;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * End-to-end streaming tests for tables with {@code delta.enableInCommitTimestamps}, run through
+ * BOTH the DSv1 and DSv2 streaming readers and asserted for parity.
+ *
+ * The cross-product test matrix only covers ICT helper-level behavior; this suite exercises full
+ * streaming queries against ICT-enabled tables to find bugs in {@code startingTimestamp}/{@code
+ * startingVersion} resolution, restart semantics, sub-second skew, mtime drift, mid-table ICT
+ * enablement, AvailableNow trigger, deletion vectors, and column mapping.
+ *
+ *
DSv1 reference: {@code DeltaSourceSuite.testQuietly("startingTimestamp")} in {@code
+ * spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceSuite.scala}. DSv1 is treated as the
+ * oracle: every test runs both readers with identical options and compares the resulting rows.
+ */
+public class V2StreamingIctTest extends V2TestBase {
+
+ /** Format epoch millis as a "yyyy-MM-dd HH:mm:ss.SSS" string in the session timezone. */
+ private String formatTs(long millis) {
+ String tz = spark.sessionState().conf().sessionLocalTimeZone();
+ SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
+ sdf.setTimeZone(TimeZone.getTimeZone(tz));
+ return sdf.format(new Date(millis));
+ }
+
+ /** Create an ICT-enabled table and append {@code numCommits} commits with timestamps. */
+ private void createIctTableWithCommits(String tablePath, long[] timestamps) {
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id BIGINT) USING delta "
+ + "TBLPROPERTIES ('delta.enableInCommitTimestamps' = 'true')",
+ tablePath));
+ DeltaLog log = DeltaLog.forTable(spark, tablePath);
+ for (int i = 0; i < timestamps.length; i++) {
+ long rangeStart = (long) i * 10;
+ long rangeEnd = rangeStart + 10;
+ spark.range(rangeStart, rangeEnd).write().format("delta").mode("append").save(tablePath);
+ // Versions: 0=CREATE TABLE, 1..N=appends.
+ IctTestUtils.modifyCommitTimestamp(log, /* version= */ i + 1, timestamps[i]);
+ }
+ }
+
+ /** Build a single-entry options map for a starting-timestamp stream. */
+ private static Map startingTimestamp(String ts) {
+ Map opts = new HashMap<>();
+ opts.put("startingTimestamp", ts);
+ return opts;
+ }
+
+ /**
+ * Run the same streaming read against the DSv1 ("delta") and DSv2 ("dsv2.delta.`...`") sources
+ * with identical options and assert their row sets are equal. DSv1 is the oracle.
+ *
+ * @return the DSv1 row set (also equal to the DSv2 row set on success), sorted by id.
+ */
+ private List assertV1V2StreamingParity(
+ String tablePath, String tag, Map options) throws Exception {
+ DataStreamReader v1Reader = spark.readStream().format("delta");
+ for (Map.Entry e : options.entrySet()) {
+ v1Reader = v1Reader.option(e.getKey(), e.getValue());
+ }
+ List v1Rows = sortedById(processStreamingQuery(v1Reader.load(tablePath), tag + "_v1"));
+
+ DataStreamReader v2Reader = spark.readStream();
+ for (Map.Entry e : options.entrySet()) {
+ v2Reader = v2Reader.option(e.getKey(), e.getValue());
+ }
+ List v2Rows =
+ sortedById(
+ processStreamingQuery(v2Reader.table(str("dsv2.delta.`%s`", tablePath)), tag + "_v2"));
+
+ assertEquals(v1Rows.toString(), v2Rows.toString(), tag + ": V1 vs V2 row mismatch");
+ return v1Rows;
+ }
+
+ /** Sort rows by their first column ("id") so memory-sink ordering doesn't perturb compares. */
+ private static List sortedById(List rows) {
+ List copy = new ArrayList<>(rows);
+ copy.sort(
+ (a, b) -> {
+ long av = a.getLong(0);
+ long bv = b.getLong(0);
+ return Long.compare(av, bv);
+ });
+ return copy;
+ }
+
+ // ===================================================================================
+ // Case 1: ICT + startingTimestamp == known commit's ICT -> stream starts from that commit
+ // ===================================================================================
+ @Test
+ public void case1_startingTimestampEqualsCommitIct(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2});
+
+ List rows =
+ assertV1V2StreamingParity(tablePath, "ict_case1", startingTimestamp(formatTs(t2)));
+ // Expect rows from version 2 only (ids 10..19).
+ assertEquals(10, rows.size(), () -> "Rows: " + rows);
+ }
+
+ // ===================================================================================
+ // Case 2: ICT + startingTimestamp between two commits -> next-commit semantics
+ // ===================================================================================
+ @Test
+ public void case2_startingTimestampBetweenCommits(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 120_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2});
+
+ long between = t1 + 30_000L; // strictly between t1 and t2
+ List rows =
+ assertV1V2StreamingParity(tablePath, "ict_case2", startingTimestamp(formatTs(between)));
+ // Next-commit semantics: between two commits should resolve to t2 (version 2): ids 10..19.
+ assertEquals(
+ 10,
+ rows.size(),
+ () -> "Expected next-commit (10 rows from v2) but got " + rows.size() + ": " + rows);
+ }
+
+ // ===================================================================================
+ // Case 3: ICT sub-second skew: two commits within the same wall-clock millisecond
+ // ===================================================================================
+ @Test
+ public void case3_subsecondSameMillisDisambiguation(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long sameMs = 1700000000000L;
+ // Two commits with the SAME ICT millisecond. ICT requires monotonicity, so the second commit
+ // is forced to ICT(v1)+1. We model the "same ms" intent by setting both ICT and mtime to
+ // sameMs initially, then set v2's ICT to sameMs+1 to satisfy monotonicity.
+ createIctTableWithCommits(tablePath, new long[] {sameMs, sameMs + 1});
+
+ // Asking for the exact ms should land on v1 (first commit at-or-before).
+ List rowsA =
+ assertV1V2StreamingParity(tablePath, "ict_case3_a", startingTimestamp(formatTs(sameMs)));
+ assertEquals(20, rowsA.size(), () -> "starting@sameMs should include both commits: " + rowsA);
+
+ // Asking for sameMs+1 should land on v2 (next-commit).
+ List rowsB =
+ assertV1V2StreamingParity(
+ tablePath, "ict_case3_b", startingTimestamp(formatTs(sameMs + 1)));
+ assertEquals(10, rowsB.size(), () -> "starting@sameMs+1 should include only v2: " + rowsB);
+ }
+
+ // ===================================================================================
+ // Case 4: ICT vs filesystem mtime drift -> DSv2 must use ICT, not mtime
+ // ===================================================================================
+ @Test
+ public void case4_mtimeDriftUsesIct(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2});
+
+ // Tamper: set v2's filesystem mtime to BEFORE t1 (e.g., a restore overwrote it).
+ DeltaLog log = DeltaLog.forTable(spark, tablePath);
+ long bogusMtime = t1 - 600_000L;
+ IctTestUtils.setFileMtimeOnly(log, /* version= */ 2, bogusMtime);
+
+ // Ask for t2: with ICT, should still resolve to v2 (10 rows). With mtime, would resolve to
+ // v0 because v2's mtime is now before t1 -> would return more rows (entire table).
+ List rows =
+ assertV1V2StreamingParity(tablePath, "ict_case4", startingTimestamp(formatTs(t2)));
+ assertEquals(
+ 10,
+ rows.size(),
+ () -> "DSv2 should consult ICT (10 rows from v2); mtime-based would return more: " + rows);
+ }
+
+ // ===================================================================================
+ // Case 5: ICT + startingTimestamp = future -> error consistent with DSv1
+ // ===================================================================================
+ @Test
+ public void case5_startingTimestampFutureErrors(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2});
+
+ long future = t2 + 10 * 365L * 24 * 3600 * 1000L; // far future
+ String futureTs = formatTs(future);
+
+ // V1: should error with a DSv1-style "after the latest" message. V1 is the oracle for the
+ // error shape.
+ Dataset v1Df =
+ spark.readStream().format("delta").option("startingTimestamp", futureTs).load(tablePath);
+ StreamingQueryException v1Ex =
+ assertThrows(
+ StreamingQueryException.class, () -> processStreamingQuery(v1Df, "ict_case5_v1"));
+ assertAfterLatestError("V1", v1Ex);
+
+ // V2: should error with an equivalent "after the latest" message.
+ Dataset v2Df =
+ spark
+ .readStream()
+ .option("startingTimestamp", futureTs)
+ .table(str("dsv2.delta.`%s`", tablePath));
+ StreamingQueryException v2Ex =
+ assertThrows(
+ StreamingQueryException.class, () -> processStreamingQuery(v2Df, "ict_case5_v2"));
+ assertAfterLatestError("V2", v2Ex);
+ }
+
+ /** Assert a streaming query failure carries a DSv1-style "after the latest" message. */
+ private static void assertAfterLatestError(String label, StreamingQueryException ex) {
+ String msg = ex.getMessage() == null ? "" : ex.getMessage();
+ String causeMsg = ex.getCause() == null ? "" : String.valueOf(ex.getCause());
+ assertTrue(
+ msg.contains("after the latest")
+ || msg.contains("is after")
+ || causeMsg.contains("after the latest")
+ || causeMsg.contains("is after"),
+ () ->
+ label
+ + ": expected DSv1-style 'after the latest' error, got: "
+ + msg
+ + " / "
+ + causeMsg);
+ }
+
+ // ===================================================================================
+ // Case 6: Restart with startingTimestamp set -> should ignore and resume from checkpoint
+ // ===================================================================================
+ @Test
+ public void case6_restartIgnoresStartingTimestamp(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2});
+
+ // Each reader gets its own checkpoint + output dir so the V1 and V2 streams don't fight over
+ // state. We compare the per-reader "second-run" delta to assert parity.
+ String startTs = formatTs(t1);
+ String dsv2Ref = str("dsv2.delta.`%s`", tablePath);
+
+ long v1Initial =
+ runRestartFirstPass(
+ spark.readStream().format("delta").option("startingTimestamp", startTs).load(tablePath),
+ new File(deltaTablePath, "_chk_v1"),
+ new File(deltaTablePath, "_out_v1"));
+ long v2Initial =
+ runRestartFirstPass(
+ spark.readStream().option("startingTimestamp", startTs).table(dsv2Ref),
+ new File(deltaTablePath, "_chk_v2"),
+ new File(deltaTablePath, "_out_v2"));
+ assertEquals(20, v1Initial, "V1 initial run should process 20 rows");
+ assertEquals(v1Initial, v2Initial, "V1 vs V2 initial run row count mismatch");
+
+ // Append a new commit (v3).
+ long t3 = t2 + 60_000L;
+ spark.range(20, 30).write().format("delta").mode("append").save(tablePath);
+ IctTestUtils.modifyCommitTimestamp(DeltaLog.forTable(spark, tablePath), 3L, t3);
+
+ // Restart each query against its own checkpoint. Per DSv1 semantics, restart with checkpoint
+ // ignores startingTimestamp and resumes from where we left off (i.e., should only read v3).
+ long v1Restart =
+ runRestartSecondPass(
+ spark.readStream().format("delta").option("startingTimestamp", startTs).load(tablePath),
+ new File(deltaTablePath, "_chk_v1"),
+ new File(deltaTablePath, "_out_v1"),
+ v1Initial);
+ long v2Restart =
+ runRestartSecondPass(
+ spark.readStream().option("startingTimestamp", startTs).table(dsv2Ref),
+ new File(deltaTablePath, "_chk_v2"),
+ new File(deltaTablePath, "_out_v2"),
+ v2Initial);
+
+ assertEquals(
+ v1Restart,
+ v2Restart,
+ () -> "V1 restart added " + v1Restart + " rows, V2 added " + v2Restart);
+ assertEquals(
+ 10,
+ v1Restart,
+ () ->
+ "Restart should ignore startingTimestamp and only process v3 (10 rows), got "
+ + v1Restart);
+ }
+
+ /** Run a streaming query to completion against a parquet sink and return the row count. */
+ private long runRestartFirstPass(Dataset df, File checkpointDir, File outputDir)
+ throws Exception {
+ StreamingQuery q =
+ df.writeStream()
+ .format("parquet")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .start();
+ try {
+ q.processAllAvailable();
+ return spark.read().parquet(outputDir.getAbsolutePath()).count();
+ } finally {
+ q.stop();
+ DeltaLog.clearCache();
+ }
+ }
+
+ /** Restart the query and return how many rows the second pass added on top of {@code prior}. */
+ private long runRestartSecondPass(Dataset df, File checkpointDir, File outputDir, long prior)
+ throws Exception {
+ StreamingQuery q =
+ df.writeStream()
+ .format("parquet")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .start();
+ try {
+ q.processAllAvailable();
+ long total = spark.read().parquet(outputDir.getAbsolutePath()).count();
+ return total - prior;
+ } finally {
+ q.stop();
+ DeltaLog.clearCache();
+ }
+ }
+
+ // ===================================================================================
+ // Case 7: ICT enabled mid-history (commits 0-2 non-ICT, commits 3+ ICT)
+ // ===================================================================================
+ @Test
+ public void case7_ictEnabledMidHistory(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ // Create non-ICT table first.
+ spark.sql(str("CREATE TABLE delta.`%s` (id BIGINT) USING delta", tablePath));
+ // v1, v2 = non-ICT appends.
+ spark.range(0, 10).write().format("delta").mode("append").save(tablePath);
+ spark.range(10, 20).write().format("delta").mode("append").save(tablePath);
+ // Set deterministic mtimes for v0, v1, v2 so the non-ICT search has a
+ // monotonically-increasing wall-clock history. (v0's natural mtime is "now()" — far in the
+ // future relative to the synthetic v1/v2 mtimes, which would corrupt monotonization.)
+ DeltaLog log = DeltaLog.forTable(spark, tablePath);
+ long base = 1700000000000L;
+ long v0Mtime = base - 60_000L;
+ long v1Mtime = base;
+ long v2Mtime = base + 60_000L;
+ IctTestUtils.setFileMtimeOnly(log, 0L, v0Mtime);
+ IctTestUtils.setFileMtimeOnly(log, 1L, v1Mtime);
+ IctTestUtils.setFileMtimeOnly(log, 2L, v2Mtime);
+ // v3: ALTER TABLE turns on ICT (this commit itself enables ICT mid-history).
+ spark.sql(
+ str(
+ "ALTER TABLE delta.`%s` SET TBLPROPERTIES ('delta.enableInCommitTimestamps' = 'true')",
+ tablePath));
+ long v3Ict = v2Mtime + 120_000L;
+ IctTestUtils.modifyCommitTimestamp(log, 3L, v3Ict);
+ // v4: post-ICT data append.
+ spark.range(20, 30).write().format("delta").mode("append").save(tablePath);
+ long v4Ict = v3Ict + 60_000L;
+ IctTestUtils.modifyCommitTimestamp(log, 4L, v4Ict);
+
+ // 7a: startingTimestamp = v2's mtime should land on v2 (read v2..v4 = 20 rows).
+ List rowsA =
+ assertV1V2StreamingParity(tablePath, "ict_case7_a", startingTimestamp(formatTs(v2Mtime)));
+ assertEquals(
+ 20,
+ rowsA.size(),
+ () -> "starting@v2.mtime should read v2..v4 (20 rows), got: " + rowsA.size());
+
+ // 7b: startingTimestamp = v3's ICT should land on v3 (read v3..v4 = 10 rows; v3 is metadata-
+ // only so 10 rows from v4).
+ List rowsB =
+ assertV1V2StreamingParity(tablePath, "ict_case7_b", startingTimestamp(formatTs(v3Ict)));
+ assertEquals(
+ 10,
+ rowsB.size(),
+ () -> "starting@v3.ICT should read v3..v4 (10 rows from v4), got: " + rowsB.size());
+ }
+
+ // ===================================================================================
+ // Case 8: ICT x Trigger.AvailableNow
+ // ===================================================================================
+ @Test
+ public void case8_ictAvailableNow(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ long t3 = t2 + 60_000L;
+ createIctTableWithCommits(tablePath, new long[] {t1, t2, t3});
+
+ String startTs = formatTs(t2);
+ String dsv2Ref = str("dsv2.delta.`%s`", tablePath);
+
+ long v1Rows =
+ runAvailableNow(
+ spark.readStream().format("delta").option("startingTimestamp", startTs).load(tablePath),
+ "ict_case8_v1",
+ new File(deltaTablePath, "_chk_v1"));
+ long v2Rows =
+ runAvailableNow(
+ spark.readStream().option("startingTimestamp", startTs).table(dsv2Ref),
+ "ict_case8_v2",
+ new File(deltaTablePath, "_chk_v2"));
+
+ assertEquals(v1Rows, v2Rows, () -> "V1=" + v1Rows + " V2=" + v2Rows);
+ // starting@t2 -> read v2 + v3 = 20 rows.
+ assertEquals(
+ 20, v1Rows, () -> "AvailableNow + ICT starting@t2 should read 20 rows, got " + v1Rows);
+ }
+
+ /** Run an AvailableNow streaming query and return the COUNT(*) from its memory sink. */
+ private long runAvailableNow(Dataset df, String queryName, File checkpointDir)
+ throws Exception {
+ StreamingQuery q =
+ df.writeStream()
+ .format("memory")
+ .queryName(queryName)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ try {
+ q.processAllAvailable();
+ assertTrue(q.awaitTermination(60_000L), queryName + ": AvailableNow query should terminate");
+ return spark.sql("SELECT COUNT(*) FROM " + queryName).collectAsList().get(0).getLong(0);
+ } finally {
+ q.stop();
+ DeltaLog.clearCache();
+ }
+ }
+
+ // ===================================================================================
+ // Case 9: ICT x Deletion Vectors
+ // ===================================================================================
+ @Test
+ @Disabled(
+ "KNOWN-GAP: DSv2 streaming does not yet honor DV-only deletes when the source has a DELETE "
+ + "commit between the starting timestamp and the latest version; the stream surfaces "
+ + "DELTA_SOURCE_TABLE_IGNORE_CHANGES instead of applying the DV against the snapshot. "
+ + "Re-enable when DSv2 streaming treats DV-only deletes the way DSv1 does.")
+ public void case9_ictWithDeletionVectors(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id BIGINT) USING delta TBLPROPERTIES ("
+ + "'delta.enableInCommitTimestamps' = 'true',"
+ + "'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+ long t1 = 1700000000000L;
+ spark.range(0, 10).coalesce(1).write().format("delta").mode("append").save(tablePath);
+ DeltaLog log = DeltaLog.forTable(spark, tablePath);
+ IctTestUtils.modifyCommitTimestamp(log, 1L, t1);
+
+ // Apply DV: delete id=0,1,2.
+ long t2 = t1 + 60_000L;
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id < 3", tablePath));
+ IctTestUtils.modifyCommitTimestamp(log, 2L, t2);
+
+ // Stream from t1 with ICT-enabled DV table.
+ List rows =
+ assertV1V2StreamingParity(tablePath, "ict_case9", startingTimestamp(formatTs(t1)));
+ // v1 produces 10 rows in initial snapshot; the DV at v2 filters 0,1,2 leaving 7 rows.
+ assertEquals(7, rows.size(), () -> "ICT + DV: expected 7 rows after DV applied, got: " + rows);
+ }
+
+ // ===================================================================================
+ // Case 10: ICT x Column mapping
+ // ===================================================================================
+ @Test
+ public void case10_ictWithColumnMapping(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id BIGINT) USING delta TBLPROPERTIES ("
+ + "'delta.enableInCommitTimestamps' = 'true',"
+ + "'delta.columnMapping.mode' = 'name',"
+ + "'delta.minReaderVersion' = '2',"
+ + "'delta.minWriterVersion' = '5')",
+ tablePath));
+ long t1 = 1700000000000L;
+ long t2 = t1 + 60_000L;
+ spark.range(0, 10).write().format("delta").mode("append").save(tablePath);
+ spark.range(10, 20).write().format("delta").mode("append").save(tablePath);
+ DeltaLog log = DeltaLog.forTable(spark, tablePath);
+ IctTestUtils.modifyCommitTimestamp(log, 1L, t1);
+ IctTestUtils.modifyCommitTimestamp(log, 2L, t2);
+
+ // Stream from t2 (next-commit semantics: lands on v2 -> 10 rows).
+ List rows =
+ assertV1V2StreamingParity(tablePath, "ict_case10", startingTimestamp(formatTs(t2)));
+ assertEquals(
+ 10, rows.size(), () -> "ICT + column-mapping: expected 10 rows from v2, got: " + rows);
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingLogRetentionFailOnDataLossTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingLogRetentionFailOnDataLossTest.java
new file mode 100644
index 00000000000..543515b913a
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingLogRetentionFailOnDataLossTest.java
@@ -0,0 +1,258 @@
+/*
+ * Copyright (2026) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * Bug #27: DSv2 does not honor {@code failOnDataLoss=false} when {@code logRetentionDuration} has
+ * pruned old commit JSON files.
+ *
+ * When Delta log retention prunes old commit JSON files, DSv2 surfaces {@code
+ * InvalidTableException: Missing delta files - versions are not contiguous} from Kernel's {@code
+ * CommitRangeFactory}. DSv1 honors {@code failOnDataLoss=false} by skipping over the gap. DSv2
+ * propagates the Kernel exception as a hard {@code StreamingQueryException} regardless of the
+ * option.
+ *
+ *
Each test in this file uses DSv1 as the oracle (expected to succeed and emit rows) and pins
+ * the DSv2 failure shape so the divergence is documented. When the bug is fixed, the V2 assertion
+ * blocks here will need to be inverted to match V1.
+ */
+public class V2StreamingLogRetentionFailOnDataLossTest extends V2TestBase {
+
+ /** Force a checkpoint so the snapshot can be reconstructed without the pruned commit JSON. */
+ @SuppressWarnings("deprecation")
+ private void checkpoint(String tablePath) {
+ DeltaLog.forTable(spark, tablePath).checkpoint();
+ }
+
+ /**
+ * Simulate {@code logRetentionDuration} expiry by deleting the commit JSON for {@code version}
+ * (and its CRC sibling) under {@code _delta_log/}.
+ */
+ private void pruneCommitJson(String tablePath, long version) throws Exception {
+ String name = String.format("%020d.json", version);
+ Path json = Paths.get(tablePath, "_delta_log", name);
+ Files.delete(json);
+ Path crc = Paths.get(tablePath, "_delta_log", String.format("%020d.crc", version));
+ if (Files.exists(crc)) {
+ Files.delete(crc);
+ }
+ DeltaLog.clearCache();
+ }
+
+ /** Build a table of 4 single-row commits (v1..v4 INSERT after v0 CREATE) at {@code tablePath}. */
+ private void buildFourCommitTable(String tablePath) {
+ spark.sql(str("CREATE TABLE delta.`%s` (id INT) USING delta", tablePath));
+ for (int i = 1; i <= 4; i++) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (%d)", tablePath, i));
+ }
+ }
+
+ /**
+ * Assert that {@code ex} (a Throwable thrown by a DSv2 streaming query) indicates the Bug #27
+ * "not contiguous" / InvalidTableException failure shape. Checks the exception itself and its
+ * cause chain.
+ */
+ private static void assertBug27FailureShape(Throwable ex) {
+ String top = ex.toString();
+ Throwable cause = ex.getCause();
+ String causeStr = cause == null ? "" : cause.toString();
+ assertTrue(
+ top.contains("not contiguous")
+ || top.contains("InvalidTable")
+ || causeStr.contains("not contiguous")
+ || causeStr.contains("InvalidTable"),
+ () -> "Expected InvalidTableException / not-contiguous error but got: " + ex);
+ }
+
+ /**
+ * Test 1. Basic streaming read with a pruned middle commit JSON.
+ *
+ *
Create a table with 4 INSERT commits (v0=CREATE, v1..v4=INSERT), force a checkpoint, then
+ * delete the v1 commit JSON + CRC. Stream with {@code failOnDataLoss=false} from {@code
+ * startingVersion=0}.
+ *
+ *
V1 (oracle): succeeds and emits rows from the reconstructed snapshot. V2 (Bug #27):
+ * propagates Kernel's {@code InvalidTableException}.
+ */
+ @Test
+ public void testLogRetentionPrune_basicStream(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ buildFourCommitTable(tablePath);
+
+ checkpoint(tablePath);
+ pruneCommitJson(tablePath, /* version= */ 1L);
+
+ String tag = "basic";
+
+ // V1 (oracle): should succeed and emit rows.
+ List v1Rows = null;
+ try {
+ v1Rows =
+ processStreamingQuery(
+ spark
+ .readStream()
+ .format("delta")
+ .option("failOnDataLoss", "false")
+ .option("startingVersion", "0")
+ .load(tablePath),
+ "v1_" + tag);
+ } catch (Exception e) {
+ fail("V1 should honor failOnDataLoss=false but threw: " + e);
+ }
+ assertFalse(v1Rows.isEmpty(), () -> "V1 should emit rows from reconstructed snapshot");
+
+ // V2 (Bug #27): currently throws InvalidTableException. When the bug is fixed, this assertion
+ // block must be inverted to match V1's behavior.
+ try {
+ processStreamingQuery(
+ spark
+ .readStream()
+ .option("failOnDataLoss", "false")
+ .option("startingVersion", "0")
+ .table(str("dsv2.delta.`%s`", tablePath)),
+ "v2_" + tag);
+ fail(
+ "Expected V2 to fail with InvalidTableException / not-contiguous - if this passes, Bug"
+ + " #27 is fixed and this test should be updated to assert V1 parity.");
+ } catch (Exception e) {
+ assertBug27FailureShape(e);
+ }
+ }
+
+ /**
+ * Test 2. Pruned middle commit JSON composed with {@code excludeRegex}.
+ *
+ * Same setup as Test 1, but also passes an {@code excludeRegex} option to the stream.
+ *
+ *
V1 (oracle): succeeds. V2 (Bug #27): still propagates the Kernel exception - the regex
+ * filter does not change the underlying commit-range failure.
+ */
+ @Test
+ public void testLogRetentionPrune_withExcludeRegex(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ buildFourCommitTable(tablePath);
+
+ checkpoint(tablePath);
+ pruneCommitJson(tablePath, /* version= */ 1L);
+
+ String tag = "excl";
+
+ // V1 (oracle): should succeed and emit rows.
+ List v1Rows = null;
+ try {
+ v1Rows =
+ processStreamingQuery(
+ spark
+ .readStream()
+ .format("delta")
+ .option("failOnDataLoss", "false")
+ .option("excludeRegex", "nonmatching_regex_xyz")
+ .option("startingVersion", "0")
+ .load(tablePath),
+ "v1_" + tag);
+ } catch (Exception e) {
+ fail("V1 should honor failOnDataLoss=false but threw: " + e);
+ }
+ assertFalse(v1Rows.isEmpty(), () -> "V1 should emit rows from reconstructed snapshot");
+
+ // V2 (Bug #27): currently throws InvalidTableException.
+ try {
+ processStreamingQuery(
+ spark
+ .readStream()
+ .option("failOnDataLoss", "false")
+ .option("excludeRegex", "nonmatching_regex_xyz")
+ .option("startingVersion", "0")
+ .table(str("dsv2.delta.`%s`", tablePath)),
+ "v2_" + tag);
+ fail(
+ "Expected V2 to fail with InvalidTableException / not-contiguous - if this passes, Bug"
+ + " #27 is fixed and this test should be updated to assert V1 parity.");
+ } catch (Exception e) {
+ assertBug27FailureShape(e);
+ }
+ }
+
+ /**
+ * Test 3. Pruned middle commit JSON composed with {@code maxFilesPerTrigger=1}.
+ *
+ * Same setup as Test 1, but also passes {@code maxFilesPerTrigger=1} to the stream so each
+ * micro-batch admits exactly one file.
+ *
+ *
V1 (oracle): succeeds; rate limit does not interact with the pruned commit. V2 (Bug #27):
+ * still propagates the Kernel exception from CommitRangeFactory.
+ */
+ @Test
+ public void testLogRetentionPrune_withMaxFilesPerTrigger(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ buildFourCommitTable(tablePath);
+
+ checkpoint(tablePath);
+ pruneCommitJson(tablePath, /* version= */ 1L);
+
+ String tag = "mfpt";
+
+ // V1 (oracle): should succeed and emit rows.
+ List v1Rows = null;
+ try {
+ v1Rows =
+ processStreamingQuery(
+ spark
+ .readStream()
+ .format("delta")
+ .option("failOnDataLoss", "false")
+ .option("maxFilesPerTrigger", "1")
+ .option("startingVersion", "0")
+ .load(tablePath),
+ "v1_" + tag);
+ } catch (Exception e) {
+ fail("V1 should honor failOnDataLoss=false but threw: " + e);
+ }
+ assertFalse(v1Rows.isEmpty(), () -> "V1 should emit rows from reconstructed snapshot");
+
+ // V2 (Bug #27): currently throws InvalidTableException.
+ try {
+ processStreamingQuery(
+ spark
+ .readStream()
+ .option("failOnDataLoss", "false")
+ .option("maxFilesPerTrigger", "1")
+ .option("startingVersion", "0")
+ .table(str("dsv2.delta.`%s`", tablePath)),
+ "v2_" + tag);
+ fail(
+ "Expected V2 to fail with InvalidTableException / not-contiguous - if this passes, Bug"
+ + " #27 is fixed and this test should be updated to assert V1 parity.");
+ } catch (Exception e) {
+ assertBug27FailureShape(e);
+ }
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingMidPriorityScenarios13to18Test.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingMidPriorityScenarios13to18Test.java
new file mode 100644
index 00000000000..dcadf0c5275
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingMidPriorityScenarios13to18Test.java
@@ -0,0 +1,536 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.math.BigDecimal;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.Trigger;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * DSv2 streaming tests for medium-priority scenarios 13-18 from the brainstorm doc {@code
+ * ~/markdown/testgap/scenario_brainstorm.md}. Each test targets one suspected bug; the goal is to
+ * surface failures that DSv1 either prevents or surfaces differently. Patterns mirror {@code
+ * V2StreamingEdgeDataReadTest}.
+ */
+public class V2StreamingMidPriorityScenarios13to18Test extends V2TestBase {
+
+ /**
+ * Scenario 13: excludeRegex against a %XX-encoded partition path.
+ *
+ * SMS:443-447 applies excludeRegex to AddFile.getPath(), which for partitioned tables holds
+ * the URL-encoded directory segment (e.g. "p=a%3Db/part-0000.parquet"). DSv1 (DeltaSource.scala)
+ * matches against the same encoded path, so this is a parity check. A bug surfaces if DSv2
+ * decodes the path before regex match, diverging from DSv1.
+ */
+ @Test
+ public void testScenario13_excludeRegex_encodedPartitionPath(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, p STRING) USING delta PARTITIONED BY (p)",
+ tablePath));
+
+ // 'a=b' partition value -> directory "p=a%3Db". Use DataFrame to avoid SQL escaping.
+ List rows =
+ Arrays.asList(
+ RowFactory.create(1, "a=b"), RowFactory.create(2, "ok"), RowFactory.create(3, "a=b"));
+ StructType schema =
+ DataTypes.createStructType(
+ Arrays.asList(
+ DataTypes.createStructField("id", DataTypes.IntegerType, false),
+ DataTypes.createStructField("p", DataTypes.StringType, true)));
+ spark
+ .createDataFrame(rows, schema)
+ .write()
+ .format("delta")
+ .mode("append")
+ .partitionBy("p")
+ .save(tablePath);
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ // Regex matches the URL-ENCODED form. DSv1 parity expected.
+ Dataset dsv2Stream =
+ spark.readStream().option("excludeRegex", "p=a%3Db").table(dsv2TableRef);
+ List dsv2Rows = processStreamingQuery(dsv2Stream, "scn13_dsv2_encoded");
+
+ Dataset dsv1Stream =
+ spark.readStream().format("delta").option("excludeRegex", "p=a%3Db").load(tablePath);
+ List dsv1Rows = processStreamingQuery(dsv1Stream, "scn13_dsv1_encoded");
+
+ assertEquals(
+ dsv1Rows.size(),
+ dsv2Rows.size(),
+ () ->
+ "DSv1 vs DSv2 row count diverge for excludeRegex against %XX-encoded partition path."
+ + " DSv1="
+ + dsv1Rows
+ + " DSv2="
+ + dsv2Rows);
+ assertDataEquals(dsv2Rows, dsv1Rows);
+ }
+
+ /**
+ * Scenario 14: Stream restart after table dropped + recreated with CM and DV.
+ *
+ * Extends DSv1 DeltaSourceSuite ":845". SMS:214 captures tableId only at init from the initial
+ * snapshot. After restart, DeltaSourceOffset.apply(tableId, json) compares the new tableId
+ * against the offset's reservoirId; mismatch should raise DELTA_RESERVOIR_ID_MISMATCH (or DSv2
+ * equivalent), NOT silently resume reading the new table from the old offset. We accept either a
+ * clean error or a re-bootstrap, but reject silent data loss.
+ */
+ @Test
+ public void testScenario14_streamRestart_afterDropAndRecreate_withCMAndDV(
+ @TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ // Use sibling checkpoint dirs so the drop-and-recreate of the table directory does not also
+ // nuke the checkpoint state we need to restart from.
+ File checkpointV2 =
+ new File(deltaTablePath.getParentFile(), "_ckpt_v2_" + deltaTablePath.getName());
+ File checkpointV1 =
+ new File(deltaTablePath.getParentFile(), "_ckpt_v1_" + deltaTablePath.getName());
+
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, v STRING) USING delta TBLPROPERTIES ("
+ + "'delta.columnMapping.mode' = 'name', "
+ + "'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 'a')", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (2, 'b')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ // First-run: stream both engines through the initial table so each checkpoint captures the
+ // pre-drop tableId / reservoirId.
+ StreamingQuery q1v2 =
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .writeStream()
+ .format("noop")
+ .queryName("scn14_first_dsv2")
+ .option("checkpointLocation", checkpointV2.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ q1v2.awaitTermination();
+ q1v2.stop();
+
+ StreamingQuery q1v1 =
+ spark
+ .readStream()
+ .format("delta")
+ .load(tablePath)
+ .writeStream()
+ .format("noop")
+ .queryName("scn14_first_dsv1")
+ .option("checkpointLocation", checkpointV1.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ q1v1.awaitTermination();
+ q1v1.stop();
+ org.apache.spark.sql.delta.DeltaLog.clearCache();
+
+ // Drop and recreate at the same path.
+ deleteRecursively(deltaTablePath);
+ assertTrue(deltaTablePath.mkdirs(), "Failed to recreate table directory");
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, v STRING) USING delta TBLPROPERTIES ("
+ + "'delta.columnMapping.mode' = 'name', "
+ + "'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (99, 'NEW')", tablePath));
+
+ long dsv2Rows =
+ runSecondRunAndCountRows(
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .writeStream()
+ .format("noop")
+ .queryName("scn14_second_dsv2")
+ .option("checkpointLocation", checkpointV2.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow()),
+ "DSv2");
+
+ long dsv1Rows =
+ runSecondRunAndCountRows(
+ spark
+ .readStream()
+ .format("delta")
+ .load(tablePath)
+ .writeStream()
+ .format("noop")
+ .queryName("scn14_second_dsv1")
+ .option("checkpointLocation", checkpointV1.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow()),
+ "DSv1");
+
+ // Parity check: whatever behavior DSv1 picks (clean error -> rows == -1, or re-bootstrap from
+ // the new table -> rows >= 1), DSv2 must match. Silent zero on either engine indicates the
+ // tableId mismatch is not enforced and the new row was never read.
+ assertEquals(
+ dsv1Rows,
+ dsv2Rows,
+ () ->
+ "DSv1 vs DSv2 diverge after drop+recreate restart. DSv1 rows="
+ + dsv1Rows
+ + " DSv2 rows="
+ + dsv2Rows
+ + " (-1 = structured error, 0 = silent skip / data loss, >=1 = re-bootstrap)");
+ }
+
+ /**
+ * Starts the given write-stream, awaits termination, and returns the total {@code numInputRows}
+ * across all batches. Returns {@code -1} if the query terminated with an exception (a structured
+ * error). Asserts that any exception is NOT a raw NPE.
+ */
+ private static long runSecondRunAndCountRows(
+ org.apache.spark.sql.streaming.DataStreamWriter writer, String engine) throws Exception {
+ StreamingQuery q = null;
+ Throwable thrown = null;
+ try {
+ q = writer.start();
+ q.awaitTermination();
+ } catch (Throwable t) {
+ thrown = t;
+ } finally {
+ if (q != null) q.stop();
+ org.apache.spark.sql.delta.DeltaLog.clearCache();
+ }
+ if (thrown != null || (q != null && q.exception().isDefined())) {
+ String msg = (thrown != null ? thrown.toString() : q.exception().get().toString());
+ assertFalse(
+ msg.contains("NullPointerException"),
+ () -> engine + ": drop+recreate restart raised NPE instead of structured error: " + msg);
+ return -1L;
+ }
+ long rows = 0;
+ for (org.apache.spark.sql.streaming.StreamingQueryProgress p : q.recentProgress()) {
+ rows += p.numInputRows();
+ }
+ return rows;
+ }
+
+ /**
+ * Scenario 15: maxBytesPerTrigger when limit equals the larger of two files.
+ *
+ * Exercises the per-file admission boundary in {@code DeltaSourceAdmissionBase.admit}: after
+ * the first file is admitted via the deadlock guard, the second must actually fit in the
+ * remaining capacity. With limit == max(f1,f2), neither file can ever fit alongside the other, so
+ * each must land in its own batch regardless of which file streaming sees first.
+ */
+ @Test
+ public void testScenario15_maxBytesPerTrigger_fileSizeEqualsLimit(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(str("CREATE TABLE delta.`%s` (id INT, name STRING) USING delta", tablePath));
+
+ spark
+ .range(50)
+ .selectExpr("cast(id as int) as id", "concat('row', cast(id as string)) as name")
+ .coalesce(1)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ spark
+ .range(50, 100)
+ .selectExpr("cast(id as int) as id", "concat('row', cast(id as string)) as name")
+ .coalesce(1)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ org.apache.spark.sql.delta.DeltaLog deltaLog =
+ org.apache.spark.sql.delta.DeltaLog.forTable(
+ spark, new org.apache.hadoop.fs.Path(tablePath));
+ org.apache.spark.sql.delta.actions.AddFile[] addsArr =
+ (org.apache.spark.sql.delta.actions.AddFile[])
+ deltaLog.update(false, scala.Option.empty(), scala.Option.empty()).allFiles().collect();
+ assertEquals(2, addsArr.length, "Expected exactly 2 AddFiles for this scenario.");
+ long file1Size = addsArr[0].size();
+ long file2Size = addsArr[1].size();
+ // Use max() so the limit is deterministic regardless of allFiles() listing order: with
+ // limit == max(f1,f2), the two files must split across batches no matter which one is read
+ // first, so the bug repros 5/5 instead of being masked by listing order.
+ long limit = Math.max(file1Size, file2Size);
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ String maxBytes = limit + "b";
+
+ // Bug #24 is SHARED between V1 and V2: both engines route through the common
+ // `DeltaSourceAdmissionBase.admit` code, so a V1==V2 differential would pass with both wrong
+ // the same way and fail to expose the bug. We instead assert each engine independently against
+ // the oracle (>= 2 non-empty batches when maxBytesPerTrigger == max(file1, file2)).
+ StreamingQuery qV2 =
+ spark
+ .readStream()
+ .option("maxBytesPerTrigger", maxBytes)
+ .table(dsv2TableRef)
+ .writeStream()
+ .format("noop")
+ .queryName("scn15_eq_limit_dsv2")
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ qV2.awaitTermination();
+ qV2.stop();
+
+ StreamingQuery qV1 =
+ spark
+ .readStream()
+ .format("delta")
+ .option("maxBytesPerTrigger", maxBytes)
+ .load(tablePath)
+ .writeStream()
+ .format("noop")
+ .queryName("scn15_eq_limit_dsv1")
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ qV1.awaitTermination();
+ qV1.stop();
+
+ assertOracleAdmitSplit(qV2, "DSv2", file1Size, file2Size, limit);
+ assertOracleAdmitSplit(qV1, "DSv1", file1Size, file2Size, limit);
+ }
+
+ private static void assertOracleAdmitSplit(
+ StreamingQuery q, String engine, long file1Size, long file2Size, long limit) {
+ int nonEmptyBatches = 0;
+ long totalRows = 0;
+ for (org.apache.spark.sql.streaming.StreamingQueryProgress p : q.recentProgress()) {
+ if (p.numInputRows() > 0) {
+ nonEmptyBatches++;
+ }
+ totalRows += p.numInputRows();
+ }
+ final int finalNonEmpty = nonEmptyBatches;
+ assertEquals(100L, totalRows, () -> engine + ": total rows should equal 100 across batches.");
+ assertTrue(
+ finalNonEmpty >= 2,
+ () ->
+ engine
+ + ": expected at least 2 non-empty batches when maxBytesPerTrigger equals"
+ + " max(file1,file2) (file1="
+ + file1Size
+ + " file2="
+ + file2Size
+ + " limit="
+ + limit
+ + "); got "
+ + finalNonEmpty
+ + ". Indicates per-file admit re-fires the deadlock guard for every file with any"
+ + " positive remaining capacity, instead of only for the first file in the batch.");
+ }
+
+ /**
+ * Scenario 16: CCv2 commit-coordinator handoff mid-stream.
+ *
+ *
SS:443 / SMS:309 leave a TODO; CCv2 in OSS DSv2 is wired only via the Unity Catalog code
+ * path (UCCommitCoordinatorBuilder, UCCatalogManagedClient). There is no path-based way to
+ * construct a CCv2-managed table outside UC, so a meaningful test requires either UC or a
+ * SparkInjectingCommitCoordinator hook, neither of which is present in OSS test infra.
+ */
+ @Disabled(
+ "KNOWN-GAP (2026-05-04): CCv2 commit-coordinator handoff requires Unity Catalog wiring; "
+ + "no OSS path-based way to construct a CCv2-managed Delta table. See "
+ + "scenario_brainstorm.md #16. Re-enable when CCv2 lands with an OSS test harness.")
+ @Test
+ public void testScenario16_ccv2_commitCoordinatorHandoffMidStream(@TempDir File deltaTablePath)
+ throws Exception {
+ fail("Not implementable in OSS - see @Disabled reason.");
+ }
+
+ /**
+ * Scenario 17: MAP<STRUCT, INT> with a fully-NULL composite key.
+ *
+ *
ColumnarMap key/value DV-wrapped paths are uncovered. The fix in dea78c848 wraps non-Struct
+ * child vectors so getChild(0)/getChild(1) on a MAP-typed column applies the DV row-id mapping to
+ * the keyArray and valueArray. The composite STRUCT key adds another layer: keyArray's children
+ * are the struct fields. Without the fix, getChild on the MAP returns a raw delegate child whose
+ * null bitmap is keyed on input row ids - a NULL key in the input may surface at the wrong row in
+ * output.
+ *
+ *
Spark does not allow NULL as a map key directly; we use a struct key whose fields are all
+ * NULL.
+ */
+ @Test
+ public void testScenario17_mapStructIntKey_nullCompositeKey(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` ("
+ + "id INT, "
+ + "m MAP, INT>) USING delta TBLPROPERTIES ("
+ + "'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+
+ spark.sql(
+ str(
+ "INSERT INTO delta.`%s` VALUES "
+ + "(1, map(named_struct('a', 1, 'b', 'k1'), 100)), "
+ + "(2, map(named_struct('a', 2, 'b', 'k2'), 200)), "
+ + "(3, map(named_struct('a', cast(NULL as int), 'b', cast(NULL as string)), 300)), "
+ + "(4, map(named_struct('a', 4, 'b', 'k4'), 400))",
+ tablePath));
+
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id = 2", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset dsv2Stream = spark.readStream().table(dsv2TableRef);
+ List dsv2Rows = processStreamingQuery(dsv2Stream, "scn17_dsv2_map_struct_null");
+
+ Dataset dsv1Stream = spark.readStream().format("delta").load(tablePath);
+ List dsv1Rows = processStreamingQuery(dsv1Stream, "scn17_dsv1_map_struct_null");
+
+ assertEquals(
+ 3,
+ dsv2Rows.size(),
+ () -> "Expected 3 rows after DELETE (id=2 removed via DV); got " + dsv2Rows);
+ assertDataEquals(dsv2Rows, dsv1Rows);
+ }
+
+ /**
+ * Scenario 18: Decimal(38, 38) byte[]-storage column under DV.
+ *
+ * Type-fanout coverage only included Decimal(30, 6). Spark stores any decimal with precision
+ * > Decimal.MAX_LONG_DIGITS (=18) as BigDecimal-backed byte[] (BinaryType in Parquet), which
+ * routes through the byte[] accessor path of ColumnVectorWithFilter. With DV applied, the
+ * remapping must apply to that byte[] read - dea78c848 ensures non-Struct children are wrapped.
+ *
+ *
Decimal(38, 38) means scale == precision: legal values are in (-1, 1) with up to 38 digits
+ * after the decimal point. Bug indicator: wrong-row decimal returned for an undeleted row.
+ */
+ @Test
+ public void testScenario18_decimal38_38_underDV(@TempDir File deltaTablePath) throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id INT, d DECIMAL(38, 38)) USING delta TBLPROPERTIES ("
+ + "'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+
+ BigDecimal d1 = new BigDecimal("0." + repeat("1", 38));
+ BigDecimal d2 = new BigDecimal("0." + repeat("2", 38));
+ BigDecimal d3 = new BigDecimal("0." + repeat("3", 38));
+ BigDecimal d4 = new BigDecimal("0." + repeat("4", 38));
+
+ List seedRows =
+ Arrays.asList(
+ RowFactory.create(1, d1),
+ RowFactory.create(2, d2),
+ RowFactory.create(3, d3),
+ RowFactory.create(4, d4));
+ StructType schema =
+ DataTypes.createStructType(
+ Arrays.asList(
+ DataTypes.createStructField("id", DataTypes.IntegerType, false),
+ DataTypes.createStructField("d", DataTypes.createDecimalType(38, 38), true)));
+ spark.createDataFrame(seedRows, schema).write().format("delta").mode("append").save(tablePath);
+
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id = 2", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset dsv2Stream = spark.readStream().table(dsv2TableRef);
+ List dsv2Rows = processStreamingQuery(dsv2Stream, "scn18_dsv2_decimal38_38_dv");
+
+ // DSv1 is the oracle for decimal-under-DV: it has shipped this path for years. We expect DSv2
+ // to match DSv1 row-for-row; divergence indicates the byte[] decimal accessor missed the DV
+ // row-id remap.
+ Dataset dsv1Stream = spark.readStream().format("delta").load(tablePath);
+ List dsv1Rows = processStreamingQuery(dsv1Stream, "scn18_dsv1_decimal38_38_dv");
+
+ assertEquals(3, dsv2Rows.size(), () -> "Expected 3 surviving rows; got: " + dsv2Rows);
+
+ Map expected = new HashMap<>();
+ expected.put(1, d1);
+ expected.put(3, d3);
+ expected.put(4, d4);
+
+ Map dsv2Actual = new HashMap<>();
+ for (Row r : dsv2Rows) {
+ dsv2Actual.put(r.getInt(0), r.getDecimal(1));
+ }
+ Map dsv1Actual = new HashMap<>();
+ for (Row r : dsv1Rows) {
+ dsv1Actual.put(r.getInt(0), r.getDecimal(1));
+ }
+ assertEquals(
+ expected,
+ dsv1Actual,
+ () ->
+ "DSv1 oracle returned wrong Decimal(38,38) values under DV. Expected="
+ + expected
+ + " DSv1="
+ + dsv1Actual);
+ assertEquals(
+ expected,
+ dsv2Actual,
+ () ->
+ "Decimal(38,38) values misaligned with ids after DV remap on DSv2. Indicates row-id "
+ + "mapping missing on byte[] decimal storage path. Expected="
+ + expected
+ + " DSv2="
+ + dsv2Actual);
+ assertDataEquals(dsv2Rows, dsv1Rows);
+ }
+
+ private static String repeat(String s, int n) {
+ StringBuilder sb = new StringBuilder(s.length() * n);
+ for (int i = 0; i < n; i++) sb.append(s);
+ return sb.toString();
+ }
+
+ private static void deleteRecursively(File f) {
+ if (f == null) return;
+ if (f.isDirectory()) {
+ File[] children = f.listFiles();
+ if (children != null) {
+ for (File c : children) {
+ deleteRecursively(c);
+ }
+ }
+ }
+ f.delete();
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRaceLifecycleTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRaceLifecycleTest.java
new file mode 100644
index 00000000000..af8d7ea1225
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRaceLifecycleTest.java
@@ -0,0 +1,552 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
+import org.apache.spark.sql.*;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.apache.spark.sql.delta.sources.DeltaSourceOffset;
+import org.apache.spark.sql.delta.sources.DeltaSourceOffset$;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.StreamingQueryProgress;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * Tests for V2 streaming under race conditions and lifecycle scenarios.
+ *
+ * Each test corresponds to a scenario from {@code testgap/scenario_brainstorm.md} and is
+ * designed to surface bugs in DSv2 streaming. Each test exercises the scenario through BOTH the
+ * DSv1 and DSv2 streaming paths and asserts the two sides agree (matching rows, or matching
+ * exceptions). DSv1 is the oracle.
+ */
+public class V2StreamingRaceLifecycleTest extends V2TestBase {
+
+ /**
+ * Scenario 2: Concurrent commit between {@code latestOffset()} and {@code planInputPartitions()}.
+ *
+ *
SMS:317 captures endOffset; SMS:425 builds files independently - nothing pins the snapshot.
+ * If a writer commits a new file between the two phases, a phantom AddFile beyond endOffset could
+ * leak into the batch.
+ *
+ *
We can't reliably synchronize between the two micro-batch phases without a Spark internal
+ * test hook. This is a best-effort race: a writer commits in a tight loop while a reader runs
+ * with maxFilesPerTrigger=1 to maximize the gap. We then verify each batch's row count never
+ * exceeds the declared max, and the overall results match the writer. Both DSv1 and DSv2 must
+ * satisfy the invariants independently (V1 is the oracle).
+ */
+ @Test
+ public void testScenario2_ConcurrentCommitBetweenLatestOffsetAndPlanPartitions(
+ @TempDir File deltaTablePath) throws Exception {
+ runScenario2OnEngine(new File(deltaTablePath, "v1"), /* v2= */ false);
+ runScenario2OnEngine(new File(deltaTablePath, "v2"), /* v2= */ true);
+ }
+
+ private void runScenario2OnEngine(File tableDir, boolean v2) throws Exception {
+ assertTrue(tableDir.mkdirs() || tableDir.isDirectory());
+ String tablePath = tableDir.getAbsolutePath();
+ File checkpointDir = new File(tableDir, "_checkpoint");
+
+ // Seed the table so streaming has something to read.
+ spark
+ .createDataFrame(Arrays.asList(RowFactory.create(0, "init", 0.0)), TEST_SCHEMA)
+ .write()
+ .format("delta")
+ .save(tablePath);
+
+ Dataset streamingDF =
+ v2
+ ? spark
+ .readStream()
+ .option("maxFilesPerTrigger", "1")
+ .table(str("dsv2.delta.`%s`", tablePath))
+ : spark.readStream().format("delta").option("maxFilesPerTrigger", "1").load(tablePath);
+
+ AtomicLong rowsWritten = new AtomicLong(1);
+ int totalRowsToWrite = 50;
+
+ StreamingQuery query =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(v2 ? "scenario2_race_v2" : "scenario2_race_v1")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .start();
+
+ // Writer runs in parallel: many small commits while micro-batches advance.
+ ExecutorService writer = Executors.newSingleThreadExecutor();
+ try {
+ writer.submit(
+ () -> {
+ for (int i = 1; i <= totalRowsToWrite - 1; i++) {
+ try {
+ spark
+ .createDataFrame(
+ Arrays.asList(RowFactory.create(i, "row" + i, (double) i)), TEST_SCHEMA)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+ rowsWritten.incrementAndGet();
+ } catch (Exception ignored) {
+ return;
+ }
+ }
+ });
+
+ // Let writer + reader race. Wait for writer to complete its work + final batches.
+ long deadline = System.currentTimeMillis() + 60000L;
+ while (rowsWritten.get() < totalRowsToWrite && System.currentTimeMillis() < deadline) {
+ Thread.sleep(100);
+ }
+ // Allow reader to drain remaining batches.
+ query.processAllAvailable();
+ } finally {
+ writer.shutdownNow();
+ writer.awaitTermination(5, TimeUnit.SECONDS);
+ query.stop();
+ DeltaLog.clearCache();
+ }
+
+ String engine = v2 ? "DSv2" : "DSv1";
+
+ // Verify no exception was thrown by the streaming query.
+ assertTrue(
+ query.exception().isEmpty(),
+ () ->
+ engine
+ + " streaming query failed unexpectedly: "
+ + (query.exception().isDefined() ? query.exception().get().toString() : ""));
+
+ // Per-batch invariant: numInputRows <= numFilesAdmittedThisBatch (= maxFilesPerTrigger=1
+ // means at most 1 file per batch). With 1 row per file, numInputRows must be 0 or 1.
+ StreamingQueryProgress[] progresses = query.recentProgress();
+ for (StreamingQueryProgress p : progresses) {
+ long numRows = p.numInputRows();
+ assertTrue(
+ numRows <= 1,
+ () ->
+ engine
+ + " batch "
+ + p.batchId()
+ + " produced "
+ + numRows
+ + " rows but maxFilesPerTrigger=1 with 1 row per file."
+ + " Phantom AddFile beyond endOffset suspected. Progress: "
+ + p.json());
+ }
+
+ // Per-batch end-offset invariant: planInputPartitions must not include files past endOffset.
+ // We approximate this by checking each progress' endOffset version matches a real commit.
+ long latestVersion = DeltaLog.forTable(spark, tablePath).snapshot().version();
+ String tableId = DeltaLog.forTable(spark, tablePath).tableId();
+ for (StreamingQueryProgress p : progresses) {
+ if (p.sources().length == 0) continue;
+ String endOffsetJson = p.sources()[0].endOffset();
+ if (endOffsetJson == null) continue;
+ DeltaSourceOffset endOffset = DeltaSourceOffset$.MODULE$.apply(tableId, endOffsetJson);
+ assertTrue(
+ endOffset.reservoirVersion() <= latestVersion + 1,
+ () ->
+ engine
+ + " endOffset reservoirVersion="
+ + endOffset.reservoirVersion()
+ + " exceeds latestVersion+1="
+ + (latestVersion + 1));
+ }
+ }
+
+ /**
+ * Scenario 5: Protocol upgrade mid-stream (writer feature appears at v=N).
+ *
+ * SMS:1009 ({@code validateCommitAndDecideSkipping}) handles AddFile / RemoveFile / Metadata
+ * but does not consult Protocol actions. SMS:631 only validates protocol at startup, not
+ * per-commit. Per scenario_brainstorm.md, we expect a clean {@code
+ * UnsupportedTableFeatureException} (or DeltaUnsupportedTableFeatureException) when the stream
+ * encounters a commit that introduces a writer feature it cannot read - not silent skip or NPE.
+ *
+ *
Stream a non-DV table; mid-stream enable {@code delta.enableDeletionVectors} and DELETE rows
+ * to actually produce a DV. V1 (oracle) throws an unsupported-feature error; V2 must match. If V2
+ * swallows it (Bug #23), divergence surfaces here.
+ */
+ @Test
+ public void testScenario5_ProtocolUpgradeMidStream(@TempDir File deltaTablePath)
+ throws Exception {
+ Throwable v1Thrown = runScenario5OnEngine(new File(deltaTablePath, "v1"), /* v2= */ false);
+ Throwable v2Thrown = runScenario5OnEngine(new File(deltaTablePath, "v2"), /* v2= */ true);
+
+ boolean v1Threw = hasUnsupportedFeatureCause(v1Thrown);
+ boolean v2Threw = hasUnsupportedFeatureCause(v2Thrown);
+
+ assertTrue(
+ v1Threw,
+ () ->
+ "DSv1 (oracle) was expected to throw a DeltaUnsupportedTableFeatureException for "
+ + "mid-stream DV protocol upgrade. Got: "
+ + describeChain(v1Thrown));
+
+ assertEquals(
+ v1Threw,
+ v2Threw,
+ () ->
+ "DSv1 vs DSv2 divergence on mid-stream DV protocol upgrade.\n"
+ + "DSv1 threw expected="
+ + v1Threw
+ + " chain="
+ + describeChain(v1Thrown)
+ + "\nDSv2 threw expected="
+ + v2Threw
+ + " chain="
+ + describeChain(v2Thrown));
+ }
+
+ /**
+ * Runs the scenario-5 sequence (drain initial snapshot, enable DV + DELETE, restart stream) on
+ * either the DSv1 or DSv2 path. Returns whatever exception bubbled out (or null if the stream
+ * completed without error).
+ */
+ private Throwable runScenario5OnEngine(File tableDir, boolean v2) throws Exception {
+ assertTrue(tableDir.mkdirs() || tableDir.isDirectory());
+ String tablePath = tableDir.getAbsolutePath();
+ File checkpointDir = new File(tableDir, "_checkpoint");
+
+ // v=0: create non-DV table with a few rows.
+ spark.sql(str("CREATE TABLE delta.`%s` (value INT) USING delta", tablePath));
+ spark
+ .range(10)
+ .selectExpr("cast(id as int) as value")
+ .coalesce(1)
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ Dataset streamingDF =
+ v2
+ ? spark.readStream().table(str("dsv2.delta.`%s`", tablePath))
+ : spark.readStream().format("delta").load(tablePath);
+ String tag = v2 ? "v2" : "v1";
+
+ // First run: drain initial snapshot using noop sink (memory sink doesn't support
+ // checkpoint recovery across separate query instances).
+ StreamingQuery query =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .queryName("scenario5_pre_upgrade_" + tag)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ query.awaitTermination();
+ query.stop();
+
+ // Mid-stream: enable DV writer feature + perform a DELETE that materializes a DV.
+ spark.sql(
+ str(
+ "ALTER TABLE delta.`%s` SET TBLPROPERTIES ('delta.enableDeletionVectors' = 'true')",
+ tablePath));
+ spark.sql(str("DELETE FROM delta.`%s` WHERE value = 0", tablePath));
+
+ // Restart stream: should encounter the protocol upgrade + DV commit.
+ Throwable thrown = null;
+ try {
+ StreamingQuery q =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .queryName("scenario5_post_upgrade_" + tag)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ try {
+ q.awaitTermination();
+ } catch (Throwable awaitErr) {
+ thrown = awaitErr;
+ } finally {
+ q.stop();
+ }
+ if (thrown == null && q.exception().isDefined()) {
+ thrown = q.exception().get();
+ }
+ } catch (Throwable t) {
+ thrown = t;
+ } finally {
+ DeltaLog.clearCache();
+ }
+ return thrown;
+ }
+
+ /** Returns true if the cause chain contains an UnsupportedTableFeature-style error. */
+ private static boolean hasUnsupportedFeatureCause(Throwable t) {
+ Throwable cur = t;
+ while (cur != null) {
+ String cls = cur.getClass().getName();
+ if (cls.contains("UnsupportedTableFeature") || cls.contains("DeltaUnsupportedOperation")) {
+ return true;
+ }
+ cur = cur.getCause();
+ }
+ return false;
+ }
+
+ /** Renders a cause chain for diagnostics. */
+ private static String describeChain(Throwable t) {
+ if (t == null) return "(no exception thrown)";
+ StringBuilder sb = new StringBuilder();
+ Throwable cur = t;
+ while (cur != null) {
+ sb.append("\n -> ")
+ .append(cur.getClass().getName())
+ .append(": ")
+ .append(cur.getMessage() == null ? "" : cur.getMessage());
+ cur = cur.getCause();
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Scenario 6: {@code Trigger.AvailableNow} twice on same checkpoint, no new data.
+ *
+ * SMS:149-151 caches {@code lastOffsetForTriggerAvailableNow} per-stream; {@code
+ * isLastOffsetForTriggerAvailableNowInitialized} is reset across instances. The second invocation
+ * should produce 0 batches (or 1 trivially-empty progress) - no duplicate, no error. Asserted on
+ * both DSv1 and DSv2 against the same row counts.
+ */
+ @Test
+ public void testScenario6_AvailableNowTwiceNoNewData(@TempDir File deltaTablePath)
+ throws Exception {
+ long[] v1Runs = runScenario6OnEngine(new File(deltaTablePath, "v1"), /* v2= */ false);
+ long[] v2Runs = runScenario6OnEngine(new File(deltaTablePath, "v2"), /* v2= */ true);
+
+ // V1 is the oracle: first run drains all 3 rows, second run drains 0.
+ assertEquals(3L, v1Runs[0], "DSv1 first AvailableNow run should drain all 3 rows.");
+ assertEquals(0L, v1Runs[1], "DSv1 second AvailableNow run should produce 0 rows.");
+
+ // V2 must match V1 row-for-row on both runs.
+ assertEquals(
+ v1Runs[0],
+ v2Runs[0],
+ () -> "DSv1 vs DSv2 first-run row count mismatch. V1=" + v1Runs[0] + " V2=" + v2Runs[0]);
+ assertEquals(
+ v1Runs[1],
+ v2Runs[1],
+ () ->
+ "DSv1 vs DSv2 second-run row count mismatch. V1="
+ + v1Runs[1]
+ + " V2="
+ + v2Runs[1]
+ + ". Indicates duplicate replay (cf. Task K row-duplication).");
+ }
+
+ /** Runs two back-to-back AvailableNow streams on the same checkpoint; returns rows per run. */
+ private long[] runScenario6OnEngine(File tableDir, boolean v2) throws Exception {
+ assertTrue(tableDir.mkdirs() || tableDir.isDirectory());
+ String tablePath = tableDir.getAbsolutePath();
+ File checkpointDir = new File(tableDir, "_checkpoint");
+
+ spark
+ .createDataFrame(
+ Arrays.asList(
+ RowFactory.create(1, "Alice", 10.0),
+ RowFactory.create(2, "Bob", 20.0),
+ RowFactory.create(3, "Charlie", 30.0)),
+ TEST_SCHEMA)
+ .write()
+ .format("delta")
+ .save(tablePath);
+
+ Dataset streamingDF =
+ v2
+ ? spark.readStream().table(str("dsv2.delta.`%s`", tablePath))
+ : spark.readStream().format("delta").load(tablePath);
+ String tag = v2 ? "v2" : "v1";
+
+ StreamingQuery q1 =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .queryName("scenario6_first_" + tag)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ q1.awaitTermination();
+ q1.stop();
+
+ long firstRunRows = 0;
+ for (StreamingQueryProgress p : q1.recentProgress()) {
+ firstRunRows += p.numInputRows();
+ }
+
+ StreamingQuery q2 =
+ streamingDF
+ .writeStream()
+ .format("noop")
+ .queryName("scenario6_second_" + tag)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ q2.awaitTermination();
+ q2.stop();
+ DeltaLog.clearCache();
+
+ String engine = v2 ? "DSv2" : "DSv1";
+ assertTrue(
+ q2.exception().isEmpty(),
+ () -> engine + " second AvailableNow failed: " + q2.exception().get().toString());
+
+ long secondRunRows = 0;
+ for (StreamingQueryProgress p : q2.recentProgress()) {
+ secondRunRows += p.numInputRows();
+ }
+ return new long[] {firstRunRows, secondRunRows};
+ }
+
+ /**
+ * Scenario 8: Initial snapshot exceeding {@code maxInitialSnapshotFiles} (SMS:226).
+ *
+ * {@code InitialSnapshotCache} (SMS:172) is asserted at limit in {@code
+ * loadAndValidateSnapshot}. We lower the conf to 5 and write 10 files in v=0; expect a clean
+ * {@code DELTA_STREAMING_INITIAL_SNAPSHOT_TOO_LARGE} error - not OOM, not NPE. DSv1 is the
+ * oracle; DSv2 must surface the same structured error.
+ */
+ @Test
+ public void testScenario8_InitialSnapshotExceedsMaxFiles(@TempDir File deltaTablePath)
+ throws Exception {
+ Throwable v1Thrown = runScenario8OnEngine(new File(deltaTablePath, "v1"), /* v2= */ false);
+ Throwable v2Thrown = runScenario8OnEngine(new File(deltaTablePath, "v2"), /* v2= */ true);
+
+ boolean v1Matched = hasInitialSnapshotTooLargeCause(v1Thrown);
+ boolean v2Matched = hasInitialSnapshotTooLargeCause(v2Thrown);
+
+ assertTrue(
+ v1Matched,
+ () ->
+ "DSv1 (oracle) expected DELTA_STREAMING_INITIAL_SNAPSHOT_TOO_LARGE. Got: "
+ + describeChain(v1Thrown));
+ assertEquals(
+ v1Matched,
+ v2Matched,
+ () ->
+ "DSv1 vs DSv2 divergence on initial-snapshot-too-large.\n"
+ + "DSv1 chain="
+ + describeChain(v1Thrown)
+ + "\nDSv2 chain="
+ + describeChain(v2Thrown));
+ }
+
+ /** Runs scenario 8 on one engine and returns whatever exception bubbled out. */
+ private Throwable runScenario8OnEngine(File tableDir, boolean v2) throws Exception {
+ assertTrue(tableDir.mkdirs() || tableDir.isDirectory());
+ String tablePath = tableDir.getAbsolutePath();
+
+ // Write 10 small files in v=0 by repartitioning to 10 partitions.
+ spark
+ .range(10)
+ .selectExpr(
+ "cast(id as int) as id", "cast(id as string) as name", "cast(id as double) as value")
+ .repartition(10)
+ .write()
+ .format("delta")
+ .save(tablePath);
+
+ // Verify we actually have multiple files in v=0.
+ long numFilesInV0 = DeltaLog.forTable(spark, tablePath).snapshot().allFiles().count();
+ assertTrue(
+ numFilesInV0 > 5,
+ () -> "Expected >5 files in v=0 to trigger the limit, but got " + numFilesInV0);
+
+ Throwable[] thrownHolder = new Throwable[1];
+ String tag = v2 ? "v2" : "v1";
+
+ withSQLConf(
+ "spark.databricks.delta.streaming.initialSnapshotMaxFiles",
+ "5",
+ () -> {
+ Dataset streamingDF =
+ v2
+ ? spark.readStream().table(str("dsv2.delta.`%s`", tablePath))
+ : spark.readStream().format("delta").load(tablePath);
+ File checkpointDir = new File(tableDir, "_checkpoint_s8");
+
+ Throwable thrown = null;
+ StreamingQuery q = null;
+ try {
+ q =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName("scenario8_too_large_" + tag)
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .outputMode("append")
+ .trigger(Trigger.AvailableNow())
+ .start();
+ try {
+ q.awaitTermination();
+ } catch (Throwable awaitErr) {
+ thrown = awaitErr;
+ }
+ if (thrown == null && q.exception().isDefined()) {
+ thrown = q.exception().get();
+ }
+ } catch (Throwable t) {
+ thrown = t;
+ } finally {
+ if (q != null) {
+ try {
+ q.stop();
+ } catch (Throwable stopErr) {
+ // ignore stop errors during cleanup
+ }
+ }
+ DeltaLog.clearCache();
+ }
+ thrownHolder[0] = thrown;
+ });
+
+ return thrownHolder[0];
+ }
+
+ /**
+ * Returns true if the cause chain matches the DELTA_STREAMING_INITIAL_SNAPSHOT_TOO_LARGE error.
+ */
+ private static boolean hasInitialSnapshotTooLargeCause(Throwable t) {
+ Throwable cur = t;
+ while (cur != null) {
+ String msg = cur.getMessage() == null ? "" : cur.getMessage();
+ if (msg.contains("DELTA_STREAMING_INITIAL_SNAPSHOT_TOO_LARGE")
+ || msg.contains("initialSnapshotMaxFiles")
+ || cur.getClass().getName().contains("DeltaUnsupportedOperation")) {
+ return true;
+ }
+ cur = cur.getCause();
+ }
+ return false;
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java
index 5696027a075..778950e1cd2 100644
--- a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingReadTest.java
@@ -471,4 +471,66 @@ public void testNestedNullabilityRelaxDetectedOnRestart(@TempDir File deltaTable
ex.getMessage().contains("DELTA_STREAMING_SCHEMA_MISMATCH_ON_RESTART"),
"Expected DELTA_STREAMING_SCHEMA_MISMATCH_ON_RESTART but got: " + ex.getMessage());
}
+
+ /**
+ * Regression test for the V2 partition-column read NPE referenced in PR #6583's DSv1 test comment
+ * ("Data writes would trip a separate V2 partition-column read NPE (OnHeapColumnVector.getLong),
+ * tracked out-of-band").
+ *
+ * The table schema declares the partition column in the MIDDLE of the column list: {@code (id
+ * LONG, part LONG, col3 INT) PARTITIONED BY (part)}. The V2 scan's {@link
+ * io.delta.spark.internal.v2.read.SparkScan#readSchema()} naively appends partition columns to
+ * the data columns, producing {@code (id, col3, part)} — the partition column lands in the wrong
+ * ordinal. When Spark's vectorized Parquet reader builds its {@code OnHeapColumnVector} batch
+ * using the table-schema ordinal for {@code part}, it dereferences a partition-value vector that
+ * the data-only Parquet file does not actually contain, hitting an NPE in {@code
+ * OnHeapColumnVector.getLong}.
+ */
+ @Test
+ public void testStreamingRead_partitionColumnInMiddle(@TempDir File deltaTablePath)
+ throws Exception {
+ String tablePath = deltaTablePath.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ // Create table with partition column declared IN THE MIDDLE: (id, part, col3)
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id LONG, part LONG, col3 INT) "
+ + "USING delta PARTITIONED BY (part)",
+ tablePath));
+
+ // Insert actual data across two partition values.
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (1, 10, 100), (2, 10, 200)", tablePath));
+ spark.sql(str("INSERT INTO delta.`%s` VALUES (3, 20, 300), (4, 20, 400)", tablePath));
+
+ // Stream via V1 (oracle): readStream + format("delta").load(path).
+ Dataset v1StreamingDF = spark.readStream().format("delta").load(tablePath);
+ List v1Rows = processStreamingQuery(v1StreamingDF, "test_partition_col_middle_v1");
+
+ // V1 must succeed and return the expected rows.
+ List expectedRows =
+ Arrays.asList(
+ RowFactory.create(1L, 10L, 100),
+ RowFactory.create(2L, 10L, 200),
+ RowFactory.create(3L, 20L, 300),
+ RowFactory.create(4L, 20L, 400));
+ assertDataEquals(v1Rows, expectedRows);
+
+ // Stream via V2.
+ Dataset streamingDF = spark.readStream().table(dsv2TableRef);
+ assertTrue(streamingDF.isStreaming(), "Dataset should be streaming");
+
+ List v2Rows = processStreamingQuery(streamingDF, "test_partition_col_middle_v2");
+
+ // Sort both lists by id (column 0) and assert V1 == V2.
+ List v1Sorted =
+ v1Rows.stream()
+ .sorted((a, b) -> Long.compare(a.getLong(0), b.getLong(0)))
+ .collect(Collectors.toList());
+ List v2Sorted =
+ v2Rows.stream()
+ .sorted((a, b) -> Long.compare(a.getLong(0), b.getLong(0)))
+ .collect(Collectors.toList());
+ assertEquals(v1Sorted, v2Sorted);
+ }
}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRowTrackingTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRowTrackingTest.java
new file mode 100644
index 00000000000..21464da9da6
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingRowTrackingTest.java
@@ -0,0 +1,675 @@
+/*
+ * Copyright (2026) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import org.apache.spark.sql.AnalysisException;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.delta.DeltaLog;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.streaming.Trigger;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import scala.Option;
+
+/**
+ * Integration tests for DSv2 streaming reads on row-tracking-enabled Delta tables.
+ *
+ * Mirrors {@link V2RowTrackingReadTest} (which is batch-only) but exercises the streaming
+ * micro-batch path. Each test launches a streaming query against {@code dsv2.delta.``},
+ * projects {@code _metadata.row_id} / {@code _metadata.row_commit_version} where relevant, and
+ * validates that the row-tracking metadata reaches the consumer.
+ *
+ * Each test ALSO runs the same scenario through DSv1 streaming ({@code
+ * spark.readStream().format("delta").load(path)}) and asserts that the rows produced by V1 and V2
+ * match. V1 is the oracle for parity; divergence here indicates a DSv2 streaming bug.
+ *
+ *
Failures here indicate bugs in DSv2 streaming row-tracking integration. Tests are
+ * intentionally thin so each one isolates a single hypothesis.
+ */
+public class V2StreamingRowTrackingTest extends V2TestBase {
+
+ // ---------------------------------------------------------------------------
+ // Case 1: stream from row-tracked table, basic — verify rows arrive
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testStreamFromRowTrackedTableBasic(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF = spark.readStream().table(dsv2TableRef);
+ assertTrue(streamingDF.isStreaming());
+
+ List rows = processStreamingQuery(streamingDF, "rt_basic");
+ assertEquals(3, rows.size());
+ Set ids = new HashSet<>();
+ for (Row r : rows) {
+ ids.add(r.getLong(0));
+ }
+ assertEquals(Set.of(1L, 2L, 3L), ids);
+
+ // V1 vs V2 streaming parity: same projection, sort by id, compare.
+ assertV1V2StreamingParity(tablePath, "rt_basic", /* projection= */ null);
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 2: project _metadata.row_id — verify ids stable across batches
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testStreamProjectsRowId(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark.readStream().table(dsv2TableRef).selectExpr("id", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_row_id");
+ assertEquals(3, rows.size());
+ // Map id -> row_id
+ Map idToRowId = new HashMap<>();
+ for (Row r : rows) {
+ idToRowId.put(r.getLong(0), r.getLong(1));
+ }
+ assertEquals(Set.of(0L, 1L, 2L), new HashSet<>(idToRowId.values()), "Expected row_ids 0,1,2");
+
+ assertV1V2StreamingParity(
+ tablePath, "rt_row_id", df -> df.selectExpr("id", "_metadata.row_id AS row_id"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 3: project _metadata.row_commit_version — verify monotonic
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testStreamProjectsRowCommitVersionMonotonic(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice')");
+ insert(tablePath, "(2, 'Bob')");
+ insert(tablePath, "(3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .selectExpr("id", "_metadata.row_commit_version AS rcv");
+
+ List rows = processStreamingQuery(streamingDF, "rt_rcv_monotonic");
+ assertEquals(3, rows.size());
+ Map idToRcv = new HashMap<>();
+ for (Row r : rows) {
+ idToRcv.put(r.getLong(0), r.getLong(1));
+ }
+ // Each insert is its own commit (1, 2, 3). Row tracking commit version should reflect that.
+ assertEquals(1L, idToRcv.get(1L), "Alice was inserted in commit 1");
+ assertEquals(2L, idToRcv.get(2L), "Bob was inserted in commit 2");
+ assertEquals(3L, idToRcv.get(3L), "Charlie was inserted in commit 3");
+
+ assertV1V2StreamingParity(
+ tablePath,
+ "rt_rcv_monotonic",
+ df -> df.selectExpr("id", "_metadata.row_commit_version AS rcv"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 4: row tracking × Trigger.AvailableNow
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingWithAvailableNowTrigger(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ File checkpointDir = new File(tempDir, "_checkpoint");
+ String memoryName = "rt_avail_now";
+
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .selectExpr("id", "_metadata.row_id AS row_id", "_metadata.row_commit_version AS rcv");
+
+ StreamingQuery query =
+ streamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(memoryName)
+ .outputMode("append")
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .trigger(Trigger.AvailableNow())
+ .start();
+ try {
+ query.awaitTermination();
+ } finally {
+ query.stop();
+ DeltaLog.clearCache();
+ }
+
+ List rows = spark.sql("SELECT * FROM " + memoryName).collectAsList();
+ assertEquals(3, rows.size(), "AvailableNow should drain all available rows");
+ Set rowIds = new HashSet<>();
+ for (Row r : rows) {
+ rowIds.add(r.getLong(1));
+ }
+ assertEquals(Set.of(0L, 1L, 2L), rowIds);
+
+ // V1 parity: run the same AvailableNow stream through DSv1 with a separate checkpoint,
+ // then compare the collected rows (sorted by id).
+ File v1CheckpointDir = new File(tempDir, "_checkpoint_v1");
+ String v1MemoryName = "rt_avail_now_v1";
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id", "_metadata.row_commit_version AS rcv");
+ StreamingQuery v1Query =
+ v1StreamingDF
+ .writeStream()
+ .format("memory")
+ .queryName(v1MemoryName)
+ .outputMode("append")
+ .option("checkpointLocation", v1CheckpointDir.getAbsolutePath())
+ .trigger(Trigger.AvailableNow())
+ .start();
+ try {
+ v1Query.awaitTermination();
+ } finally {
+ v1Query.stop();
+ DeltaLog.clearCache();
+ }
+ List v1Rows = spark.sql("SELECT * FROM " + v1MemoryName).collectAsList();
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_avail_now");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 5: row tracking × restart — start, stop, append, restart, verify row_id consistency
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingAcrossStreamRestart(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ File checkpointDir = new File(tempDir, "_checkpoint");
+ File outputDir = new File(tempDir, "_out");
+
+ // Parquet sink (instead of memory sink) so checkpoint recovery is supported across restart.
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .selectExpr("id", "_metadata.row_id AS row_id", "_metadata.row_commit_version AS rcv");
+
+ StreamingQuery q1 =
+ streamingDF
+ .writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ try {
+ q1.processAllAvailable();
+ } finally {
+ q1.stop();
+ }
+
+ // Append 2 more rows, restart the same stream from checkpoint.
+ insert(tablePath, "(3, 'Charlie'), (4, 'Dave')");
+
+ StreamingQuery q2 =
+ streamingDF
+ .writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", outputDir.getAbsolutePath())
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .start();
+ try {
+ q2.processAllAvailable();
+ } finally {
+ q2.stop();
+ DeltaLog.clearCache();
+ }
+
+ List rows = spark.read().parquet(outputDir.getAbsolutePath()).collectAsList();
+ assertEquals(4, rows.size(), "All 4 rows should arrive across restart");
+ Map idToRowId = new HashMap<>();
+ for (Row r : rows) {
+ idToRowId.put(r.getLong(0), r.getLong(1));
+ }
+ // Row tracking row_ids should be unique and continue from high watermark across restart.
+ assertEquals(0L, idToRowId.get(1L));
+ assertEquals(1L, idToRowId.get(2L));
+ assertEquals(2L, idToRowId.get(3L));
+ assertEquals(3L, idToRowId.get(4L));
+
+ // V1 parity: drive a separate DSv1 stream with its own checkpoint + output dir, restart it
+ // across the same gap, then assert V1's full output equals V2's full output.
+ File v1CheckpointDir = new File(tempDir, "_checkpoint_v1");
+ File v1OutputDir = new File(tempDir, "_out_v1");
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id", "_metadata.row_commit_version AS rcv");
+ // V1 first run: should consume rows present at start of stream. Because we already appended
+ // (3,4) above, V1's "first run" will see all 4 rows in the first batch. That is fine - we
+ // only assert the FINAL union after both runs.
+ StreamingQuery v1q1 =
+ v1StreamingDF
+ .writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", v1OutputDir.getAbsolutePath())
+ .option("checkpointLocation", v1CheckpointDir.getAbsolutePath())
+ .start();
+ try {
+ v1q1.processAllAvailable();
+ } finally {
+ v1q1.stop();
+ }
+ // Restart against the same checkpoint - there is nothing more to consume, so this is a no-op
+ // that exercises the checkpoint recovery code path.
+ StreamingQuery v1q2 =
+ v1StreamingDF
+ .writeStream()
+ .format("parquet")
+ .outputMode("append")
+ .option("path", v1OutputDir.getAbsolutePath())
+ .option("checkpointLocation", v1CheckpointDir.getAbsolutePath())
+ .start();
+ try {
+ v1q2.processAllAvailable();
+ } finally {
+ v1q2.stop();
+ DeltaLog.clearCache();
+ }
+ List v1Rows = spark.read().parquet(v1OutputDir.getAbsolutePath()).collectAsList();
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_restart");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 6: row tracking × DV — DELETE rows, stream, verify surviving row_ids unchanged
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingWithDeletionVectorsPreservesIds(@TempDir File tempDir)
+ throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id LONG, name STRING) USING delta TBLPROPERTIES "
+ + "('delta.enableRowTracking' = 'true', 'delta.enableDeletionVectors' = 'true')",
+ tablePath));
+ spark
+ .range(1000)
+ .selectExpr("id", "cast(id as string) as name")
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(tablePath);
+
+ spark.sql(str("DELETE FROM delta.`%s` WHERE id %% 2 = 0", tablePath));
+
+ DeltaLog deltaLog = DeltaLog.forTable(spark, tablePath);
+ long numDVs =
+ (long)
+ deltaLog
+ .update(false, Option.empty(), Option.empty())
+ .numDeletionVectorsOpt()
+ .getOrElse(() -> 0L);
+ assertTrue(numDVs > 0, "Expected DVs to be created");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark.readStream().table(dsv2TableRef).selectExpr("id", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_dv");
+ assertEquals(500, rows.size(), "Expected only odd ids (DV-filtered)");
+ for (Row r : rows) {
+ long id = r.getLong(0);
+ long rowId = r.getLong(1);
+ assertEquals(1L, id % 2, "Only odd IDs should survive deletion");
+ // With stable physical row positions row_id == id for the initial single-file insert.
+ assertEquals(id, rowId, "row_id should be preserved across DV-filtered streaming reads");
+ }
+
+ // V1 parity: DELETE produces a non-append commit, so V1 streaming requires ignoreChanges
+ // (or ignoreDeletes) to consume past it. The initial snapshot read still sees only surviving
+ // rows; we project the same columns and compare.
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreDeletes", "true")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id");
+ List v1Rows = processStreamingQuery(v1StreamingDF, "rt_dv_v1");
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_dv");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 7: row tracking × column mapping (combine top-2 cross products)
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingWithColumnMapping(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id LONG, name STRING) USING delta TBLPROPERTIES "
+ + "('delta.enableRowTracking' = 'true', 'delta.columnMapping.mode' = 'name', "
+ + "'delta.minReaderVersion' = '2', 'delta.minWriterVersion' = '5')",
+ tablePath));
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .table(dsv2TableRef)
+ .selectExpr("id", "name", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_cm");
+ assertEquals(3, rows.size());
+ Map idToRowId = new HashMap<>();
+ for (Row r : rows) {
+ idToRowId.put(r.getLong(0), r.getLong(2));
+ }
+ assertEquals(Set.of(0L, 1L, 2L), new HashSet<>(idToRowId.values()));
+
+ assertV1V2StreamingParity(
+ tablePath, "rt_cm", df -> df.selectExpr("id", "name", "_metadata.row_id AS row_id"));
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 8: row tracking × INSERT OVERWRITE — should row_ids change for rewritten rows?
+ //
+ // Per Delta semantics, INSERT OVERWRITE is a logical replacement. We make a streaming
+ // query consume the table after the overwrite and verify the rows produced by the stream
+ // (an initial snapshot read) carry the post-overwrite row_ids from the high watermark.
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingWithInsertOverwrite(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob')");
+ // Overwrite — replaces all data; new rows should get fresh row_ids continuing from watermark.
+ spark.sql(str("INSERT OVERWRITE TABLE delta.`%s` VALUES (10, 'X'), (20, 'Y')", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark.readStream().table(dsv2TableRef).selectExpr("id", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_overwrite");
+ assertEquals(2, rows.size(), "Stream should see only post-overwrite rows");
+ Set ids = new HashSet<>();
+ Set rowIds = new HashSet<>();
+ for (Row r : rows) {
+ ids.add(r.getLong(0));
+ rowIds.add(r.getLong(1));
+ }
+ assertEquals(Set.of(10L, 20L), ids, "Only overwritten ids should be visible");
+ assertEquals(2, rowIds.size(), "row_ids should be unique per surviving row");
+ // Row IDs after overwrite should continue past the original 2 rows: the first watermark
+ // was {0, 1} so the new rows must have ids >= 2 (any 2 ids from {2, 3}).
+ for (long rid : rowIds) {
+ assertTrue(rid >= 2L, "Expected row_id >= 2 after overwrite; got " + rid);
+ }
+
+ // V1 parity: INSERT OVERWRITE is a non-append commit; DSv1 streaming requires
+ // ignoreChanges to start a fresh stream after such a commit. Snapshot read sees only the
+ // post-overwrite rows.
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreChanges", "true")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id");
+ List v1Rows = processStreamingQuery(v1StreamingDF, "rt_overwrite_v1");
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_overwrite");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 9: row tracking × MERGE → ignoreChanges, verify row_id preserved across reused files
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingWithMergeIgnoreChanges(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ createRowTrackedTable(tablePath);
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob'), (3, 'Charlie')");
+
+ // Capture pre-MERGE row_ids via a batch query.
+ List beforeRows =
+ spark
+ .sql(
+ str(
+ "SELECT id, _metadata.row_id AS row_id FROM dsv2.delta.`%s` ORDER BY id",
+ tablePath))
+ .collectAsList();
+ Map beforeIdToRowId = new HashMap<>();
+ for (Row r : beforeRows) {
+ beforeIdToRowId.put(r.getLong(0), r.getLong(1));
+ }
+
+ // Set up a source for MERGE.
+ spark.sql("DROP VIEW IF EXISTS rt_merge_src");
+ spark
+ .sql("SELECT 1L AS id, 'ALICE' AS name UNION ALL SELECT 99L AS id, 'New' AS name")
+ .createOrReplaceTempView("rt_merge_src");
+
+ spark.sql(
+ str(
+ "MERGE INTO delta.`%s` t USING rt_merge_src s ON t.id = s.id "
+ + "WHEN MATCHED THEN UPDATE SET name = s.name "
+ + "WHEN NOT MATCHED THEN INSERT (id, name) VALUES (s.id, s.name)",
+ tablePath));
+
+ // ignoreChanges allows streaming over a table with non-append commits.
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark
+ .readStream()
+ .option("ignoreChanges", "true")
+ .table(dsv2TableRef)
+ .selectExpr("id", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_merge_ignore");
+ Map afterIdToRowId = new HashMap<>();
+ for (Row r : rows) {
+ afterIdToRowId.put(r.getLong(0), r.getLong(1));
+ }
+ // Rows id=2 and id=3 were untouched in MERGE (Bob, Charlie). If MERGE rewrites the file
+ // (file-rewrite path), row tracking must preserve their original row_ids.
+ assertTrue(afterIdToRowId.containsKey(2L), "Bob (id=2) should still be present");
+ assertTrue(afterIdToRowId.containsKey(3L), "Charlie (id=3) should still be present");
+ assertEquals(
+ beforeIdToRowId.get(2L),
+ afterIdToRowId.get(2L),
+ "Bob's row_id should be preserved across MERGE rewrites");
+ assertEquals(
+ beforeIdToRowId.get(3L),
+ afterIdToRowId.get(3L),
+ "Charlie's row_id should be preserved across MERGE rewrites");
+
+ // V1 parity: same option + same projection.
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreChanges", "true")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id");
+ List v1Rows = processStreamingQuery(v1StreamingDF, "rt_merge_ignore_v1");
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_merge_ignore");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Case 10: row tracking on a snapshot started without it (enabled later)
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testRowTrackingEnabledAfterTableCreate(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ spark.sql(str("CREATE TABLE delta.`%s` (id LONG, name STRING) USING delta", tablePath));
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob')");
+ // Enable row tracking after data is already present.
+ spark.sql(
+ str(
+ "ALTER TABLE delta.`%s` SET TBLPROPERTIES ('delta.enableRowTracking' = 'true')",
+ tablePath));
+ insert(tablePath, "(3, 'Charlie')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ Dataset streamingDF =
+ spark.readStream().table(dsv2TableRef).selectExpr("id", "_metadata.row_id AS row_id");
+
+ List rows = processStreamingQuery(streamingDF, "rt_late_enable");
+ assertEquals(3, rows.size(), "All rows should arrive even though RT was enabled mid-life");
+ Set rowIds = new HashSet<>();
+ for (Row r : rows) {
+ rowIds.add(r.getLong(1));
+ }
+ assertEquals(3, rowIds.size(), "row_ids should be unique across the whole table");
+
+ // V1 parity: ALTER TABLE produces a non-append (metadata) commit; use ignoreChanges so
+ // V1 streaming can replay past it.
+ Dataset v1StreamingDF =
+ spark
+ .readStream()
+ .format("delta")
+ .option("ignoreChanges", "true")
+ .load(tablePath)
+ .selectExpr("id", "_metadata.row_id AS row_id");
+ List v1Rows = processStreamingQuery(v1StreamingDF, "rt_late_enable_v1");
+ assertRowsEqualSortedByFirstCol(v1Rows, rows, "rt_late_enable");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Sanity: project _metadata struct on a non-RT table through DSv2 streaming
+ // ---------------------------------------------------------------------------
+
+ @Test
+ public void testStreamMetadataStructOnNonRowTrackedTable(@TempDir File tempDir) throws Exception {
+ String tablePath = tempDir.getAbsolutePath();
+ spark.sql(str("CREATE TABLE delta.`%s` (id LONG, name STRING) USING delta", tablePath));
+ insert(tablePath, "(1, 'Alice'), (2, 'Bob')");
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+ // Accessing _metadata.row_id on a non-RT table must surface a clear analysis error.
+ AnalysisException ex =
+ assertThrows(
+ AnalysisException.class,
+ () -> spark.readStream().table(dsv2TableRef).selectExpr("_metadata.row_id AS row_id"));
+ assertTrue(
+ ex.getMessage().toLowerCase().contains("row_id")
+ || ex.getMessage().toLowerCase().contains("_metadata"),
+ "Expected analysis error mentioning row_id or _metadata; got: " + ex.getMessage());
+
+ // V1 parity: DSv1 streaming must also reject _metadata.row_id on a non-row-tracked table.
+ AnalysisException v1Ex =
+ assertThrows(
+ AnalysisException.class,
+ () ->
+ spark
+ .readStream()
+ .format("delta")
+ .load(tablePath)
+ .selectExpr("_metadata.row_id AS row_id"));
+ assertTrue(
+ v1Ex.getMessage().toLowerCase().contains("row_id")
+ || v1Ex.getMessage().toLowerCase().contains("_metadata"),
+ "V1: expected analysis error mentioning row_id or _metadata; got: " + v1Ex.getMessage());
+ }
+
+ // ---------------------------------------------------------------------------
+ // Helpers
+ // ---------------------------------------------------------------------------
+
+ private void createRowTrackedTable(String path) {
+ spark.sql(
+ str(
+ "CREATE TABLE delta.`%s` (id LONG, name STRING) USING delta "
+ + "TBLPROPERTIES ('delta.enableRowTracking' = 'true')",
+ path));
+ }
+
+ private void insert(String path, String values) {
+ spark.sql(str("INSERT INTO delta.`%s` VALUES %s", path, values));
+ }
+
+ /**
+ * Runs the same streaming projection against DSv1 (file path) and DSv2 (catalog table) and
+ * asserts the produced rows match (sorted by the first column's toString).
+ *
+ * Use this for tests whose underlying table has only append commits - i.e., V1 streaming can
+ * consume it without {@code ignoreChanges}/{@code ignoreDeletes}. For non-append-only tables
+ * (DV/overwrite/merge/alter), inline a V1 stream with the appropriate option instead.
+ */
+ private void assertV1V2StreamingParity(
+ String tablePath, String tag, Function, Dataset> projection)
+ throws Exception {
+ Dataset v1 = spark.readStream().format("delta").load(tablePath);
+ if (projection != null) v1 = projection.apply(v1);
+ List v1Rows = processStreamingQuery(v1, tag + "_v1");
+
+ Dataset v2 = spark.readStream().table(str("dsv2.delta.`%s`", tablePath));
+ if (projection != null) v2 = projection.apply(v2);
+ List v2Rows = processStreamingQuery(v2, tag + "_v2");
+
+ assertRowsEqualSortedByFirstCol(v1Rows, v2Rows, tag);
+ }
+
+ /** Sorts both lists by the first column's toString and asserts equality. */
+ private static void assertRowsEqualSortedByFirstCol(
+ List v1Rows, List v2Rows, String tag) {
+ List v1Sorted = new ArrayList<>(v1Rows);
+ List v2Sorted = new ArrayList<>(v2Rows);
+ Comparator byFirstCol = Comparator.comparing(r -> String.valueOf(r.get(0)));
+ v1Sorted.sort(byFirstCol);
+ v2Sorted.sort(byFirstCol);
+ assertEquals(
+ v1Sorted.toString(), v2Sorted.toString(), tag + ": V1 vs V2 streaming row mismatch");
+ }
+}
diff --git a/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingSchemaRejectionTest.java b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingSchemaRejectionTest.java
new file mode 100644
index 00000000000..2dd126146b5
--- /dev/null
+++ b/spark/v2/src/test/java/io/delta/spark/internal/v2/V2StreamingSchemaRejectionTest.java
@@ -0,0 +1,431 @@
+/*
+ * Copyright (2025) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.delta.spark.internal.v2;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicReference;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.streaming.StreamingQuery;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * DSv2 mirrors of DSv1 streaming schema-rejection / CREATE-source DDL tests in {@code
+ * DeltaSourceSuite.scala} (lines 96-220).
+ *
+ * Each test corresponds 1:1 to a DSv1 case and asserts the *current* DSv2 behavior, with a
+ * result classification embedded in the javadoc:
+ *
+ *
+ * - PASS — DSv2 behaves like DSv1 (the assertion captures the matching contract).
+ *
- FAIL — DSv2 differs from DSv1 (the assertion captures the divergent DSv2 behavior; the
+ * comment names the parity gap and the DSv1 contract that's missing).
+ *
- CANT-CONSTRUCT — the DSv1 input cannot be expressed via the DSv2 catalog API.
+ *
+ *
+ * The tests are written to pass against current DSv2 so they serve as a regression suite — if
+ * DSv2 ever starts matching DSv1, the differing tests will fail and force re-classification.
+ */
+public class V2StreamingSchemaRejectionTest extends V2TestBase {
+
+ /**
+ * Mirrors DSv1 {@code "streaming delta source should not drop null columns"} (line 96).
+ *
+ *
DSv1 contract: with {@code DELTA_STREAMING_CREATE_DATAFRAME_DROP_NULL_COLUMNS = false}, the
+ * source preserves a {@code VOID}/NullType column and the stream proceeds (the user's {@code
+ * .drop("nullTypeCol")} succeeds before the writer sees the schema).
+ *
+ *
Classification: FAIL — parity gap. DSv2 cannot even load a table that contains
+ * a VOID column: {@code SparkTable} eagerly calls {@code snapshot.getSchema()} from Kernel, which
+ * raises {@code KernelException("Failed to parse the schema. Encountered unsupported Delta data
+ * type: VOID")}. So the test never reaches the streaming entrypoint, let alone exercises the
+ * DROP_NULL_COLUMNS feature flag. The DSv1 "no drop" contract is unobservable on DSv2.
+ *
+ *
Bug shape: Kernel-backed DSv2 catalog rejects any Delta table whose committed schema
+ * mentions VOID/NullType, even though such tables are valid in DSv1 (Spark wrote them). This is
+ * also a stronger blocker than DSv1: a user with a pre-existing table containing a VOID column
+ * cannot read it via DSv2 at all.
+ */
+ @Test
+ public void testCase1_streamingShouldNotDropNullColumns_v1Flag_false(
+ @TempDir File sourceDir, @TempDir File sinkDir, @TempDir File checkpointDir) {
+ String sourcePath = sourceDir.getAbsolutePath();
+
+ spark
+ .sql("select CAST(null as VOID) as nullTypeCol, id from range(10)")
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(sourcePath);
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", sourcePath);
+
+ // ---- DSv2 leg: KernelException at loadTable. ----
+ AtomicReference caught = new AtomicReference<>();
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafe.read.createDataFrame.dropNullColumns",
+ "false",
+ () -> {
+ try {
+ spark.readStream().table(dsv2TableRef);
+ } catch (Throwable t) {
+ caught.set(t);
+ }
+ });
+
+ assertNotNull(caught.get(), "DSv2 should currently fail to even load a VOID-bearing table.");
+ String msg = unwrapMessages(caught.get());
+ assertTrue(
+ msg.contains("VOID") && msg.contains("unsupported Delta data type"),
+ "Expected Kernel 'unsupported Delta data type: VOID' error, got: " + msg);
+
+ // ---- DSv1 leg (documents the divergence): with the flag = false, DSv1 preserves the VOID
+ // column and `.drop("nullTypeCol")` succeeds before the writer sees the schema; the stream
+ // runs to completion.
+ withSQLConf(
+ "spark.databricks.delta.streaming.createDataFrame.dropNullColumns",
+ "false",
+ () ->
+ assertDoesNotThrow(
+ () -> {
+ Dataset v1Df =
+ spark.readStream().format("delta").load(sourcePath).drop("nullTypeCol");
+ StreamingQuery q =
+ v1Df.writeStream()
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .format("delta")
+ .start(sinkDir.getAbsolutePath());
+ try {
+ q.processAllAvailable();
+ } finally {
+ q.stop();
+ }
+ },
+ "DSv1 with flag=false should run to completion (VOID column preserved, dropped by "
+ + "user)."));
+ }
+
+ /**
+ * Mirrors DSv1 {@code "streaming delta source should drop null columns without feature flag"}
+ * (line 100).
+ *
+ * DSv1 contract: with the flag {@code true}, the source materializes the VOID column away; the
+ * user's later {@code .drop("nullTypeCol")} fails because the column is already gone, surfacing
+ * as {@code STREAM_FAILED} with {@code "assertion failed: Invalid batch: nullTypeCol"}.
+ *
+ *
Classification: FAIL — parity gap. Same root cause as Case 1: DSv2 fails at {@code
+ * loadTable} with {@code KernelException("unsupported Delta data type: VOID")} before reaching
+ * the streaming source, so the DROP_NULL_COLUMNS flag (DSv1-only conf) cannot be observed on
+ * DSv2.
+ *
+ *
Bug shape: DSv2 misses the DROP_NULL_COLUMNS read-time materialization behavior; the
+ * DSv1 conf {@code DELTA_STREAMING_CREATE_DATAFRAME_DROP_NULL_COLUMNS} is unwired in the DSv2
+ * read path.
+ */
+ @Test
+ public void testCase2_streamingShouldDropNullColumns_v1Flag_true(
+ @TempDir File sourceDir, @TempDir File sinkDir, @TempDir File checkpointDir) {
+ String sourcePath = sourceDir.getAbsolutePath();
+
+ spark
+ .sql("select CAST(null as VOID) as nullTypeCol, id from range(10)")
+ .write()
+ .format("delta")
+ .mode("append")
+ .save(sourcePath);
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", sourcePath);
+
+ // ---- DSv2 leg: KernelException at loadTable. ----
+ AtomicReference caught = new AtomicReference<>();
+ withSQLConf(
+ "spark.databricks.delta.streaming.unsafe.read.createDataFrame.dropNullColumns",
+ "true",
+ () -> {
+ try {
+ spark.readStream().table(dsv2TableRef);
+ } catch (Throwable t) {
+ caught.set(t);
+ }
+ });
+
+ assertNotNull(caught.get(), "DSv2 should currently fail to even load a VOID-bearing table.");
+ String msg = unwrapMessages(caught.get());
+ assertTrue(
+ msg.contains("VOID") && msg.contains("unsupported Delta data type"),
+ "Expected Kernel 'unsupported Delta data type: VOID' error, got: " + msg);
+
+ // ---- DSv1 leg (documents the divergence): with the flag = true, DSv1 materializes the VOID
+ // column away; the user's later `.drop("nullTypeCol")` fails because the column is already
+ // gone, surfacing as "Invalid batch: nullTypeCol" via StreamingQueryException.
+ Throwable v1Err =
+ assertThrows(
+ Throwable.class,
+ () ->
+ withSQLConf(
+ "spark.databricks.delta.streaming.createDataFrame.dropNullColumns",
+ "true",
+ () -> {
+ Dataset v1Df =
+ spark.readStream().format("delta").load(sourcePath).drop("nullTypeCol");
+ StreamingQuery q = null;
+ try {
+ q =
+ v1Df.writeStream()
+ .option("checkpointLocation", checkpointDir.getAbsolutePath())
+ .format("delta")
+ .start(sinkDir.getAbsolutePath());
+ q.processAllAvailable();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ if (q != null) {
+ try {
+ q.stop();
+ } catch (Exception ignored) {
+ // Already failed - best-effort cleanup.
+ }
+ }
+ }
+ }),
+ "DSv1 with flag=true should fail because the dropped column is already gone.");
+ String v1Msg = unwrapMessages(v1Err);
+ assertTrue(
+ v1Msg.contains("Invalid batch: nullTypeCol"),
+ "Expected DSv1 'Invalid batch: nullTypeCol' error, got: " + v1Msg);
+ }
+
+ /**
+ * Mirrors DSv1 {@code "no schema should throw an exception"} (line 104).
+ *
+ * DSv1 contract: pointing the streaming reader at a directory with an empty {@code _delta_log}
+ * raises {@code AnalysisException} with both {@code "Table schema is not set"} and {@code "CREATE
+ * TABLE"} — a user-actionable error.
+ *
+ *
Classification: FAIL — parity gap on exception type and message. DSv2 raises {@code
+ * RuntimeException("Failed to load table: delta.``")} (wrapping a Kernel error) from {@code
+ * TestCatalog.loadTable}. The exception type is a plain {@code RuntimeException} (not {@code
+ * AnalysisException}) and the message contains neither {@code "Table schema is not set"} nor
+ * {@code "CREATE TABLE"}.
+ *
+ * Bug shape: DSv2 throws a different exception type than DSv1 for the same input; the
+ * message is also less actionable. Note: the {@code "Failed to load table"} prefix is specific to
+ * {@code TestCatalog} (the test catalog used here) — production DSv2 catalogs may surface a
+ * Kernel exception unwrapped, which is a separate, related parity gap.
+ */
+ @Test
+ public void testCase3_noSchemaShouldThrow(@TempDir File inputDir) {
+ new File(inputDir, "_delta_log").mkdir();
+ String inputPath = inputDir.getAbsolutePath();
+ String dsv2TableRef = str("dsv2.delta.`%s`", inputPath);
+
+ // ---- DSv2 leg: throws, but with a non-actionable message. ----
+ Throwable t =
+ assertThrows(
+ Throwable.class, () -> spark.readStream().table(dsv2TableRef).writeStream().toString());
+
+ String msg = unwrapMessages(t);
+ // DSv2 currently surfaces the test-catalog wrapping message, not the DSv1-style message.
+ assertTrue(
+ msg.contains("Failed to load table"),
+ "Expected DSv2 'Failed to load table' message, got: " + msg);
+ assertFalse(
+ msg.contains("Table schema is not set") && msg.contains("CREATE TABLE"),
+ "DSv2 currently does NOT match DSv1's actionable 'Table schema is not set / CREATE TABLE' "
+ + "message. If this assertion fails, parity has been restored - re-classify Case 3 as "
+ + "PASS.");
+
+ // ---- DSv1 leg: also throws, with the actionable message DSv2 lacks. ----
+ Throwable v1Err =
+ assertThrows(Throwable.class, () -> spark.readStream().format("delta").load(inputPath));
+ String v1Msg = unwrapMessages(v1Err);
+ assertTrue(
+ v1Msg.contains("Table schema is not set") && v1Msg.contains("CREATE TABLE"),
+ "Expected DSv1 'Table schema is not set' + 'CREATE TABLE' error, got: " + v1Msg);
+ }
+
+ /**
+ * Mirrors DSv1 {@code "disallow user specified schema"} (line 116) — the *mismatched* schema
+ * variant.
+ *
+ *
DSv1 contract: providing a user schema that differs from the Delta table schema raises
+ * {@code AnalysisException} with {@code "The schema provided for the source read doesn't match
+ * the schema of the Delta table"} (error class {@code DELTA_READ_SOURCE_SCHEMA_CONFLICT}).
+ *
+ *
Classification: FAIL — bug found (silent acceptance). DSv2 silently accepts a
+ * mismatched user schema via {@code spark.readStream().schema(userSchema).table(dsv2TableRef)};
+ * no exception is thrown at planning time. This is exactly the bug shape called out in the task
+ * spec: "DSv2 silently accepts a user-specified schema that doesn't match (no error)".
+ *
+ *
Whether the divergent schema then causes a runtime read error is a separate question; the
+ * parity gap with DSv1 is at the planning/check stage.
+ */
+ @Test
+ public void testCase4a_disallowUserSchema_mismatched(@TempDir File inputDir) {
+ String tablePath = inputDir.getAbsolutePath();
+ spark.sql(str("CREATE TABLE delta.`%s` (value STRING) USING delta", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ StructType userSchema =
+ DataTypes.createStructType(
+ Arrays.asList(
+ DataTypes.createStructField("a", DataTypes.IntegerType, true),
+ DataTypes.createStructField("b", DataTypes.StringType, true)));
+
+ // ---- DSv2 leg: silently accepts the mismatched user schema (BUG). ----
+ Dataset df =
+ assertDoesNotThrow(
+ () -> spark.readStream().schema(userSchema).table(dsv2TableRef),
+ "DSv2 currently silently accepts a mismatched user-specified schema. If this "
+ + "assertion fails (DSv2 now throws), the parity gap with DSv1's "
+ + "DELTA_READ_SOURCE_SCHEMA_CONFLICT has been closed - re-classify Case 4a as "
+ + "PASS.");
+ assertNotNull(df);
+ assertTrue(df.isStreaming());
+
+ // ---- DSv1 leg: rejects with DELTA_READ_SOURCE_SCHEMA_CONFLICT. ----
+ Throwable v1Err =
+ assertThrows(
+ Throwable.class,
+ () -> spark.readStream().schema(userSchema).format("delta").load(tablePath),
+ "DSv1 should reject a mismatched user-specified schema.");
+ String v1Msg = unwrapMessages(v1Err);
+ assertTrue(
+ v1Msg.contains(
+ "The schema provided for the source read doesn't match the schema of the Delta "
+ + "table"),
+ "Expected DSv1 DELTA_READ_SOURCE_SCHEMA_CONFLICT error, got: " + v1Msg);
+ }
+
+ /**
+ * Mirrors DSv1 {@code "disallow user specified schema"} (line 116) — the *matching* schema
+ * variant.
+ *
+ * DSv1 contract: even when the user-supplied schema *matches* the table schema, the public
+ * {@code spark.readStream.schema(...).format("delta").load(...)} entry point still rejects with
+ * {@code "does not support user-specified schema"}.
+ *
+ *
Classification: FAIL — bug found (silent acceptance). DSv2's {@code .table(...)}
+ * entry point silently accepts {@code .schema(...)}; no exception is thrown.
+ *
+ *
Note: DSv1 and DSv2 use different DataStreamReader entry points ({@code .load(path)} vs
+ * {@code .table(name)}), so Spark itself routes them differently. The parity question is whether
+ * the *Delta* connector enforces "no user schema" on its DSv2 path; currently it does not.
+ */
+ @Test
+ public void testCase4b_disallowUserSchema_matching(@TempDir File inputDir) {
+ String tablePath = inputDir.getAbsolutePath();
+ spark.sql(str("CREATE TABLE delta.`%s` (value STRING) USING delta", tablePath));
+
+ String dsv2TableRef = str("dsv2.delta.`%s`", tablePath);
+
+ StructType matchingSchema =
+ DataTypes.createStructType(
+ Arrays.asList(DataTypes.createStructField("value", DataTypes.StringType, true)));
+
+ // ---- DSv2 leg: silently accepts the user-specified schema (BUG). ----
+ Dataset df =
+ assertDoesNotThrow(
+ () -> spark.readStream().schema(matchingSchema).table(dsv2TableRef),
+ "DSv2 currently silently accepts a (matching) user-specified schema. If this "
+ + "assertion fails (DSv2 now throws), the parity gap with DSv1's 'does not "
+ + "support user-specified schema' contract has been closed - re-classify Case "
+ + "4b as PASS.");
+ assertNotNull(df);
+ assertTrue(df.isStreaming());
+
+ // ---- DSv1 leg: rejects even a matching user schema via .load(path). ----
+ Throwable v1Err =
+ assertThrows(
+ Throwable.class,
+ () -> spark.readStream().schema(matchingSchema).format("delta").load(tablePath),
+ "DSv1 should reject any user-specified schema on the .load(path) entry point.");
+ String v1Msg = unwrapMessages(v1Err);
+ assertTrue(
+ v1Msg.contains("does not support user-specified schema"),
+ "Expected DSv1 'does not support user-specified schema' error, got: " + v1Msg);
+ }
+
+ /**
+ * Mirrors DSv1 {@code "allow user specified schema if consistent: v1 source"} (line 144).
+ *
+ * DSv1 contract: an internal {@code DataSource(spark, userSpecifiedSchema=Some(...),
+ * className="delta", ...)} succeeds when the user schema matches. This is a Spark-internal
+ * advanced-plugin API.
+ *
+ *