Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ trait DeltaV2SourceSchemaEvolutionSuiteBase extends V2ForceTest {
// ========== Schema evolution scenarios ==========
"consecutive schema evolutions without schema merging",
"consecutive schema evolutions",
"consecutive schema evolutions with protocol-only tail",
"upgrade and downgrade",
"multiple sources with schema evolution",
"schema evolution with Delta sink",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -765,28 +765,44 @@ object DeltaSourceMetadataEvolutionSupport extends Logging {
(!hasFileAction && (metadataAction.isDefined || protocolAction.isDefined),
version, metadataAction, protocolAction)
}.takeWhile(_._1)
DeltaSource.iteratorLast(untilMetadataChange.toClosable)
.flatMap { case (_, version, metadataOpt, protocolOpt) =>
if (version == currentMetadataVersion) {
None
} else {
log.info(s"Looked ahead from version $currentMetadataVersion and " +
s"will use metadata at version $version to read Delta stream.")
Some(
currentMetadata.copy(
deltaCommitVersion = version,
dataSchemaJson =
metadataOpt.map(_.schema.json).getOrElse(currentMetadata.dataSchemaJson),
partitionSchemaJson =
metadataOpt.map(_.partitionSchema.json)
.getOrElse(currentMetadata.partitionSchemaJson),
tableConfigurations = metadataOpt.map(_.configuration)
.orElse(currentMetadata.tableConfigurations),
protocolJson = protocolOpt.map(_.json).orElse(currentMetadata.protocolJson)
)
// Fold the chain so the merged entry tracks the latest Metadata and Protocol seen
// anywhere in the run, not just the last commit's actions -- otherwise a (Metadata,
// Protocol-only) tail would advance deltaCommitVersion while losing the schema change.
var lastVersion: Option[Long] = None
var latestMetadata: Option[Metadata] = None
var latestProtocol: Option[Protocol] = None
val chainIter = untilMetadataChange.toClosable
try {
while (chainIter.hasNext) {
val (_, version, metadataOpt, protocolOpt) = chainIter.next()
lastVersion = Some(version)
metadataOpt.foreach(m => latestMetadata = Some(m))
protocolOpt.foreach(p => latestProtocol = Some(p))
}
} finally {
chainIter.close()
}
lastVersion.flatMap { version =>
if (version == currentMetadataVersion) {
None
} else {
log.info(s"Looked ahead from version $currentMetadataVersion and " +
s"will use metadata at version $version to read Delta stream.")
Some(
currentMetadata.copy(
deltaCommitVersion = version,
dataSchemaJson =
latestMetadata.map(_.schema.json).getOrElse(currentMetadata.dataSchemaJson),
partitionSchemaJson =
latestMetadata.map(_.partitionSchema.json)
.getOrElse(currentMetadata.partitionSchemaJson),
tableConfigurations = latestMetadata.map(_.configuration)
.orElse(currentMetadata.tableConfigurations),
protocolJson = latestProtocol.map(_.json).orElse(currentMetadata.protocolJson)
)
}
)
}
}
}

// scalastyle:off
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import java.nio.charset.Charset
import scala.collection.JavaConverters._
import scala.util.Try

import org.apache.spark.sql.delta.actions.{Metadata, Protocol}
import org.apache.spark.sql.delta.actions.{Action, Metadata, Protocol}
import org.apache.spark.sql.delta.sources._
import org.apache.spark.sql.delta.test.{DeltaColumnMappingSelectedTestMixin, DeltaSQLCommandTest}
import org.apache.spark.sql.delta.test.DeltaTestImplicits._
import org.apache.spark.sql.delta.util.JsonUtils
import org.apache.commons.io.FileUtils
import org.apache.commons.lang3.exception.ExceptionUtils
Expand All @@ -49,6 +50,7 @@ trait StreamingSchemaEvolutionSuiteBase extends ColumnMappingStreamingTestUtils
"trigger.Once with deferred commit should work",
"trigger.AvailableNow should work",
"consecutive schema evolutions",
"consecutive schema evolutions with protocol-only tail",
"latestOffset should not progress before schema evolved"
)

Expand Down Expand Up @@ -1521,6 +1523,51 @@ trait StreamingSchemaEvolutionSuiteBase extends ColumnMappingStreamingTestUtils
)
}

testSchemaEvolution("consecutive schema evolutions with protocol-only tail") {
implicit log =>
// Chain ends with a Protocol-only commit; start, currentMetadata, and final schemas
// are chosen pairwise distinct so a coincidence can't mask a faulty merger.
val v5 = log.update().version // <a, b>
renameColumn("b", "c") // <a, c> -- becomes currentMetadata at merger time
addColumn("d") // <a, c, d>
dropColumn("c") // <a, d> -- the final snapshot schema
val newProtocol = log.update().protocol.merge(
Action.supportedProtocolVersion(featuresToExclude = Seq(CatalogOwnedTableFeature)))
log.upgradeProtocol(newProtocol) // protocol-only tail commit
val vTail = log.update().version
addData(5 until 6) // file action bounds the merger chain

def df: DataFrame = readStream(
schemaLocation = Some(getDefaultSchemaLocation.toString), startingVersion = Some(v5))

// Init schema log @ v5 with <a, b>.
testStream(df)(
StartStream(checkpointLocation = getDefaultCheckpoint.toString),
ProcessAllAvailableIgnoreError,
ExpectMetadataEvolutionExceptionFromInitialization
)
assert(getDefaultSchemaLog().getLatestMetadata.get.deltaCommitVersion == v5)
assert(getDefaultSchemaLog().getLatestMetadata.get.dataSchema.fieldNames
.sameElements(Array("a", "b")))

// First schema change at v6 -> persists (v6, <a, c>), fails the stream.
testStream(df)(
StartStream(checkpointLocation = getDefaultCheckpoint.toString),
ProcessAllAvailableIgnoreError,
CheckAnswer(Seq(4).map(_.toString).map(i => (i, i)): _*),
ExpectMetadataEvolutionException
)
assert(getDefaultSchemaLog().getLatestMetadata.get.deltaCommitVersion == v5 + 1)
assert(getDefaultSchemaLog().getLatestMetadata.get.dataSchema.fieldNames
.sameElements(Array("a", "c")))

// Next restart runs the merger; entry should advance to vTail with schema <a, d>.
val latestDf = df
assert(getDefaultSchemaLog().getLatestMetadata.get.deltaCommitVersion == vTail)
assert(latestDf.schema.fieldNames.sameElements(Array("a", "d")),
s"got ${latestDf.schema.fieldNames.toSeq}")
}

testSchemaEvolution("upgrade and downgrade") { implicit log =>
val ckpt = getDefaultCheckpoint.toString
val df = readStream(startingVersion = Some(1))
Expand Down
Loading