Skip to content

Commit b9b051f

Browse files
authored
Merge pull request #835 from slackhq/backport-increment-migration-metrics
Backport vitessio#19297: increment migration metrics
2 parents b0cb4f6 + 3621176 commit b9b051f

3 files changed

Lines changed: 279 additions & 0 deletions

File tree

go/test/endtoend/onlineddl/scheduler/onlineddl_scheduler_test.go

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ import (
4646

4747
const (
4848
anyErrorIndicator = "<any-error-of-any-kind>"
49+
50+
// Migration metric names as exposed via /debug/vars
51+
metricStartedMigrations = "StartedMigrations"
52+
metricSuccessfulMigrations = "SuccessfulMigrations"
53+
metricFailedMigrations = "FailedMigrations"
4954
)
5055

5156
type testOnlineDDLStatementParams struct {
@@ -3049,3 +3054,164 @@ func runInTransaction(t *testing.T, ctx context.Context, tablet *cluster.Vttable
30493054
}
30503055
return err
30513056
}
3057+
3058+
// TestMigrationMetrics tests that migration metrics are correctly incremented
3059+
// when migrations are executed end-to-end. This verifies that the metrics
3060+
// tracking works correctly in a real cluster environment.
3061+
func TestMigrationMetrics(t *testing.T) {
3062+
throttler.EnableLagThrottlerAndWaitForStatus(t, clusterInstance)
3063+
3064+
shards = clusterInstance.Keyspaces[0].Shards
3065+
require.Equal(t, 1, len(shards))
3066+
3067+
// Helper function to get metric value from /debug/vars
3068+
getMetric := func(metricName string) int64 {
3069+
vars := primaryTablet.VttabletProcess.GetVars()
3070+
if val, ok := vars[metricName]; ok {
3071+
switch v := val.(type) {
3072+
case float64:
3073+
return int64(v)
3074+
case int64:
3075+
return v
3076+
case int:
3077+
return int64(v)
3078+
}
3079+
}
3080+
return 0
3081+
}
3082+
3083+
// Create the stress_test table for our tests
3084+
t.Run("setup table", func(t *testing.T) {
3085+
createStatement := `
3086+
CREATE TABLE IF NOT EXISTS stress_test (
3087+
id bigint(20) not null,
3088+
rand_val varchar(32) null default '',
3089+
hint_col varchar(64) not null default 'just-created',
3090+
created_timestamp timestamp not null default current_timestamp,
3091+
updates int unsigned not null default 0,
3092+
PRIMARY KEY (id),
3093+
key created_idx(created_timestamp),
3094+
key updates_idx(updates)
3095+
) ENGINE=InnoDB
3096+
`
3097+
uuid := testOnlineDDLStatement(t, &testOnlineDDLStatementParams{
3098+
ddlStatement: createStatement,
3099+
ddlStrategy: "direct",
3100+
executeStrategy: "vtgate",
3101+
})
3102+
// For direct strategy, uuid may be empty
3103+
_ = uuid
3104+
})
3105+
3106+
// Test 1: Successful migration increments metrics
3107+
t.Run("successful migration increments metrics", func(t *testing.T) {
3108+
// Get baseline metrics
3109+
startedBefore := getMetric(metricStartedMigrations)
3110+
successfulBefore := getMetric(metricSuccessfulMigrations)
3111+
3112+
// Execute a simple migration
3113+
uuid := testOnlineDDLStatement(t, &testOnlineDDLStatementParams{
3114+
ddlStatement: "ALTER TABLE stress_test ENGINE=InnoDB",
3115+
ddlStrategy: "vitess",
3116+
executeStrategy: "vtgate",
3117+
})
3118+
require.NotEmpty(t, uuid)
3119+
3120+
// Wait for migration to complete
3121+
onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid,
3122+
normalWaitTime, schema.OnlineDDLStatusComplete)
3123+
3124+
// Check metrics incremented correctly using assert.Eventually
3125+
assert.EventuallyWithT(t, func(c *assert.CollectT) {
3126+
startedAfter := getMetric(metricStartedMigrations)
3127+
successfulAfter := getMetric(metricSuccessfulMigrations)
3128+
3129+
assert.Equal(c, startedBefore+1, startedAfter,
3130+
"StartedMigrations should increment by 1")
3131+
assert.Equal(c, successfulBefore+1, successfulAfter,
3132+
"SuccessfulMigrations should increment by 1")
3133+
}, 10*time.Second, time.Second, "metrics did not increment correctly")
3134+
})
3135+
3136+
// Test 2: Failed migration increments metrics
3137+
t.Run("failed migration increments metrics", func(t *testing.T) {
3138+
failedBefore := getMetric(metricFailedMigrations)
3139+
startedBefore := getMetric(metricStartedMigrations)
3140+
3141+
// Execute a migration that will fail (invalid column)
3142+
uuid := testOnlineDDLStatement(t, &testOnlineDDLStatementParams{
3143+
ddlStatement: "ALTER TABLE stress_test DROP COLUMN nonexistent_column",
3144+
ddlStrategy: "vitess",
3145+
executeStrategy: "vtgate",
3146+
})
3147+
require.NotEmpty(t, uuid)
3148+
3149+
// Wait for migration to fail
3150+
onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid,
3151+
normalWaitTime, schema.OnlineDDLStatusFailed)
3152+
3153+
// Verify metrics
3154+
assert.EventuallyWithT(t, func(c *assert.CollectT) {
3155+
failedAfter := getMetric(metricFailedMigrations)
3156+
startedAfter := getMetric(metricStartedMigrations)
3157+
3158+
assert.Equal(c, failedBefore+1, failedAfter,
3159+
"FailedMigrations should increment by 1")
3160+
assert.Equal(c, startedBefore+1, startedAfter,
3161+
"StartedMigrations should increment by 1 even for failed migrations")
3162+
}, 10*time.Second, time.Second, "metrics did not increment correctly")
3163+
})
3164+
3165+
// Test 3: Multiple migrations increment metrics correctly
3166+
t.Run("multiple migrations increment metrics correctly", func(t *testing.T) {
3167+
startedBefore := getMetric(metricStartedMigrations)
3168+
successfulBefore := getMetric(metricSuccessfulMigrations)
3169+
3170+
// Run 3 migrations
3171+
uuids := make([]string, 3)
3172+
for i := range 3 {
3173+
uuids[i] = testOnlineDDLStatement(t, &testOnlineDDLStatementParams{
3174+
ddlStatement: "ALTER TABLE stress_test ENGINE=InnoDB",
3175+
ddlStrategy: "vitess",
3176+
executeStrategy: "vtgate",
3177+
})
3178+
require.NotEmpty(t, uuids[i])
3179+
}
3180+
3181+
// Wait for all to complete
3182+
for _, uuid := range uuids {
3183+
onlineddl.WaitForMigrationStatus(t, &vtParams, shards, uuid,
3184+
extendedWaitTime, schema.OnlineDDLStatusComplete)
3185+
}
3186+
3187+
// Verify metrics
3188+
assert.EventuallyWithT(t, func(c *assert.CollectT) {
3189+
startedAfter := getMetric(metricStartedMigrations)
3190+
successfulAfter := getMetric(metricSuccessfulMigrations)
3191+
3192+
assert.Equal(c, startedBefore+3, startedAfter,
3193+
"StartedMigrations should increment by 3")
3194+
assert.Equal(c, successfulBefore+3, successfulAfter,
3195+
"SuccessfulMigrations should increment by 3")
3196+
}, 15*time.Second, time.Second, "metrics did not increment correctly")
3197+
})
3198+
3199+
// Test 4: Verify metrics are accessible via /debug/vars
3200+
t.Run("metrics are exposed via debug vars", func(t *testing.T) {
3201+
vars := primaryTablet.VttabletProcess.GetVars()
3202+
require.NotNil(t, vars)
3203+
3204+
// Verify all three metrics exist
3205+
assert.Contains(t, vars, metricStartedMigrations,
3206+
"StartedMigrations metric should be exposed")
3207+
assert.Contains(t, vars, metricSuccessfulMigrations,
3208+
"SuccessfulMigrations metric should be exposed")
3209+
assert.Contains(t, vars, metricFailedMigrations,
3210+
"FailedMigrations metric should be exposed")
3211+
3212+
// Verify metrics are non-negative
3213+
assert.GreaterOrEqual(t, getMetric(metricStartedMigrations), int64(0))
3214+
assert.GreaterOrEqual(t, getMetric(metricSuccessfulMigrations), int64(0))
3215+
assert.GreaterOrEqual(t, getMetric(metricFailedMigrations), int64(0))
3216+
})
3217+
}

go/vt/vttablet/onlineddl/executor.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2228,6 +2228,7 @@ func (e *Executor) readFailedCancelledMigrationsInContextBeforeMigration(ctx con
22282228
func (e *Executor) failMigration(ctx context.Context, onlineDDL *schema.OnlineDDL, withError error) error {
22292229
defer e.triggerNextCheckInterval()
22302230
_ = e.updateMigrationStatusFailedOrCancelled(ctx, onlineDDL.UUID)
2231+
failedMigrations.Add(1)
22312232
if withError != nil {
22322233
_ = e.updateMigrationMessage(ctx, onlineDDL.UUID, withError.Error())
22332234
}
@@ -3261,6 +3262,7 @@ func (e *Executor) reviewStaleMigrations(ctx context.Context) error {
32613262
if err := e.updateMigrationStatus(ctx, onlineDDL.UUID, schema.OnlineDDLStatusFailed); err != nil {
32623263
return err
32633264
}
3265+
failedMigrations.Add(1)
32643266
defer e.triggerNextCheckInterval()
32653267
_ = e.updateMigrationStartedTimestamp(ctx, uuid)
32663268
// Because the migration is stale, it may not update completed_timestamp. It is essential to set completed_timestamp
@@ -4500,17 +4502,20 @@ func (e *Executor) onSchemaMigrationStatus(ctx context.Context,
45004502
{
45014503
_ = e.updateMigrationStartedTimestamp(ctx, uuid)
45024504
err = e.updateMigrationTimestamp(ctx, "liveness_timestamp", uuid)
4505+
startedMigrations.Add(1)
45034506
}
45044507
case schema.OnlineDDLStatusComplete:
45054508
{
45064509
progressPct = progressPctFull
45074510
_ = e.updateMigrationStartedTimestamp(ctx, uuid)
45084511
err = e.updateMigrationTimestamp(ctx, "completed_timestamp", uuid)
4512+
successfulMigrations.Add(1)
45094513
}
45104514
case schema.OnlineDDLStatusFailed:
45114515
{
45124516
_ = e.updateMigrationStartedTimestamp(ctx, uuid)
45134517
err = e.updateMigrationTimestamp(ctx, "completed_timestamp", uuid)
4518+
failedMigrations.Add(1)
45144519
}
45154520
}
45164521
if err != nil {

go/vt/vttablet/onlineddl/executor_test.go

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ import (
2626

2727
"github.com/stretchr/testify/assert"
2828
"github.com/stretchr/testify/require"
29+
30+
"vitess.io/vitess/go/vt/schema"
2931
)
3032

3133
func TestShouldCutOverAccordingToBackoff(t *testing.T) {
@@ -221,3 +223,109 @@ func TestSafeMigrationCutOverThreshold(t *testing.T) {
221223
})
222224
}
223225
}
226+
227+
func TestMigrationMetricsIncrement(t *testing.T) {
228+
tcases := []struct {
229+
name string
230+
testFunc func()
231+
verify func(before int64, after int64) bool
232+
}{
233+
{
234+
name: "startedMigrations increments correctly",
235+
testFunc: func() {
236+
startedMigrations.Add(1)
237+
},
238+
verify: func(before int64, after int64) bool {
239+
return after == before+1
240+
},
241+
},
242+
{
243+
name: "successfulMigrations increments correctly",
244+
testFunc: func() {
245+
successfulMigrations.Add(1)
246+
},
247+
verify: func(before int64, after int64) bool {
248+
return after == before+1
249+
},
250+
},
251+
{
252+
name: "failedMigrations increments correctly",
253+
testFunc: func() {
254+
failedMigrations.Add(1)
255+
},
256+
verify: func(before int64, after int64) bool {
257+
return after == before+1
258+
},
259+
},
260+
}
261+
262+
for _, tcase := range tcases {
263+
t.Run(tcase.name, func(t *testing.T) {
264+
var before, after int64
265+
266+
switch tcase.name {
267+
case "startedMigrations increments correctly":
268+
before = startedMigrations.Get()
269+
tcase.testFunc()
270+
after = startedMigrations.Get()
271+
case "successfulMigrations increments correctly":
272+
before = successfulMigrations.Get()
273+
tcase.testFunc()
274+
after = successfulMigrations.Get()
275+
case "failedMigrations increments correctly":
276+
before = failedMigrations.Get()
277+
tcase.testFunc()
278+
after = failedMigrations.Get()
279+
}
280+
281+
assert.True(t, tcase.verify(before, after), "metric should increment correctly: before=%d, after=%d", before, after)
282+
})
283+
}
284+
}
285+
286+
func TestMigrationStatusTransitionsUpdateMetrics(t *testing.T) {
287+
tcases := []struct {
288+
name string
289+
status schema.OnlineDDLStatus
290+
expectStarted int64
291+
expectSuccess int64
292+
expectFailed int64
293+
}{
294+
{
295+
name: "running status updates started metric",
296+
status: schema.OnlineDDLStatusRunning,
297+
expectStarted: 1,
298+
},
299+
{
300+
name: "complete status updates successful metric",
301+
status: schema.OnlineDDLStatusComplete,
302+
expectSuccess: 1,
303+
},
304+
{
305+
name: "failed status updates failed metric",
306+
status: schema.OnlineDDLStatusFailed,
307+
expectFailed: 1,
308+
},
309+
}
310+
311+
for _, tcase := range tcases {
312+
t.Run(tcase.name, func(t *testing.T) {
313+
startedBefore := startedMigrations.Get()
314+
successBefore := successfulMigrations.Get()
315+
failedBefore := failedMigrations.Get()
316+
317+
switch tcase.status {
318+
case schema.OnlineDDLStatusRunning:
319+
startedMigrations.Add(1)
320+
case schema.OnlineDDLStatusComplete:
321+
successfulMigrations.Add(1)
322+
case schema.OnlineDDLStatusFailed:
323+
failedMigrations.Add(1)
324+
}
325+
326+
assert.Equal(t, startedBefore+tcase.expectStarted, startedMigrations.Get(), "startedMigrations")
327+
assert.Equal(t, successBefore+tcase.expectSuccess, successfulMigrations.Get(), "successfulMigrations")
328+
assert.Equal(t, failedBefore+tcase.expectFailed, failedMigrations.Get(), "failedMigrations")
329+
})
330+
}
331+
}

0 commit comments

Comments
 (0)