diff --git a/.gitignore b/.gitignore index 636e29b86ed..b6d3f2c84d7 100644 --- a/.gitignore +++ b/.gitignore @@ -58,3 +58,4 @@ cmake-build-debug/ cmake-build-release/ __pycache__ src/unit/.flags +.DS_Store diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake index 2845a350192..f9fd13d45a6 100644 --- a/cmake/Modules/SourceFiles.cmake +++ b/cmake/Modules/SourceFiles.cmake @@ -11,6 +11,10 @@ set(VALKEY_SERVER_SRCS ${CMAKE_SOURCE_DIR}/src/ae.c ${CMAKE_SOURCE_DIR}/src/anet.c ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/reply_blocking.c + ${CMAKE_SOURCE_DIR}/src/durable_task.c + ${CMAKE_SOURCE_DIR}/src/durability_provider.c + ${CMAKE_SOURCE_DIR}/src/uncommitted_keys.c ${CMAKE_SOURCE_DIR}/src/hashtable.c ${CMAKE_SOURCE_DIR}/src/kvstore.c ${CMAKE_SOURCE_DIR}/src/sds.c diff --git a/src/Makefile b/src/Makefile index cd11a507269..24972b448a5 100644 --- a/src/Makefile +++ b/src/Makefile @@ -524,6 +524,10 @@ ENGINE_SERVER_OBJ = \ rdma.o \ release.o \ replication.o \ + reply_blocking.o \ + durable_task.o \ + durability_provider.o \ + uncommitted_keys.o \ resp_parser.o \ rio.o \ script.o \ diff --git a/src/aof.c b/src/aof.c index bfebed1f47a..375a08fc393 100644 --- a/src/aof.c +++ b/src/aof.c @@ -29,6 +29,7 @@ #include "server.h" #include "bio.h" +#include "io_threads.h" #include "rio.h" #include "functions.h" #include "module.h" @@ -51,6 +52,23 @@ aofManifest *aofLoadManifestFromFile(sds am_filepath); void aofManifestFreeAndUpdate(aofManifest *am); void aof_background_fsync_and_close(int fd); +enum { + AOF_IO_FLUSH_IDLE = 0, + AOF_IO_FLUSH_PENDING, + AOF_IO_FLUSH_DONE, + AOF_IO_FLUSH_ERR, +}; + +typedef struct aofIOFlushJob { + int fd; + sds buf; + size_t len; + long long reploff; +} aofIOFlushJob; + +static void processAofIOThreadFlushResult(void); +static int tryOffloadAofAlwaysFlushToIOThreads(void); + /* ---------------------------------------------------------------------------- * AOF Manifest file implementation. * @@ -952,6 +970,9 @@ void stopAppendOnly(void) { server.aof_last_incr_size = 0; server.aof_last_incr_fsync_offset = 0; server.fsynced_reploff = -1; + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); atomic_store_explicit(&server.fsynced_reploff_pending, 0, memory_order_relaxed); killAppendOnlyChild(); sdsfree(server.aof_buf); @@ -1002,6 +1023,9 @@ int startAppendOnly(void) { serverLog(LL_WARNING, "AOF reopen, just ignore the last error."); server.aof_last_write_status = C_OK; } + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); return C_OK; } @@ -1156,6 +1180,118 @@ ssize_t aofWrite(int fd, const char *buf, size_t len) { return totwritten; } +static void aofIOThreadFlushJobHandler(void *data) { + aofIOFlushJob *job = data; + int err = 0; + ssize_t nwritten = aofWrite(job->fd, job->buf, job->len); + if (nwritten != (ssize_t)job->len) { + err = (nwritten == -1) ? errno : ENOSPC; + goto done; + } + + if (valkey_fsync(job->fd) == -1) { + err = errno; + goto done; + } + + atomic_store_explicit(&server.fsynced_reploff_pending, job->reploff, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, job->len, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_DONE, memory_order_release); + sdsfree(job->buf); + zfree(job); + return; + +done: + atomic_store_explicit(&server.aof_io_flush_errno, err, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_ERR, memory_order_release); + sdsfree(job->buf); + zfree(job); +} + +int aofIOFlushInProgress(void) { + return atomic_load_explicit(&server.aof_io_flush_state, memory_order_acquire) == AOF_IO_FLUSH_PENDING; +} + +static void processAofIOThreadFlushResult(void) { + int state = atomic_load_explicit(&server.aof_io_flush_state, memory_order_acquire); + if (state == AOF_IO_FLUSH_IDLE || state == AOF_IO_FLUSH_PENDING) return; + + if (state == AOF_IO_FLUSH_DONE) { + off_t nwritten = atomic_load_explicit(&server.aof_io_flush_size, memory_order_relaxed); + server.aof_current_size += nwritten; + server.aof_last_incr_size += nwritten; + server.aof_last_incr_fsync_offset = server.aof_last_incr_size; + server.aof_last_fsync = server.mstime; + if (server.aof_last_write_status == C_ERR) { + serverLog(LL_NOTICE, "AOF write error looks solved. The server can write again."); + server.aof_last_write_status = C_OK; + } + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + + /* Notify sync replication that AOF fsync completed so blocked clients can be unblocked */ + notifyDurabilityProgress(); + return; + } + + int err = atomic_load_explicit(&server.aof_io_flush_errno, memory_order_relaxed); + server.aof_last_write_errno = err; + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + + /* IO thread flush is only used with appendfsync=always, so an error here + * is always fatal. We cannot guarantee durability. */ + serverLog(LL_WARNING, + "Can't persist AOF from IO thread for " + "AOF fsync policy 'always': %s. Exiting...", + strerror(err)); + exit(1); +} + +static int tryOffloadAofAlwaysFlushToIOThreads(void) { + if (server.aof_fsync != AOF_FSYNC_ALWAYS || sdslen(server.aof_buf) == 0 || aofIOFlushInProgress()) { + return C_ERR; + } + + /* Ensure the previous IO thread result has been fully processed. + * aofIOFlushInProgress() only checks for PENDING; the state could also + * be DONE or ERR if processAofIOThreadFlushResult() hasn't run yet. */ + if (atomic_load_explicit(&server.aof_io_flush_state, memory_order_acquire) != AOF_IO_FLUSH_IDLE) { + return C_ERR; + } + + /* If IO threads are configured but not active, we can't offload. + * Note: Thread activation based on AOF workload is handled by + * adjustIOThreadsByEventLoad() via the has_background_work parameter. */ + if (server.io_threads_num <= 1 || server.active_io_threads_num <= 1) { + return C_ERR; + } + + /* NOTE: With sync replication enabled, we still want to offload fsync to + * IO threads to avoid blocking the main thread. The notifyDurabilityProgress() + * callback will be invoked in beforeSleep() when we check for completed IO thread + * jobs, which will then unblock waiting clients. This adds at most one + * event loop iteration of latency but keeps the main thread responsive. */ + + aofIOFlushJob *job = zmalloc(sizeof(*job)); + job->fd = server.aof_fd; + job->buf = server.aof_buf; + job->len = sdslen(job->buf); + job->reploff = server.primary_repl_offset; + + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_PENDING, memory_order_release); + if (trySendJobToIOThreads(aofIOThreadFlushJobHandler, job) == C_OK) { + /* Hand off the buffer to the IO thread; allocate a fresh one for new writes. */ + server.aof_buf = sdsempty(); + server.aof_flush_postponed_start = 0; + return C_OK; + } + + atomic_store_explicit(&server.aof_io_flush_state, AOF_IO_FLUSH_IDLE, memory_order_release); + zfree(job); + return C_ERR; +} + /* Write the append only file buffer on disk. * * Since we are required to write the AOF before replying to the client, @@ -1180,6 +1316,23 @@ void flushAppendOnlyFile(int force) { int sync_in_progress = 0; mstime_t latency; + processAofIOThreadFlushResult(); + if (aofIOFlushInProgress()) { + if (!force) return; + /* Busy-wait for the IO thread to finish. Timeout after 30 seconds + * to prevent hanging indefinitely. */ + monotime wait_start = getMonotonicUs(); + while (aofIOFlushInProgress()) { + usleep(100); + processAofIOThreadFlushResult(); + if (getMonotonicUs() - wait_start > 30 * 1000000ULL) { + serverLog(LL_WARNING, + "Timed out waiting for AOF IO thread flush to complete. Exiting..."); + exit(1); + } + } + } + if (sdslen(server.aof_buf) == 0) { /* Check if we need to do fsync even the aof buffer is empty, * because previously in AOF_FSYNC_EVERYSEC mode, fsync is @@ -1234,6 +1387,11 @@ void flushAppendOnlyFile(int force) { "without waiting for fsync to complete, this may slow down the server."); } } + + if (server.aof_fsync == AOF_FSYNC_ALWAYS && !force && tryOffloadAofAlwaysFlushToIOThreads() == C_OK) { + return; + } + /* We want to perform a single write. This should be guaranteed atomic * at least if the filesystem we are writing is a real physical one. * While this will save us against the server being killed I don't think diff --git a/src/config.c b/src/config.c index 93ef289e328..aa6a592fa1d 100644 --- a/src/config.c +++ b/src/config.c @@ -2605,6 +2605,9 @@ static int updateAppendOnly(const char **err) { return 0; } } + /* Durability is implied by appendfsync always + AOF on, so toggling + * appendonly may enable or disable it. */ + durabilityReset(); return 1; } @@ -2694,6 +2697,9 @@ int updateAppendFsync(const char **err) { * worker thread. */ bioDrainWorker(BIO_AOF_FSYNC); } + /* Durability is implied by appendfsync always + AOF on, so toggling + * appendfsync may enable or disable it. */ + durabilityReset(); return 1; } @@ -3259,6 +3265,7 @@ standardConfig static_configs[] = { createBoolConfig("repl-mptcp", NULL, IMMUTABLE_CONFIG, server.repl_mptcp, 0, isValidMptcp, NULL), createBoolConfig("repl-diskless-sync", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_diskless_sync, 1, NULL, NULL), createBoolConfig("dual-channel-replication-enabled", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.dual_channel_replication, 0, NULL, NULL), + createBoolConfig("sync-eligible", NULL, MODIFIABLE_CONFIG, server.sync_eligible, 0, NULL, NULL), createBoolConfig("aof-rewrite-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.aof_rewrite_incremental_fsync, 1, NULL, NULL), createBoolConfig("no-appendfsync-on-rewrite", NULL, MODIFIABLE_CONFIG, server.aof_no_fsync_on_rewrite, 0, NULL, NULL), createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL), @@ -3397,6 +3404,7 @@ standardConfig static_configs[] = { createIntConfig("hz", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.hz, CONFIG_DEFAULT_HZ, INTEGER_CONFIG, NULL, updateHZ), createIntConfig("min-replicas-to-write", "min-slaves-to-write", MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_min_replicas_to_write, 0, INTEGER_CONFIG, NULL, updateGoodReplicas), createIntConfig("min-replicas-max-lag", "min-slaves-max-lag", MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_min_replicas_max_lag, 10, INTEGER_CONFIG, NULL, updateGoodReplicas), + createIntConfig("min-sync-replicas", NULL, IMMUTABLE_CONFIG, 0, 6, server.min_sync_replicas, 0, INTEGER_CONFIG, NULL, NULL), createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod), createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL), diff --git a/src/db.c b/src/db.c index ba9d25c2fa6..e4c869ad37e 100644 --- a/src/db.c +++ b/src/db.c @@ -752,6 +752,9 @@ long long dbTotalServerKeyCount(void) { * a context of a client. */ void signalModifiedKey(client *c, serverDb *db, robj *key) { touchWatchedKey(db, key); + if (durabilitySignalModifiedKey(c, db, key)) { + return; + } trackingInvalidateKey(c, key, 1); } @@ -770,6 +773,10 @@ void signalFlushedDb(int dbid, int async) { touchAllWatchedKeysInDb(server.db[j], NULL); } + if (durabilitySignalFlushedDb(dbid)) { + return; + } + trackingInvalidateKeysOnFlush(async); /* Changes in this method may take place in swapMainDbWithTempDb as well, @@ -1766,6 +1773,13 @@ int dbSwapDatabases(int id1, int id2) { db2->keys_with_volatile_items = aux.keys_with_volatile_items; copyDbExpiry(db2, &aux); + /* Swap uncommitted key tracking so it stays consistent with the key data. */ + db1->uncommitted_keys = db2->uncommitted_keys; + db1->dirty_repl_offset = db2->dirty_repl_offset; + + db2->uncommitted_keys = aux.uncommitted_keys; + db2->dirty_repl_offset = aux.dirty_repl_offset; + /* Now we need to handle clients blocked on lists: as an effect * of swapping the two DBs, a client that was waiting for list * X in a given DB, may now actually be unblocked if X happens @@ -1811,6 +1825,13 @@ void swapMainDbWithTempDb(serverDb **tempDb) { newdb->keys_with_volatile_items = aux.keys_with_volatile_items; copyDbExpiry(newdb, &aux); + /* Swap uncommitted key tracking so it stays consistent with the key data. */ + activedb->uncommitted_keys = newdb->uncommitted_keys; + activedb->dirty_repl_offset = newdb->dirty_repl_offset; + + newdb->uncommitted_keys = aux.uncommitted_keys; + newdb->dirty_repl_offset = aux.dirty_repl_offset; + /* Now we need to handle clients blocked on lists: as an effect * of swapping the two DBs, a client that was waiting for list * X in a given DB, may now actually be unblocked if X happens @@ -1946,6 +1967,7 @@ void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int d notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired", keyobj, db->id); signalModifiedKey(NULL, db, keyobj); propagateDeletion(db, keyobj, server.lazyfree_lazy_expire, dict_index); + if (isPrimaryDurabilityEnabled()) handleUncommittedKeyForClient(NULL, keyobj, db); server.stat_expiredkeys++; } @@ -2077,6 +2099,7 @@ size_t dbReclaimExpiredFields(robj *o, serverDb *db, mstime_t now, unsigned long if (!hashTypeHasVolatileFields(o)) dbUntrackKeyWithVolatileItems(db, o); } signalModifiedKey(NULL, db, keyobj); + if (isPrimaryDurabilityEnabled()) handleUncommittedKeyForClient(NULL, keyobj, db); exitExecutionUnit(); postExecutionUnitOperations(); decrRefCount(keyobj); diff --git a/src/debug.c b/src/debug.c index bbb02dc2d25..6aad06edbe0 100644 --- a/src/debug.c +++ b/src/debug.c @@ -39,6 +39,7 @@ #include "io_threads.h" #include "sds.h" #include "module.h" +#include "durability_provider.h" #include #include @@ -1066,6 +1067,37 @@ void debugCommand(client *c) { } else if (!strcasecmp(objectGetVal(c->argv[1]), "client-enforce-reply-list") && c->argc == 3) { server.debug_client_enforce_reply_list = atoi(objectGetVal(c->argv[2])); addReply(c, shared.ok); + } else if (!strcasecmp(objectGetVal(c->argv[1]), "durability-provider-pause") && c->argc == 3) { + if (pauseDurabilityProvider(objectGetVal(c->argv[2]))) { + addReply(c, shared.ok); + } else { + addReplyError(c, "No such durability provider"); + } + } else if (!strcasecmp(objectGetVal(c->argv[1]), "durability-provider-resume") && c->argc == 3) { + if (resumeDurabilityProvider(objectGetVal(c->argv[2]))) { + addReply(c, shared.ok); + } else { + addReplyError(c, "No such durability provider"); + } + } else if (!strcasecmp(objectGetVal(c->argv[1]), "set-io-last-written") && c->argc == 5) { + /* DEBUG set-io-last-written + * Simulate a partial write state on a target client for testing. + * Sets io_last_written.buf to target->buf, bufpos and data_len to the given values. + * This allows injecting the post-partial-write state that triggers the + * data_len vs bufpos divergence with copy avoidance. */ + long long client_id, bufpos_val, data_len_val; + if (getLongLongFromObjectOrReply(c, c->argv[2], &client_id, NULL) != C_OK) return; + if (getLongLongFromObjectOrReply(c, c->argv[3], &bufpos_val, NULL) != C_OK) return; + if (getLongLongFromObjectOrReply(c, c->argv[4], &data_len_val, NULL) != C_OK) return; + client *target = lookupClientByID((uint64_t)client_id); + if (target == NULL) { + addReplyError(c, "No such client"); + return; + } + target->io_last_written.buf = target->buf; + target->io_last_written.bufpos = (size_t)bufpos_val; + target->io_last_written.data_len = (size_t)data_len_val; + addReply(c, shared.ok); } else if (!handleDebugClusterCommand(c)) { addReplySubcommandSyntaxError(c); return; diff --git a/src/durability_provider.c b/src/durability_provider.c new file mode 100644 index 00000000000..3e1a5c8b895 --- /dev/null +++ b/src/durability_provider.c @@ -0,0 +1,241 @@ +#include "server.h" +#include +#include + +/*================================= Durability Provider Registry ============= */ + +/* Provider registry: static array of registered providers */ +static durabilityProvider *durability_providers[MAX_DURABILITY_PROVIDERS]; +static int num_durability_providers = 0; + +/** + * Register a durability provider. Providers are checked in registration order. + * The overall durability consensus is the MIN (AND) of all enabled providers. + */ +void registerDurabilityProvider(durabilityProvider *provider) { + serverAssert(num_durability_providers < MAX_DURABILITY_PROVIDERS); + durability_providers[num_durability_providers++] = provider; + serverLog(LL_NOTICE, "Registered durability provider: %s", provider->name); +} + +/** + * Unregister a durability provider by pointer. + */ +void unregisterDurabilityProvider(durabilityProvider *provider) { + for (int i = 0; i < num_durability_providers; i++) { + if (durability_providers[i] == provider) { + /* Shift remaining providers down */ + for (int j = i; j < num_durability_providers - 1; j++) { + durability_providers[j] = durability_providers[j + 1]; + } + num_durability_providers--; + serverLog(LL_NOTICE, "Unregistered durability provider: %s", provider->name); + return; + } + } +} + +bool anyDurabilityProviderEnabled(void) { + for (int i = 0; i < num_durability_providers; i++) { + if (durability_providers[i]->isEnabled()) return true; + } + return false; +} + +/** + * Reset the durability provider registry so it can be re-initialized. + */ +void resetDurabilityProviders(void) { + num_durability_providers = 0; +} + +/*================================= Built-in AOF Provider ==================== */ + +static bool aofProviderIsEnabled(void) { + return server.aof_state != AOF_OFF && server.aof_fsync == AOF_FSYNC_ALWAYS; +} + +static long long aofProviderGetAckedOffset(void) { + /* Use fsynced_reploff_pending directly instead of fsynced_reploff. + * When async AOF flushing is used (IO threads), fsynced_reploff_pending + * is updated by the IO thread upon fsync completion, but fsynced_reploff + * is only updated in the next beforeSleep() iteration. Using the pending + * value ensures we see the most up-to-date fsync progress immediately. */ + long long fsynced_offset = atomic_load_explicit(&server.fsynced_reploff_pending, memory_order_relaxed); + /* Handle the case where AOF is enabled but no data has been fsynced yet + * (fsynced_reploff_pending is 0 initially). In that case, use fsynced_reploff + * if it's been properly initialized. */ + if (fsynced_offset == 0 && server.fsynced_reploff > 0) { + fsynced_offset = server.fsynced_reploff; + } + return fsynced_offset; +} + +static durabilityProvider builtinAofProvider = { + .name = "aof", + .isEnabled = aofProviderIsEnabled, + .getAckedOffset = aofProviderGetAckedOffset, + .paused = false, + .pausedOffset = 0, +}; + +/*================================= Built-in Replication Provider ============ */ + +/** + * The replication durability provider is enabled when min-sync-replicas > 0. + * This implements the sync replication data path from the PacificA framework: + * writes are only considered committed once acknowledged by at least + * min-sync-replicas sync replicas (replicas with REPLICA_CAPA_SYNC flag). + */ +static bool replicationProviderIsEnabled(void) { + return server.min_sync_replicas > 0; +} + +/** + * Compute the consensus offset across all sync replicas. + * + * For every REPLCONF ACK, we calculate the minimum ack offset of all + * online sync replicas (those in the ISR — with is_in_sync flag set). + * + * consensus_offset = minimum_ack_offset(list of sync replicas) + * + * A replica is in the ISR when: + * 1. It declared REPLICA_CAPA_SYNC capability via REPLCONF + * 2. Its repl_ack_off caught up to the committed_offset + * 3. It has not timed out (checked by replicationCron) + * + * If there are fewer ISR members than min-sync-replicas, + * returns -1 to block consensus advancement (the shard is not writable). + */ +static long long replicationProviderGetAckedOffset(void) { + listIter li; + listNode *ln; + int sync_replica_count = 0; + long long min_offset = LLONG_MAX; + + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + + /* Only consider replicas that are online and in the ISR. */ + if (replica->repl_data->repl_state != REPLICA_STATE_ONLINE) continue; + if (!replica->repl_data->is_in_sync) continue; + + sync_replica_count++; + if (replica->repl_data->repl_ack_off < min_offset) { + min_offset = replica->repl_data->repl_ack_off; + } + } + + /* If we don't have enough sync replicas, block consensus. */ + if (sync_replica_count < server.min_sync_replicas) { + return -1; + } + + /* If min_offset was never updated (shouldn't happen given count check), + * return 0 as a safe fallback. */ + return (min_offset == LLONG_MAX) ? 0 : min_offset; +} + +static durabilityProvider builtinReplicationProvider = { + .name = "replication", + .isEnabled = replicationProviderIsEnabled, + .getAckedOffset = replicationProviderGetAckedOffset, + .paused = false, + .pausedOffset = 0, +}; + +/** + * Register the built-in durability providers. Called from durabilityInit(). + * + * Currently the AOF provider and replication provider are built-in. + */ +void registerBuiltinDurabilityProviders(void) { + /* Only register if not already registered (idempotent) */ + if (num_durability_providers == 0) { + registerDurabilityProvider(&builtinAofProvider); + registerDurabilityProvider(&builtinReplicationProvider); + } +} + +/*================================= Consensus Calculation ==================== */ + +/** + * Returns the durability consensus offset by iterating all registered + * providers and returning the MIN of all enabled providers' acknowledged + * offsets (AND semantics: all must acknowledge). + * + * If a provider returns -1, it means the provider cannot make progress + * (e.g. insufficient replicas), which blocks consensus advancement. + * + * If no providers are enabled, returns server.primary_repl_offset + * (i.e. no blocking). + */ +long long getDurabilityConsensusOffset(void) { + long long consensus = server.primary_repl_offset; + bool any_enabled = false; + + for (int i = 0; i < num_durability_providers; i++) { + durabilityProvider *p = durability_providers[i]; + if (!p->isEnabled()) continue; + any_enabled = true; + + long long offset; + if (p->paused) { + /* Paused provider (via DEBUG) returns the offset snapshot + * captured at pause time, freezing consensus at that point. */ + offset = p->pausedOffset; + } else { + offset = p->getAckedOffset(); + } + + if (offset == -1) { + /* Provider cannot make progress — block consensus. */ + return -1; + } + if (offset < consensus) consensus = offset; + } + + return any_enabled ? consensus : server.primary_repl_offset; +} + +/** + * Pause a durability provider by name (via DEBUG command). + * When paused, the provider's current acknowledged offset is captured and + * frozen — any writes after the pause point will block until the provider + * is resumed and catches up. + * Returns true if provider was found, false otherwise. + */ +bool pauseDurabilityProvider(const char *name) { + for (int i = 0; i < num_durability_providers; i++) { + if (!strcasecmp(durability_providers[i]->name, name)) { + /* Snapshot the current acked offset before pausing so that + * writes already acknowledged remain unblocked. */ + durability_providers[i]->pausedOffset = durability_providers[i]->getAckedOffset(); + durability_providers[i]->paused = true; + serverLog(LL_NOTICE, "Paused durability provider: %s (frozen at offset %lld)", + name, durability_providers[i]->pausedOffset); + return true; + } + } + return false; +} + +/** + * Resume a durability provider by name (via DEBUG command). + * After resuming, triggers a durability progress check to unblock + * any clients that can now proceed. + * Returns true if provider was found, false otherwise. + */ +bool resumeDurabilityProvider(const char *name) { + for (int i = 0; i < num_durability_providers; i++) { + if (!strcasecmp(durability_providers[i]->name, name)) { + durability_providers[i]->paused = false; + /* Trigger a durability check to unblock any clients that can now proceed */ + notifyDurabilityProgress(); + serverLog(LL_NOTICE, "Resumed durability provider: %s", name); + return true; + } + } + return false; +} diff --git a/src/durability_provider.h b/src/durability_provider.h new file mode 100644 index 00000000000..e43885763f9 --- /dev/null +++ b/src/durability_provider.h @@ -0,0 +1,56 @@ +#ifndef DURABILITY_PROVIDER_H +#define DURABILITY_PROVIDER_H + +#include + +/*================================= Durability Provider Interface ============ */ + +/** + * Maximum number of durability providers that can be registered. + * Built-in providers: replica, aof. + */ +#define MAX_DURABILITY_PROVIDERS 4 + +/** + * A durability provider represents a source of durability acknowledgment. + * Each provider tracks progress independently and the overall durability + * consensus is the MIN (AND) of all enabled providers' acknowledged offsets. + * + * Examples: replica acknowledgments, AOF fsync. + */ +typedef struct durabilityProvider { + const char *name; /* Human-readable name, e.g. "replica", "aof" */ + bool (*isEnabled)(void); /* Is this provider currently active? */ + long long (*getAckedOffset)(void); /* What offset has this provider acknowledged? */ + bool paused; /* When true (via DEBUG), getAckedOffset() returns + * the offset captured at pause time to freeze + * consensus progress. Used for testing. */ + long long pausedOffset; /* Offset snapshot taken when provider is paused. */ +} durabilityProvider; + +/* Provider registry */ +void registerDurabilityProvider(durabilityProvider *provider); +void unregisterDurabilityProvider(durabilityProvider *provider); +bool anyDurabilityProviderEnabled(void); +bool pauseDurabilityProvider(const char *name); +bool resumeDurabilityProvider(const char *name); + +/** + * Returns the durability consensus offset by iterating all registered + * providers and returning the MIN of all enabled providers' acknowledged + * offsets (AND semantics: all must acknowledge). + */ +long long getDurabilityConsensusOffset(void); + +/** + * Register the built-in durability providers (replica + AOF). + * Called from durabilityInit(). + */ +void registerBuiltinDurabilityProviders(void); + +/** + * Reset the durability provider registry (for cleanup/shutdown). + */ +void resetDurabilityProviders(void); + +#endif /* DURABILITY_PROVIDER_H */ diff --git a/src/durable_task.c b/src/durable_task.c new file mode 100644 index 00000000000..c71d61e54f5 --- /dev/null +++ b/src/durable_task.c @@ -0,0 +1,389 @@ +#include "server.h" +#include "zmalloc.h" +#include +#include + +/* Forward declarations from module.h to avoid pulling in full module internals + * which has header dependency issues when included before server.h */ +void moduleNotifyKeyspaceEvent(int type, const char *event, robj *key, int dbid); + +/*================================= Internal Data structures ======================== */ + +/** + * Internal structure used to track replication offset and arguments needed in + * executing task when offset has been acked by required number of replicas. + */ +typedef struct taskWaitingAck { + int type; // Task type + int64_t offset; + void **argv; +} taskWaitingAck; + +/** + * Internal structure used to define all handlers for a task type + */ +typedef struct taskWaitingAckType { + taskWaitingAck *(*createTask)(va_list); + void (*destroyTask)(void *); + void (*executeTask)(const taskWaitingAck *); + void (*onClientDestroy)(void *); +} taskWaitingAckType; + +static taskWaitingAckType taskTypes[DURABLE_TASK_TYPE_MAX]; + +/*================================= Keyspace Notify Task ===================== */ + +/** + * Create the keyspace notify task. + */ +static taskWaitingAck *createKeyspaceNotifyTask(va_list ap) { + int argc = 4; // 4 arguments for notify function: type, event, key, dbid + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + + /* Copy the event string (argv[1]) because the caller (especially modules) + * may free the original string after this function returns. */ + char *event = (char *)task->argv[1]; + if (event) { + task->argv[1] = zstrdup(event); + } + + // Increase reference count to avoid the key from being deleted + robj *key = (robj *)task->argv[2]; + if (key) { + incrRefCount(key); + } + return task; +} + +/** + * Destroy the keyspace notify task. + */ +static void destroyKeyspaceNotifyTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + /* Free the copied event string (argv[1]) */ + if (task->argv[1]) { + zfree(task->argv[1]); + } + if (task->argv[2]) { + robj *key = (robj *)task->argv[2]; + decrRefCount(key); + } + zfree(task->argv); + zfree(task); +} + +/** + * Execute the keyspace notify task. + */ +static void executeKeyspaceNotifyTask(const taskWaitingAck *task) { + notifyKeyspaceEvent((int)(intptr_t)task->argv[0], + (char *)task->argv[1], + (robj *)task->argv[2], + (int)(intptr_t)task->argv[3]); +} + +/*================================= Key Invalidation Task ==================== */ + +/** + * Create the key invalidation task. + */ +static taskWaitingAck *createKeyInvalidationTask(va_list ap) { + // A key invalidation task has 2 arguments: + // 1. client* which generated the modification on the key + // 2. serverObject* that is modified + int argc = 2; + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + + // Track the pending notification task in the referenced client + client *c = (client *)task->argv[0]; + if (c != NULL) { + listAddNodeTail(c->clientDurabilityInfo.pending_notify_tasks, task); + } + + // Increase reference count to avoid the key from being deleted + robj *key = (robj *)task->argv[1]; + if (key) { + incrRefCount(key); + } + return task; +} + +/** + * Destroy the key invalidation task. + */ +static void destroyKeyInvalidationTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + // Remove the current task from the list of pending tasks for the client. + // The tasks are tracked in FIFO order so we only need to look at the first one. + client *c = (client *)task->argv[0]; + if (c != NULL) { + serverAssert(listLength(c->clientDurabilityInfo.pending_notify_tasks) > 0); + listNode *first = listFirst(c->clientDurabilityInfo.pending_notify_tasks); + serverAssert(task == (taskWaitingAck *)listNodeValue(first)); + listDelNode(c->clientDurabilityInfo.pending_notify_tasks, first); + } + + // Decrement the refcount for the key + if (task->argv[1]) { + robj *key = (robj *)task->argv[1]; + decrRefCount(key); + } + zfree(task->argv); + zfree(task); +} + +/** + * De-reference the client argument from the key invalidation task + */ +static void destroyClientForKeyInvalidationTask(void *task_ptr) { + taskWaitingAck *task = (taskWaitingAck *)task_ptr; + // The first argument is the client pointer + task->argv[0] = NULL; +} + +/** + * Execute the key invalidation task. + */ +static void executeKeyInvalidationTask(const taskWaitingAck *task) { + trackingInvalidateKey((client *)task->argv[0], (robj *)task->argv[1], 1); +} + +/*================================= Flush Invalidation Task ================== */ + +/** + * Create the flush invalidation task. + */ +static taskWaitingAck *createFlushInvalidationTask(va_list ap) { + // Flush invalidation task has database ID as argument + int argc = 1; + taskWaitingAck *task = zcalloc(sizeof(taskWaitingAck)); + task->argv = zmalloc(argc * sizeof(void *)); + for (int i = 0; i < argc; i++) { + task->argv[i] = va_arg(ap, void *); + } + return task; +} + +/** + * Destroy the flush invalidation task. + */ +static void destroyFlushInvalidationTask(void *ptr) { + taskWaitingAck *task = (taskWaitingAck *)ptr; + zfree(task->argv); + zfree(task); +} + +/** + * Execute the flush invalidation task. + */ +static void executeFlushInvalidationTask(const taskWaitingAck *task) { + bool is_flush_all = (bool)task->argv[0]; + // Use DBID -1 for FLUSHALL, otherwise use 0 for DBID + // Note: This assumes the OSS Redis code below doesn't operate on the actual + // DBID besides differentiating between FLUSHDB and FLUSHALL. + trackingInvalidateKeysOnFlush(is_flush_all ? -1 : 0); +} + +/*================================= Default callback ========================= */ + +/** + * Default callback on client destroy doing no-op + */ +static void destroyClientDefaultCallback(void *task) { + UNUSED(task); + return; +} + +/*================================= Task Type Registry ======================= */ + +void initTaskTypes(void) { + taskTypes[DURABLE_KEYSPACE_NOTIFY_TASK] = (taskWaitingAckType){ + createKeyspaceNotifyTask, + destroyKeyspaceNotifyTask, + executeKeyspaceNotifyTask, + destroyClientDefaultCallback}; + taskTypes[DURABLE_KEY_INVALIDATION_TASK] = (taskWaitingAckType){ + createKeyInvalidationTask, + destroyKeyInvalidationTask, + executeKeyInvalidationTask, + destroyClientForKeyInvalidationTask}; + // needed + taskTypes[DURABLE_FLUSH_INVALIDATION_TASK] = (taskWaitingAckType){ + createFlushInvalidationTask, + destroyFlushInvalidationTask, + executeFlushInvalidationTask, + destroyClientDefaultCallback}; +} + +/*================================= Task Registration ======================== */ + +/** + * Create task based on the given task type and arguments, and append the new + * task to the end of the linkedlist of the pending tasks of that task type. + * + * Note that at this point in time, we might not know about the replication + * offset we want to configure this task with so we put it onto a pending list. + * And at a later point in time, when we know the replication offset, we would + * set it and move the task to the official tasks list. + */ +bool durabilityRegisterDeferredTask(int type, ...) { + /* Check durability is active and the type is valid */ + if (!isPrimaryDurabilityEnabled() || (type == DURABLE_TASK_TYPE_MAX)) { + return false; + } + + va_list ap; + bool return_code = false; + va_start(ap, type); + taskWaitingAck *task = taskTypes[type].createTask(ap); + if (task) { + task->type = type; + if (server.current_client != NULL) { + // Here the notification is triggered by an incoming client request when we + // don't yet know the actual replication offset after command is applied, + // so we need to put it onto a pending tasks list. + listAddNodeTail(server.durability.pending_tasks_waiting_ack[type], task); + } else { + /* This notification is triggered from a background job such as + * active expiry or eviction outside of a regular client command. + * The replication offset is already updated so we use it directly. */ + task->offset = server.primary_repl_offset; + listAddNodeTail(server.durability.tasks_waiting_ack[type], task); + } + return_code = true; + } + va_end(ap); + return return_code; +} + +/*================================= Signal Handlers ========================== */ + +bool durabilitySignalModifiedKey(struct client *c, struct serverDb *db, struct serverObject *key) { + UNUSED(db); + /* Defer key invalidation messages until the durability providers acknowledge. */ + return durabilityRegisterDeferredTask(DURABLE_KEY_INVALIDATION_TASK, + (void *)c, (void *)key); +} + + +bool durabilitySignalFlushedDb(int dbid) { + /* Defer flush invalidation messages until the durability providers acknowledge. */ + return durabilityRegisterDeferredTask(DURABLE_FLUSH_INVALIDATION_TASK, + (void *)(long long)(dbid == -1)); +} + +/*================================= Task Execution =========================== */ + +/** + * Find and execute deferred tasks when 'consensus_ack_offset' is acked. + */ +void executeDeferredTasksForAck(const long long consensus_ack_offset) { + listIter li; + listNode *ln; + struct durable_t *durability = &server.durability; + + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRewind(durability->tasks_waiting_ack[i], &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = listNodeValue(ln); + if (task->offset <= consensus_ack_offset) { + taskTypes[i].executeTask(task); + listDelNode(durability->tasks_waiting_ack[i], ln); + } else { + break; + } + } + } +} + +/** + * Move pending deferred tasks to the official list with the current replication offset. + */ +void certifyPendingDeferredTasks(void) { + listIter li; + listNode *ln; + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRewind(server.durability.pending_tasks_waiting_ack[i], &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = listNodeValue(ln); + serverAssert(task->offset == 0); + task->offset = server.primary_repl_offset; + if (task->type == DURABLE_KEYSPACE_NOTIFY_TASK) { + moduleNotifyKeyspaceEvent( + /*type*/ (intptr_t)task->argv[0], + /*event*/ (char *)task->argv[1], + /*key*/ (robj *)task->argv[2], + /*dbid*/ (intptr_t)task->argv[3]); + } + } + if (listLength(server.durability.pending_tasks_waiting_ack[i]) > 0) { + listJoin(server.durability.tasks_waiting_ack[i], server.durability.pending_tasks_waiting_ack[i]); + } + serverAssert(listLength(server.durability.pending_tasks_waiting_ack[i]) == 0); + } +} + +/*================================= Client Lifecycle ========================= */ + +/** + * Notify the task system that a client is being destroyed so that + * any tasks referencing it can de-reference the client pointer. + */ +void durableTaskNotifyClientDestroy(struct list *pending_notify_tasks) { + listIter li; + listNode *ln; + listRewind(pending_notify_tasks, &li); + while ((ln = listNext(&li))) { + taskWaitingAck *task = (taskWaitingAck *)listNodeValue(ln); + if (task) { + taskTypes[task->type].onClientDestroy(task); + } + } +} + +/*================================= Init / Cleanup =========================== */ + +/** + * Initialize the task lists in the durability structure. + * Called from durabilityInit(). + */ +void durableTaskInitLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + server.durability.tasks_waiting_ack[i] = listCreate(); + server.durability.pending_tasks_waiting_ack[i] = listCreate(); + listSetFreeMethod(server.durability.tasks_waiting_ack[i], + taskTypes[i].destroyTask); + listSetFreeMethod(server.durability.pending_tasks_waiting_ack[i], + taskTypes[i].destroyTask); + } +} + +/** + * Release (free) all task lists. Called from durabilityCleanup(). + */ +void durableTaskCleanupLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listRelease(server.durability.tasks_waiting_ack[i]); + server.durability.tasks_waiting_ack[i] = NULL; + listRelease(server.durability.pending_tasks_waiting_ack[i]); + server.durability.pending_tasks_waiting_ack[i] = NULL; + } +} + +/** + * Empty (but don't free) all task lists. Called during primary state reset. + */ +void durableTaskEmptyLists(void) { + for (int i = 0; i < DURABLE_TASK_TYPE_MAX; i++) { + listEmpty(server.durability.tasks_waiting_ack[i]); + listEmpty(server.durability.pending_tasks_waiting_ack[i]); + } +} diff --git a/src/durable_task.h b/src/durable_task.h new file mode 100644 index 00000000000..1c87b928fb8 --- /dev/null +++ b/src/durable_task.h @@ -0,0 +1,93 @@ +#ifndef DURABLE_TASK_H +#define DURABLE_TASK_H + +/* Include feature-test macros early so _FILE_OFFSET_BITS=64 is defined + * before any system headers, ensuring off_t is 64-bit on 32-bit builds. */ +#include "fmacros.h" + +#include +#include + +struct client; +struct serverDb; +struct serverObject; +struct list; + +/** + * Define the supported task types for deferred work that executes + * after durability has been confirmed (replica ACK). + */ +typedef enum { + DURABLE_KEYSPACE_NOTIFY_TASK = 0, /* KEYSPACE NOTIFY task */ + DURABLE_KEY_INVALIDATION_TASK, /* Key invalidation task for client side caching */ + DURABLE_FLUSH_INVALIDATION_TASK, /* FLUSH invalidation task for client side caching */ + DURABLE_TASK_TYPE_MAX /* Max task type */ +} durableTaskType; + +/** + * Initialize the task type registry (create/destroy/execute handlers). + * Must be called before any task registration. + */ +void initTaskTypes(void); + +/** + * Register a deferred task for execution after the current replication + * offset is acknowledged by durability providers. The task is created + * from the variadic arguments based on the given task type. + * + * Returns true if the task was successfully registered, false otherwise. + */ +bool durabilityRegisterDeferredTask(int type, ...); + +/** + * Find and execute all deferred tasks whose offset <= consensus_ack_offset. + */ +void executeDeferredTasksForAck(long long consensus_ack_offset); + +/** + * Move pending tasks (registered during command execution before the + * replication offset was known) to the official tasks list, setting + * their offset to server.primary_repl_offset. + */ +void certifyPendingDeferredTasks(void); + +/** + * Notify the task system that a client is being destroyed so that + * any tasks referencing it can de-reference the client pointer. + * Iterates all tasks in the given pending_notify_tasks list. + */ +void durableTaskNotifyClientDestroy(struct list *pending_notify_tasks); + +/** + * Custom processing whenever a key gets modified. Invoked from signalModifiedKey(). + * + * Return true if no further processing are required in signalModifiedKey() such + * as some async tasks are created which need some time to finish, false otherwise. + */ +bool durabilitySignalModifiedKey(struct client *c, struct serverDb *db, struct serverObject *key); + +/** + * Custom processing whenever a FLUSH happens. Invoked from signalFlushedDb(). + * + * Return true if no further processing are required in signalFlushedDb() such + * as some async tasks are created which need some time to finish, false otherwise. + */ +bool durabilitySignalFlushedDb(int dbid); + +/** + * Initialize the task lists in the durability structure. + * Called from durabilityInit(). + */ +void durableTaskInitLists(void); + +/** + * Release (free) all task lists. Called from durabilityCleanup(). + */ +void durableTaskCleanupLists(void); + +/** + * Empty (but don't free) all task lists. Called during primary state reset. + */ +void durableTaskEmptyLists(void); + +#endif /* DURABLE_TASK_H */ diff --git a/src/evict.c b/src/evict.c index ce602b25679..9c547829aa0 100644 --- a/src/evict.c +++ b/src/evict.c @@ -570,6 +570,7 @@ int performEvictions(void) { signalModifiedKey(NULL, db, keyobj); notifyKeyspaceEvent(NOTIFY_EVICTED, "evicted", keyobj, db->id); propagateDeletion(db, keyobj, server.lazyfree_lazy_eviction, bestslot); + if (isPrimaryDurabilityEnabled()) handleUncommittedKeyForClient(NULL, keyobj, db); exitExecutionUnit(); postExecutionUnitOperations(); decrRefCount(keyobj); diff --git a/src/io_threads.c b/src/io_threads.c index 3fe5e14d5c1..63b3d48b5e4 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -9,6 +9,7 @@ static _Thread_local int thread_id = 0; /* Thread local var */ static pthread_t io_threads[IO_THREADS_MAX_NUM] = {0}; static pthread_mutex_t io_threads_mutex[IO_THREADS_MAX_NUM]; +static size_t next_generic_job_rr = 0; /* Main-thread round-robin counter for generic IO jobs. */ /* IO jobs queue functions - Used to send jobs from the main-thread to the IO thread. */ typedef void (*job_handler)(void *); @@ -131,6 +132,25 @@ int inMainThread(void) { return thread_id == 0; } +/* Attempts to offload a generic job to an IO thread. + * Returns C_OK if the job is enqueued, C_ERR otherwise. */ +int trySendJobToIOThreads(void (*handler)(void *), void *data) { + if (!inMainThread() || server.active_io_threads_num <= 1) return C_ERR; + size_t workers = (size_t)server.active_io_threads_num - 1; + size_t start = (next_generic_job_rr++ % workers) + 1; + + /* Distribute jobs across active IO threads and fall back to any + * available queue if the preferred one is full. */ + for (size_t i = 0; i < workers; i++) { + size_t tid = ((start - 1 + i) % workers) + 1; + IOJobQueue *jq = &io_jobs[tid]; + if (IOJobQueue_isFull(jq)) continue; + IOJobQueue_push(jq, handler, data); + return C_OK; + } + return C_ERR; +} + int getIOThreadID(void) { return thread_id; } @@ -167,14 +187,22 @@ void waitForClientIO(client *c) { } /** Adjusts the number of active I/O threads based on the current event load. - * If increase_only is non-zero, only allows increasing the number of threads.*/ -void adjustIOThreadsByEventLoad(int numevents, int increase_only) { + * If increase_only is non-zero, only allows increasing the number of threads. + * If has_background_work is non-zero, ensures at least one IO thread is active + * for background jobs like AOF fsync. */ +void adjustIOThreadsByEventLoad(int numevents, int increase_only, int has_background_work) { if (server.io_threads_num == 1) return; /* All I/O is being done by the main thread. */ debugServerAssertWithInfo(NULL, NULL, server.io_threads_num > 1); /* When events_per_io_thread is set to 0, we offload all events to the IO threads. * This is used mainly for testing purposes. */ int target_threads = server.events_per_io_thread == 0 ? (numevents + 1) : numevents / server.events_per_io_thread; + /* If there's background work (like AOF fsync), ensure at least 2 threads are active + * so generic jobs can be offloaded to IO threads. */ + if (has_background_work && target_threads < 2) { + target_threads = 2; + } + target_threads = max(1, min(target_threads, server.io_threads_num)); if (target_threads == server.active_io_threads_num) return; diff --git a/src/io_threads.h b/src/io_threads.h index 308dc6dbff8..4fa9ac6f898 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -6,11 +6,12 @@ void initIOThreads(void); void killIOThreads(void); int inMainThread(void); +int trySendJobToIOThreads(void (*handler)(void *), void *data); int trySendReadToIOThreads(client *c); int trySendWriteToIOThreads(client *c); int tryOffloadFreeObjToIOThreads(robj *o); int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); -void adjustIOThreadsByEventLoad(int numevents, int increase_only); +void adjustIOThreadsByEventLoad(int numevents, int increase_only, int has_background_work); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); int trySendAcceptToIOThreads(connection *conn); diff --git a/src/module.c b/src/module.c index de5a5510e40..aa9eae78981 100644 --- a/src/module.c +++ b/src/module.c @@ -6807,6 +6807,16 @@ ValkeyModuleCallReply *VM_Call(ValkeyModuleCtx *ctx, const char *cmdname, const if (!(flags & VALKEYMODULE_ARGV_NO_AOF)) call_flags |= CMD_CALL_PROPAGATE_AOF; if (!(flags & VALKEYMODULE_ARGV_NO_REPLICAS)) call_flags |= CMD_CALL_PROPAGATE_REPL; } + + // check if we need to reject the execution due to access to dirty data + char *pre_script_err = preScriptCmd(c); + if (pre_script_err != NULL) { + if (error_as_call_replies) { + reply_error_msg = sdsnew(pre_script_err); + } + goto cleanup; + } + call(c, call_flags); /* Propagate database changes from the temporary client back to the context client diff --git a/src/networking.c b/src/networking.c index ddc6137e0ad..01d1144b13f 100644 --- a/src/networking.c +++ b/src/networking.c @@ -369,6 +369,12 @@ client *createClient(connection *conn) { c->io_last_written.buf = NULL; c->io_last_written.bufpos = 0; c->io_last_written.data_len = 0; + memset(&c->clientDurabilityInfo, 0, sizeof(c->clientDurabilityInfo)); + + // init durability info like + // key blocking on primary + durabilityClientInit(c); + return c; } @@ -1710,6 +1716,18 @@ void copyReplicaOutputBuffer(client *dst, client *src) { /* Return true if the specified client has pending reply buffers to write to * the socket. */ int clientHasPendingReplies(client *c) { + if (isClientReplyBufferLimited(c)) { + // Check if our first allowed reply boundary is in a position that comes + // after the current position that valkey has written up to in the COB. + const blockedResponse *n = listNodeValue(listFirst(c->clientDurabilityInfo.blocked_responses)); + if ((c->bufpos && n->disallowed_reply_block == NULL) || + (c->bufpos == 0 && n->disallowed_reply_block != NULL && listFirst(c->reply) == n->disallowed_reply_block)) { + // Both positions are pointing both at the initial 16KB buffer or the + // first reply block, compare the sentlen with the last allowed byte offset + return c->io_last_written.bufpos < n->disallowed_byte_offset; + } + } + if (getClientType(c) == CLIENT_TYPE_REPLICA) { /* Replicas use global shared replication buffer instead of * private output buffer. */ @@ -1909,6 +1927,8 @@ void unlinkClient(client *c) { /* Wait for IO operations to be done before unlinking the client. */ waitForClientIO(c); + durabilityClientReset(c); + /* If this is marked as current client unset it. */ if (c->conn && server.current_client == c) server.current_client = NULL; @@ -3201,7 +3221,7 @@ int handleClientsWithPendingWrites(void) { /* Adjust the number of I/O threads based on the number of pending writes this is required in case pending_writes > * poll_events (for example in pubsub) */ - adjustIOThreadsByEventLoad(pending_writes, 1); + adjustIOThreadsByEventLoad(pending_writes, 1, 0); listIter li; listNode *ln; diff --git a/src/notify.c b/src/notify.c index d48c515b9da..c4ca1407585 100644 --- a/src/notify.c +++ b/src/notify.c @@ -117,16 +117,38 @@ void notifyKeyspaceEvent(int type, char *event, robj *key, int dbid) { c->flag.keyspace_notified == 1 || c->id == UINT64_MAX || // AOF client getClientType(c) != CLIENT_TYPE_NORMAL); - /* If any modules are interested in events, notify the module system now. - * This bypasses the notifications configuration, but the module engine - * will only call event subscribers if the event type matches the types - * they are interested in. */ - moduleNotifyKeyspaceEvent(type, event, key, dbid); - if (c) { - c->flag.keyspace_notified = 1; - commitDeferredReplyBuffer(c, 1); + + if (!(type & NOTIFY_IN_DURABLE_TASK)) { + if (isPrimaryDurabilityEnabled()) { + bool shouldSendDelayedNotificationToClients = (server.notify_keyspace_events & type); + + /* Defer client notifications until durability providers acknowledge the write. */ + if (shouldSendDelayedNotificationToClients) { + type = type | NOTIFY_IN_DURABLE_TASK; + /* Register deferred task, executed when offset is acknowledged + * by durability providers */ + durabilityRegisterDeferredTask( + DURABLE_KEYSPACE_NOTIFY_TASK, + (void *)(long)type, + (void *)event, + (void *)key, + (void *)(long)dbid); + } + + // At this point (ZDL branch), we have notified modules, or queued a task. For clients, + // there is never a direct notification (either queue the notification or nothing). + return; + } + moduleNotifyKeyspaceEvent(type, event, key, dbid); + } else { + if (c) { + c->flag.keyspace_notified = 1; + commitDeferredReplyBuffer(c, 1); + } } + type = type & ~NOTIFY_IN_DURABLE_TASK; + /* If notifications for this class of events are off, return ASAP. */ if (!(server.notify_keyspace_events & type)) return; diff --git a/src/object.c b/src/object.c index 8d757b1c44b..8de29cbe38a 100644 --- a/src/object.c +++ b/src/object.c @@ -1160,6 +1160,16 @@ int getPositiveLongFromObjectOrReply(client *c, robj *o, long *target, const cha } } +int getIntFromObject(robj *o, int *target) { + long long value; + + if (getLongLongFromObject(o, &value) != C_OK) return C_ERR; + if (value < INT_MIN || value > INT_MAX) return C_ERR; + + *target = value; + return C_OK; +} + int getIntFromObjectOrReply(client *c, robj *o, int *target, const char *msg) { long value; diff --git a/src/replication.c b/src/replication.c index dd02c9a2814..4437a9da6f5 100644 --- a/src/replication.c +++ b/src/replication.c @@ -1444,6 +1444,8 @@ void replconfCommand(client *c) { } } else if (!strcasecmp(objectGetVal(c->argv[j + 1]), REPLICA_CAPA_SKIP_RDB_CHECKSUM_STR)) c->repl_data->replica_capa |= REPLICA_CAPA_SKIP_RDB_CHECKSUM; + else if (!strcasecmp(objectGetVal(c->argv[j + 1]), "sync-replica")) + c->repl_data->replica_capa |= REPLICA_CAPA_SYNC; } else if (!strcasecmp(objectGetVal(c->argv[j]), "ack")) { /* REPLCONF ACK is used by replica to inform the primary the amount * of replication stream that it processed so far. It is an @@ -1458,6 +1460,22 @@ void replconfCommand(client *c) { if (offset > c->repl_data->repl_aof_off) c->repl_data->repl_aof_off = offset; } c->repl_data->repl_ack_time = server.unixtime; + + /* Sync replica ISR promotion: if this replica has the SYNC capability + * and its ack offset has caught up to the committed offset, promote + * it to the in-sync replica group (ISR). The committed offset is + * tracked as previous_acked_offset in the durability subsystem. */ + if ((c->repl_data->replica_capa & REPLICA_CAPA_SYNC) && + !c->repl_data->is_in_sync && + c->repl_data->repl_state == REPLICA_STATE_ONLINE && + c->repl_data->repl_ack_off >= server.durability.previous_acked_offset) { + c->repl_data->is_in_sync = 1; + serverLog(LL_NOTICE, "Replica %s promoted to ISR (ack_off=%lld, committed_offset=%lld)", + replicationGetReplicaName(c), + c->repl_data->repl_ack_off, + server.durability.previous_acked_offset); + } + /* If this was a diskless replication, we need to really put * the replica online when the first ACK is received (which * confirms replica is online and ready to get more data). This @@ -1472,6 +1490,7 @@ void replconfCommand(client *c) { if (c->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) { replicaPutOnline(c); } + /* Note: this command does not reply anything! */ return; } else if (!strcasecmp(objectGetVal(c->argv[j]), "getack")) { @@ -1479,6 +1498,26 @@ void replconfCommand(client *c) { * to the replica. */ if (server.primary_host && server.primary) replicationSendAck(); return; + } else if (!strcasecmp(objectGetVal(c->argv[j]), "commit")) { + /* REPLCONF COMMIT is sent by the primary to inform + * replicas of the current committed (durable) offset. This + * is the highest offset that has been acknowledged by all + * required sync replicas on the primary side. + * + * On the replica, this directly advances the committed offset + * and unblocks any clients whose responses were held pending + * durability confirmation. This is the replica-side counterpart + * of notifyDurabilityProgress() on the primary. */ + long long offset; + if ((getLongLongFromObject(c->argv[j + 1], &offset) != C_OK)) return; + if (offset > server.durability.previous_acked_offset) { + server.durability.previous_acked_offset = offset; + drainCommittedKeys(offset); + unblockResponsesWithAckOffset(&server.durability, offset); + } + /* This command does not reply anything — it is injected + * into the replication stream like PING and GETACK. */ + return; } else if (!strcasecmp(objectGetVal(c->argv[j]), "rdb-only")) { /* REPLCONF RDB-ONLY is used to identify the client only wants * RDB snapshot without replication buffer. */ @@ -3840,8 +3879,8 @@ int syncWithPrimaryHandleSendHandshakeState(connection *conn) { // we can ignore primary's conditions when sending capa (is_primary_stream_verified=1) int send_skip_rdb_checksum_capa = replicationSupportSkipRDBChecksum(conn, useDisklessLoad(), 1); - char *argv[9] = {"REPLCONF", "capa", "eof", "capa", "psync2", NULL, NULL, NULL, NULL}; - size_t lens[9] = {8, 4, 3, 4, 6, 0, 0, 0, 0}; + char *argv[11] = {"REPLCONF", "capa", "eof", "capa", "psync2", NULL, NULL, NULL, NULL, NULL, NULL}; + size_t lens[11] = {8, 4, 3, 4, 6, 0, 0, 0, 0, 0, 0}; int argc = 5; if (send_skip_rdb_checksum_capa) { argv[argc] = "capa"; @@ -3859,6 +3898,14 @@ int syncWithPrimaryHandleSendHandshakeState(connection *conn) { lens[argc] = strlen("dual-channel"); argc++; } + if (server.min_sync_replicas > 0 && server.sync_eligible) { + argv[argc] = "capa"; + lens[argc] = strlen("capa"); + argc++; + argv[argc] = "sync-replica"; + lens[argc] = strlen("sync-replica"); + argc++; + } err = sendCommandArgv(conn, argc, argv, lens); if (err) goto err; @@ -4435,6 +4482,7 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required, bool disc freeClient(server.primary); } + durabilityClearPrimaryState(); /* Setting primary_host only after the call to freeClient since it calls * replicationHandlePrimaryDisconnection which can trigger a re-connect * directly from within that call. */ @@ -4912,6 +4960,45 @@ int checkGoodReplicasStatus(void) { server.repl_good_replicas_count >= server.repl_min_replicas_to_write; /* check if we have enough replicas */ } +/* Return true if we have enough sync replicas in the ISR to accept writes. + * When min-sync-replicas is 0 (disabled), always returns true. + * When this node is a replica (has primary_host), always returns true. */ +int checkSyncReplicasStatus(void) { + if (server.primary_host) return 1; /* Not a primary — OK */ + if (server.min_sync_replicas == 0) return 1; /* Feature disabled — OK */ + + listIter li; + listNode *ln; + int isr_count = 0; + + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE && + replica->repl_data->is_in_sync) { + isr_count++; + } + } + return isr_count >= server.min_sync_replicas; +} + +/* Return the number of sync replicas currently in the ISR. */ +int getSyncReplicaCount(void) { + listIter li; + listNode *ln; + int count = 0; + + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + if (replica->repl_data->repl_state == REPLICA_STATE_ONLINE && + replica->repl_data->is_in_sync) { + count++; + } + } + return count; +} + /* ----------------------- SYNCHRONOUS REPLICATION -------------------------- * Synchronous replication design can be summarized in points: * @@ -5286,6 +5373,19 @@ void replicationCron(void) { } } + /* Send the committed offset to replicas so they know which data + * has been durably committed. This is used by the sync replication + * protocol — replicas use this to know what data is safe. */ + if (server.min_sync_replicas > 0 && listLength(server.replicas) && + server.durability.previous_acked_offset > 0) { + robj *commit_argv[3]; + commit_argv[0] = shared.replconf; + commit_argv[1] = shared.commit; + commit_argv[2] = createObject(OBJ_STRING, sdsfromlonglong(server.durability.previous_acked_offset)); + replicationFeedReplicas(-1, commit_argv, 3); + decrRefCount(commit_argv[2]); + } + /* Second, send a newline to all the replicas in pre-synchronization * stage, that is, replicas waiting for the primary to create the RDB file. * @@ -5313,6 +5413,29 @@ void replicationCron(void) { } } + /* Remove sync replicas from ISR if they haven't ACKed within the + * ISR timeout. #todo: @spaneru to expand this when lease mechanism is implemented */ + if (server.min_sync_replicas > 0 && getSyncReplicaCount() > 0) { + listIter li; + listNode *ln; + + listRewind(server.replicas, &li); + while ((ln = listNext(&li))) { + client *replica = ln->value; + + if (!replica->repl_data->is_in_sync) continue; + + time_t last_ack_age = server.unixtime - replica->repl_data->repl_ack_time; + if (last_ack_age > REPLICA_ISR_TIMEOUT) { + replica->repl_data->is_in_sync = 0; + serverLog(LL_WARNING, + "Removing replica %s from ISR: no ACK for %ld seconds (timeout=%d)", + replicationGetReplicaName(replica), + (long)last_ack_age, REPLICA_ISR_TIMEOUT); + } + } + } + /* Disconnect timedout replicas. */ if (listLength(server.replicas)) { listIter li; diff --git a/src/reply_blocking.c b/src/reply_blocking.c new file mode 100644 index 00000000000..4e5937b6252 --- /dev/null +++ b/src/reply_blocking.c @@ -0,0 +1,819 @@ +#include "server.h" +#include "zmalloc.h" +#include "script.h" +#include + +/* Forward declarations from module.h to avoid pulling in full module internals + * which has header dependency issues when included before server.h */ +int moduleClientIsBlockedOnKeys(client *c); + +/*============================ Internal prototypes ========================= */ +static void resetPreExecutionOffset(struct client *c); +static void trackCommandPreExecutionPosition(struct client *c); +static int unblockClientWaitingAck(struct client *c); +static bool clientEligibleForResponseTracking(client *c); +static void unblockFirstResponse(const struct client *c); +static int isBlockingNeededForOffset(const struct client *c, long long offset); +static void blockClientAndMonitorsOnReplOffset(struct client *c, long long blockingReplOffset); +static long long getSingleCommandBlockingOffsetForReplicatingCommand(client *c); +static long long getSingleCommandBlockingOffsetForNonReplicatingCommand(client *c); +static long long getSingleCommandBlockingOffsetForConsistentWrites(client *c); +static void durabilityResetPrimaryState(bool is_free_clients_needed); + + +/*================================= Utility functions ======================== */ + +/** + * Utility function to determine whether durability is enabled. + * Durability is enabled when any registered durability provider reports + * itself as enabled (e.g. the built-in AOF provider enables when + * appendonly + appendfsync always). + */ +int isDurabilityEnabled(void) { + return anyDurabilityProviderEnabled(); +} + +/** + * Utility function to determine whether durability is enabled on a primary node. + */ +int isPrimaryDurabilityEnabled(void) { + return isDurabilityEnabled() && iAmPrimary(); +} + +/*================================= Client management ======================== */ + +/** + * Reset the pre-execution offset fields. + */ +static void resetPreExecutionOffset(struct client *c) { + c->clientDurabilityInfo.offset.recorded = false; + c->clientDurabilityInfo.offset.reply_block = NULL; + c->clientDurabilityInfo.offset.byte_offset = 0; +} + + +/** + * Track the pre-execution position in the client reply COB. + */ +static void trackCommandPreExecutionPosition(struct client *c) { + resetPreExecutionOffset(c); + list *reply = c->reply; + int bufpos = c->bufpos; + + if (reply != NULL && listLength(reply) > 0) { + listNode *last_reply_block = listLast(reply); + c->clientDurabilityInfo.offset.reply_block = last_reply_block; + c->clientDurabilityInfo.offset.byte_offset = ((clientReplyBlock *)listNodeValue(last_reply_block))->used; + } else if (bufpos > 0) { + c->clientDurabilityInfo.offset.reply_block = NULL; + c->clientDurabilityInfo.offset.byte_offset = bufpos; + } + c->clientDurabilityInfo.offset.recorded = true; +} + +/** + * If the client is currently waiting for durability acknowledgement, + * mark it unblocked and reset the client flags. + */ +static int unblockClientWaitingAck(struct client *c) { + if (c->clientDurabilityInfo.durability_blocked) { + listNode *node = listSearchKey(server.durability.clients_waiting_ack, c); + if (node != NULL) { + listDelNode(server.durability.clients_waiting_ack, node); + c->clientDurabilityInfo.durability_blocked = 0; + return 1; + } + } + return 0; +} + +/** + * Initialize the durability client attributes when client is created. + */ +void durabilityClientInit(client *c) { + if (!isDurabilityEnabled()) { + return; + } + if (c->clientDurabilityInfo.blocked_responses == NULL) { + c->clientDurabilityInfo.blocked_responses = listCreate(); + listSetFreeMethod(c->clientDurabilityInfo.blocked_responses, zfree); + resetPreExecutionOffset(c); + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; + c->clientDurabilityInfo.pending_notify_tasks = listCreate(); + } +} + +/** + * Reset the client durability attributes during a client clean-up. + */ +void durabilityClientReset(client *c) { + if (unblockClientWaitingAck(c)) { + server.durability.clients_disconnected_before_unblocking++; + } + + if (c->clientDurabilityInfo.blocked_responses != NULL) { + listRelease(c->clientDurabilityInfo.blocked_responses); + c->clientDurabilityInfo.blocked_responses = NULL; + } + + if (c->clientDurabilityInfo.pending_notify_tasks != NULL) { + durableTaskNotifyClientDestroy(c->clientDurabilityInfo.pending_notify_tasks); + listRelease(c->clientDurabilityInfo.pending_notify_tasks); + c->clientDurabilityInfo.pending_notify_tasks = NULL; + } + + resetPreExecutionOffset(c); + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; +} + +/** + * Determines if a client is doing a transaction or not. + */ +static bool isClientDoingTransaction(client *c) { + return c->cmd->proc == execCommand || IS_SCRIPT_CALL_CMD(c->cmd); +} + +/** + * Returns true if the client is eligible for response tracking. + * [WBL] On a replica, the primary's replication connection must NOT be tracked — + * blocking it would stall the replication stream. + */ +static bool clientEligibleForResponseTracking(client *c) { + if (c->cmd == NULL) return false; + + /* [WBL] Never block the replication stream from the primary. */ + if (c->flag.primary) return false; + + bool is_keyspace_informational_cmd = IS_KEYSPACE_INFORMATIONAL(c->cmd); + + if ((c->cmd->flags & CMD_ADMIN) && !(c->cmd->flags & CMD_WRITE) && !is_keyspace_informational_cmd) { + return false; + } + + return ((c->cmd->flags & (CMD_WRITE | CMD_READONLY)) || isClientDoingTransaction(c) || is_keyspace_informational_cmd || isFunctionStoreRWCommand(c)); +} + +/** + * Check if we only allow client to receive up to a certain + * position in the client reply buffer. + */ +inline bool isClientReplyBufferLimited(client *c) { + return c->clientDurabilityInfo.blocked_responses != NULL && + listLength(c->clientDurabilityInfo.blocked_responses) > 0; +} + +/*================================= Response blocking ======================= */ + +/** + * Store the metrics for a command when blocking + * @param c The client that issued the command. + * @param br The Node at which commands are blocked. + */ +static inline void initCmdMetrics(const client *c, struct blockedResponse *br) { + if (!c->cmd) { + // If client command is NULL, eg Monitor clients, we do not start the timer + // because we do not emit metrics for this response. + br->blocked_command_timer = 0; + return; + } + + elapsedStart(&br->blocked_command_timer); + // For end-to-end latency measurement + + if (c->clientDurabilityInfo.durability_flags & DURABILITY_CLIENT_LAST_CMD_WRITE) { + server.durability.write_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_WRITE; + } else if (c->clientDurabilityInfo.durability_flags & DURABILITY_CLIENT_LAST_CMD_READONLY) { + server.durability.read_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_READ; + } else { + server.durability.other_responses_blocked++; + br->cmd_type = DURABLE_BLOCKED_CMD_OTHER; + } +} + + +/** + * Block the last response if it exists in the client output buffer. + */ +static void blockLastResponseIfExist(const client *c, const long long blocked_offset) { + serverAssert(c->clientDurabilityInfo.offset.recorded); + + bool has_new_response = false; + listNode *disallowed_reply_block = + c->clientDurabilityInfo.offset.reply_block; + size_t disallowed_byte_offset = + c->clientDurabilityInfo.offset.byte_offset; + + if (disallowed_reply_block == NULL) { + if ((size_t)c->bufpos > disallowed_byte_offset) { + has_new_response = true; + } else if (listLength(c->reply) > 0) { + has_new_response = true; + disallowed_byte_offset = 0; + disallowed_reply_block = listFirst(c->reply); + } + } else { + const clientReplyBlock *last_reply_block = listNodeValue(disallowed_reply_block); + if (last_reply_block->used > disallowed_byte_offset) { + has_new_response = true; + } else if (disallowed_reply_block->next != NULL) { + has_new_response = true; + disallowed_byte_offset = 0; + disallowed_reply_block = disallowed_reply_block->next; + } + } + + if (has_new_response) { + blockedResponse *new_block = zcalloc(sizeof(blockedResponse)); + new_block->primary_repl_offset = blocked_offset; + new_block->disallowed_byte_offset = disallowed_byte_offset; + new_block->disallowed_reply_block = disallowed_reply_block; + initCmdMetrics(c, new_block); + listAddNodeTail(c->clientDurabilityInfo.blocked_responses, new_block); + } +} + + +/** + * Process the metrics of all commands blocked at a BlockedResponse while unblocking + * @param br The Node at which commands are blocked. + */ +static inline void processCmdMetrics(struct blockedResponse *br) { + if (br->blocked_command_timer == 0) return; // Do not count the response if timer is not started + + unsigned long long duration = elapsedUs(br->blocked_command_timer); + + if (br->cmd_type == DURABLE_BLOCKED_CMD_WRITE) { + server.durability.write_responses_blocked_cumulative_time_us += duration; + server.durability.write_responses_unblocked++; + } else if (br->cmd_type == DURABLE_BLOCKED_CMD_READ) { + server.durability.read_responses_blocked_cumulative_time_us += duration; + server.durability.read_responses_unblocked++; + } else { + server.durability.other_responses_blocked_cumulative_time_us += duration; + server.durability.other_responses_unblocked++; + } +} +/** + * Unblocks the first response in the client's blocked responses list. + */ +static void unblockFirstResponse(const client *c) { + serverAssert(c->clientDurabilityInfo.blocked_responses != NULL); + if (listLength(c->clientDurabilityInfo.blocked_responses) > 0) { + listNode *first = listFirst(c->clientDurabilityInfo.blocked_responses); + processCmdMetrics(listNodeValue(first)); + listDelNode(c->clientDurabilityInfo.blocked_responses, first); + } +} + +/** + * Determines if we need to block on a given replication offset for a given client. + */ +static int isBlockingNeededForOffset(const client *c, const long long offset) { + if (offset == -1 || anyDurabilityProviderEnabled() == 0) { + return 0; + } + + if (listLength(c->clientDurabilityInfo.blocked_responses) == 0) + return 1; + + listNode *last_response = listLast(c->clientDurabilityInfo.blocked_responses); + long long previous_offset = ((blockedResponse *)listNodeValue(last_response))->primary_repl_offset; + return previous_offset < offset; +} + +/** + * Block a given client on the specified replication offset if applicable. + */ +void blockClientOnReplOffset(client *c, const long long blockingReplOffset) { + serverAssert(isDurabilityEnabled()); + + if (isBlockingNeededForOffset(c, blockingReplOffset)) { + serverLog(LL_DEBUG, "client should be blocked at offset %lld, cmd=%s, is_write=%d", + blockingReplOffset, c->cmd->declared_name, (c->cmd->flags & CMD_WRITE) ? 1 : 0); + blockLastResponseIfExist(c, blockingReplOffset); + if (!c->clientDurabilityInfo.durability_blocked) { + listAddNodeTail(server.durability.clients_waiting_ack, c); + c->clientDurabilityInfo.durability_blocked = 1; + server.durability.clients_blocked++; + } + } + + resetPreExecutionOffset(c); +} + +/** + * Utility function to determine whether a command should be replicated to monitors. + */ +static inline int isCommandReplicatedToMonitors(void) { + return listLength(server.monitors) && !server.loading; +} + +/** + * Block a client and all connected MONITOR clients on the specified replication offset. + */ +static void blockClientAndMonitorsOnReplOffset(client *c, long long blockingReplOffset) { + blockClientOnReplOffset(c, blockingReplOffset); + + if (isCommandReplicatedToMonitors()) { + listNode *ln; + listIter li; + listRewind(server.monitors, &li); + while ((ln = listNext(&li))) { + client *monitor = ln->value; + blockClientOnReplOffset(monitor, blockingReplOffset); + } + } +} + +/*================================= Unblocking ============================== */ + +/** + * Unblock responses and tasks of all blocked clients with a given consensus acked offset. + */ +void unblockResponsesWithAckOffset(const durable_t *durability, const long long consensus_ack_offset) { + serverLog(LL_DEBUG, "unblocking clients for consensus offset %lld,", consensus_ack_offset); + listIter li, li_response; + listNode *ln, *ln_response; + listRewind(durability->clients_waiting_ack, &li); + blockedResponse *br = NULL; + while ((ln = listNext(&li))) { + client *c = ln->value; + + serverAssert(c->clientDurabilityInfo.blocked_responses != NULL); + listRewind(c->clientDurabilityInfo.blocked_responses, &li_response); + bool unblocked_responses = false; + + while ((ln_response = listNext(&li_response))) { + br = listNodeValue(ln_response); + if (br->primary_repl_offset <= consensus_ack_offset) { + unblockFirstResponse(c); + if (unblocked_responses == false) { + unblocked_responses = true; + } + } else { + break; + } + } + if (listLength(c->clientDurabilityInfo.blocked_responses) == 0) { + if (unblockClientWaitingAck(c)) { + server.durability.clients_unblocked++; + } + } + if (unblocked_responses) { + putClientInPendingWriteQueue(c); + } + } + + executeDeferredTasksForAck(consensus_ack_offset); +} + +/*================================= Post-ack handlers ======================= */ + +void notifyDurabilityProgress(void) { + if (!isDurabilityEnabled()) { + return; + } + + durable_t *durability = &server.durability; + const long long consensus_ack_offset = getDurabilityConsensusOffset(); + if (consensus_ack_offset <= durability->previous_acked_offset) { + return; + } + + durability->previous_acked_offset = consensus_ack_offset; + drainCommittedKeys(consensus_ack_offset); + unblockResponsesWithAckOffset(durability, consensus_ack_offset); +} + +/*================================= Function Store Tracking ================== */ + +bool isFunctionRWCommand(client *c) { + return (c->argc > 0 && (!strcasecmp(objectGetVal(c->argv[0]), "FUNCTION"))) && !(c->argc > 1 && !strcasecmp(objectGetVal(c->argv[1]), "HELP")); +} + +bool isFunctionStoreRWCommand(client *c) { + return isFunctionRWCommand(c) || c->cmd->proc == fcallCommand || c->cmd->proc == fcallroCommand; +} + +bool isDurableFunctionStoreUncommitted(void) { + return server.durability.func_store_blocking_offset > server.durability.previous_acked_offset; +} + +void handleUncommittedFunctionStore(void) { + if (server.execution_nesting) { + server.durability.processed_func_write_in_transaction = true; + } else { + server.durability.func_store_blocking_offset = server.primary_repl_offset; + } +} + +long long getFuncStoreBlockingOffset(void) { + return server.durability.func_store_blocking_offset; +} + +void updateFuncStoreBlockingOffsetForWrite(long long blocking_repl_offset) { + if (server.durability.processed_func_write_in_transaction) { + server.durability.func_store_blocking_offset = blocking_repl_offset; + server.durability.processed_func_write_in_transaction = false; + } +} + +/*========================== Command offset calculation ===================== */ + +/** + * Process a single replicating command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForReplicatingCommand(client *c) { + if (!(c->cmd->flags & CMD_WRITE)) { + return -1; + } + + if (isFunctionRWCommand(c)) { + handleUncommittedFunctionStore(); + } else { + getKeysResult result; + initGetKeysResult(&result); + /* Use key specs directly to extract key positions. We avoid + * getKeysFromCommand / getkeys_proc because some commands (e.g. SET) + * rewrite argv during execution (EX→PXAT) and the custom getkeys_proc + * may crash on the rewritten embedded-string robj. We only need key + * positions here, not per-key flags, so key specs are sufficient. */ + int numkeys = getKeysUsingKeySpecs(c->cmd, c->argv, c->argc, GET_KEYSPEC_DEFAULT, &result); + keyReference *keys = result.keys; + if (numkeys > 0) { + if (c->cmd->proc == moveCommand) { + int dest_dbid = -1; + if (getIntFromObject(c->argv[2], &dest_dbid) == C_ERR) { + getKeysFreeResult(&result); + return -1; + } + handleUncommittedKeyForClient(c, c->argv[keys[0].pos], server.db[dest_dbid]); + } else if (c->cmd->proc == copyCommand) { + int dest_dbid; + if (!getTargetDbIdForCopyCommand(c->argc, c->argv, c->db->id, &dest_dbid)) { + getKeysFreeResult(&result); + return -1; + } + if (dest_dbid != c->db->id) { + handleUncommittedKeyForClient(c, c->argv[2], server.db[dest_dbid]); + } + } + + for (int i = 0; i < numkeys; i++) { + handleUncommittedKeyForClient(c, c->argv[keys[i].pos], c->db); + } + } + getKeysFreeResult(&result); + } + + if (!server.execution_nesting) { + return server.primary_repl_offset; + } + + return -1; +} + +/** + * Process a single non-replicating command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForNonReplicatingCommand(client *c) { + long long blocking_repl_offset = -1; + + if (isFunctionStoreRWCommand(c)) { + blocking_repl_offset = getFuncStoreBlockingOffset(); + } else if (IS_SCRIPT_CALL_READONLY_CMD(c->cmd)) { + return -1; + } else if ((c->cmd->flags & CMD_MODULE) && (c->clientDurabilityInfo.module_cmd_blocking_offset != -1)) { + blocking_repl_offset = c->clientDurabilityInfo.module_cmd_blocking_offset; + } else if (c->cmd->flags & (CMD_READONLY | CMD_WRITE)) { + blocking_repl_offset = c->db->dirty_repl_offset; + getKeysResult result; + initGetKeysResult(&result); + int numkeys = getKeysUsingKeySpecs(c->cmd, c->argv, c->argc, GET_KEYSPEC_DEFAULT, &result); + keyReference *keys = result.keys; + + for (int i = 0; i < numkeys; i++) { + sds keystr = objectGetVal(c->argv[keys[i].pos]); + long long offset = durabilityPurgeAndGetUncommittedKeyOffset(keystr, c->db); + if (offset > blocking_repl_offset) { + blocking_repl_offset = offset; + } + } + + /* COPY/MOVE may target a different DB; check the destination key there too. */ + if (c->cmd->proc == moveCommand) { + int dest_dbid = -1; + if (getIntFromObject(c->argv[2], &dest_dbid) == C_OK && dest_dbid != c->db->id) { + long long offset = durabilityPurgeAndGetUncommittedKeyOffset(objectGetVal(c->argv[1]), server.db[dest_dbid]); + if (offset > blocking_repl_offset) blocking_repl_offset = offset; + } + } else if (c->cmd->proc == copyCommand) { + int dest_dbid; + if (getTargetDbIdForCopyCommand(c->argc, c->argv, c->db->id, &dest_dbid) && dest_dbid != c->db->id) { + long long offset = durabilityPurgeAndGetUncommittedKeyOffset(objectGetVal(c->argv[2]), server.db[dest_dbid]); + if (offset > blocking_repl_offset) blocking_repl_offset = offset; + } + } + + getKeysFreeResult(&result); + } + + return blocking_repl_offset; +} + +/** + * Process a single command for consistent write blocking. + */ +static long long getSingleCommandBlockingOffsetForConsistentWrites(struct client *c) { + serverAssert(isDurabilityEnabled()); + + if (!anyDurabilityProviderEnabled()) + return -1; + + long long blocking_repl_offset = -1; + + // we can't trust keyspace info if we have any dirty data + if (IS_KEYSPACE_INFORMATIONAL(c->cmd) && + (listLength(server.durability.clients_waiting_ack) > 0 || hasUncommittedKeys() || isDurableFunctionStoreUncommitted())) { + blocking_repl_offset = server.primary_repl_offset; + } else if ((server.primary_repl_offset > server.durability.pre_call_replication_offset) || (server.also_propagate.numops > server.durability.pre_call_num_ops_pending_propagation)) { + blocking_repl_offset = getSingleCommandBlockingOffsetForReplicatingCommand(c); + } else if (c->flag.primary && (c->cmd->flags & CMD_WRITE)) { + /* [WBL] On a replica, write commands from the replication stream don't + * advance primary_repl_offset (no sub-replicas), but we still + * need to track their keys as uncommitted until REPLCONF COMMIT + * confirms the offset. Call the replicating-command path to + * register the keys, but return -1 so we don't block the + * replication stream client itself. */ + getSingleCommandBlockingOffsetForReplicatingCommand(c); + blocking_repl_offset = -1; + } else { + blocking_repl_offset = getSingleCommandBlockingOffsetForNonReplicatingCommand(c); + } + + if (blocking_repl_offset <= server.durability.previous_acked_offset) { + blocking_repl_offset = -1; + } + + return blocking_repl_offset; +} + +static void durabilitySetClientCmdFlags(client *c) { + // Transaction wrapper commands, e.g., eval, exec, fcall, should not interfere with the + // final classification of the transaction itself as read or write. Rather the commands + // executed inside the transaction will define if it is read or write or none. + if (isClientDoingTransaction(c)) return; + if (c->cmd->flags & CMD_WRITE) + c->clientDurabilityInfo.durability_flags |= DURABILITY_CLIENT_LAST_CMD_WRITE; + else if (c->cmd->flags & CMD_READONLY) + c->clientDurabilityInfo.durability_flags |= DURABILITY_CLIENT_LAST_CMD_READONLY; +} + +/*=========================== Command hook functions ======================= */ + +/** + * Record the starting replication offset of the command about to be executed. + */ +void beforeCommandTrackReplOffset(struct client *c) { + if (!isDurabilityEnabled()) return; + + durabilitySetClientCmdFlags(c); + + + server.durability.pre_call_replication_offset = server.primary_repl_offset; + server.durability.pre_call_num_ops_pending_propagation = server.also_propagate.numops; +} + +static bool isClientBlockedByModule(struct client *c) { + return c->flag.blocked && + c->bstate && + c->bstate->btype == BLOCKED_MODULE && + !moduleClientIsBlockedOnKeys(c); +} + +/** + * After processing a command, track the replication offset and update + * the blocking offset for the command block. + */ +void afterCommandTrackReplOffset(client *c) { + serverLog(LL_DEBUG, "afterCommandTrackReplOffset entered for command '%s'", c->cmd->declared_name); + if (!isDurabilityEnabled() || (c->flag.blocked && !isClientBlockedByModule(c))) + return; + + long long current_cmd_blocking_offset = getSingleCommandBlockingOffsetForConsistentWrites(c); + + client *tracking_client = server.current_client ? server.current_client : c; + + if (current_cmd_blocking_offset > tracking_client->clientDurabilityInfo.current_command_repl_offset) { + tracking_client->clientDurabilityInfo.current_command_repl_offset = current_cmd_blocking_offset; + } + + handleDatabaseModification(c); +} + +char *preScriptCmd(client *c) { + UNUSED(c); + return NULL; +} + +/** + * Perform pre-processing before command execution for a given client. + */ +int preCommandExec(client *c) { + c->clientDurabilityInfo.current_command_repl_offset = -1; + c->clientDurabilityInfo.module_cmd_blocking_offset = -1; + + if (isDurabilityEnabled() && clientEligibleForResponseTracking(c)) { + trackCommandPreExecutionPosition(c); + + if (isCommandReplicatedToMonitors()) { + listNode *ln; + listIter li; + listRewind(server.monitors, &li); + while ((ln = listNext(&li))) { + client *monitor = ln->value; + trackCommandPreExecutionPosition(monitor); + } + } + } + + server.durability.pre_command_replication_offset = server.primary_repl_offset; + return CMD_FILTER_ALLOW; +} + +/** + * Perform post-processing after command execution for a given client. + */ +void postCommandExec(client *c) { + if (!isDurabilityEnabled() || c->cmd == NULL || c->flag.multi) { + return; + } + + long long blocking_repl_offset = c->clientDurabilityInfo.current_command_repl_offset; + + if (server.primary_repl_offset > server.durability.pre_command_replication_offset && (c->cmd->flags & CMD_WRITE || isClientDoingTransaction(c)) && c->cmd->proc != syncCommand && c->cmd->proc != clusterCommand && c->cmd->proc != shutdownCommand) { + blocking_repl_offset = server.primary_repl_offset; + } + + if (blocking_repl_offset > server.durability.pre_command_replication_offset) { + serverAssert(clientEligibleForResponseTracking(c)); + } + + processPendingUncommittedData(server.primary_repl_offset); + + blockClientAndMonitorsOnReplOffset(c, blocking_repl_offset); + + certifyPendingDeferredTasks(); +} + +/*================================= Lifecycle =============================== */ + +/** + * Initialize the durability subsystem. + */ +void durabilityInit(void) { + serverLog(LL_DEBUG, "Initializing durability subsystem"); + + /* Initialize uncommitted keys pending data */ + uncommittedKeysInitPending(); + + /* Have to init the handlers before using them. */ + initTaskTypes(); + server.durability.previous_acked_offset = -1; + server.durability.clients_waiting_ack = listCreate(); + durableTaskInitLists(); + server.durability.clients_blocked = 0; + server.durability.clients_unblocked = 0; + server.durability.clients_disconnected_before_unblocking = 0; + server.durability.read_responses_blocked = 0; + server.durability.write_responses_blocked = 0; + server.durability.other_responses_blocked = 0; + server.durability.read_responses_unblocked = 0; + server.durability.write_responses_unblocked = 0; + server.durability.other_responses_unblocked = 0; + server.durability.read_responses_blocked_cumulative_time_us = 0; + server.durability.write_responses_blocked_cumulative_time_us = 0; + server.durability.other_responses_blocked_cumulative_time_us = 0; + + /* Initialize function store blocking state */ + server.durability.all_dbs_dirty_in_current_cmd = false; + server.durability.func_store_blocking_offset = -1; + server.durability.processed_func_write_in_transaction = false; + + /* Register built-in durability providers (AOF) */ + registerBuiltinDurabilityProviders(); +} + +/** + * Clean up the durability subsystem on server shutdown. + */ +void durabilityCleanup(void) { + if (server.durability.clients_waiting_ack != NULL) { + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = NULL; + } + + uncommittedKeysCleanupPending(); + + /* Cleanup deferred tasks waiting for durability ack */ + durableTaskCleanupLists(); + + /* Reset the durability provider registry so it can be re-initialized */ + resetDurabilityProviders(); + + clearAllUncommittedKeys(); +} + +/** + * Disconnect and free clients waiting for durability ack. + */ +static void freeClientsWaitingAck(const durable_t *durability) { + listIter li; + listNode *ln; + listRewind(durability->clients_waiting_ack, &li); + while ((ln = listNext(&li))) { + client *c = listNodeValue(ln); + freeClient(c); + } + listEmpty(durability->clients_waiting_ack); +} + +/** + * Reset primary state for the durability subsystem. + */ +static void durabilityResetPrimaryState(bool is_free_clients_needed) { + if (listLength(server.durability.clients_waiting_ack) > 0) { + if (is_free_clients_needed) { + freeClientsWaitingAck(&server.durability); + } else { + unblockResponsesWithAckOffset(&server.durability, LLONG_MAX); + } + serverAssert(listLength(server.durability.clients_waiting_ack) == 0); + } + durableTaskEmptyLists(); +} + +/** + * Clear the durability attributes specific to the primary. + * Invoked when a primary node becomes a replica. + */ +void durabilityClearPrimaryState(void) { + if (!isDurabilityEnabled()) return; + durabilityResetPrimaryState(true); +} + +/** + * Generate INFO string for durability stats. + */ +sds genDurabilityInfoString(sds info) { + if (!isDurabilityEnabled()) { + info = sdscatprintf(info, "durability_enabled:0\r\n"); + return info; + } + + info = sdscatprintf(info, + "durability_enabled:1\r\n" + "durability_read_blocked_count:%lld\r\n" + "durability_write_blocked_count:%lld\r\n" + "durability_clients_waiting_ack:%lu\r\n" + "durability_uncommitted_keys:%llu\r\n" + "durability_committed_offset:%lld\r\n" + "durability_primary_repl_offset:%lld\r\n" + "durability_sync_replicas:%d\r\n" + "durability_min_sync_replicas:%d\r\n", + server.durability.read_responses_blocked, + server.durability.write_responses_blocked, + listLength(server.durability.clients_waiting_ack), + getNumberOfUncommittedKeys(), + server.durability.previous_acked_offset, + server.primary_repl_offset, + getSyncReplicaCount(), + server.min_sync_replicas); + + return info; +} + +/** + * Reset related resources when enabling/disabling durability. + */ +void durabilityReset(void) { + if (isDurabilityEnabled()) { + server.durability.pre_command_replication_offset = server.primary_repl_offset; + listIter li; + listNode *ln; + listRewind(server.clients, &li); + while ((ln = listNext(&li)) != NULL) { + client *c = listNodeValue(ln); + durabilityClientInit(c); + } + } else { + if (iAmPrimary()) { + durabilityResetPrimaryState(false); + } + clearAllUncommittedKeys(); + } +} diff --git a/src/reply_blocking.h b/src/reply_blocking.h new file mode 100644 index 00000000000..e7fe96ebd42 --- /dev/null +++ b/src/reply_blocking.h @@ -0,0 +1,221 @@ +#ifndef REPLY_BLOCKING_H +#define REPLY_BLOCKING_H + +/* Include feature-test macros early so _FILE_OFFSET_BITS=64 is defined + * before any system headers, ensuring off_t is 64-bit on 32-bit builds. */ +#include "fmacros.h" + +#include +#include +#include +#include "expire.h" +#include "monotonic.h" +#include "sds.h" +#include "durability_provider.h" +#include "uncommitted_keys.h" +#include "durable_task.h" + +/* Command filter codes that are used in pre execution stage of a command. */ +#define CMD_FILTER_ALLOW 0 +#define CMD_FILTER_REJECT 1 +// Returns true if the cmd is a script command that never replicates. +#define IS_SCRIPT_CALL_READONLY_CMD(cmd) ((cmd) && (((cmd)->proc == fcallroCommand) || ((cmd)->proc == evalRoCommand) || ((cmd)->proc == evalShaRoCommand))) + +// Returns true if the cmd is a script command +// (EVAL/EVAL_RO/EVALSHA/EVALSHA_RO/FCALL/FCALL_RO). +#define IS_SCRIPT_CALL_CMD(cmd) ((cmd) && (((cmd)->proc == fcallCommand) || ((cmd)->proc == fcallroCommand) || ((cmd)->proc == evalCommand) || ((cmd)->proc == evalRoCommand) || ((cmd)->proc == evalShaCommand) || ((cmd)->proc == evalShaRoCommand))) + +// Returns true if the cmd is a keyspace informational command — a command that is +// related to the keyspace (ACL_CATEGORY_KEYSPACE) but does not mutate it (not CMD_WRITE). +// These commands provide information about the keyspace and need to be tracked for +// durability response blocking even when they are admin or non-read/non-write commands. +#define IS_KEYSPACE_INFORMATIONAL(cmd) ((cmd) && ((cmd)->acl_categories & ACL_CATEGORY_KEYSPACE) && !((cmd)->flags & CMD_WRITE)) + +/* Flags below help in correctly classifying transactions as + * either read/write commands or non-keyspace commands. */ +// Indicates the client's last command was a mutative command. +#define DURABILITY_CLIENT_LAST_CMD_WRITE (1ULL << 20) +// Indicates the client's last command was read-only command. */ +#define DURABILITY_CLIENT_LAST_CMD_READONLY (1ULL << 21) + +struct client; +struct serverObject; +struct serverDb; +struct list; +struct listNode; + +typedef long long mstime_t; + +/* Indicate this type of notification is called inside of a durable task, + * which is used by the durability feature to defer notifications. */ +#define NOTIFY_IN_DURABLE_TASK (1 << 30) +/** + * Durability container to house all the durability related fields. + */ +typedef struct durable_t { + /* Clients waiting for offset acknowledgement from durability providers */ + struct list *clients_waiting_ack; + + /* Deferred tasks waiting for offset acknowledgement from durability providers */ + struct list *tasks_waiting_ack[DURABLE_TASK_TYPE_MAX]; + + /* Pending lists of tasks waiting for durability ack. This list is populated + * when the current command is under execution but before we know about the + * updated primary_repl_offset. After the command execution completes, the + * server.primary_repl_offset would get incremented and we need to update + * this list and move all the pending tasks to the official + * tasks_waiting_ack list as part of the post-execution logic + */ + struct list *pending_tasks_waiting_ack[DURABLE_TASK_TYPE_MAX]; + + /* Previously acknowledged replication offset by durability providers */ + long long previous_acked_offset; + + /* Track the replication offset prior to executing a single command in call() */ + long long pre_call_replication_offset; + + /* Track the replication offset prior to executing a command block + including single command and multi-command transactions */ + long long pre_command_replication_offset; + + /* Track the number of commands awaiting propagation prior to executing a single command in call() */ + int pre_call_num_ops_pending_propagation; + + /* Counters for stats / info */ + + /* Counter of how many clients are blocked for durability */ + unsigned long long clients_blocked; + /* Counter of how many clients are unblocked for durability */ + unsigned long long clients_unblocked; + /* Counter of how many clients are disconnected before being unblocked for durability */ + unsigned long long clients_disconnected_before_unblocking; + /* Counter of how many responses are blocked/unblocked by type */ + unsigned long long read_responses_blocked; + unsigned long long write_responses_blocked; + unsigned long long other_responses_blocked; + unsigned long long read_responses_unblocked; + unsigned long long write_responses_unblocked; + unsigned long long other_responses_unblocked; + + /* Cumulative times for all the blocked responses */ + unsigned long long read_responses_blocked_cumulative_time_us; + unsigned long long write_responses_blocked_cumulative_time_us; + unsigned long long other_responses_blocked_cumulative_time_us; + + /* Tracks whether all databases were dirtied during the current command + * within a multi-command block (MULTI/EXEC or Lua script). */ + bool all_dbs_dirty_in_current_cmd; + + /* Function store blocking offset: tracks the replication offset at which + * the function store was last modified and needs durability acknowledgement. */ + long long func_store_blocking_offset; + + /* Flag indicating a function write occurred inside a transaction, so the + * blocking offset should be updated when the transaction completes. */ + bool processed_func_write_in_transaction; +} durable_t; + +/** + * Define the type of command being blocked + */ +typedef enum { + DURABLE_BLOCKED_CMD_OTHER = 0, + DURABLE_BLOCKED_CMD_WRITE, + DURABLE_BLOCKED_CMD_READ +} durableBlockedCmdType; + +// Blocked response structure used by client to mark +// the blocking information associated with each response +typedef struct blockedResponse { + // Pointer to the client's reply node where the blocked response starts. + // NULL if the blocked response starts from the 16KB initial buffer + struct listNode *disallowed_reply_block; + // The boundary in the reply buffer where the blocked response starts. + size_t disallowed_byte_offset; + // The replication offset to wait for acknowledgement from durability providers + long long primary_repl_offset; + + // Enum to store the type of blocked command + durableBlockedCmdType cmd_type; + // Timer for blocked command + monotime blocked_command_timer; +} blockedResponse; + +// Describes a pre-execution COB offset for a client +typedef struct preExecutionOffsetPosition { + // True if the pre execution offset/reply block are initialized + bool recorded; + // Track initial client COB position for client blocking + struct listNode *reply_block; + // Byte position boundary within the pre-execution reply block + size_t byte_offset; +} preExecutionOffsetPosition; + +typedef struct clientDurabilityInfo { + // Blocked client responses list for durability + struct list *blocked_responses; + + /* Pre-execution data recorded before a command is executed + * to record the boundaries of the COB. */ + preExecutionOffsetPosition offset; + + // Replication offset to block this current command response + long long current_command_repl_offset; + + // The list of async notification tasks that reference this client + struct list *pending_notify_tasks; + + // This client is waiting for durability providers to acknowledge + // the write before its response can be sent. + uint64_t durability_blocked : 1; + // Modules can set the blocking offset for read cmds + long long module_cmd_blocking_offset; + + uint64_t durability_flags; +} clientDurableInfo; + +/** + * Init / Lifecycle + */ +void durabilityInit(void); +void durabilityCleanup(void); +void durabilityReset(void); +void durabilityClientInit(struct client *c); +void durabilityClientReset(struct client *c); +void durabilityClearPrimaryState(void); + +/** + * Command processing hooks for offset and COB tracking + */ +void beforeCommandTrackReplOffset(client *c); +void afterCommandTrackReplOffset(client *c); +int preCommandExec(client *c); +char *preScriptCmd(client *c); +void postCommandExec(client *c); +void notifyDurabilityProgress(void); + +/** + * Response blocking + */ +void blockClientOnReplOffset(client *c, long long blockingReplOffset); +void unblockResponsesWithAckOffset(const durable_t *durability, long long consensus_ack_offset); + +/** + * Utils + */ +int isPrimaryDurabilityEnabled(void); +int isDurabilityEnabled(void); +bool isClientReplyBufferLimited(client *c); +sds genDurabilityInfoString(sds info); + +/** + * Function store dirty tracking (durability blocking for function store writes) + */ +bool isFunctionRWCommand(struct client *c); +bool isFunctionStoreRWCommand(struct client *c); +bool isDurableFunctionStoreUncommitted(void); +void handleUncommittedFunctionStore(void); +void updateFuncStoreBlockingOffsetForWrite(long long blocking_repl_offset); +long long getFuncStoreBlockingOffset(void); + +#endif /* REPLY_BLOCKING_H */ diff --git a/src/script.c b/src/script.c index 156fb12cf2b..a3781302d00 100644 --- a/src/script.c +++ b/src/script.c @@ -202,6 +202,13 @@ int scriptPrepareForRun(scriptRunCtx *run_ctx, return C_ERR; } + // check if sync replication would want to stop the execution. + const char *pre_script_err = preScriptCmd(caller); + if (pre_script_err != NULL) { + addReplyError(caller, pre_script_err); + return C_ERR; + } + } else { /* Special handling for backwards compatibility (no shebang eval[sha]) mode */ if (running_stale) { diff --git a/src/server.c b/src/server.c index c076f0714ea..2e2d955ece8 100644 --- a/src/server.c +++ b/src/server.c @@ -1828,7 +1828,14 @@ void beforeSleep(struct aeEventLoop *eventLoop) { processed += processIOThreadsReadDone(); processed += connTypeProcessPendingData(); if (server.aof_state == AOF_ON || server.aof_state == AOF_WAIT_REWRITE) flushAppendOnlyFile(0); - processed += handleClientsWithPendingWrites(); + /* When appendfsync=always and the AOF fsync has been offloaded to an + * IO thread, skip flushing replies to clients until the fsync completes. + * This ensures clients don't receive responses before their writes are + * durable on disk. The next beforeSleep iteration will flush replies + * once processAofIOThreadFlushResult() confirms the fsync is done. */ + if (!(server.aof_fsync == AOF_FSYNC_ALWAYS && aofIOFlushInProgress())) { + processed += handleClientsWithPendingWrites(); + } int last_processed = 0; do { /* Try to process all the pending IO events. */ @@ -1935,6 +1942,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { * wake them up ASAP. */ if (listLength(server.clients_waiting_acks) && prev_fsynced_reploff != server.fsynced_reploff) dont_sleep = 1; } + notifyDurabilityProgress(); /* Handle writes with pending output buffers. */ int client_writes = handleClientsWithPendingWrites(); @@ -2048,7 +2056,11 @@ void afterSleep(struct aeEventLoop *eventLoop, int numevents) { server.cmd_time_snapshot = server.mstime; } - adjustIOThreadsByEventLoad(numevents, 0); + /* Check if AOF always-fsync needs IO threads for background work */ + int aof_needs_io = (server.aof_state != AOF_OFF && + server.aof_fsync == AOF_FSYNC_ALWAYS && + sdslen(server.aof_buf) > 0); + adjustIOThreadsByEventLoad(numevents, 0, aof_needs_io); } /* =========================== Server initialization ======================== */ @@ -2128,6 +2140,7 @@ void createSharedObjects(void) { shared.oomerr = createSharedString("-OOM command not allowed when used memory > 'maxmemory'.\r\n"); shared.execaborterr = createSharedString("-EXECABORT Transaction discarded because of previous errors.\r\n"); shared.noreplicaserr = createSharedString("-NOREPLICAS Not enough good replicas to write.\r\n"); + shared.nosyncreplicas = createSharedString("-CLUSTERDOWN Not enough sync replicas to accept writes.\r\n"); shared.busykeyerr = createSharedString("-BUSYKEY Target key name already exists.\r\n"); /* The shared NULL depends on the protocol version. */ @@ -2218,6 +2231,7 @@ void createSharedObjects(void) { shared.load = createSharedString("LOAD"); shared.createconsumer = createSharedString("CREATECONSUMER"); shared.getack = createSharedString("GETACK"); + shared.commit = createSharedString("commit"); shared.special_asterisk = createSharedString("*"); shared.special_equals = createSharedString("="); shared.redacted = createSharedString("(redacted)"); @@ -2297,6 +2311,9 @@ void initServerConfig(void) { server.aof_flush_sleep = 0; server.aof_last_fsync = time(NULL) * 1000; server.aof_cur_timestamp = 0; + atomic_store_explicit(&server.aof_io_flush_state, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_errno, 0, memory_order_relaxed); + atomic_store_explicit(&server.aof_io_flush_size, 0, memory_order_relaxed); atomic_store_explicit(&server.aof_bio_fsync_status, C_OK, memory_order_relaxed); server.aof_rewrite_time_last = -1; server.aof_rewrite_time_start = -1; @@ -2868,6 +2885,8 @@ serverDb *createDatabase(int id) { db->ready_keys = dictCreate(&objectKeyPointerValueDictType); db->watched_keys = dictCreate(&keylistDictType); db->id = id; + + durabilityInitDatabase(db); resetDbExpiryState(db); return db; } @@ -3096,6 +3115,8 @@ void initServer(void) { /* Initialize the EVAL scripting component. */ evalInit(); + durabilityInit(); + applyWatchdogPeriod(); if (server.maxmemory_clients != 0) initServerClientMemUsageBuckets(); @@ -3833,6 +3854,7 @@ void call(client *c, int flags) { struct ClientFlags client_old_flags = c->flag; struct serverCommand *real_cmd = c->realcmd; + beforeCommandTrackReplOffset(c); client *prev_client = server.executing_client; server.executing_client = c; @@ -4031,6 +4053,11 @@ void call(client *c, int flags) { /* Do some maintenance job and cleanup */ afterCommand(c); + /* Track replication offset for durability blocking. This must stay + * here rather than inside afterCommand() because afterCommand() is + * also invoked from nested call() contexts (e.g. propagatePendingCommands) + * where the client argv may no longer be valid. */ + afterCommandTrackReplOffset(c); /* Remember the replication offset of the client, right after its last * command that resulted in propagation. */ @@ -4479,6 +4506,13 @@ int processCommand(client *c) { return C_OK; } + /* Don't accept write commands if there are not enough sync replicas + * in the ISR and user configured min-sync-replicas. */ + if (is_write_command && !checkSyncReplicasStatus()) { + rejectCommand(c, shared.nosyncreplicas); + return C_OK; + } + /* Don't accept write commands if this is a read only replica. But * accept write commands if this is our primary. */ if (server.primary_host && server.repl_replica_ro && !obey_client && is_write_command) { @@ -4564,8 +4598,12 @@ int processCommand(client *c) { queueMultiCommand(c, cmd_flags); addReply(c, shared.queued); } else { + if (preCommandExec(c) == CMD_FILTER_REJECT) { + return C_OK; + } int flags = CMD_CALL_FULL; call(c, flags); + postCommandExec(c); if (listLength(server.ready_keys) && !isInsideYieldingLongCommand()) handleClientsBlockedOnKeys(); } return C_OK; @@ -4852,6 +4890,9 @@ int finishShutdown(void) { /* Fire the shutdown modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_SHUTDOWN, 0, NULL); + /* Cleanup durability tracking resources. */ + durabilityCleanup(); + /* Remove the pid file if possible and needed. */ if (server.daemonize || server.pidfile) { serverLog(LL_NOTICE, "Removing the pid file."); @@ -6693,6 +6734,13 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "eventloop_cmd_per_cycle_max:%lld\r\n", server.el_cmd_cnt_max)); } + /* Sync replication / durability stats */ + if (all_sections || (dictFind(section_dict, "durability") != NULL)) { + if (sections++) info = sdscat(info, "\r\n"); + info = sdscatprintf(info, "# Durability\r\n"); + info = genDurabilityInfoString(info); + } + return info; } diff --git a/src/server.h b/src/server.h index 579810808c8..57b2256e04b 100644 --- a/src/server.h +++ b/src/server.h @@ -36,7 +36,9 @@ #include "rio.h" #include "commands.h" #include "allocator_defrag.h" +#include "reply_blocking.h" +#include #include #include #include @@ -490,6 +492,7 @@ typedef enum { #define REPLICA_CAPA_PSYNC2 (1 << 1) /* Supports PSYNC2 protocol. */ #define REPLICA_CAPA_DUAL_CHANNEL (1 << 2) /* Supports dual channel replication sync */ #define REPLICA_CAPA_SKIP_RDB_CHECKSUM (1 << 3) /* Supports skipping RDB checksum for sync requests. */ +#define REPLICA_CAPA_SYNC (1 << 4) /* Replica participates in sync replication (ISR member). */ /* Replica capability strings */ #define REPLICA_CAPA_SKIP_RDB_CHECKSUM_STR "skip-rdb-checksum" /* Supports skipping RDB checksum for sync requests. */ @@ -505,6 +508,10 @@ typedef enum { /* Synchronous read timeout - replica side */ #define CONFIG_REPL_SYNCIO_TIMEOUT 5 +/* ISR (in-sync replica) timeout in seconds. If a sync replica has not + * sent a REPLCONF ACK within this period, it is removed from the ISR. */ +#define REPLICA_ISR_TIMEOUT 10 + /* The default number of replication backlog blocks to trim per call. */ #define REPL_BACKLOG_TRIM_BLOCKS_PER_CALL 64 @@ -946,6 +953,12 @@ typedef struct serverDb { long long avg_ttl; /* Average TTL, just for stats */ unsigned long cursor; /* Cursor of the active expire cycle. */ } expiry[ACTIVE_EXPIRY_TYPE_COUNT]; + + /* fields related to dirty key tracking + * for consistent writes with durability */ + hashtable *uncommitted_keys; /* Map of dirty keys to the offset required by replica acknowledgement */ + long long dirty_repl_offset; /* Replication offset for a dirty DB */ + rax *reply_duration; /* Radix tree tracking reply durations for durable blocked clients */ } serverDb; /* forward declaration for functions ctx */ @@ -1235,6 +1248,8 @@ typedef struct ClientFlags { or client::buf. */ uint64_t keyspace_notified : 1; /* Indicates that a keyspace notification was triggered during the execution of the current command. */ + uint64_t durable_blocked_client : 1; /* This is a durable blocked client that is waiting for the server to + * acknowledge the write of the command that caused it to be blocked. */ } ClientFlags; typedef struct ClientPubSubData { @@ -1274,6 +1289,10 @@ typedef struct ClientReplicationData { int replica_version; /* Version on the form 0xMMmmpp. */ short replica_capa; /* Replica capabilities: REPLICA_CAPA_* bitwise OR. */ short replica_req; /* Replica requirements: REPLICA_REQ_* */ + int is_in_sync; /* Runtime flag: 1 if this replica is in the ISR + (in-sync replica group). Set by primary when + repl_ack_off >= committed_offset for a + REPLICA_CAPA_SYNC replica. Cleared on timeout. */ uint64_t associated_rdb_client_id; /* The client id of this replica's rdb connection */ time_t rdb_client_disconnect_time; /* Time of the first freeClient call on this client. Used for delaying free. */ listNode *ref_repl_buf_node; /* Referenced node of replication buffer blocks, @@ -1434,6 +1453,7 @@ typedef struct client { #ifdef LOG_REQ_RES clientReqResInfo reqres; #endif + struct clientDurabilityInfo clientDurabilityInfo; } client; /* Forward declaration */ @@ -1458,6 +1478,7 @@ static inline int getClientType(client *c) { return CLIENT_TYPE_NORMAL; } + /* When a command generates a lot of discrete elements to the client output buffer, it is much faster to * skip certain types of initialization. This type is used to indicate a client that has been initialized * and can be used with addWritePreparedReply* functions. A client can be cast into this type with @@ -1498,11 +1519,11 @@ struct sharedObjectsStruct { *loadingerr, *slowevalerr, *slowscripterr, *slowmoduleerr, *bgsaveerr, *primarydownerr, *roreplicaerr, *loadingerr_variants[2], *slowevalerr_variants[2], *slowscripterr_variants[2], *slowmoduleerr_variants[2], *bgsaveerr_variants[2], - *execaborterr, *noautherr, *noreplicaserr, *busykeyerr, *oomerr, *plus, *messagebulk, *pmessagebulk, + *execaborterr, *noautherr, *noreplicaserr, *nosyncreplicas, *busykeyerr, *oomerr, *plus, *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *psubscribebulk, *punsubscribebulk, *del, *unlink, *rpop, *lpop, *lpush, *zadd, *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *hsetex, *hdel, *hpexpireat, *hpersist, *srem, *xgroup, *xclaim, *script, *replconf, *eval, *cluster, *syncslots, *persist, *set, *pexpireat, *pexpire, *time, *pxat, *absttl, - *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *getack, + *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *commit, *getack, *special_asterisk, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk, *fields, *finish, *state, *success, *failed, *name, *message, *smessagebulk, *select[PROTO_SHARED_SELECT_CMDS], *integers[OBJ_SHARED_INTEGERS], @@ -1772,6 +1793,7 @@ typedef enum childInfoType { } childInfoType; struct valkeyServer { + durable_t durability; /* General */ pid_t pid; /* Main process pid. */ pthread_t main_thread_id; /* Main thread id */ @@ -2062,6 +2084,9 @@ struct valkeyServer { int aof_load_truncated; /* Don't stop on unexpected AOF EOF. */ int aof_use_rdb_preamble; /* Specify base AOF to use RDB encoding on AOF rewrites. */ int aof_rewrite_use_rdb_preamble; /* Base AOF to use RDB encoding on AOF rewrites start. */ + _Atomic(int) aof_io_flush_state; /* AOF always-fsync IO-thread flush state. */ + _Atomic(int) aof_io_flush_errno; /* Errno of AOF always-fsync IO-thread flush. */ + _Atomic(off_t) aof_io_flush_size; /* Bytes written by the last IO-thread flush. */ _Atomic(int) aof_bio_fsync_status; /* Status of AOF fsync in bio job. */ _Atomic(int) aof_bio_fsync_errno; /* Errno of AOF fsync in bio job. */ aofManifest *aof_manifest; /* Used to track AOFs. */ @@ -2149,6 +2174,10 @@ struct valkeyServer { int repl_min_replicas_to_write; /* Min number of replicas to write. */ int repl_min_replicas_max_lag; /* Max lag of replicas to write. */ int repl_good_replicas_count; /* Number of replicas with lag <= max_lag. */ + int min_sync_replicas; /* Min number of sync replicas required to accept writes. + When > 0, sync replication is enabled. */ + int sync_eligible; /* If true, this replica is eligible to join the ISR. + Only meaningful when min_sync_replicas > 0. */ int repl_diskless_sync; /* Primary send RDB to replicas sockets directly. */ int repl_diskless_load; /* Replica parse RDB directly from the socket. * see REPL_DISKLESS_LOAD_* enum */ @@ -3000,7 +3029,6 @@ size_t getClientOutputBufferMemoryUsage(client *c); size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage); int freeClientsInAsyncFreeQueue(void); int closeClientOnOutputBufferLimitReached(client *c, int async); -int getClientType(client *c); int getClientTypeByName(char *name); char *getClientTypeName(int client_class); void flushReplicasOutputBuffers(void); @@ -3048,6 +3076,8 @@ int processIOThreadsWriteDone(void); void releaseReplyReferences(client *c); void resetLastWrittenBuf(client *c); +int getIntFromObject(robj *o, int *target); + int parseExtendedCommandArgumentsOrReply(client *c, int command_type, int start_idx, int max_args, int *flags, int *unit, int *expire_idx, robj **expire, robj **compare_val); /* logreqres.c - logging of requests and responses */ @@ -3215,6 +3245,8 @@ void replicationSetPrimary(char *ip, int port, int full_sync_required, bool disc void replicationUnsetPrimary(void); void refreshGoodReplicasCount(void); int checkGoodReplicasStatus(void); +int checkSyncReplicasStatus(void); +int getSyncReplicaCount(void); void processClientsWaitingReplicas(void); void unblockClientWaitingReplicas(client *c); int replicationCountAcksByOffset(long long offset); @@ -3294,6 +3326,7 @@ void aofManifestFree(aofManifest *am); int aofDelHistoryFiles(void); int aofRewriteLimited(void); int rewriteSlotToAppendOnlyFileRio(rio *aof, int db_num, int hashslot, size_t *key_count); +int aofIOFlushInProgress(void); /* Child info */ void openChildInfoPipe(void); @@ -3804,6 +3837,7 @@ int getKeysFromCommandWithSpecs(struct serverCommand *cmd, getKeysResult *result); keyReference *getKeysPrepareResult(getKeysResult *result, int numkeys); int getKeysFromCommand(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); +int getKeysUsingKeySpecs(struct serverCommand *cmd, robj **argv, int argc, int search_flags, getKeysResult *result); int doesCommandHaveKeys(struct serverCommand *cmd); int getChannelsFromCommand(struct serverCommand *cmd, robj **argv, int argc, getKeysResult *result); int doesCommandHaveChannelsWithFlags(struct serverCommand *cmd, int flags); diff --git a/src/uncommitted_keys.c b/src/uncommitted_keys.c new file mode 100644 index 00000000000..8aebee96e9b --- /dev/null +++ b/src/uncommitted_keys.c @@ -0,0 +1,417 @@ +#include "server.h" +#include "zmalloc.h" +#include "script.h" +#include +#include + +/*================================= Internal Data Structures ================= */ +typedef struct uncommittedKeyEntry { + sds key; + long long offset; +} uncommittedKeyEntry; + +/** + * Pending key reference used during multi-command blocks (MULTI/EXEC, Lua). + * We mark keys dirty immediately but don't yet know the final replication + * offset, so we keep a reference to update the offset after the transaction + * completes. + */ +typedef struct pendingUncommittedKey { + robj *key; + hashtable *uncommitted_keys; +} pendingUncommittedKey; + +/* Pending keys buffered during MULTI/EXEC or Lua scripts. These are keys + * that have already been marked dirty in uncommitted_keys (with LLONG_MAX + * as a placeholder offset) but whose real offset is not yet known. */ +static list *pending_uncommitted_keys; + +/* Pending databases dirtied during a multi-command block. */ +static list *pending_uncommitted_dbs; + + +/*================================= Internal Prototypes ====================== */ + +static void addUncommittedKey(sds key, long long offset, hashtable *uncommittedKeys); +static void pendingUncommittedKeyDestructor(void *entry); +static uint64_t uncommittedKeysHash(const void *key); +static int uncommittedKeysKeyCompare(const void *key1, const void *key2); +static const void *uncommittedKeyEntryGetKey(const void *entry); +static void uncommittedKeyEntryDestructor(void *entry); +static void handleDirtyDatabase(client *c, serverDb *db); +static bool swapdbGetParams(robj **argv, int argc, int *id1_p, int *id2_p); +static bool getDbIdFromRobj(robj *obj, int *db_id); + +/*================================= Hashtable Type =========================== */ + +static hashtableType uncommittedKeysHashtableType = { + .entryGetKey = uncommittedKeyEntryGetKey, + .hashFunction = uncommittedKeysHash, + .keyCompare = uncommittedKeysKeyCompare, + .entryDestructor = uncommittedKeyEntryDestructor, +}; + +/*================================= Utility Functions ======================== */ + +static void pendingUncommittedKeyDestructor(void *entry) { + if (entry == NULL) return; + pendingUncommittedKey *uk = entry; + if (uk->key != NULL) decrRefCount(uk->key); + zfree(uk); +} + +static uint64_t uncommittedKeysHash(const void *key) { + const sds keystr = (const sds)key; + return hashtableGenHashFunction(keystr, sdslen(keystr)); +} + +static int uncommittedKeysKeyCompare(const void *key1, const void *key2) { + const sds s1 = (const sds)key1; + const sds s2 = (const sds)key2; + return sdslen(s1) != sdslen(s2) || memcmp(s1, s2, sdslen(s1)); +} + +static const void *uncommittedKeyEntryGetKey(const void *entry) { + return ((const uncommittedKeyEntry *)entry)->key; +} + +static void uncommittedKeyEntryDestructor(void *entry) { + if (entry == NULL) return; + uncommittedKeyEntry *uke = entry; + sdsfree(uke->key); + zfree(uke); +} + +unsigned long long getNumberOfUncommittedKeys(void) { + unsigned long long num_uncommitted_keys = 0; + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + num_uncommitted_keys += hashtableSize(server.db[i]->uncommitted_keys); + } + } + return num_uncommitted_keys; +} + +/*================================= Key Tracking ============================= */ + +/** + * Mark a key as uncommitted at a particular replication offset. + * If the key already exists in the hashtable, update its offset. + */ +static void addUncommittedKey(const sds key, const long long offset, hashtable *uncommittedKeys) { + uncommittedKeyEntry *entry = zmalloc(sizeof(*entry)); + entry->key = sdsdup(key); + entry->offset = offset; + + void *existing = NULL; + if (hashtableAddOrFind(uncommittedKeys, entry, &existing)) { + return; /* newly added */ + } + + /* Key already tracked — update to the latest offset */ + uncommittedKeyEntry *existing_entry = existing; + existing_entry->offset = offset; + sdsfree(entry->key); + zfree(entry); +} + +/** + * Retrieve the uncommitted replication offset for a given key, purge the given + * key from uncommitted keys set if the replication offset has been committed. + */ +long long durabilityPurgeAndGetUncommittedKeyOffset(const sds key, serverDb *db) { + // serverAssert(iAmPrimary()); + uncommittedKeyEntry *entry = NULL; + if (!hashtableFind(db->uncommitted_keys, key, (void **)&entry)) { + return -1; + } + + long long key_offset = entry->offset; + + if (key_offset <= server.durability.previous_acked_offset) { + hashtableDelete(db->uncommitted_keys, key); + return -1; + } + + return key_offset; +} + +/** + * Handle a dirty key for a given client. + * + * Keys are marked dirty immediately in db->uncommitted_keys. For single + * commands outside a transaction the real replication offset is known. + * + * Inside a MULTI/EXEC or Lua script we use LLONG_MAX as a placeholder + * offset (so reads are blocked immediately) and buffer a reference in + * pending_uncommitted_keys. processPendingUncommittedData() will later + * update the offset once the transaction completes. + * + * Cleanup happens in drainCommittedKeys() which iterates the hashtable + * and removes entries whose offset has been committed. + */ +void handleUncommittedKeyForClient(const client *c, robj *key, serverDb *db) { + sds keystr = objectGetVal(key); + + if ((c != NULL) && ((c->flag.multi) || scriptIsRunning())) { + if (server.durability.all_dbs_dirty_in_current_cmd) return; + + /* Mark dirty immediately with placeholder offset */ + addUncommittedKey(keystr, LLONG_MAX, db->uncommitted_keys); + + /* Buffer a reference so we can update offset later */ + if (pending_uncommitted_keys == NULL) { + pending_uncommitted_keys = listCreate(); + listSetFreeMethod(pending_uncommitted_keys, pendingUncommittedKeyDestructor); + } + pendingUncommittedKey *dirty_key = zmalloc(sizeof(pendingUncommittedKey)); + incrRefCount(key); + dirty_key->key = key; + dirty_key->uncommitted_keys = db->uncommitted_keys; + listAddNodeTail(pending_uncommitted_keys, dirty_key); + } else { + + /* [WBL] - On a replica, primary_repl_offset doesn't advance for + * commands from the replication stream. Use the primary + * client's applied replication offset instead. */ + long long offset = server.primary_repl_offset; + if (server.primary != NULL && server.current_client != NULL && + server.current_client->flag.primary) { + offset = server.primary->repl_data->reploff; + } + addUncommittedKey(keystr, offset, db->uncommitted_keys); + } +} + +/*================================= Database Modification ==================== */ + +static void handleDirtyDatabase(client *c, serverDb *db) { + if ((c->flag.multi) || scriptIsRunning()) { + if (server.durability.all_dbs_dirty_in_current_cmd) return; + if (db != NULL) { + listAddNodeTail(pending_uncommitted_dbs, db); + } else { + server.durability.all_dbs_dirty_in_current_cmd = true; + listEmpty(pending_uncommitted_keys); + listEmpty(pending_uncommitted_dbs); + /* FLUSHALL inside a transaction: any keys previously dirtied + * in this transaction are now gone. Clear the per-DB + * uncommitted_keys hashtables so stale LLONG_MAX-offset + * entries don't block future reads after the EXEC commits. */ + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + hashtableEmpty(server.db[i]->uncommitted_keys, NULL); + } + } + } + } else { + if (db != NULL) { + db->dirty_repl_offset = server.primary_repl_offset; + } else { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + server.db[i]->dirty_repl_offset = server.primary_repl_offset; + } + } + } + } +} + +void handleDatabaseModification(client *c) { + if (c->cmd->proc == swapdbCommand && server.cluster_enabled == 0) { + int id1, id2; + if (swapdbGetParams(c->argv, c->argc, &id1, &id2)) { + handleDirtyDatabase(c, server.db[id1]); + handleDirtyDatabase(c, server.db[id2]); + } + } else if (c->cmd->proc == flushdbCommand) { + handleDirtyDatabase(c, c->db); + } else if (c->cmd->proc == flushallCommand) { + handleDirtyDatabase(c, NULL); + } +} + +/*================================= Command Parameter Helpers ================ */ + +static bool swapdbGetParams(robj **argv, int argc, int *id1_p, int *id2_p) { + long long dbid1, dbid2; + if (argc != 3) return false; + if (server.cluster_enabled) return false; + if (getLongLongFromObject(argv[1], &dbid1) != C_OK) return false; + if (getLongLongFromObject(argv[2], &dbid2) != C_OK) return false; + if (dbid1 < 0 || dbid1 >= server.dbnum) return false; + if (dbid2 < 0 || dbid2 >= server.dbnum) return false; + if (dbid1 == dbid2) return false; + + *id1_p = (int)dbid1; + *id2_p = (int)dbid2; + return true; +} + +static bool getDbIdFromRobj(robj *obj, int *db_id) { + if ((getIntFromObject(obj, db_id) != C_OK) || (*db_id < 0) || (*db_id >= server.dbnum)) { + return false; + } + return true; +} + +bool getTargetDbIdForCopyCommand(int argc, robj **argv, int selected_dbid, int *target_dbid) { + const int copy_command_optional_arg_start_index = 3; + + *target_dbid = selected_dbid; + + for (int j = copy_command_optional_arg_start_index; j < argc; j++) { + if (!strcasecmp(objectGetVal(argv[j]), "replace")) { + continue; + } else if (!strcasecmp(objectGetVal(argv[j]), "db") && (argc > j + 1)) { + if (!getDbIdFromRobj(argv[j + 1], target_dbid)) { + return false; + } + j++; + } else { + return false; + } + } + return true; +} + +/*================================= Drain / Cleanup ========================== */ + +/** + * Remove committed entries from the per-DB uncommitted_keys hashtables. + * + * Iterates each database's uncommitted_keys hashtable with a safe iterator + * and deletes entries whose offset has been durably committed. + * + * With appendfsync=always the uncommitted set stays small (bounded by keys + * written between fsyncs), so the full-scan cost is smaller than the fsync. + */ +void drainCommittedKeys(long long committed_offset) { + for (int i = 0; i < server.dbnum; i++) { + serverDb *db = server.db[i]; + if (db == NULL) continue; + + if (hashtableSize(db->uncommitted_keys) > 0) { + hashtableIterator iter; + hashtableInitIterator(&iter, db->uncommitted_keys, HASHTABLE_ITER_SAFE); + void *entry; + while (hashtableNext(&iter, &entry)) { + uncommittedKeyEntry *uke = entry; + if (uke->offset <= committed_offset) { + hashtableDelete(db->uncommitted_keys, uke->key); + } + } + hashtableCleanupIterator(&iter); + } + + if (db->dirty_repl_offset <= committed_offset) { + db->dirty_repl_offset = -1; + } + } +} + +/** + * Initialize sync replication related fields for a database. + */ +void durabilityInitDatabase(serverDb *db) { + db->uncommitted_keys = hashtableCreate(&uncommittedKeysHashtableType); + db->dirty_repl_offset = -1; +} + +/** + * Clear all uncommitted keys for each database. + */ +void clearAllUncommittedKeys(void) { + serverLog(LL_NOTICE, "Clearing all uncommitted keys for sync replication"); + for (int i = 0; i < server.dbnum; i++) { + serverDb *db = server.db[i]; + if (db == NULL) continue; + hashtableRelease(db->uncommitted_keys); + durabilityInitDatabase(db); + } +} + +/*================================= Access Validation ======================== */ + +/** + * Determine if there are uncommitted keys in the server. + */ +int hasUncommittedKeys(void) { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] && (hashtableSize(server.db[i]->uncommitted_keys) > 0)) + return 1; + } + return 0; +} + +/*================================= Pending Data Processing ================== */ + +void uncommittedKeysInitPending(void) { + pending_uncommitted_keys = listCreate(); + listSetFreeMethod(pending_uncommitted_keys, pendingUncommittedKeyDestructor); + pending_uncommitted_dbs = listCreate(); + server.durability.all_dbs_dirty_in_current_cmd = false; +} + +void uncommittedKeysCleanupPending(void) { + if (pending_uncommitted_keys != NULL) { + listRelease(pending_uncommitted_keys); + pending_uncommitted_keys = NULL; + } + if (pending_uncommitted_dbs != NULL) { + listRelease(pending_uncommitted_dbs); + pending_uncommitted_dbs = NULL; + } +} + +/** + * After a transaction completes, update the placeholder offsets on keys + * that were dirtied during the transaction to the real replication offset. + * Cleanup will happen when drainCommittedKeys() iterates the hashtable. + */ +void processPendingUncommittedData(long long blocking_repl_offset) { + if (listLength(pending_uncommitted_keys) > 0) { + listIter li; + listNode *key_node; + listRewind(pending_uncommitted_keys, &li); + while ((key_node = listNext(&li)) != NULL) { + const pendingUncommittedKey *uk = listNodeValue(key_node); + sds keystr = objectGetVal(uk->key); + + /* Update the placeholder offset to the real one */ + uncommittedKeyEntry *entry = NULL; + if (hashtableFind(uk->uncommitted_keys, keystr, (void **)&entry)) { + /* Only update if still at placeholder or our offset is newer */ + if (entry->offset == LLONG_MAX || entry->offset < blocking_repl_offset) { + entry->offset = blocking_repl_offset; + } + } + + listDelNode(pending_uncommitted_keys, key_node); + } + } + + if (server.durability.all_dbs_dirty_in_current_cmd) { + for (int i = 0; i < server.dbnum; i++) { + if (server.db[i] != NULL) { + server.db[i]->dirty_repl_offset = blocking_repl_offset; + } + } + server.durability.all_dbs_dirty_in_current_cmd = false; + } else if (listLength(pending_uncommitted_dbs) > 0) { + listIter li; + listNode *db_node; + listRewind(pending_uncommitted_dbs, &li); + while ((db_node = listNext(&li)) != NULL) { + serverDb *db = listNodeValue(db_node); + db->dirty_repl_offset = blocking_repl_offset; + listDelNode(pending_uncommitted_dbs, db_node); + } + } + + serverAssert(listLength(pending_uncommitted_keys) == 0); + serverAssert(listLength(pending_uncommitted_dbs) == 0); + serverAssert(server.durability.all_dbs_dirty_in_current_cmd == false); + + updateFuncStoreBlockingOffsetForWrite(blocking_repl_offset); +} diff --git a/src/uncommitted_keys.h b/src/uncommitted_keys.h new file mode 100644 index 00000000000..c43f6bbba13 --- /dev/null +++ b/src/uncommitted_keys.h @@ -0,0 +1,89 @@ +#ifndef UNCOMMITTED_KEYS_H +#define UNCOMMITTED_KEYS_H + +#include +#include "sds.h" + +struct client; +struct serverObject; +struct serverDb; +struct serverCommand; + +/* Note: robj is typedef'd in server.h as `typedef struct serverObject robj;` + * We use struct serverObject * in declarations here to avoid duplicate typedefs. */ + +/*================================= Uncommitted Key Tracking ================= */ + +/** + * Initialize durability-related fields for a database. + */ +void durabilityInitDatabase(struct serverDb *db); + +/** + * Handle a dirty key for a given client. + * Marks the key as dirty immediately in db->uncommitted_keys. + * @param c The calling client. NULL if the key becomes dirty outside a client command (i.e. expiry/eviction) + * @param key The key object + * @param db The database + */ +void handleUncommittedKeyForClient(const struct client *c, struct serverObject *key, struct serverDb *db); + +/** + * Retrieve the uncommitted replication offset for a given key, purge the given + * key from uncommitted keys set if the replication offset has been committed. + * @return the ACK offset of the key if key is uncommitted, returns -1 otherwise. + */ +long long durabilityPurgeAndGetUncommittedKeyOffset(sds key, struct serverDb *db); + +/** + * Drain committed entries from the offset tracker queue, removing keys + * from uncommitted_keys when their offset has been durably committed. + * Called from beforeSleep / notifyDurabilityProgress. + */ +void drainCommittedKeys(long long committed_offset); + +/** + * Clear all uncommitted keys for each database. + */ +void clearAllUncommittedKeys(void); + +/** + * Get the number of uncommitted keys across all databases. + */ +unsigned long long getNumberOfUncommittedKeys(void); + +/*================================= Database Modification Tracking =========== */ + +/** + * Handle database-level modification commands (FLUSHDB, FLUSHALL, SWAPDB). + */ +void handleDatabaseModification(struct client *c); + +/** + * Commit pending uncommitted data (keys, databases, function store) + * after a transaction completes. Sets the real replication offset on + * keys that were dirtied during the transaction and enqueues them for + * cleanup tracking. + */ +void processPendingUncommittedData(long long blocking_repl_offset); + +/** + * Initialize the pending uncommitted data structures. + */ +void uncommittedKeysInitPending(void); + +/** + * Clean up the pending uncommitted data structures. + */ +void uncommittedKeysCleanupPending(void); + +/** + * Determine if there are uncommitted keys in the server. + */ +int hasUncommittedKeys(void); + +/*================================= Command parameter helpers ================ */ + +bool getTargetDbIdForCopyCommand(int argc, struct serverObject **argv, int selected_dbid, int *target_dbid); + +#endif /* UNCOMMITTED_KEYS_H */ diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt index fcc75412970..2ef6d12ff46 100644 --- a/src/unit/CMakeLists.txt +++ b/src/unit/CMakeLists.txt @@ -85,8 +85,11 @@ target_compile_options(valkey-unit-gtests PRIVATE -Og -g) target_compile_options(valkey-unit-gtests PRIVATE -Wno-deprecated-declarations -Wno-write-strings - -fno-var-tracking-assignments ) +# -fno-var-tracking-assignments is GCC-only, skip on clang +if (NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + target_compile_options(valkey-unit-gtests PRIVATE -fno-var-tracking-assignments) +endif() # Include directories for C++ compilation target_include_directories(valkey-unit-gtests PRIVATE diff --git a/src/unit/test_files.h b/src/unit/test_files.h new file mode 100644 index 00000000000..31f936805fa --- /dev/null +++ b/src/unit/test_files.h @@ -0,0 +1,343 @@ +typedef int unitTestProc(int argc, char **argv, int flags); + +typedef struct unitTest { + char *name; + unitTestProc *proc; +} unitTest; + +int test_popcount(int argc, char **argv, int flags); +int test_crc64(int argc, char **argv, int flags); +int test_crc64combine(int argc, char **argv, int flags); +int test_dictCreate(int argc, char **argv, int flags); +int test_dictAdd16Keys(int argc, char **argv, int flags); +int test_dictDisableResize(int argc, char **argv, int flags); +int test_dictAddOneKeyTriggerResize(int argc, char **argv, int flags); +int test_dictDeleteKeys(int argc, char **argv, int flags); +int test_dictDeleteOneKeyTriggerResize(int argc, char **argv, int flags); +int test_dictEmptyDirAdd128Keys(int argc, char **argv, int flags); +int test_dictDisableResizeReduceTo3(int argc, char **argv, int flags); +int test_dictDeleteOneKeyTriggerResizeAgain(int argc, char **argv, int flags); +int test_dictBenchmark(int argc, char **argv, int flags); +int test_endianconv(int argc, char *argv[], int flags); +int test_entryCreate(int argc, char **argv, int flags); +int test_entryUpdate(int argc, char **argv, int flags); +int test_entryHasexpiry_entrySetExpiry(int argc, char **argv, int flags); +int test_entryIsExpired(int argc, char **argv, int flags); +int test_entryMemUsage_entrySetExpiry_entryUpdate(int argc, char **argv, int flags); +int test_entryStringRef(int argc, char **argv, int flags); +int test_fifoEmptyPop(int argc, char *argv[], int flags); +int test_fifoEmptyPeek(int argc, char *argv[], int flags); +int test_fifoSimplePushPop(int argc, char *argv[], int flags); +int test_fifoTryVariousSizes(int argc, char *argv[], int flags); +int test_fifoPushPopTest(int argc, char *argv[], int flags); +int test_fifoJoinTest(int argc, char *argv[], int flags); +int test_fifoComparePerformance(int argc, char *argv[], int flags); +int test_cursor(int argc, char **argv, int flags); +int test_set_hash_function_seed(int argc, char **argv, int flags); +int test_add_find_delete(int argc, char **argv, int flags); +int test_add_find_delete_avoid_resize(int argc, char **argv, int flags); +int test_instant_rehashing(int argc, char **argv, int flags); +int test_bucket_chain_length(int argc, char **argv, int flags); +int test_two_phase_insert_and_pop(int argc, char **argv, int flags); +int test_replace_reallocated_entry(int argc, char **argv, int flags); +int test_incremental_find(int argc, char **argv, int flags); +int test_scan(int argc, char **argv, int flags); +int test_iterator(int argc, char **argv, int flags); +int test_safe_iterator(int argc, char **argv, int flags); +int test_compact_bucket_chain(int argc, char **argv, int flags); +int test_random_entry(int argc, char **argv, int flags); +int test_random_entry_with_long_chain(int argc, char **argv, int flags); +int test_random_entry_sparse_table(int argc, char **argv, int flags); +int test_safe_iterator_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_empty_no_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_reset_invalidation(int argc, char **argv, int flags); +int test_safe_iterator_reset_untracking(int argc, char **argv, int flags); +int test_safe_iterator_pause_resume_tracking(int argc, char **argv, int flags); +int test_null_hashtable_iterator(int argc, char **argv, int flags); +int test_hashtable_retarget_iterator(int argc, char **argv, int flags); +int test_intsetValueEncodings(int argc, char **argv, int flags); +int test_intsetBasicAdding(int argc, char **argv, int flags); +int test_intsetLargeNumberRandomAdd(int argc, char **argv, int flags); +int test_intsetUpgradeFromint16Toint32(int argc, char **argv, int flags); +int test_intsetUpgradeFromint16Toint64(int argc, char **argv, int flags); +int test_intsetUpgradeFromint32Toint64(int argc, char **argv, int flags); +int test_intsetStressLookups(int argc, char **argv, int flags); +int test_intsetStressAddDelete(int argc, char **argv, int flags); +int test_kvstoreAdd16Keys(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags); +int test_kvstoreHashtableExpand(int argc, char **argv, int flags); +int test_listpackCreateIntList(int argc, char **argv, int flags); +int test_listpackCreateList(int argc, char **argv, int flags); +int test_listpackLpPrepend(int argc, char **argv, int flags); +int test_listpackLpPrependInteger(int argc, char **argv, int flags); +int test_listpackGetELementAtIndex(int argc, char **argv, int flags); +int test_listpackPop(int argc, char **argv, int flags); +int test_listpackGetELementAtIndex2(int argc, char **argv, int flags); +int test_listpackIterate0toEnd(int argc, char **argv, int flags); +int test_listpackIterate1toEnd(int argc, char **argv, int flags); +int test_listpackIterate2toEnd(int argc, char **argv, int flags); +int test_listpackIterateBackToFront(int argc, char **argv, int flags); +int test_listpackIterateBackToFrontWithDelete(int argc, char **argv, int flags); +int test_listpackDeleteWhenNumIsMinusOne(int argc, char **argv, int flags); +int test_listpackDeleteWithNegativeIndex(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange0_0(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange0_1(int argc, char **argv, int flags); +int test_listpackDeleteInclusiveRange1_2(int argc, char **argv, int flags); +int test_listpackDeleteWitStartIndexOutOfRange(int argc, char **argv, int flags); +int test_listpackDeleteWitNumOverflow(int argc, char **argv, int flags); +int test_listpackBatchDelete(int argc, char **argv, int flags); +int test_listpackDeleteFooWhileIterating(int argc, char **argv, int flags); +int test_listpackReplaceWithSameSize(int argc, char **argv, int flags); +int test_listpackReplaceWithDifferentSize(int argc, char **argv, int flags); +int test_listpackRegressionGt255Bytes(int argc, char **argv, int flags); +int test_listpackCreateLongListAndCheckIndices(int argc, char **argv, int flags); +int test_listpackCompareStrsWithLpEntries(int argc, char **argv, int flags); +int test_listpackLpMergeEmptyLps(int argc, char **argv, int flags); +int test_listpackLpMergeLp1Larger(int argc, char **argv, int flags); +int test_listpackLpMergeLp2Larger(int argc, char **argv, int flags); +int test_listpackLpNextRandom(int argc, char **argv, int flags); +int test_listpackLpNextRandomCC(int argc, char **argv, int flags); +int test_listpackRandomPairWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairWithManyElements(int argc, char **argv, int flags); +int test_listpackRandomPairsWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairsWithManyElements(int argc, char **argv, int flags); +int test_listpackRandomPairsUniqueWithOneElement(int argc, char **argv, int flags); +int test_listpackRandomPairsUniqueWithManyElements(int argc, char **argv, int flags); +int test_listpackPushVariousEncodings(int argc, char **argv, int flags); +int test_listpackLpFind(int argc, char **argv, int flags); +int test_listpackLpValidateIntegrity(int argc, char **argv, int flags); +int test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN(int argc, char **argv, int flags); +int test_listpackStressWithRandom(int argc, char **argv, int flags); +int test_listpackSTressWithVariableSize(int argc, char **argv, int flags); +int test_listpackBenchmarkInit(int argc, char *argv[], int flags); +int test_listpackBenchmarkLpAppend(int argc, char **argv, int flags); +int test_listpackBenchmarkLpFindString(int argc, char **argv, int flags); +int test_listpackBenchmarkLpFindNumber(int argc, char **argv, int flags); +int test_listpackBenchmarkLpSeek(int argc, char **argv, int flags); +int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); +int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); +int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); +int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_mutexQueueSimplePushPop(int argc, char *argv[], int flags); +int test_mutexQueueDoublePushPop(int argc, char *argv[], int flags); +int test_mutexQueuePriorityOrdering(int argc, char *argv[], int flags); +int test_mutexQueueFifoPopAll(int argc, char *argv[], int flags); +int test_mutexQueueFifoAddMultiple(int argc, char *argv[], int flags); +int test_mutexQueueSimpleThread(int argc, char *argv[], int flags); +int test_mutexQueueParallelWriters(int argc, char *argv[], int flags); +int test_mutexQueueParallelReaders(int argc, char *argv[], int flags); +int test_mutexQueueParallelReadWrite(int argc, char *argv[], int flags); +int test_writeToReplica(int argc, char **argv, int flags); +int test_postWriteToReplica(int argc, char **argv, int flags); +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags); +int test_rewriteClientCommandArgument(int argc, char **argv, int flags); +int test_addRepliesWithOffloadsToBuffer(int argc, char **argv, int flags); +int test_addRepliesWithOffloadsToList(int argc, char **argv, int flags); +int test_addBufferToReplyIOV(int argc, char **argv, int flags); +int test_object_with_key(int argc, char **argv, int flags); +int test_embedded_string_with_key(int argc, char **argv, int flags); +int test_embedded_string_with_key_and_expire(int argc, char **argv, int flags); +int test_embedded_value(int argc, char **argv, int flags); +int test_unembed_value(int argc, char **argv, int flags); +int test_quicklistCreateList(int argc, char **argv, int flags); +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags); +int test_quicklistRotateEmpty(int argc, char **argv, int flags); +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags); +int test_quicklistNextPlainNode(int argc, char **argv, int flags); +int test_quicklistRotatePlainNode(int argc, char **argv, int flags); +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags); +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags); +int test_quicklistPopEmpty(int argc, char **argv, int flags); +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead500From500(int argc, char **argv, int flags); +int test_quicklistPopHead5000From500(int argc, char **argv, int flags); +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags); +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags); +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags); +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags); +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags); +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags); +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags); +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags); +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags); +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags); +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags); +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags); +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags); +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags); +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags); +int test_quicklistBookmarkLimit(int argc, char **argv, int flags); +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags); +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags); +int test_raxRandomWalk(int argc, char **argv, int flags); +int test_raxIteratorUnitTests(int argc, char **argv, int flags); +int test_raxTryInsertUnitTests(int argc, char **argv, int flags); +int test_raxRegressionTest1(int argc, char **argv, int flags); +int test_raxRegressionTest2(int argc, char **argv, int flags); +int test_raxRegressionTest3(int argc, char **argv, int flags); +int test_raxRegressionTest4(int argc, char **argv, int flags); +int test_raxRegressionTest5(int argc, char **argv, int flags); +int test_raxRegressionTest6(int argc, char **argv, int flags); +int test_raxBenchmark(int argc, char **argv, int flags); +int test_raxHugeKey(int argc, char **argv, int flags); +int test_raxFuzz(int argc, char **argv, int flags); +int test_raxRecompressHugeKey(int argc, char **argv, int flags); +int test_sds(int argc, char **argv, int flags); +int test_typesAndAllocSize(int argc, char **argv, int flags); +int test_sdsHeaderSizes(int argc, char **argv, int flags); +int test_sdssplitargs(int argc, char **argv, int flags); +int test_sdsnsplitargs(int argc, char **argv, int flags); +int test_sdsnsplitargsBenchmark(int argc, char **argv, int flags); +int test_sha1(int argc, char **argv, int flags); +int test_sha256_abc(int argc, char **argv, int flags); +int test_sha256_large(int argc, char **argv, int flags); +int test_sha256_million_a(int argc, char **argv, int flags); +int test_string2ll(int argc, char **argv, int flags); +int test_string2l(int argc, char **argv, int flags); +int test_ll2string(int argc, char **argv, int flags); +int test_ld2string(int argc, char **argv, int flags); +int test_fixedpoint_d2string(int argc, char **argv, int flags); +int test_version2num(int argc, char **argv, int flags); +int test_reclaimFilePageCache(int argc, char **argv, int flags); +int test_writePointerWithPadding(int argc, char **argv, int flags); +int test_valkey_strtod(int argc, char **argv, int flags); +int test_vector(int argc, char **argv, int flags); +int test_vset_add_and_iterate(int argc, char **argv, int flags); +int test_vset_large_batch_same_expiry(int argc, char **argv, int flags); +int test_vset_large_batch_update_entry_same_expiry(int argc, char **argv, int flags); +int test_vset_large_batch_update_entry_multiple_expiries(int argc, char **argv, int flags); +int test_vset_iterate_multiple_expiries(int argc, char **argv, int flags); +int test_vset_add_and_remove_all(int argc, char **argv, int flags); +int test_vset_remove_expire_shrink(int argc, char **argv, int flags); +int test_vset_defrag(int argc, char **argv, int flags); +int test_vset_fuzzer(int argc, char **argv, int flags); +int test_ziplistCreateIntList(int argc, char **argv, int flags); +int test_ziplistPop(int argc, char **argv, int flags); +int test_ziplistGetElementAtIndex3(int argc, char **argv, int flags); +int test_ziplistGetElementOutOfRange(int argc, char **argv, int flags); +int test_ziplistGetLastElement(int argc, char **argv, int flags); +int test_ziplistGetFirstElement(int argc, char **argv, int flags); +int test_ziplistGetElementOutOfRangeReverse(int argc, char **argv, int flags); +int test_ziplistIterateThroughFullList(int argc, char **argv, int flags); +int test_ziplistIterateThroughListFrom1ToEnd(int argc, char **argv, int flags); +int test_ziplistIterateThroughListFrom2ToEnd(int argc, char **argv, int flags); +int test_ziplistIterateThroughStartOutOfRange(int argc, char **argv, int flags); +int test_ziplistIterateBackToFront(int argc, char **argv, int flags); +int test_ziplistIterateBackToFrontDeletingAllItems(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange0To0(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange0To1(int argc, char **argv, int flags); +int test_ziplistDeleteInclusiveRange1To2(int argc, char **argv, int flags); +int test_ziplistDeleteWithStartIndexOutOfRange(int argc, char **argv, int flags); +int test_ziplistDeleteWithNumOverflow(int argc, char **argv, int flags); +int test_ziplistDeleteFooWhileIterating(int argc, char **argv, int flags); +int test_ziplistReplaceWithSameSize(int argc, char **argv, int flags); +int test_ziplistReplaceWithDifferentSize(int argc, char **argv, int flags); +int test_ziplistRegressionTestForOver255ByteStrings(int argc, char **argv, int flags); +int test_ziplistRegressionTestDeleteNextToLastEntries(int argc, char **argv, int flags); +int test_ziplistCreateLongListAndCheckIndices(int argc, char **argv, int flags); +int test_ziplistCompareStringWithZiplistEntries(int argc, char **argv, int flags); +int test_ziplistMergeTest(int argc, char **argv, int flags); +int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **argv, int flags); +int test_ziplistCascadeUpdateEdgeCases(int argc, char **argv, int flags); +int test_ziplistInsertEdgeCase(int argc, char **argv, int flags); +int test_ziplistStressWithVariableSize(int argc, char **argv, int flags); +int test_BenchmarkziplistFind(int argc, char **argv, int flags); +int test_BenchmarkziplistIndex(int argc, char **argv, int flags); +int test_BenchmarkziplistValidateIntegrity(int argc, char **argv, int flags); +int test_BenchmarkziplistCompareWithString(int argc, char **argv, int flags); +int test_BenchmarkziplistCompareWithNumber(int argc, char **argv, int flags); +int test_ziplistStress__ziplistCascadeUpdate(int argc, char **argv, int flags); +int test_zipmapIterateWithLargeKey(int argc, char *argv[], int flags); +int test_zipmapIterateThroughElements(int argc, char *argv[], int flags); +int test_zmallocAllocReallocCallocAndFree(int argc, char **argv, int flags); +int test_zmallocAllocZeroByteAndFree(int argc, char **argv, int flags); + +unitTest __test_bitops_c[] = {{"test_popcount", test_popcount}, {NULL, NULL}}; +unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}}; +unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}}; +unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}}; +unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}}; +unitTest __test_entry_c[] = {{"test_entryCreate", test_entryCreate}, {"test_entryUpdate", test_entryUpdate}, {"test_entryHasexpiry_entrySetExpiry", test_entryHasexpiry_entrySetExpiry}, {"test_entryIsExpired", test_entryIsExpired}, {"test_entryMemUsage_entrySetExpiry_entryUpdate", test_entryMemUsage_entrySetExpiry_entryUpdate}, {"test_entryStringRef", test_entryStringRef}, {NULL, NULL}}; +unitTest __test_fifo_c[] = {{"test_fifoEmptyPop", test_fifoEmptyPop}, {"test_fifoEmptyPeek", test_fifoEmptyPeek}, {"test_fifoSimplePushPop", test_fifoSimplePushPop}, {"test_fifoTryVariousSizes", test_fifoTryVariousSizes}, {"test_fifoPushPopTest", test_fifoPushPopTest}, {"test_fifoJoinTest", test_fifoJoinTest}, {"test_fifoComparePerformance", test_fifoComparePerformance}, {NULL, NULL}}; +unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_function_seed", test_set_hash_function_seed}, {"test_add_find_delete", test_add_find_delete}, {"test_add_find_delete_avoid_resize", test_add_find_delete_avoid_resize}, {"test_instant_rehashing", test_instant_rehashing}, {"test_bucket_chain_length", test_bucket_chain_length}, {"test_two_phase_insert_and_pop", test_two_phase_insert_and_pop}, {"test_replace_reallocated_entry", test_replace_reallocated_entry}, {"test_incremental_find", test_incremental_find}, {"test_scan", test_scan}, {"test_iterator", test_iterator}, {"test_safe_iterator", test_safe_iterator}, {"test_compact_bucket_chain", test_compact_bucket_chain}, {"test_random_entry", test_random_entry}, {"test_random_entry_with_long_chain", test_random_entry_with_long_chain}, {"test_random_entry_sparse_table", test_random_entry_sparse_table}, {"test_safe_iterator_invalidation", test_safe_iterator_invalidation}, {"test_safe_iterator_empty_no_invalidation", test_safe_iterator_empty_no_invalidation}, {"test_safe_iterator_reset_invalidation", test_safe_iterator_reset_invalidation}, {"test_safe_iterator_reset_untracking", test_safe_iterator_reset_untracking}, {"test_safe_iterator_pause_resume_tracking", test_safe_iterator_pause_resume_tracking}, {"test_null_hashtable_iterator", test_null_hashtable_iterator}, {"test_hashtable_retarget_iterator", test_hashtable_retarget_iterator}, {NULL, NULL}}; +unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; +unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableExpand", test_kvstoreHashtableExpand}, {NULL, NULL}}; +unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_mutexqueue_c[] = {{"test_mutexQueueSimplePushPop", test_mutexQueueSimplePushPop}, {"test_mutexQueueDoublePushPop", test_mutexQueueDoublePushPop}, {"test_mutexQueuePriorityOrdering", test_mutexQueuePriorityOrdering}, {"test_mutexQueueFifoPopAll", test_mutexQueueFifoPopAll}, {"test_mutexQueueFifoAddMultiple", test_mutexQueueFifoAddMultiple}, {"test_mutexQueueSimpleThread", test_mutexQueueSimpleThread}, {"test_mutexQueueParallelWriters", test_mutexQueueParallelWriters}, {"test_mutexQueueParallelReaders", test_mutexQueueParallelReaders}, {"test_mutexQueueParallelReadWrite", test_mutexQueueParallelReadWrite}, {NULL, NULL}}; +unitTest __test_networking_c[] = {{"test_writeToReplica", test_writeToReplica}, {"test_postWriteToReplica", test_postWriteToReplica}, {"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {"test_addRepliesWithOffloadsToBuffer", test_addRepliesWithOffloadsToBuffer}, {"test_addRepliesWithOffloadsToList", test_addRepliesWithOffloadsToList}, {"test_addBufferToReplyIOV", test_addBufferToReplyIOV}, {NULL, NULL}}; +unitTest __test_object_c[] = {{"test_object_with_key", test_object_with_key}, {"test_embedded_string_with_key", test_embedded_string_with_key}, {"test_embedded_string_with_key_and_expire", test_embedded_string_with_key_and_expire}, {"test_embedded_value", test_embedded_value}, {"test_unembed_value", test_unembed_value}, {NULL, NULL}}; +unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; +unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {"test_raxRecompressHugeKey", test_raxRecompressHugeKey}, {NULL, NULL}}; +unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {"test_sdsnsplitargs", test_sdsnsplitargs}, {"test_sdsnsplitargsBenchmark", test_sdsnsplitargsBenchmark}, {NULL, NULL}}; +unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; +unitTest __test_sha256_c[] = {{"test_sha256_abc", test_sha256_abc}, {"test_sha256_large", test_sha256_large}, {"test_sha256_million_a", test_sha256_million_a}, {NULL, NULL}}; +unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {"test_writePointerWithPadding", test_writePointerWithPadding}, {NULL, NULL}}; +unitTest __test_valkey_strtod_c[] = {{"test_valkey_strtod", test_valkey_strtod}, {NULL, NULL}}; +unitTest __test_vector_c[] = {{"test_vector", test_vector}, {NULL, NULL}}; +unitTest __test_vset_c[] = {{"test_vset_add_and_iterate", test_vset_add_and_iterate}, {"test_vset_large_batch_same_expiry", test_vset_large_batch_same_expiry}, {"test_vset_large_batch_update_entry_same_expiry", test_vset_large_batch_update_entry_same_expiry}, {"test_vset_large_batch_update_entry_multiple_expiries", test_vset_large_batch_update_entry_multiple_expiries}, {"test_vset_iterate_multiple_expiries", test_vset_iterate_multiple_expiries}, {"test_vset_add_and_remove_all", test_vset_add_and_remove_all}, {"test_vset_remove_expire_shrink", test_vset_remove_expire_shrink}, {"test_vset_defrag", test_vset_defrag}, {"test_vset_fuzzer", test_vset_fuzzer}, {NULL, NULL}}; +unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; +unitTest __test_zipmap_c[] = {{"test_zipmapIterateWithLargeKey", test_zipmapIterateWithLargeKey}, {"test_zipmapIterateThroughElements", test_zipmapIterateThroughElements}, {NULL, NULL}}; +unitTest __test_zmalloc_c[] = {{"test_zmallocAllocReallocCallocAndFree", test_zmallocAllocReallocCallocAndFree}, {"test_zmallocAllocZeroByteAndFree", test_zmallocAllocZeroByteAndFree}, {NULL, NULL}}; + +struct unitTestSuite { + char *filename; + unitTest *tests; +} unitTestSuite[] = { + {"test_bitops.c", __test_bitops_c}, + {"test_crc64.c", __test_crc64_c}, + {"test_crc64combine.c", __test_crc64combine_c}, + {"test_dict.c", __test_dict_c}, + {"test_endianconv.c", __test_endianconv_c}, + {"test_entry.c", __test_entry_c}, + {"test_fifo.c", __test_fifo_c}, + {"test_hashtable.c", __test_hashtable_c}, + {"test_intset.c", __test_intset_c}, + {"test_kvstore.c", __test_kvstore_c}, + {"test_listpack.c", __test_listpack_c}, + {"test_mutexqueue.c", __test_mutexqueue_c}, + {"test_networking.c", __test_networking_c}, + {"test_object.c", __test_object_c}, + {"test_quicklist.c", __test_quicklist_c}, + {"test_rax.c", __test_rax_c}, + {"test_sds.c", __test_sds_c}, + {"test_sha1.c", __test_sha1_c}, + {"test_sha256.c", __test_sha256_c}, + {"test_util.c", __test_util_c}, + {"test_valkey_strtod.c", __test_valkey_strtod_c}, + {"test_vector.c", __test_vector_c}, + {"test_vset.c", __test_vset_c}, + {"test_ziplist.c", __test_ziplist_c}, + {"test_zipmap.c", __test_zipmap_c}, + {"test_zmalloc.c", __test_zmalloc_c}, +}; diff --git a/src/unit/test_reply_blocking.cpp b/src/unit/test_reply_blocking.cpp new file mode 100644 index 00000000000..7e487c92f26 --- /dev/null +++ b/src/unit/test_reply_blocking.cpp @@ -0,0 +1,1041 @@ +/* + * Copyright (c) Valkey Contributors + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "generated_wrappers.hpp" + +#include +#include +#include +#include + +extern "C" { +#include "server.h" +#include "reply_blocking.h" +#include "durability_provider.h" +#include "uncommitted_keys.h" + +/* Forward declarations used by tests */ +} + +/* ========================= Test Helpers ========================= */ + +static void initTestEnv(void) { + static char test_logfile[] = ""; + if (server.logfile == nullptr) { + server.logfile = test_logfile; + } +} + +/** + * Minimal durability initialization for tests — avoids calling initTaskTypes() + * which is forward-declared but not yet defined. + */ +static void initDurabilityForTest(void) { + uncommittedKeysInitPending(); + initTaskTypes(); + server.durability.previous_acked_offset = -1; + server.durability.clients_waiting_ack = listCreate(); + durableTaskInitLists(); + server.durability.clients_blocked = 0; + server.durability.clients_unblocked = 0; + server.durability.clients_disconnected_before_unblocking = 0; + server.durability.read_responses_blocked = 0; + server.durability.write_responses_blocked = 0; + server.durability.other_responses_blocked = 0; + server.durability.read_responses_unblocked = 0; + server.durability.write_responses_unblocked = 0; + server.durability.other_responses_unblocked = 0; + server.durability.read_responses_blocked_cumulative_time_us = 0; + server.durability.write_responses_blocked_cumulative_time_us = 0; + server.durability.other_responses_blocked_cumulative_time_us = 0; + registerBuiltinDurabilityProviders(); +} + +/** + * Minimal durability cleanup for tests. + */ +static void cleanupDurabilityForTest(void) { + if (server.durability.clients_waiting_ack) { + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = nullptr; + } + uncommittedKeysCleanupPending(); + durableTaskCleanupLists(); + resetDurabilityProviders(); +} + +/* ========================= Test Fixtures ========================= */ + +class SyncReplicationTest : public ::testing::Test { + protected: + void SetUp() override { + initTestEnv(); + } +}; + +class DurabilityProviderTest : public ::testing::Test { + protected: + /* Saved state */ + int old_aof_state; + int old_aof_fsync; + long long old_fsynced_reploff; + list *old_replicas; + list *old_clients_pending_write; + char *old_primary_host; + durable_t old_durability; + + void SetUp() override { + initTestEnv(); + old_aof_state = server.aof_state; + old_aof_fsync = server.aof_fsync; + old_fsynced_reploff = server.fsynced_reploff; + old_replicas = server.replicas; + old_clients_pending_write = server.clients_pending_write; + old_primary_host = server.primary_host; + old_durability = server.durability; + + server.primary_host = nullptr; + server.clients_pending_write = listCreate(); + server.replicas = listCreate(); + } + + void TearDown() override { + listRelease(server.clients_pending_write); + listRelease(server.replicas); + + server.aof_state = old_aof_state; + server.aof_fsync = old_aof_fsync; + server.fsynced_reploff = old_fsynced_reploff; + server.replicas = old_replicas; + server.clients_pending_write = old_clients_pending_write; + server.primary_host = old_primary_host; + server.durability = old_durability; + } +}; + +class UncommittedKeysTest : public ::testing::Test { + protected: + serverDb **old_db; + int old_dbnum; + char *old_primary_host; + int old_cluster_enabled; + long long old_previous_acked_offset; + long long old_primary_repl_offset; + + void SetUp() override { + initTestEnv(); + old_db = server.db; + old_dbnum = server.dbnum; + old_primary_host = server.primary_host; + old_cluster_enabled = server.cluster_enabled; + old_previous_acked_offset = server.durability.previous_acked_offset; + old_primary_repl_offset = server.primary_repl_offset; + + server.cluster_enabled = 0; + server.primary_host = nullptr; + server.dbnum = 1; + server.db = (serverDb **)zcalloc(sizeof(serverDb *)); + server.db[0] = (serverDb *)zcalloc(sizeof(serverDb)); + durabilityInitDatabase(server.db[0]); + } + + void TearDown() override { + hashtableRelease(server.db[0]->uncommitted_keys); + zfree(server.db[0]); + zfree(server.db); + + server.db = old_db; + server.dbnum = old_dbnum; + server.primary_host = old_primary_host; + server.cluster_enabled = old_cluster_enabled; + server.durability.previous_acked_offset = old_previous_acked_offset; + server.primary_repl_offset = old_primary_repl_offset; + } +}; + +/* ========================= Durability Tests ========================= */ + +TEST_F(DurabilityProviderTest, IsDurabilityEnabled) { + initDurabilityForTest(); + + /* Durability delegates to anyDurabilityProviderEnabled(). + * The built-in AOF provider enables when AOF on + appendfsync always. */ + server.aof_state = AOF_OFF; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_EQ(isDurabilityEnabled(), 0); + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_EQ(isDurabilityEnabled(), 0); + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + ASSERT_EQ(isDurabilityEnabled(), 1); + + server.aof_state = AOF_OFF; + server.aof_fsync = AOF_FSYNC_ALWAYS; + ASSERT_EQ(isDurabilityEnabled(), 0); + + cleanupDurabilityForTest(); +} + +TEST_F(SyncReplicationTest, IsPrimaryDurabilityEnabled) { + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + + /* Primary (not a replica) */ + server.primary_host = nullptr; + ASSERT_EQ(isPrimaryDurabilityEnabled(), 1); + + /* Replica */ + server.primary_host = sdsnew("127.0.0.1"); + ASSERT_EQ(isPrimaryDurabilityEnabled(), 0); + sdsfree(server.primary_host); + + /* Disabled (appendfsync != always) + primary */ + server.aof_fsync = AOF_FSYNC_EVERYSEC; + server.primary_host = nullptr; + ASSERT_EQ(isPrimaryDurabilityEnabled(), 0); +} + +TEST_F(SyncReplicationTest, ClientInitAndReset) { + client *c = (client *)zcalloc(sizeof(client)); + c->clientDurabilityInfo.blocked_responses = nullptr; + c->clientDurabilityInfo.durability_blocked = 0; + c->clientDurabilityInfo.current_command_repl_offset = 0; + + /* Disabled — should be a no-op */ + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + durabilityClientInit(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, nullptr); + + /* Enabled — should initialize */ + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + durabilityClientInit(c); + ASSERT_NE(c->clientDurabilityInfo.blocked_responses, nullptr); + ASSERT_EQ(listLength(c->clientDurabilityInfo.blocked_responses), 0u); + ASSERT_FALSE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.offset.reply_block, nullptr); + ASSERT_EQ(c->clientDurabilityInfo.offset.byte_offset, 0u); + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + + /* Reset — should free */ + durabilityClientReset(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, nullptr); + ASSERT_FALSE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + zfree(c); +} + +TEST_F(SyncReplicationTest, IsClientReplyBufferLimited) { + client *c = (client *)zcalloc(sizeof(client)); + + /* No blocked_responses list */ + c->clientDurabilityInfo.blocked_responses = nullptr; + ASSERT_FALSE(isClientReplyBufferLimited(c)); + + /* Empty blocked_responses list */ + c->clientDurabilityInfo.blocked_responses = listCreate(); + ASSERT_FALSE(isClientReplyBufferLimited(c)); + + /* Non-empty blocked_responses list */ + blockedResponse *br = (blockedResponse *)zcalloc(sizeof(blockedResponse)); + br->primary_repl_offset = 100; + br->disallowed_byte_offset = 0; + br->disallowed_reply_block = nullptr; + listAddNodeTail(c->clientDurabilityInfo.blocked_responses, br); + ASSERT_TRUE(isClientReplyBufferLimited(c)); + + listSetFreeMethod(c->clientDurabilityInfo.blocked_responses, zfree); + listRelease(c->clientDurabilityInfo.blocked_responses); + zfree(c); +} + +/* Verify that clientHasPendingReplies uses bufpos (not data_len) when + * comparing against the blocked response's disallowed_byte_offset. + * + * With copy avoidance, encoded reply buffers contain payload headers + + * bulk-string references. The io_last_written.data_len tracks the total + * *decoded* data written to the socket (i.e. RESP bytes on the wire) + * which can be larger than the encoded buffer position (bufpos). + * Using data_len for the comparison would cause the response to appear + * "fully written" prematurely, releasing the blocked reply before the + * durability provider acknowledges the write. */ +TEST_F(SyncReplicationTest, ClientHasPendingRepliesUsesBufposNotDataLen) { + client *c = (client *)zcalloc(sizeof(client)); + c->reply = listCreate(); + c->repl_data = nullptr; + c->slot_migration_job = nullptr; + c->raw_flag = 0; + + /* Set up a blocked response at offset 100 in c->buf (no reply block) */ + c->clientDurabilityInfo.blocked_responses = listCreate(); + listSetFreeMethod(c->clientDurabilityInfo.blocked_responses, zfree); + + blockedResponse *br = (blockedResponse *)zcalloc(sizeof(blockedResponse)); + br->primary_repl_offset = 500; + br->disallowed_byte_offset = 100; + br->disallowed_reply_block = nullptr; + listAddNodeTail(c->clientDurabilityInfo.blocked_responses, br); + + /* Simulate: 200 bytes in the static buffer, no reply list entries */ + c->bufpos = 200; + + /* Case 1: bufpos < disallowed_byte_offset => has pending replies + * (the write hasn't reached the blocked boundary yet) */ + c->io_last_written.buf = nullptr; + c->io_last_written.bufpos = 50; + c->io_last_written.data_len = 50; + ASSERT_TRUE(clientHasPendingReplies(c)); + + /* Case 2: bufpos == disallowed_byte_offset => no pending replies + * (the write has exactly reached the blocked boundary) */ + c->io_last_written.bufpos = 100; + c->io_last_written.data_len = 100; + ASSERT_FALSE(clientHasPendingReplies(c)); + + /* Case 3: The critical copy-avoidance scenario. + * bufpos is still below the boundary (e.g. 80, because encoded buffer + * is compact), but data_len is above it (e.g. 120, because decoded + * RESP on the wire is larger than the encoded buffer). + * + * With the old bug (using data_len), this would return false (no pending) + * causing the response to be released prematurely. + * With the fix (using bufpos), this correctly returns true (still pending). */ + c->io_last_written.bufpos = 80; + c->io_last_written.data_len = 120; + ASSERT_TRUE(clientHasPendingReplies(c)); + + listRelease(c->clientDurabilityInfo.blocked_responses); + listRelease(c->reply); + zfree(c); +} + +/* ========================= DurabilityProvider Tests ========================= */ + +TEST_F(DurabilityProviderTest, BuiltinAofProviderDisabledWhenAofOff) { + initDurabilityForTest(); + + server.aof_state = AOF_OFF; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_FALSE(anyDurabilityProviderEnabled()); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, AofProviderEnabledOnlyWhenAlwaysFsync) { + initDurabilityForTest(); + + /* AOF provider is only enabled when AOF is on AND appendfsync is always */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + /* Not enabled with other fsync policies */ + server.aof_fsync = AOF_FSYNC_EVERYSEC; + ASSERT_FALSE(anyDurabilityProviderEnabled()); + + server.aof_fsync = AOF_FSYNC_NO; + ASSERT_FALSE(anyDurabilityProviderEnabled()); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, NoProviderEnabledWhenNotAlwaysFsync) { + initDurabilityForTest(); + + /* When fsync != always, no provider is enabled so consensus = primary_repl_offset */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_EVERYSEC; + server.primary_repl_offset = 500; + ASSERT_EQ(getDurabilityConsensusOffset(), 500); + + server.aof_fsync = AOF_FSYNC_NO; + server.primary_repl_offset = 700; + ASSERT_EQ(getDurabilityConsensusOffset(), 700); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, AofProviderPauseAndResume) { + initDurabilityForTest(); + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + server.primary_repl_offset = 300; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)300, __ATOMIC_RELAXED); + server.fsynced_reploff = 300; + + /* Before pause: consensus = 300 (fsynced) */ + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Pause: consensus should be frozen at 300 (the offset at pause time). + * New writes that advance primary_repl_offset past 300 will block, + * but already-acknowledged data remains unblocked. */ + ASSERT_TRUE(pauseDurabilityProvider("aof")); + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Advance primary_repl_offset — consensus stays frozen at 300 */ + server.primary_repl_offset = 500; + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Resume: consensus should catch up to actual fsynced offset */ + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + server.durability.previous_acked_offset = -1; + ASSERT_TRUE(resumeDurabilityProvider("aof")); + ASSERT_EQ(getDurabilityConsensusOffset(), 300); + + /* Nonexistent provider returns false */ + ASSERT_FALSE(pauseDurabilityProvider("nonexistent")); + ASSERT_FALSE(resumeDurabilityProvider("nonexistent")); + + cleanupDurabilityForTest(); +} + +/* Custom test provider */ +static bool testProviderEnabled = true; +static long long testProviderOffset = 50; +static bool testCustomIsEnabled(void) { return testProviderEnabled; } +static long long testCustomGetAckedOffset(void) { return testProviderOffset; } + +TEST_F(DurabilityProviderTest, CustomProviderRegistrationAndConsensus) { + initDurabilityForTest(); + + durabilityProvider customProvider = { + .name = "custom-test", + .isEnabled = testCustomIsEnabled, + .getAckedOffset = testCustomGetAckedOffset, + }; + + /* Enable AOF provider */ + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)300, __ATOMIC_RELAXED); + server.fsynced_reploff = 300; + + /* Register custom provider */ + testProviderEnabled = true; + testProviderOffset = 50; + registerDurabilityProvider(&customProvider); + ASSERT_TRUE(anyDurabilityProviderEnabled()); + + /* Consensus = MIN(aof=300, custom=50) = 50 */ + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), 50); + + /* Unregister */ + unregisterDurabilityProvider(&customProvider); + + cleanupDurabilityForTest(); +} + +TEST_F(DurabilityProviderTest, CustomProviderDisabledIsSkipped) { + initDurabilityForTest(); + + durabilityProvider customProvider = { + .name = "custom-disabled", + .isEnabled = testCustomIsEnabled, + .getAckedOffset = testCustomGetAckedOffset, + }; + + server.aof_state = AOF_ON; + server.aof_fsync = AOF_FSYNC_ALWAYS; + __atomic_store_n(&server.fsynced_reploff_pending, (long long)200, __ATOMIC_RELAXED); + server.fsynced_reploff = 200; + + testProviderEnabled = false; + testProviderOffset = 10; + registerDurabilityProvider(&customProvider); + + /* Custom disabled, only AOF enabled => consensus = 200 */ + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), 200); + + unregisterDurabilityProvider(&customProvider); + cleanupDurabilityForTest(); +} + +static long long negativeOffsetProvider(void) { return -1; } +static bool alwaysEnabled(void) { return true; } + +TEST_F(DurabilityProviderTest, ProviderReturningNegativeOneBlocksConsensus) { + initDurabilityForTest(); + + durabilityProvider blockingProvider = { + .name = "blocking", + .isEnabled = alwaysEnabled, + .getAckedOffset = negativeOffsetProvider, + }; + + registerDurabilityProvider(&blockingProvider); + server.primary_repl_offset = 300; + ASSERT_EQ(getDurabilityConsensusOffset(), -1); + + unregisterDurabilityProvider(&blockingProvider); + cleanupDurabilityForTest(); +} + + +/* ========================= UncommittedKeys Tests ========================= */ + +TEST_F(UncommittedKeysTest, HandleAndPurgeUncommittedKey) { + robj *key_obj = createStringObject("key", 3); + sds key = (sds)objectGetVal(key_obj); + long long offset = 10; + server.primary_repl_offset = offset; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + + /* Key should be in uncommitted set */ + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Not yet acked — should return the offset */ + server.durability.previous_acked_offset = 5; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(key, server.db[0]), offset); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Acked — should purge and return -1 */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(key, server.db[0]), -1); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, MultipleKeysTracked) { + robj *k1 = createStringObject("key1", 4); + robj *k2 = createStringObject("key2", 4); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, k1, server.db[0]); + server.primary_repl_offset = 20; + handleUncommittedKeyForClient(nullptr, k2, server.db[0]); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 2u); + + /* Ack up to 10 — only key1 should be purged */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k1), server.db[0]), -1); + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k2), server.db[0]), 20); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Ack up to 20 — key2 also purged */ + server.durability.previous_acked_offset = 20; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(k2), server.db[0]), -1); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(k1); + decrRefCount(k2); +} + +TEST_F(UncommittedKeysTest, KeyOffsetUpdatedOnRewrite) { + robj *key_obj = createStringObject("key", 3); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Rewrite same key at higher offset */ + server.primary_repl_offset = 50; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Old offset acked but new offset not */ + server.durability.previous_acked_offset = 10; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]), 50); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, NonexistentKeyReturnsNegativeOne) { + sds missing = sdsnew("nonexistent"); + server.durability.previous_acked_offset = 0; + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset(missing, server.db[0]), -1); + sdsfree(missing); +} + +TEST_F(UncommittedKeysTest, HasUncommittedKeysAcrossDBs) { + /* No uncommitted keys initially */ + ASSERT_EQ(hasUncommittedKeys(), 0); + + robj *key_obj = createStringObject("key", 3); + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + ASSERT_EQ(hasUncommittedKeys(), 1); + + /* Purge it */ + server.durability.previous_acked_offset = 10; + durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]); + ASSERT_EQ(hasUncommittedKeys(), 0); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, GetNumberOfUncommittedKeys) { + ASSERT_EQ(getNumberOfUncommittedKeys(), 0u); + + robj *k1 = createStringObject("a", 1); + robj *k2 = createStringObject("b", 1); + robj *k3 = createStringObject("c", 1); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, k1, server.db[0]); + handleUncommittedKeyForClient(nullptr, k2, server.db[0]); + handleUncommittedKeyForClient(nullptr, k3, server.db[0]); + + ASSERT_EQ(getNumberOfUncommittedKeys(), 3u); + + decrRefCount(k1); + decrRefCount(k2); + decrRefCount(k3); +} + +TEST_F(UncommittedKeysTest, DrainCommittedKeysRemovesCommitted) { + robj *k1 = createStringObject("key1", 4); + robj *k2 = createStringObject("key2", 4); + robj *k3 = createStringObject("key3", 4); + + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, k1, server.db[0]); + server.primary_repl_offset = 20; + handleUncommittedKeyForClient(nullptr, k2, server.db[0]); + server.primary_repl_offset = 30; + handleUncommittedKeyForClient(nullptr, k3, server.db[0]); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 3u); + + /* Drain up to offset 20 — key1 and key2 should be removed */ + drainCommittedKeys(20); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* key3 should still be present */ + void *found = nullptr; + ASSERT_TRUE(hashtableFind(server.db[0]->uncommitted_keys, + (sds)objectGetVal(k3), &found)); + + /* Drain up to 30 — key3 removed */ + drainCommittedKeys(30); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(k1); + decrRefCount(k2); + decrRefCount(k3); +} + +TEST_F(UncommittedKeysTest, DrainPreservesReDirtiedKey) { + robj *key_obj = createStringObject("hotkey", 6); + + /* Write at offset 10 */ + server.primary_repl_offset = 10; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + + /* Re-dirty at offset 50 */ + server.primary_repl_offset = 50; + handleUncommittedKeyForClient(nullptr, key_obj, server.db[0]); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Drain up to 10 — key should NOT be removed because it was re-dirtied at 50 */ + drainCommittedKeys(10); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + + /* Drain up to 50 — now it should be removed */ + drainCommittedKeys(50); + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + + decrRefCount(key_obj); +} + +TEST_F(UncommittedKeysTest, DrainClearsDirtyDbOffset) { + server.db[0]->dirty_repl_offset = 100; + + /* Drain below the DB offset — should not clear */ + drainCommittedKeys(50); + ASSERT_EQ(server.db[0]->dirty_repl_offset, 100); + + /* Drain at the DB offset — should clear */ + drainCommittedKeys(100); + ASSERT_EQ(server.db[0]->dirty_repl_offset, -1); +} + +TEST_F(UncommittedKeysTest, DrainEmptyHashtableIsNoop) { + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); + drainCommittedKeys(1000); /* Should not crash */ + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 0u); +} + + +/* ========================= Function Store Tests ========================= */ + +TEST_F(SyncReplicationTest, FunctionStoreUncommittedTracking) { + server.durability.previous_acked_offset = 0; + + /* Not uncommitted initially */ + ASSERT_FALSE(isDurableFunctionStoreUncommitted()); + + /* Mark uncommitted */ + server.execution_nesting = 0; + server.primary_repl_offset = 100; + handleUncommittedFunctionStore(); + ASSERT_TRUE(isDurableFunctionStoreUncommitted()); + ASSERT_EQ(getFuncStoreBlockingOffset(), 100); + + /* After acking, it should no longer be uncommitted */ + server.durability.previous_acked_offset = 100; + ASSERT_FALSE(isDurableFunctionStoreUncommitted()); +} + +/* ========================= INFO String Test ========================= */ + +TEST_F(SyncReplicationTest, GenInfoStringDisabled) { + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + sds info = sdsempty(); + info = genDurabilityInfoString(info); + ASSERT_NE(strstr(info, "durability_enabled:0"), nullptr); + sdsfree(info); +} + +TEST_F(SyncReplicationTest, GenInfoStringEnabled) { + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + server.durability.clients_waiting_ack = listCreate(); + server.durability.read_responses_blocked = 5; + server.durability.write_responses_blocked = 3; + server.durability.previous_acked_offset = 42; + server.primary_repl_offset = 100; + + sds info = sdsempty(); + info = genDurabilityInfoString(info); + ASSERT_NE(strstr(info, "durability_enabled:1"), nullptr); + ASSERT_NE(strstr(info, "durability_read_blocked_count:5"), nullptr); + ASSERT_NE(strstr(info, "durability_write_blocked_count:3"), nullptr); + ASSERT_NE(strstr(info, "durability_previous_acked_offset:42"), nullptr); + ASSERT_NE(strstr(info, "durability_primary_repl_offset:100"), nullptr); + + sdsfree(info); + listRelease(server.durability.clients_waiting_ack); + server.durability.clients_waiting_ack = nullptr; + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; +} + +/* ========================= Migrated from C tests ========================= */ + +/** + * Fixture for tests that need full durability init (durabilityInit) + * plus database and client setup. + */ +class FullDurabilityTest : public ::testing::Test { + protected: + serverDb **old_db; + int old_dbnum; + char *old_primary_host; + int old_cluster_enabled; + long long old_primary_repl_offset; + int old_get_ack; + list *old_replicas; + list *old_clients_pending_write; + int old_aof_state; + int old_aof_fsync; + long long old_fsynced_reploff; + durable_t old_durability; + list *old_monitors; + + void SetUp() override { + initTestEnv(); + old_db = server.db; + old_dbnum = server.dbnum; + old_primary_host = server.primary_host; + old_cluster_enabled = server.cluster_enabled; + old_primary_repl_offset = server.primary_repl_offset; + old_get_ack = server.get_ack_from_replicas; + old_replicas = server.replicas; + old_clients_pending_write = server.clients_pending_write; + old_aof_state = server.aof_state; + old_aof_fsync = server.aof_fsync; + old_fsynced_reploff = server.fsynced_reploff; + old_durability = server.durability; + old_monitors = server.monitors; + + server.cluster_enabled = 0; + server.primary_host = nullptr; + server.clients_pending_write = listCreate(); + server.monitors = listCreate(); + server.dbnum = 1; + server.db = (serverDb **)zcalloc(sizeof(serverDb *)); + server.db[0] = (serverDb *)zcalloc(sizeof(serverDb)); + durabilityInitDatabase(server.db[0]); + + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + durabilityInit(); + } + + void TearDown() override { + durabilityCleanup(); + listRelease(server.clients_pending_write); + listRelease(server.monitors); + hashtableRelease(server.db[0]->uncommitted_keys); + zfree(server.db[0]); + zfree(server.db); + + server.db = old_db; + server.dbnum = old_dbnum; + server.primary_host = old_primary_host; + server.cluster_enabled = old_cluster_enabled; + server.primary_repl_offset = old_primary_repl_offset; + server.get_ack_from_replicas = old_get_ack; + server.replicas = old_replicas; + server.clients_pending_write = old_clients_pending_write; + server.aof_state = old_aof_state; + server.aof_fsync = old_aof_fsync; + server.fsynced_reploff = old_fsynced_reploff; + server.durability = old_durability; + server.monitors = old_monitors; + } +}; + +/* Migrated from test_durableInit */ +TEST_F(SyncReplicationTest, SyncReplicationInitSetsDefaults) { + /* initDurabilityForTest() approximates durabilityInit(); verify fields */ + initDurabilityForTest(); + + ASSERT_NE(server.durability.clients_waiting_ack, nullptr); + ASSERT_EQ(listLength(server.durability.clients_waiting_ack), 0u); + ASSERT_EQ(server.durability.previous_acked_offset, -1); + ASSERT_EQ(server.durability.clients_blocked, 0u); + ASSERT_EQ(server.durability.clients_unblocked, 0u); + ASSERT_EQ(server.durability.clients_disconnected_before_unblocking, 0u); + ASSERT_EQ(server.durability.read_responses_blocked, 0u); + ASSERT_EQ(server.durability.write_responses_blocked, 0u); + ASSERT_EQ(server.durability.other_responses_blocked, 0u); + + cleanupDurabilityForTest(); +} + +/* Migrated from test_beforeCommandTrackReplOffset */ +TEST_F(FullDurabilityTest, BeforeCommandTrackReplOffset) { + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + c->cmd = &readonly_cmd; + + server.primary_repl_offset = 500; + beforeCommandTrackReplOffset(c); + + /* pre_call_replication_offset should be snapshotted */ + ASSERT_EQ(server.durability.pre_call_replication_offset, 500); + + durabilityClientReset(c); + zfree(c); +} + +/* Migrated from test_preCommandExec — Case 1: durability disabled */ +TEST_F(SyncReplicationTest, PreCommandExecDurabilityDisabled) { + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + + /* preCommandExec always accesses server.monitors via isCommandReplicatedToMonitors() */ + list *old_monitors = server.monitors; + server.monitors = listCreate(); + + client *c = (client *)zcalloc(sizeof(client)); + c->cmd = &readonly_cmd; + c->clientDurabilityInfo.current_command_repl_offset = 123; + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + server.primary_repl_offset = 555; + + ASSERT_EQ(preCommandExec(c), CMD_FILTER_ALLOW); + /* preCommandExec always resets current_command_repl_offset to -1 */ + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + /* pre_command_replication_offset is always snapshotted */ + ASSERT_EQ(server.durability.pre_command_replication_offset, 555); + + zfree(c); + listRelease(server.monitors); + server.monitors = old_monitors; +} + +/* Migrated from test_preCommandExec — Case 2: durability enabled on primary */ +TEST_F(FullDurabilityTest, PreCommandExecDurabilityEnabledOnPrimary) { + struct serverCommand readonly_cmd = {.declared_name = "get", .flags = CMD_READONLY}; + + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + c->cmd = &readonly_cmd; + c->bufpos = 7; + c->clientDurabilityInfo.current_command_repl_offset = 88; + server.primary_repl_offset = 1234; + + ASSERT_EQ(preCommandExec(c), CMD_FILTER_ALLOW); + /* current_command_repl_offset should be reset to -1 */ + ASSERT_EQ(c->clientDurabilityInfo.current_command_repl_offset, -1); + /* Pre-execution position should be tracked */ + ASSERT_TRUE(c->clientDurabilityInfo.offset.recorded); + ASSERT_EQ(c->clientDurabilityInfo.offset.reply_block, nullptr); + ASSERT_EQ(c->clientDurabilityInfo.offset.byte_offset, 7u); + ASSERT_EQ(server.durability.pre_command_replication_offset, 1234); + + durabilityClientReset(c); + zfree(c); +} + +/* Migrated from test_multi_exec_defers_dirty_keys */ +TEST_F(FullDurabilityTest, MultiExecDefersDirtyKeys) { + client *c = (client *)zcalloc(sizeof(client)); + durabilityClientInit(c); + c->db = server.db[0]; + c->reply = listCreate(); + listSetFreeMethod(c->reply, zfree); + + /* Inside a MULTI — key is marked dirty immediately with placeholder offset */ + c->flag.multi = 1; + robj *key_obj = createStringObject("multi-key", 9); + handleUncommittedKeyForClient(c, key_obj, server.db[0]); + /* Key should be in uncommitted set immediately (with LLONG_MAX placeholder) */ + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]), LLONG_MAX); + + /* After EXEC completes: postCommandExec commits deferred keys */ + c->flag.multi = 0; + struct serverCommand exec_cmd = {.declared_name = "exec", .proc = execCommand, .flags = 0}; + c->cmd = &exec_cmd; + c->clientDurabilityInfo.current_command_repl_offset = -1; + server.primary_repl_offset = 100; + server.durability.pre_command_replication_offset = 100; + server.durability.previous_acked_offset = 0; + postCommandExec(c); + + ASSERT_EQ(hashtableSize(server.db[0]->uncommitted_keys), 1u); + ASSERT_EQ(durabilityPurgeAndGetUncommittedKeyOffset((sds)objectGetVal(key_obj), server.db[0]), 100); + + decrRefCount(key_obj); + listRelease(c->reply); + durabilityClientReset(c); + zfree(c); +} + +/* Note: test_exec_blocks_reply_and_tracks_dirty_keys from the C test suite + * exercised the full end-to-end blocking/unblocking flow including + * notifyDurabilityProgress with replica ack simulation. This requires + * putClientInPendingWriteQueue which needs a fully event-loop-registered client. + * The blocking path is covered by the MultiExecDefersDirtyKeys test above, + * and the full integration flow is tested by tests/durability/reply_blocking.tcl. */ + +/* ========================= Additional Coverage Tests ========================= */ + +/* Test updateFuncStoreBlockingOffsetForWrite */ +TEST_F(SyncReplicationTest, UpdateFuncStoreBlockingOffsetForWrite) { + server.durability.func_store_blocking_offset = -1; + server.durability.processed_func_write_in_transaction = false; + + /* Should not update when no func write was processed in transaction */ + updateFuncStoreBlockingOffsetForWrite(200); + ASSERT_EQ(server.durability.func_store_blocking_offset, -1); + + /* Should update when processed_func_write_in_transaction is set */ + server.durability.processed_func_write_in_transaction = true; + updateFuncStoreBlockingOffsetForWrite(200); + ASSERT_EQ(server.durability.func_store_blocking_offset, 200); + ASSERT_FALSE(server.durability.processed_func_write_in_transaction); +} + +/* Test handleUncommittedFunctionStore inside vs outside a transaction */ +TEST_F(SyncReplicationTest, HandleUncommittedFunctionStoreInsideTransaction) { + server.durability.processed_func_write_in_transaction = false; + server.durability.func_store_blocking_offset = -1; + + /* Inside a transaction (execution_nesting > 0): should only set the flag */ + server.execution_nesting = 1; + server.primary_repl_offset = 300; + handleUncommittedFunctionStore(); + ASSERT_TRUE(server.durability.processed_func_write_in_transaction); + ASSERT_EQ(server.durability.func_store_blocking_offset, -1); + + /* Outside a transaction: should set the blocking offset directly */ + server.execution_nesting = 0; + server.durability.processed_func_write_in_transaction = false; + server.primary_repl_offset = 400; + handleUncommittedFunctionStore(); + ASSERT_FALSE(server.durability.processed_func_write_in_transaction); + ASSERT_EQ(server.durability.func_store_blocking_offset, 400); +} + +/* Test notifyDurabilityProgress when sync replication is disabled */ +TEST_F(SyncReplicationTest, NotifyDurabilityProgressNoOpWhenDisabled) { + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + server.primary_host = nullptr; + long long old_offset = server.durability.previous_acked_offset; + notifyDurabilityProgress(); + /* Should be a no-op */ + ASSERT_EQ(server.durability.previous_acked_offset, old_offset); +} + +/* Test notifyDurabilityProgress when server is a replica */ +TEST_F(SyncReplicationTest, NotifyDurabilityProgressNoOpWhenReplica) { + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + server.primary_host = sdsnew("127.0.0.1"); + long long old_offset = server.durability.previous_acked_offset; + notifyDurabilityProgress(); + ASSERT_EQ(server.durability.previous_acked_offset, old_offset); + sdsfree(server.primary_host); + server.primary_host = nullptr; + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; +} + + +/* Test that keyspace notify task copies the event string so it doesn't + * become a dangling pointer when the caller frees the original. */ +TEST_F(FullDurabilityTest, KeyspaceNotifyTaskCopiesEventString) { + /* Create a mutable event string that we'll free after registering the task */ + char *event = (char *)zmalloc(16); + strcpy(event, "set"); + + robj *key_obj = createStringObject("mykey", 5); + + /* Register the task — this should copy the event string */ + server.current_client = nullptr; /* simulate background task */ + server.primary_repl_offset = 100; + bool registered = durabilityRegisterDeferredTask( + DURABLE_KEYSPACE_NOTIFY_TASK, + (void *)(long long)0, /* type */ + (void *)event, /* event string — will be freed below */ + (void *)key_obj, /* key */ + (void *)(long long)0 /* dbid */ + ); + ASSERT_TRUE(registered); + + /* Free the original event string — this would cause a dangling pointer + * if the task didn't copy it */ + zfree(event); + + /* The task should still be valid and executable without crash. + * Execute all tasks at offset 100 — the event string inside the task + * should be an independent copy that's still valid. */ + ASSERT_EQ(listLength(server.durability.tasks_waiting_ack[DURABLE_KEYSPACE_NOTIFY_TASK]), 1u); + executeDeferredTasksForAck(100); + ASSERT_EQ(listLength(server.durability.tasks_waiting_ack[DURABLE_KEYSPACE_NOTIFY_TASK]), 0u); + + decrRefCount(key_obj); +} + +/* Test durabilityClientInit is idempotent */ +TEST_F(SyncReplicationTest, ClientInitIdempotent) { + server.aof_state = AOF_ON; server.aof_fsync = AOF_FSYNC_ALWAYS; + + client *c = (client *)zcalloc(sizeof(client)); + c->clientDurabilityInfo.blocked_responses = nullptr; + + durabilityClientInit(c); + list *first_list = c->clientDurabilityInfo.blocked_responses; + ASSERT_NE(first_list, nullptr); + + /* Calling init again should be a no-op — should NOT create a new list */ + durabilityClientInit(c); + ASSERT_EQ(c->clientDurabilityInfo.blocked_responses, first_list); + + durabilityClientReset(c); + server.aof_state = AOF_OFF; server.aof_fsync = AOF_FSYNC_EVERYSEC; + zfree(c); +} diff --git a/tests/durability/reply_blocking.tcl b/tests/durability/reply_blocking.tcl new file mode 100644 index 00000000000..de10219b5f7 --- /dev/null +++ b/tests/durability/reply_blocking.tcl @@ -0,0 +1,1139 @@ +# Tests for reply blocking durability feature +# This test suite covers the synchronous replication functionality +# that blocks client responses until durability providers acknowledge writes. +# +# Tests are parameterized over provider_mode: +# replica - TODO +# aof - unblock via AOF appendfsync=always (automatic in beforeSleep) + +foreach provider_mode {aof} { + + if {$provider_mode eq "replica"} { + set server_overrides {appendonly yes appendfsync always} + } else { + # Durability is implied by appendonly + appendfsync always. + # We use DEBUG durability-provider-pause/resume to control blocking + # instead of toggling appendfsync, which avoids the issue where the + # provider reports as disabled when appendfsync != always. + set server_overrides {appendonly yes appendfsync always} + } + + start_server [list tags {"repl durability external:skip"} overrides $server_overrides] { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + + # Helper: put the provider into a state where writes will block. + # replica mode: ensure no replica is connected (so no one acks writes) + # aof mode: pause the AOF provider so fsynced offsets are not advanced + proc pause_provider {} { + upvar provider_mode provider_mode + upvar primary primary + upvar replica replica + + if {$provider_mode eq "replica"} { + # Disconnect any existing replica so the next write has no one to ack it + $replica replicaof no one + wait_for_condition 50 100 { + [llength [$primary client list type replica]] == 0 + } else { + fail "Primary didn't notice replica disconnect" + } + } else { + # Pause the AOF provider so the next write will block + $primary DEBUG durability-provider-pause aof + } + } + + # Helper: trigger durability acknowledgement, unblocking pending replies. + # replica mode: connect replica and wait for replication ack + # aof mode: resume the AOF provider and ping to force a beforeSleep fsync + proc unblock_with_provider {} { + upvar provider_mode provider_mode + upvar primary primary + upvar primary_host primary_host + upvar primary_port primary_port + upvar replica replica + upvar replica_host replica_host + upvar replica_port replica_port + + if {$provider_mode eq "replica"} { + $replica replicaof $primary_host $primary_port + wait_replica_online $primary + wait_replica_acked_ofs $primary $replica $replica_host $replica_port + } else { + # Resume the AOF provider so it reports real fsynced offsets + $primary DEBUG durability-provider-resume aof + # Issue a PING to force a beforeSleep cycle that fsyncs the AOF + $primary ping + } + } + + # ==================== Write blocking tests ==================== + + test "($provider_mode) Sync replication blocks replies until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + puts "durability blocks" + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:blocked value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Sync replication blocks EXEC replies until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + puts "durability blocks" + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + $rd set durable:multi value + + assert_equal "OK" [$rd read] + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) Sync replication blocks only written keys in EXEC" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + puts "durability only written keys in EXEC" + + # Pre-populate with durability off so the SET doesn't block + assert_equal "OK" [$primary set durable:multi-clean clean] + # Verify the pre-populated value is readable on the primary before EXEC + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + $rd set durable:multi-dirty value + $rd get durable:multi-clean + + assert_equal "OK" [$rd read] + assert_equal "QUEUED" [$rd read] + assert_equal "QUEUED" [$rd read] + assert_equal {clean} [$primary get durable:multi-clean] + + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK clean} [$rd read] + $rd close + } + + test "($provider_mode) Lua script write blocks replies until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:lua-clean clean] + assert_equal "OK" [$primary config set appendfsync always] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd eval {redis.call('set', KEYS[1], ARGV[1]); return redis.call('get', KEYS[2])} 2 durable:lua-dirty durable:lua-clean value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + set reader [valkey_client -1] + assert_equal {clean} [$reader get durable:lua-clean] + + unblock_with_provider + + assert_equal {clean} [$rd read] + $rd close + } + + test "($provider_mode) Lua script error after partial write still blocks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd eval {redis.call('set', KEYS[1], 'written'); error('deliberate error')} 1 durable:lua-error-key + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + catch {$rd read} err + assert_match "*deliberate error*" $err + $rd close + } + + # ==================== Non-blocking tests ==================== + + test "($provider_mode) EVAL_RO should not block replies" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:eval-ro-key hello] + assert_equal "OK" [$primary config set appendfsync always] + + set rd [valkey_deferring_client -1] + $rd eval_ro {return redis.call('get', KEYS[1])} 1 durable:eval-ro-key + + assert_equal "hello" [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with DISCARD does not block" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd set durable:discard-key value + assert_equal "QUEUED" [$rd read] + + $rd discard + assert_equal "OK" [$rd read] + + $rd get durable:discard-key + assert_equal "" [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with no writes does not block" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Pre-populate with sync-repl off so the SET doesn't block + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:nowrite-key existing] + assert_equal "OK" [$primary config set appendfsync always] + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd get durable:nowrite-key + assert_equal "QUEUED" [$rd read] + + $rd ping + assert_equal "QUEUED" [$rd read] + + $rd exec + assert_equal {existing PONG} [$rd read] + $rd close + } + + test "($provider_mode) Admin commands are never blocked" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + set rd [valkey_deferring_client -1] + + $rd ping + assert_equal "PONG" [$rd read] + + $rd info server + set info [$rd read] + assert_match "*valkey_version*" $info + + $rd dbsize + set dbsize [$rd read] + assert {[string is integer $dbsize]} + + $rd close + } + + test "($provider_mode) Read-only commands on clean keys are not blocked" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:clean-key cleanvalue] + assert_equal "OK" [$primary config set appendfsync always] + + set rd [valkey_deferring_client -1] + $rd get durable:clean-key + assert_equal "cleanvalue" [$rd read] + $rd close + } + + test "($provider_mode) Sync replication disabled - writes return immediately (regression)" { + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "everysec" [lindex [$primary config get appendfsync] 1] + + set rd [valkey_deferring_client -1] + $rd set durable:norep-key value + assert_equal "OK" [$rd read] + + $rd get durable:norep-key + assert_equal "value" [$rd read] + + $rd multi + assert_equal "OK" [$rd read] + $rd set durable:norep-key2 value2 + assert_equal "QUEUED" [$rd read] + $rd exec + assert_equal {OK} [$rd read] + + $rd close + assert_equal "OK" [$primary config set appendfsync always] + } + + # ==================== Multiple clients ==================== + + test "($provider_mode) Multiple concurrent writers block independently" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set wr1 [valkey_deferring_client -1] + set wr2 [valkey_deferring_client -1] + + $wr1 set durable:concurrent-1 val1 + $wr2 set durable:concurrent-2 val2 + + set fd1 [$wr1 channel] + set fd2 [$wr2 channel] + fconfigure $fd1 -blocking 0 + fconfigure $fd2 -blocking 0 + set early1 [read $fd1] + set early2 [read $fd2] + fconfigure $fd1 -blocking 1 + fconfigure $fd2 -blocking 1 + assert_equal "" $early1 + assert_equal "" $early2 + + unblock_with_provider + + assert_equal "OK" [$wr1 read] + assert_equal "OK" [$wr2 read] + + $wr1 close + $wr2 close + } + + test "($provider_mode) Write then read on same client preserves reply ordering" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:ordering-key orderval + $rd get durable:ordering-key + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + assert_equal "orderval" [$rd read] + $rd close + } + + # ==================== Database-level commands ==================== + + test "($provider_mode) FLUSHDB inside MULTI/EXEC blocks entire database" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:flush-pre existing] + assert_equal "OK" [$primary config set appendfsync always] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd flushdb + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) FLUSHALL blocks write reply until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary set durable:flushall-key value] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd flushall + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) FLUSHALL inside MULTI/EXEC blocks all databases" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:flushall-multi-key value] + assert_equal "OK" [$primary config set appendfsync always] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd flushall + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK} [$rd read] + $rd close + } + + test "($provider_mode) COPY cross-database blocks write reply" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "OK" [$primary set durable:copy-src srcvalue] + assert_equal "OK" [$primary config set appendfsync always] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd copy durable:copy-src durable:copy-dst db 1 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal 1 [$rd read] + $rd close + } + + test "($provider_mode) SWAPDB blocks write reply until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + assert_equal "OK" [$primary set durable:swap-db0 db0val] + $primary select 1 + assert_equal "OK" [$primary set durable:swap-db1 db1val] + $primary select 0 + + pause_provider + + set rd [valkey_deferring_client -1] + $rd swapdb 0 1 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + + # Swap back to restore state (with sync-repl off so it doesn't block) + $primary config set appendfsync everysec + $primary swapdb 0 1 + $primary config set appendfsync always + } + + test "($provider_mode) MOVE blocks write reply until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + $primary select 2 + $primary del durable:move-key + $primary select 9 + assert_equal "OK" [$primary set durable:move-key moveval] + assert_equal "OK" [$primary config set appendfsync always] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd move durable:move-key 2 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal 1 [$rd read] + $rd close + } + + test "($provider_mode) MULTI/EXEC with SELECT writes to multiple databases blocks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd multi + assert_equal "OK" [$rd read] + + $rd set durable:multidb-key0 val0 + assert_equal "QUEUED" [$rd read] + + $rd select 1 + assert_equal "QUEUED" [$rd read] + + $rd set durable:multidb-key1 val1 + assert_equal "QUEUED" [$rd read] + + $rd select 0 + assert_equal "QUEUED" [$rd read] + + $rd exec + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal {OK OK OK OK} [$rd read] + $rd close + } + + # ==================== Function store ==================== + + test "($provider_mode) FUNCTION LOAD blocks reply until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd function load "#!lua name=durtest\nserver.register_function('durfunc', function() return 'hello' end)" + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "durtest" [$rd read] + $rd close + } + + test "($provider_mode) FUNCTION DELETE blocks reply until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd function delete durtest + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "OK" [$rd read] + $rd close + } + + # ==================== Dirty key reads ==================== + + test "($provider_mode) Sync replication blocks reads on dirty keys" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:blocked dirty + + set rd [valkey_deferring_client -1] + $rd get durable:blocked + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + assert_equal "dirty" [$rd read] + $rd close + } + + test "($provider_mode) Pipelined non-blocking then blocking command does not leak blocked reply" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + # Pipeline a non-blocking command (PING) followed by a blocking write (SET). + # The PING reply is allowed to be sent, but the SET reply must be held. + # Without proper write boundary capping, _writeToClient would send + # both replies since they share the same c->buf. + set rd [valkey_deferring_client -1] + $rd ping + $rd set pipe:boundary-key val1 + + # Give the server time to process both commands and attempt the write + after 100 + + # Read whatever the server has sent — should be ONLY the PING reply + set fd [$rd channel] + fconfigure $fd -blocking 0 + set partial [read $fd] + fconfigure $fd -blocking 1 + + # PING reply should be "+PONG\r\n" — no "+OK\r\n" from SET + assert_match "*PONG*" $partial + assert {![string match "*OK*" $partial]} + + unblock_with_provider + + # Now the SET reply should arrive + assert_equal "OK" [$rd read] + $rd close + } + + # ==================== Client disconnect stats ==================== + + test "($provider_mode) Client disconnect while blocked updates stats" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:disconnect-test value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + + $rd close + + after 200 + + set info [$primary info all] + assert_match "*durability_clients_waiting_ack:0*" $info + + # Resume the provider so subsequent tests aren't affected + unblock_with_provider + } + + # ==================== Toggle / config changes ==================== + + test "($provider_mode) Sync replication toggling disables reply blocking" { + assert_equal "OK" [$primary config set appendfsync everysec] + assert_equal "everysec" [lindex [$primary config get appendfsync] 1] + + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:toggle value + + set rd [valkey_deferring_client -1] + $rd get durable:toggle + assert_equal "value" [$rd read] + + $rd close + assert_equal "OK" [$primary config set appendfsync always] + } + + test "($provider_mode) Disabling sync replication unblocks pending replies" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:toggle-blocked value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + + assert_equal "OK" [$primary config set appendfsync everysec] + + set raw_reply "" + set got_reply 0 + for {set i 0} {$i < 50} {incr i} { + append raw_reply [read $fd] + if {[string match "*\r\n" $raw_reply]} { + set got_reply 1 + break + } + after 100 + } + if {!$got_reply} { + fail "Reply didn't unblock after disabling sync replication" + } + fconfigure $fd -blocking 1 + assert_match "+OK*" $raw_reply + + # Resume the provider so subsequent tests aren't affected + # (disabling sync-replication unblocked the client but didn't resume the provider) + $primary DEBUG durability-provider-resume aof + + assert_equal "OK" [$primary config set appendfsync always] + } + + test "($provider_mode) INFO reports sync replication stats" { + set info [$primary info all] + assert_match "*durability_enabled:1*" $info + assert_match "*durability_primary_repl_offset:*" $info + assert_match "*durability_committed_offset:*" $info + } + + # ==================== Client tracking invalidation (deferred tasks) ==================== + + test "($provider_mode) Key invalidation is deferred until provider acks - signalModifiedKey" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + puts "running key invalidation" + # Set up a RESP3 tracking client that will receive invalidation messages + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on + $tracker read ;# consume TRACKING reply + + # Populate a key and cache it via GET on the tracking client + $primary config set appendfsync everysec + $primary set durable:track-key original + $primary config set appendfsync always + + $tracker GET durable:track-key + $tracker read ;# consume "original" — key is now tracked + + # Pause the provider so the next write's invalidation is deferred + pause_provider + + # Write to the tracked key from a different client (fire-and-forget) + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:track-key modified + + # Give the server a moment to process the write + after 100 + + # The tracking client should NOT have received an invalidation yet + set tracker_fd [$tracker channel] + fconfigure $tracker_fd -blocking 0 + set early_inval [read $tracker_fd] + fconfigure $tracker_fd -blocking 1 + # No invalidation push should appear while provider is paused + assert_equal "" $early_inval + + # Now unblock — this should trigger the deferred invalidation + unblock_with_provider + + # Read the invalidation message from the tracking client + # RESP3 push: [invalidate [key1 key2 ...]] + set inval_msg [$tracker read] + assert_match "*durable:track-key*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + $writer close + } + + test "($provider_mode) Flush invalidation is deferred until provider acks - signalFlushedDb" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + puts "running flush invalidation" + + # Set up a RESP3 BCAST tracking client to catch FLUSHDB invalidations + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on BCAST + $tracker read ;# consume TRACKING reply + + # Populate some keys so there's something to flush + $primary config set appendfsync everysec + $primary set durable:flush-track-a val_a + $primary set durable:flush-track-b val_b + $primary config set appendfsync always + + # Drain any invalidation messages from the SETs above + # (BCAST mode sends invalidations for all writes) + after 100 + set tracker_fd [$tracker channel] + fconfigure $tracker_fd -blocking 0 + read $tracker_fd + fconfigure $tracker_fd -blocking 1 + + # Pause the provider so the FLUSHDB invalidation is deferred + pause_provider + + # Issue FLUSHDB from a fire-and-forget writer + set writer [valkey_deferring_client -1] + $writer client reply off + $writer flushdb + + # Give the server time to process the command + after 100 + + # The tracking client should NOT have received flush invalidation yet + fconfigure $tracker_fd -blocking 0 + set early_inval [read $tracker_fd] + fconfigure $tracker_fd -blocking 1 + assert_equal "" $early_inval + + # Unblock — this triggers the deferred flush invalidation + unblock_with_provider + + # The tracking client should now receive an invalidation + # For FLUSHDB, the invalidation message contains NULL to indicate all keys + # Use a polling read with timeout to avoid hanging if message doesn't arrive + set inval_msg "" + set got_inval 0 + fconfigure $tracker_fd -blocking 0 + for {set i 0} {$i < 50} {incr i} { + append inval_msg [read $tracker_fd] + if {[string match "*invalidate*" $inval_msg]} { + set got_inval 1 + break + } + after 100 + } + fconfigure $tracker_fd -blocking 1 + if {!$got_inval} { + fail "Flush invalidation message not received within timeout" + } + assert_match "*invalidate*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + $writer close + } + + # ==================== Keyspace notification deferral (deferred tasks) ==================== + + test "($provider_mode) Keyspace notification is deferred until provider acks" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Enable keyspace notifications for all events + $primary config set notify-keyspace-events KA + + # Subscribe to keyspace notifications + set rd1 [valkey_deferring_client -1] + assert_equal {1} [psubscribe $rd1 "__keyspace@*__:*"] + + # Pause the provider so keyspace notifications are deferred + pause_provider + + # Write to a key from a fire-and-forget writer + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:keyspace-deferred-key val + + # Give the server time to process the command + after 100 + + # The subscriber should NOT have received the notification yet + set rd1_fd [$rd1 channel] + fconfigure $rd1_fd -blocking 0 + set early_notif [read $rd1_fd] + fconfigure $rd1_fd -blocking 1 + # No keyspace notification should appear while provider is paused + assert_equal "" $early_notif + + # Now unblock — this should trigger the deferred keyspace notification + unblock_with_provider + + # Read the keyspace notification + set notif_msg [$rd1 read] + assert_match "*set*" $notif_msg + + $rd1 close + $writer close + $primary config set notify-keyspace-events "" + } + + test "($provider_mode) Keyspace notification fires immediately when sync replication disabled" { + # Verify that without sync replication, keyspace events are NOT deferred + $primary config set appendfsync everysec + $primary config set notify-keyspace-events KA + + set rd1 [valkey_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + r set foo bar + assert_match "*set*" [$rd1 read] + $rd1 close + + $primary config set notify-keyspace-events "" + $primary config set appendfsync always + } + + # ==================== Client tracking invalidation (existing) ==================== + + test "($provider_mode) Key invalidation fires immediately when sync replication disabled" { + # Verify that without sync replication, invalidations are NOT deferred + $primary config set appendfsync everysec + + set tracker [valkey_deferring_client -1] + $tracker HELLO 3 + $tracker read ;# consume HELLO reply + $tracker CLIENT TRACKING on + $tracker read ;# consume TRACKING reply + + $primary set durable:track-nodefer original + + $tracker GET durable:track-nodefer + $tracker read ;# consume "original" — key is now tracked + + # Write to the tracked key — invalidation should fire immediately + $primary set durable:track-nodefer changed + + # Should get the invalidation right away (no provider pause needed) + set inval_msg [$tracker read] + assert_match "*durable:track-nodefer*" $inval_msg + + $tracker CLIENT TRACKING off + $tracker read ;# consume reply + $tracker close + + $primary config set appendfsync always + } + + # ==================== Durability provider edge cases ==================== + + test "($provider_mode) Pause unknown provider returns error" { + catch {$primary DEBUG durability-provider-pause nonexistent} err + assert_match "*No such durability provider*" $err + } + + test "($provider_mode) Resume unknown provider returns error" { + catch {$primary DEBUG durability-provider-resume nonexistent} err + assert_match "*No such durability provider*" $err + } + + test "($provider_mode) Double pause is idempotent - writes still block" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Pause twice + $primary DEBUG durability-provider-pause aof + $primary DEBUG durability-provider-pause aof + + # Write should still block + set rd [valkey_deferring_client -1] + $rd set durable:double-pause-key val + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + # Resume once should unblock + $primary DEBUG durability-provider-resume aof + $primary ping + + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Resume while not paused is harmless" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + # Resume when not paused should succeed without issue + assert_equal "OK" [$primary DEBUG durability-provider-resume aof] + + # Writes should still work normally + set rd [valkey_deferring_client -1] + $rd set durable:resume-noop-key val + assert_equal "OK" [$rd read] + $rd close + } + + test "($provider_mode) Multiple writes while paused all unblock on resume" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd set durable:multi-write-1 val1 + $rd set durable:multi-write-2 val2 + $rd set durable:multi-write-3 val3 + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + unblock_with_provider + + # All three replies should come through + assert_equal "OK" [$rd read] + assert_equal "OK" [$rd read] + assert_equal "OK" [$rd read] + $rd close + } + + # ==================== Copy avoidance compatibility ==================== + + test "($provider_mode) Blocked reply not released when io_last_written.data_len exceeds encoded boundary" { + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + # Write a dirty key (fire-and-forget) so the next GET is blocked + set writer [valkey_deferring_client -1] + $writer client reply off + $writer set durable:inject-key value + + # Reader: issue GET on the dirty key — reply will be blocked + set rd [valkey_deferring_client -1] + + # Get the reader's client ID for the DEBUG command + $rd client id + set reader_id [$rd read] + + $rd get durable:inject-key + + # Give server time to process and block the reply + after 100 + + # Inject the post-partial-write state using DEBUG. + # This simulates what happens after IO threads write an + # encoded (copy-avoided) buffer: data_len (decoded RESP bytes) + # is much larger than bufpos (encoded buffer position). + # bufpos=0 means "buffer not fully consumed" (incomplete write). + # data_len=999999 is a large decoded byte count. + # + # The disallowed_byte_offset for the blocked reply is a small + # number in encoded-buffer coordinates (~30 bytes). + # + # Bug (data_len): 999999 < ~30 → false → "no pending" → + # server thinks all data sent, removes write handler, + # reply leaks through! + # Fix (bufpos): 0 < ~30 → true → "still pending" → + # reply stays blocked. + $primary DEBUG set-io-last-written $reader_id 0 999999 + + # Check: the reply must NOT leak through + set fd [$rd channel] + fconfigure $fd -blocking 0 + after 200 + set early_reply [read $fd] + fconfigure $fd -blocking 1 + assert_equal "" $early_reply + + # Reset the injected state so normal writes work after unblock + $primary DEBUG set-io-last-written $reader_id 0 0 + + unblock_with_provider + + # Now the reply should come through + assert_equal "value" [$rd read] + $rd close + $writer close + } + + # ==================== Failover tests (must be last changes roles) ==================== + + test "($provider_mode) Failover disconnects clients waiting for ack" { + # Ensure replica is in clean state for deterministic failover behavior. + # In replica mode, earlier tests connected the replica and replicated data; + # we flush it here so the demoted primary's dirty key tracking is preserved + # correctly after failover (not overwritten by a full sync). + $replica flushall + assert_equal "always" [lindex [$primary config get appendfsync] 1] + + pause_provider + + set rd [valkey_deferring_client -1] + $rd client setname durability-waiter + $rd read + $rd set durable:failover value + + set fd [$rd channel] + fconfigure $fd -blocking 0 + set early_reply [read $fd] + assert_equal "" $early_reply + fconfigure $fd -blocking 1 + + $primary replicaof $replica_host $replica_port + + catch {$rd read} err + assert_match {*I/O error*} $err + } + + } + } +} diff --git a/tests/durability/sync-replication.tcl b/tests/durability/sync-replication.tcl new file mode 100644 index 00000000000..e0a714049c1 --- /dev/null +++ b/tests/durability/sync-replication.tcl @@ -0,0 +1,495 @@ +# Tests for replication-based durability provider (sync replication). +# +# These tests validate the sync replication feature where writes are only +# considered committed once acknowledged by a configured number of sync +# replicas (ISR members). +# +# We use appendfsync=everysec so the AOF durability provider is DISABLED, +# isolating the replication provider behavior. The replication provider +# is enabled when min-sync-replicas > 0. + +# Helper: wait until the primary reports at least N sync replicas in the ISR +# by polling INFO durability for durability_sync_replicas. +proc wait_for_isr_count {primary count} { + wait_for_condition 100 100 { + [getInfoProperty [$primary info durability] durability_sync_replicas] >= $count + } else { + fail "Expected $count sync replicas in ISR but got [getInfoProperty [$primary info durability] durability_sync_replicas]" + } +} + +# Helper: return the current write_blocked_count from INFO durability. +proc get_write_blocked_count {primary} { + getInfoProperty [$primary info durability] durability_write_blocked_count +} + +# ========================================================================== +# Test 1: If number of sync replicas < min-sync-replicas, primary rejects +# writes with CLUSTERDOWN. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 2}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica1 [srv 0 client] + set replica1_host [srv 0 host] + set replica1_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica2 [srv 0 client] + set replica2_host [srv 0 host] + set replica2_port [srv 0 port] + + test "Sync replication: write rejected when ISR count < min-sync-replicas" { + # Connect only replica1 — ISR will have 1 member + $replica1 replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 1 + + # Write must be rejected — only 1 of 2 required sync replicas + catch {$primary set mykey myvalue} err + assert_match "*CLUSTERDOWN*" $err + + # Connect replica2 so ISR reaches 2 + $replica2 replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 2 + + # Write should succeed now + assert_equal "OK" [$primary set mykey myvalue] + } + } + } +} + +# ========================================================================== +# Test 2: Primary connected to 2 sync replicas. Write is released to client +# only when both sync replicas ack back. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 2}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica1 [srv 0 client] + set replica1_host [srv 0 host] + set replica1_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica2 [srv 0 client] + set replica2_host [srv 0 host] + set replica2_port [srv 0 port] + + test "Sync replication: write released only after both sync replicas ack" { + # Connect both replicas and let them sync + $replica1 replicaof $primary_host $primary_port + $replica2 replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 2 + + # Pause the replication provider so acks don't advance consensus + $primary DEBUG durability-provider-pause replication + + set blocked_before [get_write_blocked_count $primary] + + # Issue a write via deferring client — reply should be held + set rd [valkey_deferring_client -2] + $rd set mykey myvalue + + # Wait for the write to be blocked + wait_for_condition 50 100 { + [get_write_blocked_count $primary] > $blocked_before + } else { + fail "Write was not blocked by durability provider" + } + + # Resume the replication provider — replicas have already acked, + # so consensus advances and the reply is released + $primary DEBUG durability-provider-resume replication + $primary ping ;# force a beforeSleep cycle + + assert_equal "OK" [$rd read] + $rd close + } + } + } +} + +# ========================================================================== +# Test 3: Primary connected to 2 sync replicas. Write is blocked if one +# replica lags behind (paused with SIGSTOP). +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 2}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica1 [srv 0 client] + set replica1_host [srv 0 host] + set replica1_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica2 [srv 0 client] + set replica2_host [srv 0 host] + set replica2_port [srv 0 port] + + test "Sync replication: write blocked when one replica lags behind" { + # Connect both replicas and let them sync + $replica1 replicaof $primary_host $primary_port + $replica2 replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 2 + + # Verify writes work when both replicas are healthy + assert_equal "OK" [$primary set healthy-key healthy-value] + + # Pause replica2 at the OS level (SIGSTOP) so it stops + # sending ACKs. Its ack offset is frozen, preventing + # consensus from advancing past new writes. + set replica2_pid [srv 0 pid] + pause_process $replica2_pid + + set blocked_before [get_write_blocked_count $primary] + + # Issue a write via deferring client — replica1 will ack + # but replica2 cannot, so min_offset stays behind. + set rd [valkey_deferring_client -2] + $rd set blocked-key blocked-value + + # Verify the write was blocked + wait_for_condition 50 100 { + [get_write_blocked_count $primary] > $blocked_before + } else { + fail "First write was not blocked" + } + + set blocked_before2 [get_write_blocked_count $primary] + + # Issue another write — also blocked + $rd set blocked-key2 blocked-value2 + + wait_for_condition 50 100 { + [get_write_blocked_count $primary] > $blocked_before2 + } else { + fail "Second write was not blocked" + } + + # Resume replica2 — it catches up and acks, both writes unblock + resume_process $replica2_pid + + assert_equal "OK" [$rd read] + assert_equal "OK" [$rd read] + $rd close + } + } + } +} + +# ========================================================================== +# Test 4: Replica killed — writes rejected, replica restarts — writes resume. +# +# Primary with min-sync-replicas=1 and a single sync replica. The replica +# is killed (shutdown nosave). After repl-timeout the primary disconnects +# the dead replica, ISR drops to 0, and writes are rejected with +# CLUSTERDOWN. The replica is then restarted, rejoins the ISR, and writes +# are accepted again. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 1}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + + test "Sync replication: replica killed — writes rejected then resume after restart" { + # Use a short repl-timeout so the primary detects the dead + # replica quickly. + $primary config set repl-timeout 3 + + $replica replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 1 + + # Writes should succeed with a healthy replica + assert_equal "OK" [$primary set key1 value1] + + # Kill the replica + catch {$replica shutdown nosave} + + # Wait for the primary to disconnect the dead replica + wait_for_condition 50 200 { + [s -1 connected_slaves] == 0 + } else { + fail "Primary did not disconnect the dead replica" + } + + # Writes must now be rejected — no replicas in ISR + catch {$primary set key2 value2} err + assert_match "*CLUSTERDOWN*" $err + + # Restart the replica — it reconnects and rejoins the ISR + restart_server 0 true false + + set replica [srv 0 client] + $replica replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 1 + + # Writes should succeed again + assert_equal "OK" [$primary set key3 value3] + } + } +} + +# ========================================================================== +# Test 5: Replica paused (SIGSTOP) — writes rejected after ISR timeout, +# replica resumed — writes accepted again. +# +# Primary with min-sync-replicas=1 and a single sync replica. The replica +# is paused with SIGSTOP. It stops sending ACKs, and after the ISR timeout +# (REPLICA_ISR_TIMEOUT = 10 s) the primary removes it from the ISR. +# With 0 ISR members, writes are rejected. Resuming the replica (SIGCONT) +# lets it catch up, rejoin the ISR, and writes succeed again. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 1}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_pid [srv 0 pid] + + test "Sync replication: replica paused — writes rejected then resume after SIGCONT" { + # Use a repl-timeout longer than the ISR timeout so the + # replica is removed from the ISR but NOT disconnected. + # ISR timeout is 10 s, so set repl-timeout to 20 s. + $primary config set repl-timeout 20 + + $replica replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 1 + + # Writes should succeed with a healthy replica + assert_equal "OK" [$primary set key1 value1] + + # Pause the replica — it stays connected but stops ACKing + pause_process $replica_pid + + # Wait for the ISR timeout to remove the replica from ISR. + wait_for_condition 150 200 { + [getInfoProperty [$primary info durability] durability_sync_replicas] == 0 + } else { + fail "Replica was not removed from ISR after timeout" + } + + # The replica is still connected (not timed out by + # repl-timeout which is 20 s) but removed from ISR. + assert_equal 1 [s -1 connected_slaves] + + # Writes must now be rejected — 0 ISR members + catch {$primary set key2 value2} err + assert_match "*CLUSTERDOWN*" $err + + # Resume the replica — it catches up and rejoins the ISR + resume_process $replica_pid + wait_for_isr_count $primary 1 + + # Writes should succeed again + set rd [valkey_deferring_client -1] + $rd set key3 value3 + assert_equal "OK" [$rd read] + $rd close + } + } +} + +# ========================================================================== +# Test 6: Consensus offset advances based on sync replica only, not +# regular (non-sync) replicas. +# +# Primary with min-sync-replicas=1, one sync replica, and one regular +# replica (sync-eligible=no). Writes succeed. The regular replica is +# paused (SIGSTOP). Writes continue to succeed and the committed offset +# advances, proving that consensus depends solely on the sync replica. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 1}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + # Sync replica + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set sync_replica [srv 0 client] + set sync_replica_host [srv 0 host] + set sync_replica_port [srv 0 port] + + # Regular (non-sync) replica + start_server {overrides {min-sync-replicas 1 sync-eligible no}} { + set regular_replica [srv 0 client] + set regular_replica_host [srv 0 host] + set regular_replica_port [srv 0 port] + set regular_replica_pid [srv 0 pid] + + test "Sync replication: committed offset advances based on sync replica, not regular replica" { + # Connect both replicas + $sync_replica replicaof $primary_host $primary_port + $regular_replica replicaof $primary_host $primary_port + + # Wait for both replicas to come online + wait_for_condition 50 100 { + [string match "*slave0:*state=online*" [$primary info replication]] && + [string match "*slave1:*state=online*" [$primary info replication]] + } else { + fail "Replicas did not come online" + } + + # Wait for sync replica to join ISR + wait_for_isr_count $primary 1 + + # Verify writes succeed with both replicas healthy + assert_equal "OK" [$primary set key1 value1] + + # Record the committed offset after the first write. + set offset_before [getInfoProperty [$primary info durability] durability_committed_offset] + assert {$offset_before > 0} + + # Pause the regular (non-sync) replica + pause_process $regular_replica_pid + + # Issue more writes — they should succeed because the + # sync replica is still healthy and ACKing. + assert_equal "OK" [$primary set key2 value2] + assert_equal "OK" [$primary set key3 value3] + + # Verify the committed offset has advanced, proving + # consensus is driven by the sync replica alone. + set offset_after [getInfoProperty [$primary info durability] durability_committed_offset] + assert {$offset_after > $offset_before} + + # Verify the primary still sees 2 connected replicas + # (the regular one is paused but TCP connection is alive) + assert_equal 2 [status $primary connected_slaves] + + # Resume the regular replica + resume_process $regular_replica_pid + + # One more write to confirm everything is still healthy + assert_equal "OK" [$primary set key4 value4] + + set offset_final [getInfoProperty [$primary info durability] durability_committed_offset] + assert {$offset_final > $offset_after} + } + } + } +} + +# ========================================================================== +# Test 7: [WBL] Replica blocks reads on uncommitted keys until REPLCONF COMMIT +# arrives from the primary. +# +# The primary's replication provider is paused so the committed offset +# stops advancing. A write on the primary replicates to the replica but +# the committed offset doesn't move. A client connected to the replica +# reads the key — the read is blocked because the key is dirty +# (uncommitted). When the provider is resumed, the primary sends +# REPLCONF COMMIT with the new offset, the replica unblocks the read. +# ========================================================================== + +start_server {tags {"repl durability external:skip"} overrides {appendonly yes appendfsync everysec min-sync-replicas 2}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica1 [srv 0 client] + set replica1_host [srv 0 host] + set replica1_port [srv 0 port] + + start_server {overrides {min-sync-replicas 1 sync-eligible yes}} { + set replica2 [srv 0 client] + set replica2_host [srv 0 host] + set replica2_port [srv 0 port] + + test "Sync replication: replica blocks read on uncommitted key until REPLCONF COMMIT" { + # Connect both replicas and wait for ISR + $replica1 replicaof $primary_host $primary_port + $replica2 replicaof $primary_host $primary_port + wait_replica_online $primary + wait_for_isr_count $primary 2 + + # Verify the system is healthy — a write succeeds end-to-end + assert_equal "OK" [$primary set committed-key committed-value] + + # Verify the replica can read the committed key + wait_for_condition 50 100 { + [$replica1 get committed-key] eq "committed-value" + } else { + fail "Committed key did not replicate to replica" + } + + # Pause the replication provider on the primary. + # This freezes the committed offset — new writes will + # replicate to replicas but REPLCONF COMMIT won't advance. + $primary DEBUG durability-provider-pause replication + + # Write a key on the primary via a deferring client. + # We don't read the reply — it will be blocked by the + # paused provider, but we don't care about it. + # The write replicates to replicas via the replication stream. + set writer [valkey_deferring_client -2] + $writer set uncommitted-key uncommitted-value + + # Wait for the write to replicate to the replica + wait_for_condition 50 100 { + [getInfoProperty [$replica1 info durability] durability_uncommitted_keys] > 0 + } else { + fail "Key was not tracked as uncommitted on replica" + } + + # Now a client connected to replica1 reads the key. + # The key exists on the replica (it was replicated) but + # is uncommitted (committed offset hasn't advanced). + # The read should be blocked. + set replica_blocked_before [getInfoProperty [$replica1 info durability] durability_clients_waiting_ack] + + set reader [valkey_deferring_client -1] + $reader get uncommitted-key + + # Verify the read is blocked on the replica + wait_for_condition 50 100 { + [getInfoProperty [$replica1 info durability] durability_clients_waiting_ack] > $replica_blocked_before + } else { + fail "Read on uncommitted key was not blocked on replica" + } + + # Resume the replication provider on the primary. + # The committed offset advances, the primary sends + # REPLCONF COMMIT, and the replica unblocks the read. + $primary DEBUG durability-provider-resume replication + $primary ping ;# force beforeSleep cycle + + # The reader should now get the value + assert_equal "uncommitted-value" [$reader read] + + $reader close + $writer close + } + } + } +}