From 0d1c5099e55721d798283ddc162949ccc6c4e35c Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Fri, 5 Jun 2026 03:47:26 +0100 Subject: [PATCH 1/4] Lock sliding sync connections when inserting lazy members to prevent deadlocks --- .../storage/databases/main/sliding_sync.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/synapse/storage/databases/main/sliding_sync.py b/synapse/storage/databases/main/sliding_sync.py index 9a09c0f9b54..55c360cbf44 100644 --- a/synapse/storage/databases/main/sliding_sync.py +++ b/synapse/storage/databases/main/sliding_sync.py @@ -673,6 +673,29 @@ def _persist_sliding_sync_connection_lazy_members_txn( now = self.clock.time_msec() + if isinstance(self.database_engine, PostgresEngine): + # If the update is large and there are multiple overlapping + # updates going on, + # multiple transactions can try to INSERT INTO sliding_sync_connection_lazy_members + # with conflicting tuples on + # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree + # (connection_key, room_id, user_id) + # This causes a deadlock and eventually a DeadlockDetected error. + # The transactions get tried again and again and do not make progress. + # To avoid this, take a lock on the connection upfront. + # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS + # (We could also consider sorting our insertions, but not clear if Postgres + # guarantees to preservee the insertion order) + txn.execute( + """ + SELECT 1 + FROM sliding_sync_connections + WHERE connection_key = ? + FOR NO KEY UPDATE + """, + (connection_key,), + ) + # Figure out which cache entries to add or update. # # These are either a) new entries we've never sent before (i.e. with a From b1c313e573101379960bdcdbbccb12d4f407a232 Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Fri, 5 Jun 2026 03:56:14 +0100 Subject: [PATCH 2/4] Newsfile Signed-off-by: Olivier 'reivilibre --- changelog.d/19826.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/19826.bugfix diff --git a/changelog.d/19826.bugfix b/changelog.d/19826.bugfix new file mode 100644 index 00000000000..771b197b205 --- /dev/null +++ b/changelog.d/19826.bugfix @@ -0,0 +1 @@ +Lock Sliding Sync connections when inserting lazy members, to prevent repeated deadlocks. \ No newline at end of file From 57a4d7cc81bec94af7589af6477aea1f23b4cd0e Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Fri, 5 Jun 2026 14:10:57 +0100 Subject: [PATCH 3/4] Hoist lock to entrypoint of outer txn --- .../storage/databases/main/sliding_sync.py | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/synapse/storage/databases/main/sliding_sync.py b/synapse/storage/databases/main/sliding_sync.py index 55c360cbf44..b959169b9ed 100644 --- a/synapse/storage/databases/main/sliding_sync.py +++ b/synapse/storage/databases/main/sliding_sync.py @@ -204,6 +204,32 @@ def persist_per_connection_state_txn( raise SlidingSyncUnknownPosition() (connection_key,) = row + + if isinstance(self.database_engine, PostgresEngine): + # Lock the sliding sync connection row for update upfront, + # to prevent deadlocks between concurrent transactions + # (which can retry again and again without making progress). + # + # (We don't need to explicitly lock in the other branch, + # where we re-create the connection, as that implies a lock + # anyway) + # + # Specifically, the statements seen to deadlock against + # each other were + # `INSERT INTO sliding_sync_connection_lazy_members` + # with conflicting tuples on + # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree + # (connection_key, room_id, user_id) + # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS + txn.execute( + """ + SELECT 1 + FROM sliding_sync_connections + WHERE connection_key = ? + FOR NO KEY UPDATE + """, + (connection_key,), + ) else: # We're restarting the connection, so we clear the previous existing data we # used to track it. We do this here to ensure that if we get lots of @@ -673,29 +699,6 @@ def _persist_sliding_sync_connection_lazy_members_txn( now = self.clock.time_msec() - if isinstance(self.database_engine, PostgresEngine): - # If the update is large and there are multiple overlapping - # updates going on, - # multiple transactions can try to INSERT INTO sliding_sync_connection_lazy_members - # with conflicting tuples on - # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree - # (connection_key, room_id, user_id) - # This causes a deadlock and eventually a DeadlockDetected error. - # The transactions get tried again and again and do not make progress. - # To avoid this, take a lock on the connection upfront. - # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS - # (We could also consider sorting our insertions, but not clear if Postgres - # guarantees to preservee the insertion order) - txn.execute( - """ - SELECT 1 - FROM sliding_sync_connections - WHERE connection_key = ? - FOR NO KEY UPDATE - """, - (connection_key,), - ) - # Figure out which cache entries to add or update. # # These are either a) new entries we've never sent before (i.e. with a From 4374b7b9a63b8f32771670b961c30fba05f9203c Mon Sep 17 00:00:00 2001 From: Olivier 'reivilibre Date: Mon, 15 Jun 2026 13:49:03 +0100 Subject: [PATCH 4/4] Put lock clause in initial query --- .../storage/databases/main/sliding_sync.py | 48 ++++++++----------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/synapse/storage/databases/main/sliding_sync.py b/synapse/storage/databases/main/sliding_sync.py index b959169b9ed..d0943002b6d 100644 --- a/synapse/storage/databases/main/sliding_sync.py +++ b/synapse/storage/databases/main/sliding_sync.py @@ -186,15 +186,35 @@ def persist_per_connection_state_txn( # First we fetch (or create) the connection key associated with the # previous connection position. if previous_connection_position is not None: + lock_clause = "" + if isinstance(self.database_engine, PostgresEngine): + # Lock the sliding sync connection row for update upfront, + # to prevent deadlocks between concurrent transactions + # (which can retry again and again without making progress). + # + # (We don't need to explicitly lock in the other branch, + # where we re-create the connection, as that implies a lock + # anyway) + # + # Specifically, the statements seen to deadlock against + # each other were + # `INSERT INTO sliding_sync_connection_lazy_members` + # with conflicting tuples on + # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree + # (connection_key, room_id, user_id) + # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS + lock_clause = "FOR NO KEY UPDATE OF sliding_sync_connections" + # The `previous_connection_position` is a user-supplied value, so we # need to make sure that the one they supplied is actually theirs. - sql = """ + sql = f""" SELECT connection_key FROM sliding_sync_connection_positions INNER JOIN sliding_sync_connections USING (connection_key) WHERE connection_position = ? AND user_id = ? AND effective_device_id = ? AND conn_id = ? + {lock_clause} """ txn.execute( sql, (previous_connection_position, user_id, device_id, conn_id) @@ -204,32 +224,6 @@ def persist_per_connection_state_txn( raise SlidingSyncUnknownPosition() (connection_key,) = row - - if isinstance(self.database_engine, PostgresEngine): - # Lock the sliding sync connection row for update upfront, - # to prevent deadlocks between concurrent transactions - # (which can retry again and again without making progress). - # - # (We don't need to explicitly lock in the other branch, - # where we re-create the connection, as that implies a lock - # anyway) - # - # Specifically, the statements seen to deadlock against - # each other were - # `INSERT INTO sliding_sync_connection_lazy_members` - # with conflicting tuples on - # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree - # (connection_key, room_id, user_id) - # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS - txn.execute( - """ - SELECT 1 - FROM sliding_sync_connections - WHERE connection_key = ? - FOR NO KEY UPDATE - """, - (connection_key,), - ) else: # We're restarting the connection, so we clear the previous existing data we # used to track it. We do this here to ensure that if we get lots of