diff --git a/changelog.d/19826.bugfix b/changelog.d/19826.bugfix new file mode 100644 index 00000000000..771b197b205 --- /dev/null +++ b/changelog.d/19826.bugfix @@ -0,0 +1 @@ +Lock Sliding Sync connections when inserting lazy members, to prevent repeated deadlocks. \ No newline at end of file diff --git a/synapse/storage/databases/main/sliding_sync.py b/synapse/storage/databases/main/sliding_sync.py index 9a09c0f9b54..d0943002b6d 100644 --- a/synapse/storage/databases/main/sliding_sync.py +++ b/synapse/storage/databases/main/sliding_sync.py @@ -186,15 +186,35 @@ def persist_per_connection_state_txn( # First we fetch (or create) the connection key associated with the # previous connection position. if previous_connection_position is not None: + lock_clause = "" + if isinstance(self.database_engine, PostgresEngine): + # Lock the sliding sync connection row for update upfront, + # to prevent deadlocks between concurrent transactions + # (which can retry again and again without making progress). + # + # (We don't need to explicitly lock in the other branch, + # where we re-create the connection, as that implies a lock + # anyway) + # + # Specifically, the statements seen to deadlock against + # each other were + # `INSERT INTO sliding_sync_connection_lazy_members` + # with conflicting tuples on + # "sliding_sync_connection_lazy_members_idx" UNIQUE, btree + # (connection_key, room_id, user_id) + # https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-ROWS + lock_clause = "FOR NO KEY UPDATE OF sliding_sync_connections" + # The `previous_connection_position` is a user-supplied value, so we # need to make sure that the one they supplied is actually theirs. - sql = """ + sql = f""" SELECT connection_key FROM sliding_sync_connection_positions INNER JOIN sliding_sync_connections USING (connection_key) WHERE connection_position = ? AND user_id = ? AND effective_device_id = ? AND conn_id = ? + {lock_clause} """ txn.execute( sql, (previous_connection_position, user_id, device_id, conn_id)