From 884df7ab6b5b5b1bc2177f9068de01694ed4670e Mon Sep 17 00:00:00 2001
From: gality369 <gality369@example.com>
Date: Thu, 7 May 2026 14:54:31 +0800
Subject: [PATCH] vdev_disk: avoid vd_lock lockdep cycle during reopen

On Linux, fault-driven reads can reach vdev_disk_io_start() while
holding mapping.invalidate_lock and then acquire vd_lock.

At the same time, vdev_disk_open() held vd_lock across
bdev_file_open_by_path()/bdev_open_by_path(). That pulls vd_lock
into the block open path and into the mmap_lock ->
mapping.invalidate_lock dependency chain.

This is a real lock ordering problem, not just a lockdep false
positive. A reopen can hold vd_lock while the page fault path
already holds mapping.invalidate_lock and waits for vd_lock.

Fix this by limiting vd_lock to short vd_bdh state transitions and
moving the slow block-device open/release work outside the lock.
Add a small reopen wait state so I/O waits for a transient reopen
window instead of treating vd_bdh == NULL as a permanent failure.
Also use the submission-time block device for EIO revalidation so
reopen does not race with media checks.

Signed-off-by: ZhengYuan Huang <gality369@gmail.com>
---
 module/os/linux/zfs/vdev_disk.c | 177 +++++++++++++++++++++++++-------
 1 file changed, 139 insertions(+), 38 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 66e10584ab5e..7993631bc2c2 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -74,6 +74,9 @@ typedef void zfs_bdev_handle_t;
 typedef struct vdev_disk {
 	zfs_bdev_handle_t		*vd_bdh;
 	krwlock_t			vd_lock;
+	kmutex_t			vd_open_lock;
+	kcondvar_t			vd_open_cv;
+	boolean_t			vd_opening;
 } vdev_disk_t;
 
 /*
@@ -238,6 +241,21 @@ vdev_disk_error(zio_t *zio)
 	    zio->io_flags);
 }
 
+static void
+vdev_disk_handle_eio(zio_t *zio, struct block_device *bdev)
+{
+	vdev_t *v = zio->io_vd;
+
+	if (zio->io_error != EIO || bdev == NULL)
+		return;
+
+	if (!zfs_check_disk_status(bdev)) {
+		invalidate_bdev(bdev);
+		v->vdev_remove_wanted = B_TRUE;
+		spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+	}
+}
+
 static void
 vdev_disk_kobj_evt_post(vdev_t *v)
 {
@@ -280,6 +298,36 @@ vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 #endif
 }
 
+static void
+vdev_disk_set_opening(vdev_disk_t *vd, boolean_t opening)
+{
+	mutex_enter(&vd->vd_open_lock);
+	vd->vd_opening = opening;
+	if (!opening)
+		cv_broadcast(&vd->vd_open_cv);
+	mutex_exit(&vd->vd_open_lock);
+}
+
+static boolean_t
+vdev_disk_wait_open_done(vdev_disk_t *vd)
+{
+	boolean_t waited = B_FALSE;
+
+	/*
+	 * Return B_TRUE when a reopen was observed and waited for, so the
+	 * caller can retry vd_bdh.  B_FALSE means vd_bdh was already NULL
+	 * without an active reopen, which is a terminal closed-device state.
+	 */
+	mutex_enter(&vd->vd_open_lock);
+	while (vd->vd_opening) {
+		waited = B_TRUE;
+		cv_wait(&vd->vd_open_cv, &vd->vd_open_lock);
+	}
+	mutex_exit(&vd->vd_open_lock);
+
+	return (waited);
+}
+
 static int
 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
@@ -288,6 +336,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
+	boolean_t reopening;
 
 	/* Must have a pathname and it must be absolute. */
 	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
@@ -305,13 +354,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	 * open retry timeout before reporting the device as unavailable.
 	 */
 	vd = v->vdev_tsd;
-	if (vd) {
+	reopening = (vd != NULL);
+	if (reopening) {
 		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
 		boolean_t reread_part = B_FALSE;
 
 		rw_enter(&vd->vd_lock, RW_WRITER);
+		vdev_disk_set_opening(vd, B_TRUE);
 		bdh = vd->vd_bdh;
 		vd->vd_bdh = NULL;
+		rw_exit(&vd->vd_lock);
 
 		if (bdh) {
 			struct block_device *bdev = BDH_BDEV(bdh);
@@ -356,7 +408,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
 
 		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
-		rw_enter(&vd->vd_lock, RW_WRITER);
+		mutex_init(&vd->vd_open_lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&vd->vd_open_cv, NULL, CV_DEFAULT, NULL);
 	}
 
 	/*
@@ -414,17 +467,23 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
 		    (u_longlong_t)(gethrtime() - start),
 		    (u_longlong_t)timeout);
+		rw_enter(&vd->vd_lock, RW_WRITER);
 		vd->vd_bdh = NULL;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
+		if (reopening)
+			vdev_disk_set_opening(vd, B_FALSE);
 		return (SET_ERROR(error));
 	} else {
+		rw_enter(&vd->vd_lock, RW_WRITER);
 		vd->vd_bdh = bdh;
 		v->vdev_tsd = vd;
 		rw_exit(&vd->vd_lock);
+		if (reopening)
+			vdev_disk_set_opening(vd, B_FALSE);
 	}
 
-	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	struct block_device *bdev = BDH_BDEV(bdh);
 
 	/*  Determine the physical block size */
 	int physical_block_size = bdev_physical_block_size(bdev);
@@ -487,6 +546,8 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 
 	rw_exit(&vd->vd_lock);
+	cv_destroy(&vd->vd_open_cv);
+	mutex_destroy(&vd->vd_open_lock);
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
 }
@@ -680,6 +741,36 @@ typedef struct {
 	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
+typedef struct {
+	struct block_device	*vdb_bdev;
+} vdev_disk_io_bdev_t;
+
+static inline void
+vdev_disk_io_set_bdev(zio_t *zio, struct block_device *bdev)
+{
+	vdev_disk_io_bdev_t *io_bdev;
+
+	ASSERT3P(zio->io_bio, ==, NULL);
+
+	io_bdev = kmem_zalloc(sizeof (*io_bdev), KM_SLEEP);
+	io_bdev->vdb_bdev = bdev;
+	zio->io_bio = io_bdev;
+}
+
+static inline struct block_device *
+vdev_disk_io_clear_bdev(zio_t *zio)
+{
+	vdev_disk_io_bdev_t *io_bdev = zio->io_bio;
+	struct block_device *bdev;
+
+	ASSERT3P(io_bdev, !=, NULL);
+
+	bdev = io_bdev->vdb_bdev;
+	zio->io_bio = NULL;
+	kmem_free(io_bdev, sizeof (*io_bdev));
+	return (bdev);
+}
+
 static vbio_t *
 vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
@@ -1006,6 +1097,7 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
 	if (unlikely(bio == NULL))
 		return (SET_ERROR(ENOMEM));
 
+	vdev_disk_io_set_bdev(zio, bdev);
 	bio->bi_end_io = vdev_disk_io_flush_completion;
 	bio->bi_private = zio;
 	bio_set_flush(bio);
@@ -1092,9 +1184,12 @@ vdev_disk_io_trim(zio_t *zio)
 	struct bio *bio;
 
 	zfs_bdev_handle_t *bdh = ((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh;
+	struct block_device *bdev = BDH_BDEV(bdh);
 	sector_t sector = zio->io_offset >> 9;
 	sector_t nsects = zio->io_size >> 9;
 
+	vdev_disk_io_set_bdev(zio, bdev);
+
 	if (zio->io_trim_flags & ZIO_TRIM_SECURE)
 		error = vdev_bdev_issue_secure_erase(bdh, sector, nsects, &bio);
 	else
@@ -1134,22 +1229,27 @@ vdev_disk_io_start(zio_t *zio)
 	 * Nothing to be done here but return failure.
 	 */
 	if (vd == NULL) {
-		zio->io_error = ENXIO;
+		zio->io_error = SET_ERROR(ENXIO);
 		zio_interrupt(zio);
 		return;
 	}
 
-	rw_enter(&vd->vd_lock, RW_READER);
-
-	/*
-	 * If the vdev is closed, it's likely due to a failed reopen and is
-	 * in the UNAVAIL state.  Nothing to be done here but return failure.
-	 */
-	if (vd->vd_bdh == NULL) {
+	for (;;) {
+		rw_enter(&vd->vd_lock, RW_READER);
+		if (vd->vd_bdh != NULL)
+			break;
 		rw_exit(&vd->vd_lock);
-		zio->io_error = ENXIO;
-		zio_interrupt(zio);
-		return;
+
+		/*
+		 * A reopen temporarily clears vd_bdh while the block layer open
+		 * runs outside vd_lock to avoid exporting vd_lock ordering into
+		 * unrelated kernel locks.
+		 */
+		if (!vdev_disk_wait_open_done(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
 	}
 
 	switch (zio->io_type) {
@@ -1224,26 +1324,34 @@ vdev_disk_io_start(zio_t *zio)
 static void
 vdev_disk_io_done(zio_t *zio)
 {
-	/* If this was a read or write, we need to clean up the vbio */
+	/* Clean up any backend state attached while issuing the IO. */
+	struct block_device *io_bdev = NULL;
 	if (zio->io_bio != NULL) {
-		vbio_t *vbio = zio->io_bio;
-		zio->io_bio = NULL;
+		if (zio->io_type == ZIO_TYPE_READ ||
+		    zio->io_type == ZIO_TYPE_WRITE) {
+			vbio_t *vbio = zio->io_bio;
+			zio->io_bio = NULL;
+			io_bdev = vbio->vbio_bdev;
 
-		/*
-		 * If we copied the ABD before issuing it, clean up and return
-		 * the copy to the ADB, with changes if appropriate.
-		 */
-		if (vbio->vbio_abd != NULL) {
-			if (zio->io_type == ZIO_TYPE_READ)
-				abd_copy(zio->io_abd, vbio->vbio_abd,
-				    zio->io_size);
+			/*
+			 * If we copied the ABD before issuing it, clean up and
+			 * return the copy to the ADB, with changes if
+			 * appropriate.
+			 */
+			if (vbio->vbio_abd != NULL) {
+				if (zio->io_type == ZIO_TYPE_READ)
+					abd_copy(zio->io_abd, vbio->vbio_abd,
+					    zio->io_size);
 
-			abd_free(vbio->vbio_abd);
-			vbio->vbio_abd = NULL;
-		}
+				abd_free(vbio->vbio_abd);
+				vbio->vbio_abd = NULL;
+			}
 
-		/* Final cleanup */
-		kmem_free(vbio, sizeof (vbio_t));
+			/* Final cleanup */
+			kmem_free(vbio, sizeof (vbio_t));
+		} else {
+			io_bdev = vdev_disk_io_clear_bdev(zio);
+		}
 	}
 
 	/*
@@ -1252,14 +1360,7 @@ vdev_disk_io_done(zio_t *zio)
 	 * removal of the device from the configuration.
 	 */
 	if (zio->io_error == EIO) {
-		vdev_t *v = zio->io_vd;
-		vdev_disk_t *vd = v->vdev_tsd;
-
-		if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) {
-			invalidate_bdev(BDH_BDEV(vd->vd_bdh));
-			v->vdev_remove_wanted = B_TRUE;
-			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
-		}
+		vdev_disk_handle_eio(zio, io_bdev);
 	}
 }