From 73b973b396e7c15424e093b8ee2c32339c300bd9 Mon Sep 17 00:00:00 2001 From: Migel Imeri Date: Tue, 7 Apr 2026 08:19:02 -0600 Subject: [PATCH] Calling thread IO Adds a module parameter that will allow waiting for bio's to complete, along with a flag that tracks whether a zio has bypassed the queue. The motivation behind this change was performance based. The intention was to reduce overhead caused by swapping between threads from when bio's are submitted, and the callback executes. Currently, only zio's who have bypassed the queue are allowed to wait for bio completion, this is mainly done because any performance uplift from staying in the same thread is overshadowed by the vdev queue lock. Signed-off-by: Migel Imeri --- include/sys/zio.h | 29 +++++++++--------- man/man4/zfs.4 | 8 +++++ man/man8/zpool-events.8 | 29 +++++++++--------- module/os/linux/zfs/vdev_disk.c | 52 +++++++++++++++++++++++++++++---- module/zfs/vdev_queue.c | 1 + 5 files changed, 87 insertions(+), 32 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index c3a199ce813c..4e7f81fa35cc 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -225,25 +225,26 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_DIO_READ (1ULL << 19) +#define ZIO_FLAG_BYPASSED_QUEUE (1ULL << 20) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ -#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ -#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) -#define ZIO_FLAG_IO_BYPASS (1ULL << 22) -#define ZIO_FLAG_IO_REWRITE (1ULL << 23) -#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) -#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) -#define ZIO_FLAG_GANG_CHILD (1ULL << 26) -#define ZIO_FLAG_DDT_CHILD (1ULL << 27) -#define ZIO_FLAG_GODFATHER (1ULL << 28) -#define ZIO_FLAG_NOPWRITE (1ULL << 29) -#define ZIO_FLAG_REEXECUTED (1ULL << 30) -#define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_PREALLOCATED (1ULL << 32) -#define ZIO_FLAG_POSTREAD (1ULL << 33) +#define ZIO_FLAG_DONT_QUEUE (1ULL << 21) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 22) +#define ZIO_FLAG_IO_BYPASS (1ULL << 23) +#define ZIO_FLAG_IO_REWRITE (1ULL << 24) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 25) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 26) +#define ZIO_FLAG_GANG_CHILD (1ULL << 27) +#define ZIO_FLAG_DDT_CHILD (1ULL << 28) +#define ZIO_FLAG_GODFATHER (1ULL << 29) +#define ZIO_FLAG_NOPWRITE (1ULL << 30) +#define ZIO_FLAG_REEXECUTED (1ULL << 31) +#define ZIO_FLAG_DELEGATED (1ULL << 32) +#define ZIO_FLAG_PREALLOCATED (1ULL << 33) +#define ZIO_FLAG_POSTREAD (1ULL << 34) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 11b6c622f8ec..c9cca83d01b6 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1701,6 +1701,14 @@ itself, it will be clamped. Setting it to zero will cause the kernel's ideal size to be used. This parameter only applies on Linux. . +.It Sy zfs_vdev_disk_calling_thread_io Ns = Ns Sy 1 Ns | Ns 0 Pq uint +Controls calling thread io, note that we only wait for the zio to +complete if it bypassed the vdev queue, all this module parameter does +is enable that capability. +May lead to performance improvements when enabled if backing vdev devices +are fast and low latency. +This parameter currently only applies on Linux. +. .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int Time before expiring .Pa .zfs/snapshot . diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 12a110580729..d9cc1634a555 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -517,21 +517,24 @@ ZIO_FLAG_IO_RETRY:0x00008000 ZIO_FLAG_PROBE:0x00010000 ZIO_FLAG_TRYHARD:0x00020000 ZIO_FLAG_OPTIONAL:0x00040000 +ZIO_FLAG_DIO_READ:0x00080000 +ZIO_FLAG_BYPASSED_QUEUE:0x00100000 -ZIO_FLAG_DONT_QUEUE:0x00080000 -ZIO_FLAG_DONT_PROPAGATE:0x00100000 -ZIO_FLAG_IO_BYPASS:0x00200000 -ZIO_FLAG_IO_REWRITE:0x00400000 -ZIO_FLAG_RAW_COMPRESS:0x00800000 -ZIO_FLAG_RAW_ENCRYPT:0x01000000 +ZIO_FLAG_DONT_QUEUE:0x00200000 +ZIO_FLAG_DONT_PROPAGATE:0x00400000 +ZIO_FLAG_IO_BYPASS:0x00800000 +ZIO_FLAG_IO_REWRITE:0x01000000 +ZIO_FLAG_RAW_COMPRESS:0x02000000 +ZIO_FLAG_RAW_ENCRYPT:0x04000000 -ZIO_FLAG_GANG_CHILD:0x02000000 -ZIO_FLAG_DDT_CHILD:0x04000000 -ZIO_FLAG_GODFATHER:0x08000000 -ZIO_FLAG_NOPWRITE:0x10000000 -ZIO_FLAG_REEXECUTED:0x20000000 -ZIO_FLAG_DELEGATED:0x40000000 -ZIO_FLAG_FASTWRITE:0x80000000 +ZIO_FLAG_GANG_CHILD:0x08000000 +ZIO_FLAG_DDT_CHILD:0x10000000 +ZIO_FLAG_GODFATHER:0x20000000 +ZIO_FLAG_NOPWRITE:0x40000000 +ZIO_FLAG_REEXECUTED:0x80000000 +ZIO_FLAG_DELEGATED:0x100000000 +ZIO_FLAG_PREALLOCATED:0x200000000 +ZIO_FLAG_POSTREAD:0x400000000 .TE . .Sh I/O TYPES diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 7cc19fe5afb7..130983ea02b7 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -106,6 +106,14 @@ static uint_t zfs_vdev_open_timeout_ms = 1000; static unsigned int zfs_vdev_failfast_mask = 1; +/* + * Whether we wait for bio to complete. Also requires that + * zio has bypassed the vdev queue. May lead to performance + * improvements when backing vdev devices are fast and low + * latency. + */ +static unsigned int zfs_vdev_disk_calling_thread_io = 1; + /* * Convert SPA mode flags into bdev open mode flags. */ @@ -604,6 +612,15 @@ vdev_submit_bio(struct bio *bio) current->bio_list = bio_list; } +static inline void +vdev_submit_bio_wait(struct bio *bio) +{ + struct bio_list *bio_list = current->bio_list; + current->bio_list = NULL; + (void) submit_bio_wait(bio); + current->bio_list = bio_list; +} + static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) @@ -678,6 +695,7 @@ typedef struct { struct bio *vbio_bio; /* pointer to the current bio */ int vbio_flags; /* bio flags */ + boolean_t vbio_wait; /* wait for completion */ } vbio_t; static vbio_t * @@ -694,6 +712,7 @@ vbio_alloc(zio_t *zio, struct block_device *bdev, int flags) vbio->vbio_offset = zio->io_offset; vbio->vbio_bio = NULL; vbio->vbio_flags = flags; + vbio->vbio_wait = B_FALSE; return (vbio); } @@ -779,16 +798,20 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) (void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio); ASSERT(vbio->vbio_bio); - vbio->vbio_bio->bi_end_io = vbio_completion; - vbio->vbio_bio->bi_private = vbio; - /* * Once submitted, vbio_bio now owns vbio (through bi_private) and we * can't touch it again. The bio may complete and vbio_completion() be * called and free the vbio before this task is run again, so we must * consider it invalid from this point. */ - vdev_submit_bio(vbio->vbio_bio); + + if (vbio->vbio_wait) { + vdev_submit_bio_wait(vbio->vbio_bio); + } else { + vbio->vbio_bio->bi_end_io = vbio_completion; + vbio->vbio_bio->bi_private = vbio; + vdev_submit_bio(vbio->vbio_bio); + } blk_finish_plug(&plug); } @@ -820,7 +843,12 @@ vbio_completion(struct bio *bio) ASSERT0P(zio->io_bio); zio->io_bio = vbio; - zio_delay_interrupt(zio); + /* Using calling thread io, don't dispatch zio. */ + if (vbio->vbio_wait) + zio_execute(zio); + else + zio_delay_interrupt(zio); + } /* @@ -978,8 +1006,19 @@ vdev_disk_io_rw(zio_t *zio) if (abd != zio->io_abd) vbio->vbio_abd = abd; + boolean_t bio_wait = B_FALSE; + if (zfs_vdev_disk_calling_thread_io && + (zio->io_flags & ZIO_FLAG_BYPASSED_QUEUE)) { + vbio->vbio_wait = bio_wait = B_TRUE; + } /* Fill it with data pages and submit it to the kernel */ vbio_submit(vbio, abd, zio->io_size); + + if (bio_wait) { + vbio->vbio_bio->bi_private = vbio; + vbio_completion(vbio->vbio_bio); + } + return (0); } @@ -1370,3 +1409,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW, "Maximum number of data segments to add to an IO request (min 4)"); + +ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, calling_thread_io, UINT, + ZMOD_RW, "Enable calling thread io"); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 43e5f15934ac..77e82d0ce449 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -956,6 +956,7 @@ vdev_queue_io(zio_t *zio) if (!vdev_should_queue_io(zio)) { zio->io_queue_state = ZIO_QS_NONE; + zio->io_flags |= ZIO_FLAG_BYPASSED_QUEUE; return (zio); }