From 24c546debd24252fef84e988cb229ce41eebbdc3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 11 Feb 2026 11:20:04 -0500 Subject: [PATCH 1/3] NTB: ntb_transport: Preallocate memory windows Original Linux code allocated memory on every link up and freed on every link down. It may be a problem to allocate several MBs of physically contiguous memory on a running system. To workaround that, speculatively pre-allocate it on boot and then reallocate only if remote host requests different parameters, which should be very rare. (cherry picked from commit 0852db2813974ea5fd8485907f601bd061f32964) --- drivers/ntb/ntb_transport.c | 50 +++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index c4aeeefb3305..f7a2d6bd1c14 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -73,7 +73,7 @@ MODULE_VERSION(NTB_TRANSPORT_VER); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Intel Corporation"); -static unsigned long max_mw_size; +static unsigned long max_mw_size = 256 * 1024 * 1024; module_param(max_mw_size, ulong, 0644); MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows"); @@ -927,8 +927,8 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw, xlat_size = round_up(size, xlat_align_size); buff_size = round_up(size, xlat_align); - /* No need to re-setup */ - if (mw->xlat_size == xlat_size) + /* No need to re-setup if size already matches */ + if (mw->xlat_size == xlat_size && mw->buff_size == buff_size) return 0; if (mw->buff_size) @@ -1048,8 +1048,11 @@ static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt) if (!nt->link_is_up) cancel_delayed_work_sync(&nt->link_work); - for (i = 0; i < nt->mw_count; i++) - ntb_free_mw(nt, i); + /* + * Do NOT free MW memory on link down. Memory is retained across + * link cycles to avoid fragmentation from repeated allocation. + * Memory is only freed on device removal in ntb_transport_free(). + */ /* The scratchpad registers keep the values if the remote side * goes down, blast them now to give them a sane value the next @@ -1332,6 +1335,34 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt, return 0; } +/* + * Speculatively pre-allocate memory assuming symmetric config. + * This grabs contiguous memory early before fragmentation. + * If peer has different size, we'll reallocate on link-up. + */ +static void ntb_preallocate_mws(struct ntb_transport_ctx *nt) +{ + struct ntb_transport_mw *mw; + resource_size_t size; + int i, rc; + + for (i = 0; i < nt->mw_count; i++) { + mw = &nt->mw_vec[i]; + size = mw->phys_size; + + if (max_mw_size && size > max_mw_size) + size = max_mw_size; + + rc = ntb_set_mw(nt, i, size); + if (rc) { + dev_info(&nt->ndev->pdev->dev, + "Failed to preallocate MW%d (size %llx): %d\n", + i, (unsigned long long)size, rc); + /* Continue - link-up will retry */ + } + } +} + static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev) { struct ntb_transport_ctx *nt; @@ -1476,6 +1507,9 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev) INIT_WORK(&nt->link_cleanup, ntb_transport_link_cleanup_work); nt->link_is_up = false; + /* Speculatively pre-allocate MW buffers to avoid fragmentation */ + ntb_preallocate_mws(nt); + rc = ntb_set_ctx(ndev, nt, &ntb_transport_ops); if (rc) goto err2; @@ -1495,9 +1529,11 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev) err2: kfree(nt->qp_vec); err1: - while (i--) { + for (i = 0; i < mw_count; i++) { mw = &nt->mw_vec[i]; - iounmap(mw->vbase); + ntb_free_mw(nt, i); + if (mw->vbase) + iounmap(mw->vbase); } kfree(nt->mw_vec); err: From 1ae07226b6d64b7b5fa7f81baf0f3d69b1c645ee Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 2 Apr 2026 23:58:40 +0500 Subject: [PATCH 2/3] NFSD: return ESTALE for snapdir entries when export lookup fails When nfsd_cross_mnt() crosses into a mounted ZFS snapshot but rqst_exp_get_by_name() fails to resolve the sub-export (-ENOENT), the error is silently converted to success and the automount stub dentry is returned to the caller. The stub has simple_dir_operations and its file handle is encoded with gen=1 (snapshot is mounted). This 44-byte gen=1 handle becomes a permanent trap: zfsctl_snapdir_vget() sees gen=1 matching d_mountpoint=true, returns the stub inode, and READDIR returns NFS4_OK with zero entries. The client caches this empty result indefinitely since there is no error signal to trigger re-resolution. The empty directory persists until change_info4 updates (e.g., manual snapshot creation on the server). For zfs_snapdir exports, return -ESTALE instead of silently falling back to the automount stub. This causes the client to re-resolve via LOOKUP. Signed-off-by: Ameer Hamza (cherry picked from commit 3ab428ea16afdc36ab80125cb7aca6e897b9c688) --- fs/nfsd/vfs.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4e52820810fa..77c0ac89fe44 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -201,8 +201,23 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, * allowed without an explicit export of the new * directory. */ - if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) - err = 0; + if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) { +#ifdef CONFIG_TRUENAS + /* + * For ZFS snapshot entries under a zfs_snapdir + * export, the fallback dentry is an automount + * stub with simple_dir_operations that returns + * empty READDIR (NFS4_OK, zero entries). The + * client caches this silently with no error + * signal to trigger re-resolution. Return ESTALE + * so the client retries via LOOKUP. + */ + if (is_snapdir) + err = -ESTALE; + else +#endif /* CONFIG_TRUENAS */ + err = 0; + } path_put(&path); goto out; } From b7e2338b679c56a78cd8f0ce685bd68abc68d63e Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 8 Apr 2026 15:12:06 +0500 Subject: [PATCH 3/3] NFSD: return ESTALE for snapdir entries when automount fails When nfsd_cross_mnt() sets LOOKUP_AUTOMOUNT for a snapdir entry and follow_down() returns with the path unchanged, the automount was attempted and failed (EISDIR from zfsctl_snapshot_mount). The existing code treats this as "mountpoint in some other namespace" and returns success with the ctldir stub dentry. This stub has simple_dir_operations and produces a 44-byte file handle that returns empty READDIR (NFS4_OK, zero entries) with no error signal for the client to trigger re-resolution. This can happen transiently when zfs_suspend_fs races with mount helper after the z_teardown_lock deadlock fix (https://github.com/openzfs/zfs/pull/18415) Return ESTALE for snapdir entries so the client retries via LOOKUP, which re-triggers the automount. Signed-off-by: Ameer Hamza (cherry picked from commit c026341d1fb9cda59c476a509eef18b99afb9dd4) --- fs/nfsd/vfs.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 77c0ac89fe44..371103667ccd 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -188,6 +188,20 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, nfsd_mountpoint(dentry, exp) == 2) { /* This is only a mountpoint in some other namespace */ path_put(&path); +#ifdef CONFIG_TRUENAS + /* + * For snapdir entries we set LOOKUP_AUTOMOUNT above, so + * if the path is unchanged the automount was attempted + * and failed (EISDIR from zfsctl_snapshot_mount). This + * can happen transiently when zfs_suspend_fs races with + * the mount helper after the z_teardown_lock deadlock + * fix (see https://github.com/openzfs/zfs/pull/18415). + * Return ESTALE so the client retries via LOOKUP rather + * than caching the ctldir stub as an empty directory. + */ + if (is_snapdir) + err = -ESTALE; +#endif /* CONFIG_TRUENAS */ goto out; }