From 24c546debd24252fef84e988cb229ce41eebbdc3 Mon Sep 17 00:00:00 2001
From: Alexander Motin <alexander.motin@TrueNAS.com>
Date: Wed, 11 Feb 2026 11:20:04 -0500
Subject: [PATCH 1/3] NTB: ntb_transport: Preallocate memory windows

Original Linux code allocated memory on every link up and freed on
every link down.  It may be a problem to allocate several MBs of
physically contiguous memory on a running system.  To workaround
that, speculatively pre-allocate it on boot and then reallocate
only if remote host requests different parameters, which should
be very rare.

(cherry picked from commit 0852db2813974ea5fd8485907f601bd061f32964)
---
 drivers/ntb/ntb_transport.c | 50 +++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index c4aeeefb3305..f7a2d6bd1c14 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -73,7 +73,7 @@ MODULE_VERSION(NTB_TRANSPORT_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
-static unsigned long max_mw_size;
+static unsigned long max_mw_size = 256 * 1024 * 1024;
 module_param(max_mw_size, ulong, 0644);
 MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows");
 
@@ -927,8 +927,8 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
 	xlat_size = round_up(size, xlat_align_size);
 	buff_size = round_up(size, xlat_align);
 
-	/* No need to re-setup */
-	if (mw->xlat_size == xlat_size)
+	/* No need to re-setup if size already matches */
+	if (mw->xlat_size == xlat_size && mw->buff_size == buff_size)
 		return 0;
 
 	if (mw->buff_size)
@@ -1048,8 +1048,11 @@ static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
 	if (!nt->link_is_up)
 		cancel_delayed_work_sync(&nt->link_work);
 
-	for (i = 0; i < nt->mw_count; i++)
-		ntb_free_mw(nt, i);
+	/*
+	 * Do NOT free MW memory on link down. Memory is retained across
+	 * link cycles to avoid fragmentation from repeated allocation.
+	 * Memory is only freed on device removal in ntb_transport_free().
+	 */
 
 	/* The scratchpad registers keep the values if the remote side
 	 * goes down, blast them now to give them a sane value the next
@@ -1332,6 +1335,34 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 	return 0;
 }
 
+/*
+ * Speculatively pre-allocate memory assuming symmetric config.
+ * This grabs contiguous memory early before fragmentation.
+ * If peer has different size, we'll reallocate on link-up.
+ */
+static void ntb_preallocate_mws(struct ntb_transport_ctx *nt)
+{
+	struct ntb_transport_mw *mw;
+	resource_size_t size;
+	int i, rc;
+
+	for (i = 0; i < nt->mw_count; i++) {
+		mw = &nt->mw_vec[i];
+		size = mw->phys_size;
+
+		if (max_mw_size && size > max_mw_size)
+			size = max_mw_size;
+
+		rc = ntb_set_mw(nt, i, size);
+		if (rc) {
+			dev_info(&nt->ndev->pdev->dev,
+				 "Failed to preallocate MW%d (size %llx): %d\n",
+				 i, (unsigned long long)size, rc);
+			/* Continue - link-up will retry */
+		}
+	}
+}
+
 static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 {
 	struct ntb_transport_ctx *nt;
@@ -1476,6 +1507,9 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 	INIT_WORK(&nt->link_cleanup, ntb_transport_link_cleanup_work);
 	nt->link_is_up = false;
 
+	/* Speculatively pre-allocate MW buffers to avoid fragmentation */
+	ntb_preallocate_mws(nt);
+
 	rc = ntb_set_ctx(ndev, nt, &ntb_transport_ops);
 	if (rc)
 		goto err2;
@@ -1495,9 +1529,11 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 err2:
 	kfree(nt->qp_vec);
 err1:
-	while (i--) {
+	for (i = 0; i < mw_count; i++) {
 		mw = &nt->mw_vec[i];
-		iounmap(mw->vbase);
+		ntb_free_mw(nt, i);
+		if (mw->vbase)
+			iounmap(mw->vbase);
 	}
 	kfree(nt->mw_vec);
 err:

From 1ae07226b6d64b7b5fa7f81baf0f3d69b1c645ee Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Thu, 2 Apr 2026 23:58:40 +0500
Subject: [PATCH 2/3] NFSD: return ESTALE for snapdir entries when export
 lookup fails

When nfsd_cross_mnt() crosses into a mounted ZFS snapshot but
rqst_exp_get_by_name() fails to resolve the sub-export (-ENOENT), the
error is silently converted to success and the automount stub dentry is
returned to the caller. The stub has simple_dir_operations and its file
handle is encoded with gen=1 (snapshot is mounted).

This 44-byte gen=1 handle becomes a permanent trap:
zfsctl_snapdir_vget() sees gen=1 matching d_mountpoint=true, returns the
stub inode, and READDIR returns NFS4_OK with zero entries. The client
caches this empty result indefinitely since there is no error signal to
trigger re-resolution. The empty directory persists until change_info4
updates (e.g., manual snapshot creation on the server).

For zfs_snapdir exports, return -ESTALE instead of silently falling back
to the automount stub. This causes the client to re-resolve via LOOKUP.

Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
(cherry picked from commit 3ab428ea16afdc36ab80125cb7aca6e897b9c688)
---
 fs/nfsd/vfs.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4e52820810fa..77c0ac89fe44 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -201,8 +201,23 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		 * allowed without an explicit export of the new
 		 * directory.
 		 */
-		if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
-			err = 0;
+		if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT)) {
+#ifdef CONFIG_TRUENAS
+			/*
+			 * For ZFS snapshot entries under a zfs_snapdir
+			 * export, the fallback dentry is an automount
+			 * stub with simple_dir_operations that returns
+			 * empty READDIR (NFS4_OK, zero entries). The
+			 * client caches this silently with no error
+			 * signal to trigger re-resolution. Return ESTALE
+			 * so the client retries via LOOKUP.
+			 */
+			if (is_snapdir)
+				err = -ESTALE;
+			else
+#endif /* CONFIG_TRUENAS */
+				err = 0;
+		}
 		path_put(&path);
 		goto out;
 	}

From b7e2338b679c56a78cd8f0ce685bd68abc68d63e Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Wed, 8 Apr 2026 15:12:06 +0500
Subject: [PATCH 3/3] NFSD: return ESTALE for snapdir entries when automount
 fails

When nfsd_cross_mnt() sets LOOKUP_AUTOMOUNT for a snapdir entry and
follow_down() returns with the path unchanged, the automount was
attempted and failed (EISDIR from zfsctl_snapshot_mount). The existing
code treats this as "mountpoint in some other namespace" and returns
success with the ctldir stub dentry.  This stub has
simple_dir_operations and produces a 44-byte file handle that returns
empty READDIR (NFS4_OK, zero entries) with no error signal for the
client to trigger re-resolution. This can happen transiently when
zfs_suspend_fs races with mount helper after the z_teardown_lock
deadlock fix (https://github.com/openzfs/zfs/pull/18415)

Return ESTALE for snapdir entries so the client retries via LOOKUP,
which re-triggers the automount.

Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
(cherry picked from commit c026341d1fb9cda59c476a509eef18b99afb9dd4)
---
 fs/nfsd/vfs.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 77c0ac89fe44..371103667ccd 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -188,6 +188,20 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 	    nfsd_mountpoint(dentry, exp) == 2) {
 		/* This is only a mountpoint in some other namespace */
 		path_put(&path);
+#ifdef CONFIG_TRUENAS
+		/*
+		 * For snapdir entries we set LOOKUP_AUTOMOUNT above, so
+		 * if the path is unchanged the automount was attempted
+		 * and failed (EISDIR from zfsctl_snapshot_mount).  This
+		 * can happen transiently when zfs_suspend_fs races with
+		 * the mount helper after the z_teardown_lock deadlock
+		 * fix (see https://github.com/openzfs/zfs/pull/18415).
+		 * Return ESTALE so the client retries via LOOKUP rather
+		 * than caching the ctldir stub as an empty directory.
+		 */
+		if (is_snapdir)
+			err = -ESTALE;
+#endif /* CONFIG_TRUENAS */
 		goto out;
 	}