From 017f6a93f67f6c14967568dece573e905ef08e99 Mon Sep 17 00:00:00 2001
From: Jyotiprakash Mishra <mail@jyotiprakash.org>
Date: Fri, 8 May 2026 17:35:52 +0530
Subject: [PATCH] DivSqrtRecFN_small: add optional constant-time mode (closes
 skipCycle2 timing channel)

Summary
-------
Adds a new option bit `divSqrtOpt_constTime = 32` to common.scala.  When
set, DivSqrtRecFN_small pads every divide / sqrt to a fixed worst-case
latency of `sigWidth + 5` cycles, eliminating the operand-dependent
skipCycle2 fast path.  The option is OFF by default; existing
instantiations are bit-exact and cycle-exact unchanged.

Motivation
----------
The iterative SRT divider in DivSqrtRecFN_small contains a documented
performance optimisation, `skipCycle2`, that shortens the iteration
count by one cycle when the partial significand at iter 3 satisfies a
specific bit pattern (most commonly when the divisor's mantissa is
exactly zero, i.e. a power of two).  This is a documented performance
feature, but it is also a well-defined operand-dependent timing channel:
an attacker who controls the divisor can deterministically trigger or
avoid the fast path, and the resulting per-divide cycle differential
survives synthesis, place-and-route and silicon fabric (validated on a
Z-7020 ZedBoard build of this exact module via the open xc7 toolchain
in the VecLeak paper [1]).

For the avoidance of doubt this is not a "hidden bug": the bit is named
`skipCycle2` in the source.  The contribution of this patch is to
provide a constant-time mode that closes the channel, suitable for
downstream users who route this divider through a security-sensitive FP
path (notably FALCON post-quantum signing implementations that dispatch
their inner-loop reciprocals through hardware fdiv.d).

Mechanism
---------
A small (~10-FF, ~200-LUT) counter wrapper inside the same module pads
every divide / sqrt to a fixed worst-case latency of `sigWidth + 5`
cycles (= 58 cyc on FP64).  The raw inner divider's outValid pulses are
latched into shadow ct_pending_* flags as they fire and re-emitted when
the counter reaches 1; inReady is held low for the duration so the
upstream consumer cannot dispatch a new operation and overwrite the
latched result.  The arithmetic result is unchanged.

The new behaviour is gated on `(options & divSqrtOpt_constTime) != 0`.
When the option is OFF, ctEnable evaluates to 0.B, the muxes select the
inner pulses directly, and synthesis dead-code-eliminates the unused
shadow counter / pending flags.  Existing instantiations therefore
observe no change in area, fmax, or per-cycle behaviour.

Synthesis cost (constant-time mode ON; Yosys + sky130hd + OpenSTA;
FP64 variant DivSqrtRecFM_small_e11_s53):

  vanilla:    16,357 um^2 / 67.6 MHz fmax
  patched:    16,765 um^2 / 62.6 MHz fmax
  delta:      +2.49% area / -7.4% fmax

Reference
---------
[1] VecLeak: A Cycle-Exact Operand-Dependent Timing Channel in
    Open-Source RISC-V Vector Floating-Point Hardware (forthcoming,
    MDPI Chips 2026).

Signed-off-by: Jyotiprakash Mishra <mail@jyotiprakash.org>
---
 .../src/main/scala/DivSqrtRecFN_small.scala   | 41 ++++++++++++++++---
 hardfloat/src/main/scala/common.scala         |  7 ++++
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/hardfloat/src/main/scala/DivSqrtRecFN_small.scala b/hardfloat/src/main/scala/DivSqrtRecFN_small.scala
index 2d79a18..7625607 100644
--- a/hardfloat/src/main/scala/DivSqrtRecFN_small.scala
+++ b/hardfloat/src/main/scala/DivSqrtRecFN_small.scala
@@ -492,17 +492,46 @@ class
     val divSqrtRecFNToRaw =
         Module(new DivSqrtRecFNToRaw_small(expWidth, sigWidth, options))
 
-    io.inReady := divSqrtRecFNToRaw.io.inReady
-    divSqrtRecFNToRaw.io.inValid      := io.inValid
+    //------------------------------------------------------------------------
+    // Optional constant-time padding (gated on divSqrtOpt_constTime):
+    // when enabled, every divide / sqrt completes at a fixed worst-case
+    // latency of sigWidth + 5 cycles, regardless of operand class.  This
+    // closes the operand-dependent +1-cycle skipCycle2 timing channel.
+    // When the option is OFF (default), behaviour is identical to before:
+    // io.outValid_* is driven directly from the inner module's pulses,
+    // and io.inReady mirrors the inner inReady, preserving cycle-exact
+    // backward compatibility.
+    //------------------------------------------------------------------------
+    val ctEnable = ((options & divSqrtOpt_constTime) != 0).B
+    val ctTarget = (sigWidth + 5).U
+    val ct_counter      = RegInit(0.U(8.W))
+    val ct_pending_div  = RegInit(false.B)
+    val ct_pending_sqrt = RegInit(false.B)
+
+    val ct_busy = ct_counter =/= 0.U || ct_pending_div || ct_pending_sqrt
+    io.inReady := divSqrtRecFNToRaw.io.inReady && !(ctEnable && ct_busy)
+    divSqrtRecFNToRaw.io.inValid      := io.inValid && !(ctEnable && ct_busy)
     divSqrtRecFNToRaw.io.sqrtOp       := io.sqrtOp
     divSqrtRecFNToRaw.io.a            := io.a
     divSqrtRecFNToRaw.io.b            := io.b
     divSqrtRecFNToRaw.io.roundingMode := io.roundingMode
 
-    //------------------------------------------------------------------------
-    //------------------------------------------------------------------------
-    io.outValid_div  := divSqrtRecFNToRaw.io.rawOutValid_div
-    io.outValid_sqrt := divSqrtRecFNToRaw.io.rawOutValid_sqrt
+    when (ctEnable && io.inValid && io.inReady) {
+        ct_counter := ctTarget
+    } .elsewhen (ctEnable && ct_counter =/= 0.U) {
+        ct_counter := ct_counter - 1.U
+    }
+
+    when (ctEnable && divSqrtRecFNToRaw.io.rawOutValid_div)  { ct_pending_div  := true.B }
+    when (ctEnable && divSqrtRecFNToRaw.io.rawOutValid_sqrt) { ct_pending_sqrt := true.B }
+
+    val ct_emit = ctEnable && ct_counter === 1.U
+    io.outValid_div  := Mux(ctEnable, ct_emit && ct_pending_div,  divSqrtRecFNToRaw.io.rawOutValid_div)
+    io.outValid_sqrt := Mux(ctEnable, ct_emit && ct_pending_sqrt, divSqrtRecFNToRaw.io.rawOutValid_sqrt)
+    when (ct_emit) {
+        ct_pending_div  := false.B
+        ct_pending_sqrt := false.B
+    }
 
     val roundRawFNToRecFN =
         Module(new RoundRawFNToRecFN(expWidth, sigWidth, 0))
diff --git a/hardfloat/src/main/scala/common.scala b/hardfloat/src/main/scala/common.scala
index eeddc6e..5e98912 100644
--- a/hardfloat/src/main/scala/common.scala
+++ b/hardfloat/src/main/scala/common.scala
@@ -63,6 +63,13 @@ object consts {
     /*------------------------------------------------------------------------
     *------------------------------------------------------------------------*/
     def divSqrtOpt_twoBitsPerCycle     = 16
+    /*------------------------------------------------------------------------
+    | When set, DivSqrtRecFN_small pads every divide / sqrt to a fixed
+    | worst-case latency of `sigWidth + 5` cycles, eliminating the
+    | operand-dependent timing channel created by skipCycle2.  Off by
+    | default; enable in security-sensitive deployments.
+    *------------------------------------------------------------------------*/
+    def divSqrtOpt_constTime           = 32
 }
 
 class RawFloat(val expWidth: Int, val sigWidth: Int) extends Bundle