From 017f6a93f67f6c14967568dece573e905ef08e99 Mon Sep 17 00:00:00 2001 From: Jyotiprakash Mishra Date: Fri, 8 May 2026 17:35:52 +0530 Subject: [PATCH] DivSqrtRecFN_small: add optional constant-time mode (closes skipCycle2 timing channel) Summary ------- Adds a new option bit `divSqrtOpt_constTime = 32` to common.scala. When set, DivSqrtRecFN_small pads every divide / sqrt to a fixed worst-case latency of `sigWidth + 5` cycles, eliminating the operand-dependent skipCycle2 fast path. The option is OFF by default; existing instantiations are bit-exact and cycle-exact unchanged. Motivation ---------- The iterative SRT divider in DivSqrtRecFN_small contains a documented performance optimisation, `skipCycle2`, that shortens the iteration count by one cycle when the partial significand at iter 3 satisfies a specific bit pattern (most commonly when the divisor's mantissa is exactly zero, i.e. a power of two). This is a documented performance feature, but it is also a well-defined operand-dependent timing channel: an attacker who controls the divisor can deterministically trigger or avoid the fast path, and the resulting per-divide cycle differential survives synthesis, place-and-route and silicon fabric (validated on a Z-7020 ZedBoard build of this exact module via the open xc7 toolchain in the VecLeak paper [1]). For the avoidance of doubt this is not a "hidden bug": the bit is named `skipCycle2` in the source. The contribution of this patch is to provide a constant-time mode that closes the channel, suitable for downstream users who route this divider through a security-sensitive FP path (notably FALCON post-quantum signing implementations that dispatch their inner-loop reciprocals through hardware fdiv.d). Mechanism --------- A small (~10-FF, ~200-LUT) counter wrapper inside the same module pads every divide / sqrt to a fixed worst-case latency of `sigWidth + 5` cycles (= 58 cyc on FP64). The raw inner divider's outValid pulses are latched into shadow ct_pending_* flags as they fire and re-emitted when the counter reaches 1; inReady is held low for the duration so the upstream consumer cannot dispatch a new operation and overwrite the latched result. The arithmetic result is unchanged. The new behaviour is gated on `(options & divSqrtOpt_constTime) != 0`. When the option is OFF, ctEnable evaluates to 0.B, the muxes select the inner pulses directly, and synthesis dead-code-eliminates the unused shadow counter / pending flags. Existing instantiations therefore observe no change in area, fmax, or per-cycle behaviour. Synthesis cost (constant-time mode ON; Yosys + sky130hd + OpenSTA; FP64 variant DivSqrtRecFM_small_e11_s53): vanilla: 16,357 um^2 / 67.6 MHz fmax patched: 16,765 um^2 / 62.6 MHz fmax delta: +2.49% area / -7.4% fmax Reference --------- [1] VecLeak: A Cycle-Exact Operand-Dependent Timing Channel in Open-Source RISC-V Vector Floating-Point Hardware (forthcoming, MDPI Chips 2026). Signed-off-by: Jyotiprakash Mishra --- .../src/main/scala/DivSqrtRecFN_small.scala | 41 ++++++++++++++++--- hardfloat/src/main/scala/common.scala | 7 ++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/hardfloat/src/main/scala/DivSqrtRecFN_small.scala b/hardfloat/src/main/scala/DivSqrtRecFN_small.scala index 2d79a18..7625607 100644 --- a/hardfloat/src/main/scala/DivSqrtRecFN_small.scala +++ b/hardfloat/src/main/scala/DivSqrtRecFN_small.scala @@ -492,17 +492,46 @@ class val divSqrtRecFNToRaw = Module(new DivSqrtRecFNToRaw_small(expWidth, sigWidth, options)) - io.inReady := divSqrtRecFNToRaw.io.inReady - divSqrtRecFNToRaw.io.inValid := io.inValid + //------------------------------------------------------------------------ + // Optional constant-time padding (gated on divSqrtOpt_constTime): + // when enabled, every divide / sqrt completes at a fixed worst-case + // latency of sigWidth + 5 cycles, regardless of operand class. This + // closes the operand-dependent +1-cycle skipCycle2 timing channel. + // When the option is OFF (default), behaviour is identical to before: + // io.outValid_* is driven directly from the inner module's pulses, + // and io.inReady mirrors the inner inReady, preserving cycle-exact + // backward compatibility. + //------------------------------------------------------------------------ + val ctEnable = ((options & divSqrtOpt_constTime) != 0).B + val ctTarget = (sigWidth + 5).U + val ct_counter = RegInit(0.U(8.W)) + val ct_pending_div = RegInit(false.B) + val ct_pending_sqrt = RegInit(false.B) + + val ct_busy = ct_counter =/= 0.U || ct_pending_div || ct_pending_sqrt + io.inReady := divSqrtRecFNToRaw.io.inReady && !(ctEnable && ct_busy) + divSqrtRecFNToRaw.io.inValid := io.inValid && !(ctEnable && ct_busy) divSqrtRecFNToRaw.io.sqrtOp := io.sqrtOp divSqrtRecFNToRaw.io.a := io.a divSqrtRecFNToRaw.io.b := io.b divSqrtRecFNToRaw.io.roundingMode := io.roundingMode - //------------------------------------------------------------------------ - //------------------------------------------------------------------------ - io.outValid_div := divSqrtRecFNToRaw.io.rawOutValid_div - io.outValid_sqrt := divSqrtRecFNToRaw.io.rawOutValid_sqrt + when (ctEnable && io.inValid && io.inReady) { + ct_counter := ctTarget + } .elsewhen (ctEnable && ct_counter =/= 0.U) { + ct_counter := ct_counter - 1.U + } + + when (ctEnable && divSqrtRecFNToRaw.io.rawOutValid_div) { ct_pending_div := true.B } + when (ctEnable && divSqrtRecFNToRaw.io.rawOutValid_sqrt) { ct_pending_sqrt := true.B } + + val ct_emit = ctEnable && ct_counter === 1.U + io.outValid_div := Mux(ctEnable, ct_emit && ct_pending_div, divSqrtRecFNToRaw.io.rawOutValid_div) + io.outValid_sqrt := Mux(ctEnable, ct_emit && ct_pending_sqrt, divSqrtRecFNToRaw.io.rawOutValid_sqrt) + when (ct_emit) { + ct_pending_div := false.B + ct_pending_sqrt := false.B + } val roundRawFNToRecFN = Module(new RoundRawFNToRecFN(expWidth, sigWidth, 0)) diff --git a/hardfloat/src/main/scala/common.scala b/hardfloat/src/main/scala/common.scala index eeddc6e..5e98912 100644 --- a/hardfloat/src/main/scala/common.scala +++ b/hardfloat/src/main/scala/common.scala @@ -63,6 +63,13 @@ object consts { /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ def divSqrtOpt_twoBitsPerCycle = 16 + /*------------------------------------------------------------------------ + | When set, DivSqrtRecFN_small pads every divide / sqrt to a fixed + | worst-case latency of `sigWidth + 5` cycles, eliminating the + | operand-dependent timing channel created by skipCycle2. Off by + | default; enable in security-sensitive deployments. + *------------------------------------------------------------------------*/ + def divSqrtOpt_constTime = 32 } class RawFloat(val expWidth: Int, val sigWidth: Int) extends Bundle