Fused Transpose + ReLU (#2)

WilliamZhang20 · web-flow · commit 30ea560932e7 · 2025-07-20T10:54:17.000-04:00
* relu and transpose

* widen accumulation to 12 bits, improves precision (e.g. if a negative value is subtracted from an overflowed value that is in 12 bits, you can be in the range of 8 bits and obtain a correct value)
diff --git a/info.yaml b/info.yaml
@@ -49,8 +49,8 @@ pinout:
 
   # Bidirectional pins
   uio[0]: "LOAD_EN (input)"
-  uio[1]: "Unused"
-  uio[2]: "Unused"
+  uio[1]: "TRANSPOSE (input)"
+  uio[2]: "ACTIVATION (input)"
   uio[3]: "Unused"
   uio[4]: "Unused"
   uio[5]: "Unused"
diff --git a/src/PE.v b/src/PE.v
@@ -3,30 +3,31 @@ module PE #(
 )(
     input wire clk,
     input wire rst,
-    input wire clear, // clears accumulators between computations...
-
+    input wire clear,
     input wire signed [WIDTH-1:0] a_in,
     input wire signed [WIDTH-1:0] b_in,
 
     output reg signed [WIDTH-1:0] a_out,
     output reg signed [WIDTH-1:0] b_out,
 
-    output reg signed [WIDTH-1:0] c_out
+    output reg signed [WIDTH+3:0] c_out
 );
 
     always @(posedge clk or posedge rst) begin
         if (rst) begin
-            c_out     <= 0;
-            a_out     <= 0;
-            b_out     <= 0;
+            a_out <= 0;
+            b_out <= 0;
+            c_out <= 0;
         end else if (clear) begin
-            c_out     <= 0;
-            a_out     <= 0;
-            b_out     <= 0;
+            a_out <= 0;
+            b_out <= 0;
+            c_out <= 0;
         end else begin
-            c_out     <= c_out + a_in * b_in;
-            a_out     <= a_in;
-            b_out     <= b_in;
+            a_out <= a_in;
+            b_out <= b_in;
+
+            c_out <= c_out + (a_in * b_in);
         end
     end
+
 endmodule
diff --git a/src/mmu_feeder.v b/src/mmu_feeder.v
@@ -6,12 +6,14 @@ module mmu_feeder (
     input wire en,
     input wire [2:0] mmu_cycle,
 
+    input wire transpose,
+
     /* Memory module interface */
     input wire [7:0] weight0, weight1, weight2, weight3,
     input wire [7:0] input0, input1, input2, input3,
 
     /* systolic array -> feeder */
-    input wire [7:0] c00, c01, c10, c11,
+    input wire signed [11:0] c00, c01, c10, c11,
 
     /* feeder -> mmu */
     output reg clear,
@@ -31,6 +33,18 @@ module mmu_feeder (
     // Output counter for selecting c_out
     reg [1:0] output_count;
 
+    function [7:0] saturate_to_s8;
+        input signed [11:0] val;
+        begin
+            if (val > 127)
+                saturate_to_s8 = 8'sd127;
+            else if (val < -128)
+                saturate_to_s8 = -8'sd128;
+            else
+                saturate_to_s8 = val[7:0];
+        end
+    endfunction
+
     // Sequential logic for control and data outputs
     always @(posedge clk or posedge rst) begin
         if (rst) begin
@@ -55,8 +69,6 @@ module mmu_feeder (
                 end else begin
                     output_count <= 0;
                 end
-
-                // Input assignments based on mmu_cycle
                 case (mmu_cycle)
                     3'b000: begin
                         a_data0 <= weight0;
@@ -65,8 +77,13 @@ module mmu_feeder (
                     3'b001: begin
                         a_data0 <= weight1;
                         a_data1 <= weight2;
-                        b_data0 <= input2;
-                        b_data1 <= input1;
+                        if (transpose) begin
+                            b_data0 <= input1;
+                            b_data1 <= input2;
+                        end else begin
+                            b_data0 <= input2;
+                            b_data1 <= input1;
+                        end
                     end
                     3'b010: begin
                         a_data1 <= weight3;
@@ -86,10 +103,10 @@ module mmu_feeder (
         host_outdata = 8'b0; // Default to avoid latch
         if (en) begin
             case (output_count)
-                2'b00: host_outdata = c00;
-                2'b01: host_outdata = c01;
-                2'b10: host_outdata = c10;
-                2'b11: host_outdata = c11;
+                2'b00: host_outdata = saturate_to_s8(c00);
+                2'b01: host_outdata = saturate_to_s8(c01);
+                2'b10: host_outdata = saturate_to_s8(c10);
+                2'b11: host_outdata = saturate_to_s8(c11);
                 default: host_outdata = 8'b0;
             endcase
         end
diff --git a/src/systolic_array_2x2.v b/src/systolic_array_2x2.v
@@ -5,21 +5,23 @@ module systolic_array_2x2 #(
     input wire rst,
     input wire clear,
 
+    input wire activation,
+
     input wire [WIDTH-1:0] a_data0,
     input wire [WIDTH-1:0] a_data1,
     input wire [WIDTH-1:0] b_data0,
     input wire [WIDTH-1:0] b_data1,
 
-    output wire [WIDTH-1:0] c00,
-    output wire [WIDTH-1:0] c01,
-    output wire [WIDTH-1:0] c10,
-    output wire [WIDTH-1:0] c11
+    output wire [WIDTH+3:0] c00,
+    output wire [WIDTH+3:0] c01,
+    output wire [WIDTH+3:0] c10,
+    output wire [WIDTH+3:0] c11
 );
 
     // Internal signals between PEs
     wire [WIDTH-1:0] a_wire [0:1][0:2];
     wire [WIDTH-1:0] b_wire [0:2][0:1];
-    wire [7:0] c_array [0:1][0:1];
+    wire signed [WIDTH+3:0] c_array [0:1][0:1];
 
     // Input loading at top-left
     assign a_wire[0][0] = a_data0;
@@ -31,7 +33,7 @@ module systolic_array_2x2 #(
     generate
         for (i = 0; i < 2; i = i + 1) begin : row
             for (j = 0; j < 2; j = j + 1) begin : col
-                PE #(.WIDTH(8)) pe_inst (
+                PE #(.WIDTH(WIDTH)) pe_inst (
                     .clk(clk),
                     .rst(rst),
                     .clear(clear),
@@ -45,8 +47,10 @@ module systolic_array_2x2 #(
         end
     endgenerate
 
-    assign c00 = c_array[0][0];
-    assign c01 = c_array[0][1];
-    assign c10 = c_array[1][0];
-    assign c11 = c_array[1][1];
-endmodule
+    // Combinational logic for output with optional ReLU
+    assign c00 = activation ? (c_array[0][0] < 0 ? 0 : c_array[0][0]) : c_array[0][0];
+    assign c01 = activation ? (c_array[0][1] < 0 ? 0 : c_array[0][1]) : c_array[0][1];
+    assign c10 = activation ? (c_array[1][0] < 0 ? 0 : c_array[1][0]) : c_array[1][0];
+    assign c11 = activation ? (c_array[1][1] < 0 ? 0 : c_array[1][1]) : c_array[1][1];
+
+endmodule
diff --git a/src/tpu.v b/src/tpu.v
@@ -17,6 +17,8 @@ module tt_um_tpu (
 );
 
     wire instruction = uio_in[0];
+    wire transpose = uio_in[1];
+    wire activation = uio_in[2];
 
     wire compute_en; // internal signal
     reg clear; // reset of PEs only
@@ -28,7 +30,7 @@ module tt_um_tpu (
     wire [7:0] weight0, weight1, weight2, weight3;
     wire [7:0] input0, input1, input2, input3;
 
-    wire [7:0] outputs [0:3]; // raw accumulations (16-bit)
+    wire [11:0] outputs [0:3]; // raw accumulations (16-bit)
     wire [7:0] out_data; // sent to CPU
     // Ports of the systolic Array
     wire [7:0] a_data0, b_data0, a_data1, b_data1;
@@ -60,6 +62,7 @@ module tt_um_tpu (
         .clk(clk),
         .rst(~rst_n),
         .clear(clear),
+        .activation(activation),
         .a_data0(a_data0),
         .a_data1(a_data1),
         .b_data0(b_data0),
@@ -75,6 +78,7 @@ module tt_um_tpu (
         .rst(~rst_n),
         .en(compute_en),
         .mmu_cycle(mmu_cycle),
+        .transpose(transpose),
         .weight0(weight0), .weight1(weight1), .weight2(weight2), .weight3(weight3),
         .input0(input0), .input1(input1), .input2(input2), .input3(input3),
         .c00(outputs[0]), 
@@ -94,6 +98,6 @@ module tt_um_tpu (
     assign uio_out = {done, 7'b0};
     assign uio_oe = 8'b10000000;
 
-    wire _unused = &{ena, uio_in[7:1]};
+    wire _unused = &{ena, uio_in[7:3]};
 
 endmodule
diff --git a/test/test.py b/test/test.py
@@ -3,11 +3,19 @@
 from cocotb.triggers import ClockCycles, RisingEdge
 import numpy as np
 
-def get_expected_matmul(A, B):
-    """
-    Args: lists A, B as flattened row-major matrices
-    """
-    return (np.array(A).reshape(2, 2) @ np.array(B).reshape(2, 2)).flatten().tolist()
+def saturate_to_s8(x):
+    """Clamp value to 8-bit signed range [-128, 127]."""
+    return max(-128, min(127, int(x)))
+
+def get_expected_matmul(A, B, transpose=False, relu=False):
+    A_mat = np.array(A).reshape(2, 2)
+    B_mat = np.array(B).reshape(2, 2)
+    if transpose:
+        B_mat = B_mat.T
+    result = A_mat @ B_mat
+    if relu:
+        result = np.maximum(result, 0)
+    return [saturate_to_s8(val) for val in result.flatten().tolist()]
 
 async def load_matrix(dut, matrix, sel):
     """
@@ -25,19 +33,65 @@ async def load_matrix(dut, matrix, sel):
         dut.uio_in.value = 0
         await RisingEdge(dut.clk)
 
-async def read_signed_output(dut):
-    # Wait for first outputs to propagate
-    await ClockCycles(dut.clk, 3)
+async def read_signed_output(dut, transpose=0, relu=0):
+    # Apply instruction signal just before reading
+    for i in range(3):
+        dut.uio_in.value = (transpose << 1) | (relu << 2)
+        await ClockCycles(dut.clk, 1)
+
     results = []
     for i in range(4):
-        dut.uio_in.value = 0
+        dut.uio_in.value = (transpose << 1) | (relu << 2)
         await ClockCycles(dut.clk, 1)
         val_unsigned = dut.uo_out.value.integer
         val_signed = val_unsigned if val_unsigned < 128 else val_unsigned - 256
         results.append(val_signed)
         dut._log.info(f"Read C[{i//2}][{i%2}] = {val_signed}")
     return results
 
+@cocotb.test()
+async def test_relu_transpose(dut):
+    dut._log.info("Start")
+    clock = Clock(dut.clk, 10, units="us")
+    cocotb.start_soon(clock.start())
+
+    # Reset
+    dut.ena.value = 1
+    dut.ui_in.value = 0
+    dut.uio_in.value = 0
+    dut.rst_n.value = 0
+    await ClockCycles(dut.clk, 5)
+    dut.rst_n.value = 1
+    await ClockCycles(dut.clk, 5)
+
+    A = [5, -6, 7, 8]  # row-major
+    B = [8, 9, 6, 8]  # row-major: [B00, B01, B10, B11]
+
+    await load_matrix(dut, A, sel=0)
+    await load_matrix(dut, B, sel=1)
+
+    expected = get_expected_matmul(A, B, transpose=False, relu=True)
+    results = await read_signed_output(dut, transpose=0, relu=1)
+
+    for i in range(4):
+        assert results[i] == expected[i], f"C[{i//2}][{i%2}] = {results[i]} != expected {expected[i]}"
+
+    dut._log.info("First part passed")
+
+    A = [1, 2, 3, 4]
+    B = [5, 6, 7, 8]
+
+    await load_matrix(dut, A, sel=0)
+    await load_matrix(dut, B, sel=1)
+
+    expected = get_expected_matmul(A, B, transpose=True, relu=True)
+    results = await read_signed_output(dut, transpose=1, relu=1)
+
+    for i in range(4):
+        assert results[i] == expected[i], f"C[{i//2}][{i%2}] = {results[i]} != expected {expected[i]}"
+
+    dut._log.info("ReLU + Transpose test passed!")
+
 @cocotb.test()
 async def test_numeric_limits(dut):
     dut._log.info("Start")
@@ -54,7 +108,25 @@ async def test_numeric_limits(dut):
     await ClockCycles(dut.clk, 5)
 
     A = [5, -6, 7, 8]  # row-major
-    B = [8, 9, 6, -7]  # row-major: [B00, B01, B10, B11]
+    B = [8, 12, 9, -7]  # row-major: [B00, B01, B10, B11]
+
+    await load_matrix(dut, A, sel=0)
+    await load_matrix(dut, B, sel=1)
+
+    expected = get_expected_matmul(A, B)
+    results = []
+
+    # Wait for systolic array to compute
+    
+    results = await read_signed_output(dut)
+
+    for i in range(4):
+        assert results[i] == expected[i], f"C[{i//2}][{i%2}] = {results[i]} != expected {expected[i]}"
+
+    dut._log.info("Passed large positive values")
+
+    A = [5, -6, 7, 8]  # row-major
+    B = [8, -12, 9, -7]  # row-major: [B00, B01, B10, B11]
 
     await load_matrix(dut, A, sel=0)
     await load_matrix(dut, B, sel=1)