diff --git a/crates/core/src/aes.rs b/crates/core/src/aes.rs
index ce6f0397..cb20ea81 100644
--- a/crates/core/src/aes.rs
+++ b/crates/core/src/aes.rs
@@ -1,5 +1,7 @@
 //! Fixed-key AES cipher
 
+use std::sync::OnceLock;
+
 use aes::Aes128Enc;
 use cipher::{BlockCipherEncrypt, KeyInit};
 use once_cell::sync::Lazy;
@@ -12,13 +14,19 @@ pub const FIXED_KEY: [u8; 16] = [
 ];
 
 /// Fixed-key AES cipher
-pub static FIXED_KEY_AES: Lazy<FixedKeyAes> = Lazy::new(|| FixedKeyAes {
-    aes: Aes128Enc::new_from_slice(&FIXED_KEY).unwrap(),
-});
+pub static FIXED_KEY_AES: Lazy<FixedKeyAes> = Lazy::new(|| FixedKeyAes::new(FIXED_KEY));
 
-/// Fixed-key AES cipher
+/// Fixed-key AES cipher.
+///
+/// Provides correlation-robust hash functions (CR, CCR, TCCR) and the
+/// RTCCR hash from "Three Halves Make a Whole" (Rosulek & Roy, 2021).
+///
+/// The RTCCR universal hash coefficient is lazily initialized on first use.
 pub struct FixedKeyAes {
     aes: Aes128Enc,
+    /// Lazily computed universal hash coefficient for RTCCR.
+    /// Derived by encrypting zero: u = AES_k(0).
+    u: OnceLock<Block>,
 }
 
 impl FixedKeyAes {
@@ -26,6 +34,116 @@ impl FixedKeyAes {
     pub fn new(key: [u8; 16]) -> Self {
         Self {
             aes: Aes128Enc::new(&key.into()),
+            u: OnceLock::new(),
+        }
+    }
+
+    /// Get or compute the universal hash coefficient for RTCCR.
+    ///
+    /// Lazily derives u = AES_k(0) on first call.
+    ///
+    /// # Universal Hash Implementation Note
+    ///
+    /// The paper (Section 5, Page 9) specifies U(τ) = (u₁·τ_L) ‖ (u₂·τ_R) using
+    /// two independent GF(2⁶⁴) multiplications. We instead use a single
+    /// GF(2¹²⁸) multiplication: U(τ) = u · τ in GF(2¹²⁸).
+    ///
+    /// Rationale:
+    /// - GF(2¹²⁸) multiplication uses hardware CLMUL instructions (~10-20x
+    ///   faster)
+    /// - A single field multiplication is still a valid universal hash function
+    /// - GF(2¹²⁸) provides better mixing than two independent GF(2⁶⁴)
+    ///   operations
+    /// - The security proof only requires U to be universal, not the specific
+    ///   construction
+    #[inline]
+    fn u(&self) -> Block {
+        *self.u.get_or_init(|| {
+            let mut u = Block::new([0u8; 16]);
+            self.aes.encrypt_block(u.as_array_mut());
+            u
+        })
+    }
+
+    /// Compute universal hash U(τ) = u · τ in GF(2¹²⁸)
+    #[inline]
+    fn universal_hash(&self, tweak: Block) -> Block {
+        self.u().gfmul(tweak)
+    }
+
+    /// Randomized tweakable circular correlation-robust hash function (RTCCR).
+    ///
+    /// From "Three Halves Make a Whole" (Rosulek & Roy, 2021):
+    /// <https://eprint.iacr.org/2021/749>
+    ///
+    /// `H(X, τ) = AES_k(X ⊕ U(τ)) ⊕ σ(X ⊕ U(τ))`
+    ///
+    /// Where U(τ) is a universal hash function.
+    #[inline]
+    pub fn rtccr(&self, tweak: Block, block: Block) -> Block {
+        // σ(X) = α·X in GF(2^128), where α = 0x87.
+        // See rtccr_many for detailed documentation on the choice of α.
+        const ALPHA: Block = Block::new([0x87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        #[inline]
+        fn sigma(x: Block) -> Block {
+            x.gfmul(ALPHA)
+        }
+
+        let u_tweak = self.universal_hash(tweak);
+        let tweaked = block ^ u_tweak;
+        let mut encrypted = tweaked;
+        self.aes.encrypt_block(encrypted.as_array_mut());
+        encrypted ^ sigma(tweaked)
+    }
+
+    /// Randomized tweakable circular correlation-robust hash function (RTCCR) -
+    /// batch version.
+    ///
+    /// From "Three Halves Make a Whole" (Rosulek & Roy, 2021):
+    /// <https://eprint.iacr.org/2021/749>
+    ///
+    /// `H(X, τ) = AES_k(X ⊕ U(τ)) ⊕ σ(X ⊕ U(τ))`
+    ///
+    /// Where U(τ) = (u₁·τ_L) ‖ (u₂·τ_R) is a universal hash function.
+    ///
+    /// # Arguments
+    ///
+    /// * `tweaks` - The tweaks to use for each block.
+    /// * `blocks` - The blocks to hash in-place.
+    #[inline]
+    pub fn rtccr_many<const N: usize>(&self, tweaks: &[Block; N], blocks: &mut [Block; N]) {
+        // The paper (Section 5) requires α ∈ GF(2^64) \ GF(2²), meaning α must
+        // not be in the subfield GF(4) = {0, 1, β, β+1} where β² + β + 1 = 0.
+        // Elements in GF(4) have multiplicative order dividing 3, which would
+        // make σ³ = identity and break circular correlation robustness.
+        //
+        // We use α = 0x87 (the GCM polynomial constant) in GF(2^128):
+        // - Much higher multiplicative order than minimal choices
+        // - Better security margin against attacks not covered by the proof
+        // - Uses hardware-accelerated CLMUL instructions
+        // - 0x87 is well-studied from GCM/GHASH
+        const ALPHA: Block = Block::new([0x87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+
+        #[inline]
+        fn sigma(x: Block) -> Block {
+            // RTCCR sigma function: σ(X) = α·X in GF(2^128)
+            x.gfmul(ALPHA)
+        }
+
+        // Compute X ⊕ U(τ) for all blocks
+        for (block, tweak) in blocks.iter_mut().zip(tweaks.iter()) {
+            *block ^= self.universal_hash(*tweak);
+        }
+
+        // Store σ(X ⊕ U(τ)) in buf before encryption overwrites blocks
+        let sigma_buf: [Block; N] = std::array::from_fn(|i| sigma(blocks[i]));
+
+        // Encrypt all tweaked blocks: AES_k(X ⊕ U(τ))
+        self.aes.encrypt_blocks(Block::as_array_mut_slice(blocks));
+
+        // XOR with sigma: AES_k(X ⊕ U(τ)) ⊕ σ(X ⊕ U(τ))
+        for (block, sigma) in blocks.iter_mut().zip(sigma_buf.iter()) {
+            *block ^= *sigma;
         }
     }
 
@@ -220,3 +338,179 @@ fn aes_test() {
         ]
     );
 }
+
+#[cfg(test)]
+mod rtccr_tests {
+    use super::*;
+
+    /// Test that rtccr and rtccr_many produce identical results
+    #[test]
+    fn rtccr_single_vs_batch() {
+        let key = [1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+        let aes = FixedKeyAes::new(key);
+
+        let tweak = Block::new([0xAB; 16]);
+        let block = Block::new([0xCD; 16]);
+
+        // Single call
+        let single_result = aes.rtccr(tweak, block);
+
+        // Batch call with 1 element
+        let mut blocks = [block];
+        aes.rtccr_many(&[tweak], &mut blocks);
+
+        assert_eq!(
+            single_result, blocks[0],
+            "Single and batch RTCCR should match"
+        );
+    }
+
+    /// Test that rtccr_many processes multiple blocks correctly
+    #[test]
+    fn rtccr_many_multiple_blocks() {
+        let key = [42u8; 16];
+        let aes = FixedKeyAes::new(key);
+
+        let tweaks = [
+            Block::new([1u8; 16]),
+            Block::new([2u8; 16]),
+            Block::new([3u8; 16]),
+            Block::new([4u8; 16]),
+        ];
+        let blocks_original = [
+            Block::new([0x10; 16]),
+            Block::new([0x20; 16]),
+            Block::new([0x30; 16]),
+            Block::new([0x40; 16]),
+        ];
+
+        // Compute individually
+        let expected: [Block; 4] =
+            std::array::from_fn(|i| aes.rtccr(tweaks[i], blocks_original[i]));
+
+        // Compute in batch
+        let mut blocks = blocks_original;
+        aes.rtccr_many(&tweaks, &mut blocks);
+
+        assert_eq!(blocks, expected, "Batch should match individual calls");
+    }
+
+    /// Test that universal hash produces different outputs for different tweaks
+    #[test]
+    fn universal_hash_different_tweaks() {
+        let key = [0x55u8; 16];
+        let aes = FixedKeyAes::new(key);
+
+        let block = Block::new([0xAA; 16]);
+        let tweak1 = Block::new([1u8; 16]);
+        let tweak2 = Block::new([2u8; 16]);
+
+        let result1 = aes.rtccr(tweak1, block);
+        let result2 = aes.rtccr(tweak2, block);
+
+        assert_ne!(
+            result1, result2,
+            "Different tweaks should produce different outputs"
+        );
+    }
+
+    /// Test that RTCCR is deterministic
+    #[test]
+    fn rtccr_deterministic() {
+        let key = [0x77u8; 16];
+        let aes = FixedKeyAes::new(key);
+
+        let tweak = Block::new([0x11; 16]);
+        let block = Block::new([0x22; 16]);
+
+        let result1 = aes.rtccr(tweak, block);
+        let result2 = aes.rtccr(tweak, block);
+
+        assert_eq!(result1, result2, "RTCCR should be deterministic");
+    }
+
+    /// Test that u1, u2 are derived consistently from the same key
+    #[test]
+    fn universal_hash_key_derivation() {
+        let key = [0x99u8; 16];
+
+        let aes1 = FixedKeyAes::new(key);
+        let aes2 = FixedKeyAes::new(key);
+
+        // Both should produce same results
+        let tweak = Block::new([0xBB; 16]);
+        let block = Block::new([0xCC; 16]);
+
+        assert_eq!(
+            aes1.rtccr(tweak, block),
+            aes2.rtccr(tweak, block),
+            "Same key should produce identical u1, u2"
+        );
+    }
+
+    /// Test that different keys produce different u1, u2
+    #[test]
+    fn universal_hash_different_keys() {
+        let key1 = [0x11u8; 16];
+        let key2 = [0x22u8; 16];
+
+        let aes1 = FixedKeyAes::new(key1);
+        let aes2 = FixedKeyAes::new(key2);
+
+        let tweak = Block::new([0xDD; 16]);
+        let block = Block::new([0xEE; 16]);
+
+        assert_ne!(
+            aes1.rtccr(tweak, block),
+            aes2.rtccr(tweak, block),
+            "Different keys should produce different RTCCR outputs"
+        );
+    }
+
+    /// Test that sigma uses α = 0x87 correctly
+    #[test]
+    fn sigma_alpha_properties() {
+        // Local sigma for testing (matches the one inside rtccr/rtccr_many)
+        const ALPHA: Block = Block::new([0x87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        fn sigma(x: Block) -> Block {
+            x.gfmul(ALPHA)
+        }
+
+        // σ(0) = 0
+        assert_eq!(sigma(Block::ZERO), Block::ZERO);
+
+        // σ(1) = 0x87
+        let one = Block::new([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let expected = Block::new([0x87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        assert_eq!(sigma(one), expected);
+
+        // Verify α = 0x87 is not in GF(4) by checking σ³(1) ≠ σ(1)
+        // (elements in GF(4) satisfy x^3 = x)
+        let sigma_one = sigma(one);
+        let sigma_cubed = sigma(sigma(sigma_one));
+        assert_ne!(sigma_cubed, sigma_one, "α = 0x87 should not be in GF(4)");
+    }
+
+    /// Test that sigma is linear: σ(A ⊕ B) = σ(A) ⊕ σ(B)
+    #[test]
+    fn sigma_linearity() {
+        const ALPHA: Block = Block::new([0x87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        fn sigma(x: Block) -> Block {
+            x.gfmul(ALPHA)
+        }
+
+        let a = Block::new([
+            0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66,
+            0x77, 0x88,
+        ]);
+        let b = Block::new([
+            0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF,
+            0x00, 0x11,
+        ]);
+
+        let sigma_xor = sigma(a ^ b);
+        let xor_sigma = sigma(a) ^ sigma(b);
+
+        assert_eq!(sigma_xor, xor_sigma, "σ should be linear");
+    }
+}
diff --git a/crates/garble-core/Cargo.toml b/crates/garble-core/Cargo.toml
index ea73bb9f..c1327006 100644
--- a/crates/garble-core/Cargo.toml
+++ b/crates/garble-core/Cargo.toml
@@ -30,6 +30,7 @@ mpz-vm-core = { workspace = true }
 aes = { workspace = true, features = [] }
 bitvec = { workspace = true, features = ["serde"] }
 blake3 = { workspace = true, features = ["serde"] }
+bytemuck = { workspace = true }
 cfg-if = { workspace = true }
 cipher = { workspace = true }
 derive_builder = { workspace = true }
@@ -56,9 +57,17 @@ criterion = { workspace = true }
 pretty_assertions = { workspace = true }
 
 [[bench]]
-name = "garble"
+name = "garbler_hg"
 harness = false
 
 [[bench]]
-name = "evaluate"
+name = "garbler_th"
+harness = false
+
+[[bench]]
+name = "evaluator_hg"
+harness = false
+
+[[bench]]
+name = "evaluator_th"
 harness = false
diff --git a/crates/garble-core/benches/evaluate.rs b/crates/garble-core/benches/evaluator_hg.rs
similarity index 98%
rename from crates/garble-core/benches/evaluate.rs
rename to crates/garble-core/benches/evaluator_hg.rs
index 4b28a03b..7f5dd4cd 100644
--- a/crates/garble-core/benches/evaluate.rs
+++ b/crates/garble-core/benches/evaluator_hg.rs
@@ -1,6 +1,6 @@
 //! Benchmarks for half-gates evaluation.
 //!
-//! Run with: `cargo bench -p mpz-garble-core --bench evaluate`
+//! Run with: `cargo bench -p mpz-garble-core --bench evaluator_hg`
 
 use std::sync::Arc;
 
diff --git a/crates/garble-core/benches/evaluator_th.rs b/crates/garble-core/benches/evaluator_th.rs
new file mode 100644
index 00000000..85ad2582
--- /dev/null
+++ b/crates/garble-core/benches/evaluator_th.rs
@@ -0,0 +1,183 @@
+//! Benchmarks for three-halves evaluation.
+//!
+//! Run with: `cargo bench -p mpz-garble-core --bench evaluator_th`
+
+use std::sync::Arc;
+
+use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
+use mpz_circuits::{AES128, Circuit};
+use mpz_core::Block;
+use mpz_garble_core::three_halves::{
+    Evaluator, GarbledCircuit, Garbler, evaluate_garbled_circuits,
+};
+use mpz_garble_core::Key;
+use mpz_memory_core::correlated::{Delta, Mac};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+// Gate count thresholds
+const THRESHOLDS: &[(u64, &str)] = &[(100_000, "100K"), (1_000_000, "1M"), (10_000_000, "10M")];
+
+fn bench_evaluate(c: &mut Criterion) {
+    let mut group = c.benchmark_group("evaluate");
+    group.sample_size(10);
+    let circuit = &*AES128;
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+
+    // Prepare inputs (arbitrary Keys)
+    let inputs: Vec<Key> = (0..256)
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+
+    // Compute evaluator MACs from Keys
+    let delta_block = *delta.as_block();
+    let eval_inputs: Vec<Mac> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(key, &choice)| {
+            let key_block = *key.as_block();
+            if choice {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    let gates_per_circuit = circuit.and_count() as u64;
+
+    for &(threshold, name) in THRESHOLDS {
+        let iterations = threshold.div_ceil(gates_per_circuit) as usize;
+        let actual_gates = iterations as u64 * gates_per_circuit;
+
+        // Pre-generate garbled circuits (single gates)
+        let mut gb = Garbler::default();
+        let all_gates: Vec<Vec<_>> = (0..iterations)
+            .map(|_| {
+                let mut garble_rng = StdRng::seed_from_u64(42);
+                let mut iter = gb.generate(circuit, delta, &inputs, &mut garble_rng).unwrap();
+                let gates: Vec<_> = iter.by_ref().collect();
+                let _ = iter.finish().unwrap();
+                gates
+            })
+            .collect();
+
+        group.throughput(Throughput::Elements(actual_gates));
+
+        // Iterator-based (one gate at a time)
+        group.bench_function(BenchmarkId::new("iter", name), |b| {
+            let mut ev = Evaluator::default();
+            b.iter(|| {
+                for gates in &all_gates {
+                    let mut consumer = ev.evaluate(circuit, &eval_inputs).unwrap();
+                    for gate in gates {
+                        consumer.next(*gate);
+                    }
+                    black_box(consumer.finish().unwrap());
+                }
+            })
+        });
+
+        // Batched (multiple gates at a time)
+        // Note: EncryptedGateBatch doesn't implement Clone, so we regenerate per
+        // iteration
+        group.bench_function(BenchmarkId::new("batched", name), |b| {
+            let mut ev = Evaluator::default();
+            let mut gb = Garbler::default();
+            b.iter(|| {
+                for _ in 0..iterations {
+                    // Regenerate batches (not timed separately, but included in measurement)
+                    let mut garble_rng = StdRng::seed_from_u64(42);
+                    let mut iter = gb.generate_batched(circuit, delta, &inputs, &mut garble_rng).unwrap();
+                    let batches: Vec<_> = iter.by_ref().collect();
+                    let _ = iter.finish().unwrap();
+
+                    let mut consumer = ev.evaluate_batched(circuit, &eval_inputs).unwrap();
+                    for batch in batches {
+                        consumer.next(batch);
+                    }
+                    black_box(consumer.finish().unwrap());
+                }
+            })
+        });
+    }
+
+    group.finish();
+}
+
+fn bench_evaluate_parallel(c: &mut Criterion) {
+    let mut group = c.benchmark_group("evaluate_parallel");
+    group.sample_size(10);
+    let circuit: Arc<Circuit> = AES128.clone();
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+
+    // Prepare inputs (arbitrary Keys)
+    let inputs: Vec<Key> = (0..256)
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+
+    // Compute evaluator MACs from Keys
+    let delta_block = *delta.as_block();
+    let eval_inputs: Vec<Mac> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(key, &choice)| {
+            let key_block = *key.as_block();
+            if choice {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    let gates_per_circuit = circuit.and_count() as u64;
+
+    for &(threshold, name) in THRESHOLDS {
+        let circuit_count = threshold.div_ceil(gates_per_circuit) as usize;
+        let actual_gates = circuit_count as u64 * gates_per_circuit;
+
+        // Pre-garble circuits
+        let mut gb = Garbler::default();
+        let garbled_circuits: Vec<GarbledCircuit> = (0..circuit_count)
+            .map(|_| {
+                let mut garble_rng = StdRng::seed_from_u64(42);
+                let mut iter = gb.generate(&circuit, delta, &inputs, &mut garble_rng).unwrap();
+                let gates: Vec<_> = iter.by_ref().collect();
+                let _ = iter.finish().unwrap();
+                GarbledCircuit { gates }
+            })
+            .collect();
+
+        group.throughput(Throughput::Elements(actual_gates));
+
+        // Parallel evaluation using evaluate_garbled_circuits (uses rayon par_iter)
+        group.bench_function(BenchmarkId::new("rayon", name), |b| {
+            b.iter(|| {
+                let circs: Vec<_> = garbled_circuits
+                    .iter()
+                    .map(|gc| (circuit.clone(), eval_inputs.clone(), gc.clone()))
+                    .collect();
+                black_box(evaluate_garbled_circuits(circs).unwrap())
+            })
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_evaluate, bench_evaluate_parallel);
+criterion_main!(benches);
diff --git a/crates/garble-core/benches/garble.rs b/crates/garble-core/benches/garble.rs
deleted file mode 100644
index 37949ffc..00000000
--- a/crates/garble-core/benches/garble.rs
+++ /dev/null
@@ -1,95 +0,0 @@
-//! Benchmarks for half-gates garbling.
-//!
-//! Run with: `cargo bench -p mpz-garble-core --bench garble`
-
-use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
-use mpz_circuits::AES128;
-use mpz_garble_core::{Evaluator, Garbler, Key};
-use mpz_memory_core::correlated::Delta;
-use rand::{Rng, SeedableRng, rngs::StdRng};
-
-// Gate count thresholds
-const THRESHOLDS: &[(u64, &str)] = &[(100_000, "100K"), (1_000_000, "1M"), (10_000_000, "10M")];
-
-fn bench_garble(c: &mut Criterion) {
-    let mut group = c.benchmark_group("garble");
-    group.sample_size(10);
-    let circuit = &*AES128;
-
-    let mut rng = StdRng::seed_from_u64(0);
-    let delta = Delta::random(&mut rng);
-    let inputs: Vec<Key> = (0..256).map(|_| rng.random()).collect();
-    let seed: [u8; 16] = rng.random();
-
-    let gates_per_circuit = circuit.and_count() as u64;
-
-    for &(threshold, name) in THRESHOLDS {
-        let iterations = threshold.div_ceil(gates_per_circuit) as usize;
-        let actual_gates = iterations as u64 * gates_per_circuit;
-
-        group.throughput(Throughput::Elements(actual_gates));
-
-        // Iterator-based (one gate at a time)
-        group.bench_function(BenchmarkId::new("iter", name), |b| {
-            b.iter(|| {
-                for _ in 0..iterations {
-                    let mut gb = Garbler::new(seed, delta);
-                    let _ = gb.setup().unwrap();
-                    let mut iter = gb.generate(circuit, &inputs).unwrap();
-                    let _: Vec<_> = iter.by_ref().collect();
-                    black_box(iter.finish().unwrap());
-                }
-            })
-        });
-
-        // Batched (multiple gates at a time)
-        group.bench_function(BenchmarkId::new("batched", name), |b| {
-            b.iter(|| {
-                for _ in 0..iterations {
-                    let mut gb = Garbler::new(seed, delta);
-                    let _ = gb.setup().unwrap();
-                    let mut iter = gb.generate_batched(circuit, &inputs).unwrap();
-                    let _: Vec<_> = iter.by_ref().collect();
-                    black_box(iter.finish().unwrap());
-                }
-            })
-        });
-    }
-
-    group.finish();
-
-    // Evaluator benchmarks
-    let mut ev_group = c.benchmark_group("evaluate");
-
-    ev_group.bench_function("aes128", |b| {
-        let mut gb = Garbler::new(seed, delta);
-        let setup = gb.setup().unwrap();
-        let mut gb_iter = gb.generate(&AES128, &inputs).unwrap();
-        let gates: Vec<_> = gb_iter.by_ref().collect();
-
-        let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
-        let inputs: Vec<_> = inputs
-            .iter()
-            .zip(choices)
-            .map(|(input, choice)| input.auth(choice, &delta))
-            .collect();
-
-        b.iter(|| {
-            let setup = setup.clone();
-            let mut ev = Evaluator::default();
-            ev.setup(setup).unwrap();
-            let mut ev_consumer = ev.evaluate(&AES128, &inputs).unwrap();
-
-            for gate in &gates {
-                ev_consumer.next(*gate);
-            }
-
-            black_box(ev_consumer.finish().unwrap());
-        })
-    });
-
-    ev_group.finish();
-}
-
-criterion_group!(benches, bench_garble);
-criterion_main!(benches);
diff --git a/crates/garble-core/benches/garbler_hg.rs b/crates/garble-core/benches/garbler_hg.rs
new file mode 100644
index 00000000..9852a22d
--- /dev/null
+++ b/crates/garble-core/benches/garbler_hg.rs
@@ -0,0 +1,171 @@
+//! Benchmarks for half-gates garbling scheme.
+//!
+//! Run with: `cargo bench -p mpz-garble-core --bench garbler_hg`
+
+use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main};
+use mpz_circuits::AES128;
+use mpz_garble_core::{Key, half_gates};
+use mpz_memory_core::correlated::Delta;
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+/// Benchmark single AES circuit garbling
+fn bench_garble_aes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("garble_aes128");
+    group.throughput(Throughput::Elements(AES128.and_count() as u64));
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+
+    let inputs: Vec<Key> = (0..256).map(|_| rng.random()).collect();
+
+    group.bench_function("half_gates", |b| {
+        b.iter(|| {
+            let mut gb = half_gates::Garbler::new(seed, delta);
+            let _ = gb.setup().unwrap();
+            let mut iter = gb.generate(&AES128, &inputs).unwrap();
+            let _: Vec<_> = iter.by_ref().collect();
+            black_box(iter.finish().unwrap())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark single AES circuit evaluation
+fn bench_evaluate_aes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("evaluate_aes128");
+    group.throughput(Throughput::Elements(AES128.and_count() as u64));
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+
+    let inputs: Vec<Key> = (0..256).map(|_| rng.random()).collect();
+    let mut gb = half_gates::Garbler::new(seed, delta);
+    let setup = gb.setup().unwrap();
+    let mut iter = gb.generate(&AES128, &inputs).unwrap();
+    let gates: Vec<_> = iter.by_ref().collect();
+    let _ = iter.finish().unwrap();
+
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+    let eval_inputs: Vec<_> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(k, &c)| k.auth(c, &delta))
+        .collect();
+
+    group.bench_function("half_gates", |b| {
+        b.iter(|| {
+            let mut ev = half_gates::Evaluator::default();
+            ev.setup(setup.clone()).unwrap();
+            let mut consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+            for gate in &gates {
+                consumer.next(*gate);
+            }
+            black_box(consumer.finish().unwrap())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark 100 AES circuits (throughput test)
+fn bench_100_aes(c: &mut Criterion) {
+    const N: usize = 100;
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+
+    let inputs: Vec<Key> = (0..256).map(|_| rng.random()).collect();
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+    let eval_inputs: Vec<_> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(k, &c)| k.auth(c, &delta))
+        .collect();
+
+    // Pre-generate gates for evaluation benchmarks
+    let mut gb = half_gates::Garbler::new(seed, delta);
+    let setup = gb.setup().unwrap();
+    let all_gates: Vec<Vec<_>> = (0..N)
+        .map(|_| {
+            let mut iter = gb.generate(&AES128, &inputs).unwrap();
+            let gates: Vec<_> = iter.by_ref().collect();
+            let _ = iter.finish().unwrap();
+            gates
+        })
+        .collect();
+
+    // === Garble 100x ===
+    {
+        let mut group = c.benchmark_group("garble_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("half_gates", |b| {
+            b.iter(|| {
+                for _ in 0..N {
+                    let mut gb = half_gates::Garbler::new(seed, delta);
+                    let _ = gb.setup().unwrap();
+                    let mut iter = gb.generate(&AES128, &inputs).unwrap();
+                    let _: Vec<_> = iter.by_ref().collect();
+                    black_box(iter.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+
+    // === Evaluate 100x ===
+    {
+        let mut group = c.benchmark_group("evaluate_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("half_gates", |b| {
+            b.iter(|| {
+                for gates in &all_gates {
+                    let mut ev = half_gates::Evaluator::default();
+                    ev.setup(setup.clone()).unwrap();
+                    let mut consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+                    for gate in gates {
+                        consumer.next(*gate);
+                    }
+                    black_box(consumer.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+
+    // === Garble+Evaluate 100x ===
+    {
+        let mut group = c.benchmark_group("garble_and_evaluate_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("half_gates", |b| {
+            b.iter(|| {
+                for _ in 0..N {
+                    let mut gb = half_gates::Garbler::new(seed, delta);
+                    let setup = gb.setup().unwrap();
+                    let mut ev = half_gates::Evaluator::default();
+                    ev.setup(setup).unwrap();
+                    let mut gb_iter = gb.generate(&AES128, &inputs).unwrap();
+                    let mut ev_consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+                    for gate in gb_iter.by_ref() {
+                        ev_consumer.next(gate);
+                    }
+                    black_box(gb_iter.finish().unwrap());
+                    black_box(ev_consumer.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, bench_garble_aes, bench_evaluate_aes, bench_100_aes);
+criterion_main!(benches);
diff --git a/crates/garble-core/benches/garbler_th.rs b/crates/garble-core/benches/garbler_th.rs
new file mode 100644
index 00000000..612bcb8a
--- /dev/null
+++ b/crates/garble-core/benches/garbler_th.rs
@@ -0,0 +1,214 @@
+//! Benchmarks for three-halves garbling scheme.
+//!
+//! Run with: `cargo bench -p mpz-garble-core --bench garbler_th`
+
+use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main};
+use mpz_circuits::AES128;
+use mpz_core::Block;
+use mpz_garble_core::{Key, three_halves};
+use mpz_memory_core::correlated::{Delta, Mac};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+/// Benchmark single AES circuit garbling
+fn bench_garble_aes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("garble_aes128");
+    group.throughput(Throughput::Elements(AES128.and_count() as u64));
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+
+    // Three-halves inputs (arbitrary Keys)
+    let inputs: Vec<Key> = (0..256)
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    group.bench_function("three_halves", |b| {
+        let mut gb = three_halves::Garbler::default();
+        b.iter(|| {
+            let mut bench_rng = StdRng::seed_from_u64(42);
+            let mut iter = gb
+                .generate(&AES128, delta, &inputs, &mut bench_rng)
+                .unwrap();
+            let _: Vec<_> = iter.by_ref().collect();
+            black_box(iter.finish().unwrap())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark single AES circuit evaluation
+fn bench_evaluate_aes(c: &mut Criterion) {
+    let mut group = c.benchmark_group("evaluate_aes128");
+    group.throughput(Throughput::Elements(AES128.and_count() as u64));
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+
+    // Three-halves setup
+    let inputs: Vec<Key> = (0..256)
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let mut gb = three_halves::Garbler::default();
+    let mut setup_rng = StdRng::seed_from_u64(42);
+    let mut iter = gb
+        .generate(&AES128, delta, &inputs, &mut setup_rng)
+        .unwrap();
+    let gates: Vec<_> = iter.by_ref().collect();
+    let _ = iter.finish().unwrap();
+
+    // Compute evaluator inputs from keys
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+    let delta_block = *delta.as_block();
+    let eval_inputs: Vec<Mac> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(key, &choice)| {
+            let key_block = *key.as_block();
+            if choice {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    group.bench_function("three_halves", |b| {
+        let mut ev = three_halves::Evaluator::default();
+        b.iter(|| {
+            let mut consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+            for gate in &gates {
+                consumer.next(gate.clone());
+            }
+            black_box(consumer.finish().unwrap())
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmark 100 AES circuits (throughput test)
+fn bench_100_aes(c: &mut Criterion) {
+    const N: usize = 100;
+
+    let mut rng = StdRng::seed_from_u64(0);
+    let delta = Delta::random(&mut rng);
+
+    // Three-halves inputs
+    let inputs: Vec<Key> = (0..256)
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    // Compute evaluator inputs from keys
+    let choices: Vec<bool> = (0..256).map(|_| rng.random()).collect();
+    let delta_block = *delta.as_block();
+    let eval_inputs: Vec<Mac> = inputs
+        .iter()
+        .zip(&choices)
+        .map(|(key, &choice)| {
+            let key_block = *key.as_block();
+            if choice {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    // Pre-generate gates for evaluation benchmarks
+    let mut gb = three_halves::Garbler::default();
+    let all_gates: Vec<Vec<_>> = (0..N)
+        .map(|_| {
+            let mut bench_rng = StdRng::seed_from_u64(42);
+            let mut iter = gb
+                .generate(&AES128, delta, &inputs, &mut bench_rng)
+                .unwrap();
+            let gates: Vec<_> = iter.by_ref().collect();
+            let _ = iter.finish().unwrap();
+            gates
+        })
+        .collect();
+
+    // === Garble 100x ===
+    {
+        let mut group = c.benchmark_group("garble_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("three_halves", |b| {
+            let mut gb = three_halves::Garbler::default();
+            b.iter(|| {
+                for _ in 0..N {
+                    let mut bench_rng = StdRng::seed_from_u64(42);
+                    let mut iter = gb
+                        .generate(&AES128, delta, &inputs, &mut bench_rng)
+                        .unwrap();
+                    let _: Vec<_> = iter.by_ref().collect();
+                    black_box(iter.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+
+    // === Evaluate 100x ===
+    {
+        let mut group = c.benchmark_group("evaluate_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("three_halves", |b| {
+            let mut ev = three_halves::Evaluator::default();
+            b.iter(|| {
+                for gates in &all_gates {
+                    let mut consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+                    for gate in gates {
+                        consumer.next(gate.clone());
+                    }
+                    black_box(consumer.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+
+    // === Garble+Evaluate 100x ===
+    {
+        let mut group = c.benchmark_group("garble_and_evaluate_100x_aes128");
+        group.throughput(Throughput::Elements((N * AES128.and_count()) as u64));
+
+        group.bench_function("three_halves", |b| {
+            let mut gb = three_halves::Garbler::default();
+            let mut ev = three_halves::Evaluator::default();
+            b.iter(|| {
+                for _ in 0..N {
+                    let mut bench_rng = StdRng::seed_from_u64(42);
+                    let mut gb_iter = gb
+                        .generate(&AES128, delta, &inputs, &mut bench_rng)
+                        .unwrap();
+                    let mut ev_consumer = ev.evaluate(&AES128, &eval_inputs).unwrap();
+                    for gate in gb_iter.by_ref() {
+                        ev_consumer.next(gate);
+                    }
+                    black_box(gb_iter.finish().unwrap());
+                    black_box(ev_consumer.finish().unwrap());
+                }
+            })
+        });
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, bench_garble_aes, bench_evaluate_aes, bench_100_aes);
+criterion_main!(benches);
diff --git a/crates/garble-core/src/circuit.rs b/crates/garble-core/src/half_gates/circuit.rs
similarity index 100%
rename from crates/garble-core/src/circuit.rs
rename to crates/garble-core/src/half_gates/circuit.rs
diff --git a/crates/garble-core/src/evaluator.rs b/crates/garble-core/src/half_gates/evaluator.rs
similarity index 99%
rename from crates/garble-core/src/evaluator.rs
rename to crates/garble-core/src/half_gates/evaluator.rs
index 7478a141..a01f8cfb 100644
--- a/crates/garble-core/src/evaluator.rs
+++ b/crates/garble-core/src/half_gates/evaluator.rs
@@ -4,9 +4,9 @@ use std::{marker::PhantomData, ops::Range, sync::Arc};
 use cfg_if::cfg_if;
 use mpz_memory_core::correlated::Mac;
 
-use crate::{
-    DEFAULT_BATCH_SIZE, EncryptedGateBatch, GarbledCircuit, SetupMsg, circuit::EncryptedGate,
-};
+use crate::{DEFAULT_BATCH_SIZE, SetupMsg};
+
+use super::circuit::{EncryptedGate, EncryptedGateBatch, GarbledCircuit};
 use mpz_circuits::{Circuit, Gate};
 use mpz_core::{Block, aes::FixedKeyAes};
 
diff --git a/crates/garble-core/src/garbler.rs b/crates/garble-core/src/half_gates/garbler.rs
similarity index 99%
rename from crates/garble-core/src/garbler.rs
rename to crates/garble-core/src/half_gates/garbler.rs
index e66e850a..9f9a487c 100644
--- a/crates/garble-core/src/garbler.rs
+++ b/crates/garble-core/src/half_gates/garbler.rs
@@ -1,7 +1,9 @@
 use core::fmt;
 use std::{marker::PhantomData, ops::Range};
 
-use crate::{DEFAULT_BATCH_SIZE, EncryptedGateBatch, SetupMsg, circuit::EncryptedGate};
+use crate::{DEFAULT_BATCH_SIZE, SetupMsg};
+
+use super::circuit::{EncryptedGate, EncryptedGateBatch};
 use mpz_circuits::{Circuit, Gate};
 use mpz_core::{Block, aes::FixedKeyAes};
 use mpz_memory_core::correlated::{Delta, Key};
diff --git a/crates/garble-core/src/half_gates/mod.rs b/crates/garble-core/src/half_gates/mod.rs
new file mode 100644
index 00000000..2d5b260c
--- /dev/null
+++ b/crates/garble-core/src/half_gates/mod.rs
@@ -0,0 +1,37 @@
+//! Half-Gates garbling scheme implementation
+//!
+//! This module implements the "Two Halves Make a Whole" garbling scheme from
+//! [Zahur, Rosulek, Evans 2015](https://eprint.iacr.org/2014/756).
+//!
+//! ## Overview
+//!
+//! The half-gates scheme reduces the size of garbled AND gates from 4
+//! ciphertexts (classical garbled circuits) to **2 ciphertexts** (2κ bits,
+//! where κ=128).
+//!
+//! ## Key Features
+//!
+//! - **AND gate size**: 2κ bits (32 bytes) per gate
+//! - **Free XOR**: XOR gates have zero cost (no ciphertext)
+//! - **Point-and-permute**: Fast evaluation using LSB color bits
+//!
+//! ## Architecture
+//!
+//! - [`Garbler`] - Generates garbled circuits and wire labels
+//! - [`Evaluator`] - Evaluates garbled circuits using active wire labels
+
+pub(crate) mod circuit;
+pub(crate) mod evaluator;
+pub(crate) mod garbler;
+
+#[cfg(test)]
+mod tests;
+
+pub use circuit::{EncryptedGate, EncryptedGateBatch, GarbledCircuit};
+pub use evaluator::{
+    EncryptedGateBatchConsumer, EncryptedGateConsumer, Evaluator, EvaluatorError, EvaluatorOutput,
+    EvaluatorWorker, evaluate_garbled_circuits,
+};
+pub use garbler::{
+    EncryptedGateBatchIter, EncryptedGateIter, Garbler, GarblerError, GarblerOutput, GarblerWorker,
+};
diff --git a/crates/garble-core/src/half_gates/tests.rs b/crates/garble-core/src/half_gates/tests.rs
new file mode 100644
index 00000000..6953b617
--- /dev/null
+++ b/crates/garble-core/src/half_gates/tests.rs
@@ -0,0 +1,272 @@
+//! End-to-end tests for the half-gates garbling scheme
+//!
+//! These tests verify that the complete half-gates implementation works
+//! correctly for various circuits including AES128, XOR-only circuits, and
+//! preprocessed evaluation.
+
+use aes::{
+    Aes128,
+    cipher::{BlockCipherEncrypt, KeyInit},
+};
+use itybity::{FromBitIterator, IntoBitIterator, ToBits};
+use mpz_circuits::{AES128, circuits::xor};
+use mpz_core::{Block, aes::FIXED_KEY_AES};
+use mpz_memory_core::correlated::{Delta, Key};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+use rand_chacha::ChaCha12Rng;
+
+use super::{
+    Evaluator, EvaluatorOutput, Garbler, GarblerOutput, evaluate_garbled_circuits, evaluator as ev,
+    garbler as gb,
+};
+use crate::GarbledCircuit;
+
+/// Test a single AND gate with all 4 input combinations
+#[test]
+fn test_and_gate() {
+    let mut rng = ChaCha12Rng::seed_from_u64(0);
+    let cipher = &(*FIXED_KEY_AES);
+
+    let delta = Delta::random(&mut rng);
+    let x_0 = Block::random(&mut rng);
+    let x_1 = x_0 ^ delta.as_block();
+    let y_0 = Block::random(&mut rng);
+    let y_1 = y_0 ^ delta.as_block();
+    let gid: u128 = 1;
+
+    let (z_0, encrypted_gate) = gb::and_gate(cipher, &x_0, &y_0, &delta, gid);
+    let z_1 = z_0 ^ delta.as_block();
+
+    assert_eq!(ev::and_gate(cipher, &x_0, &y_0, &encrypted_gate, gid), z_0);
+    assert_eq!(ev::and_gate(cipher, &x_0, &y_1, &encrypted_gate, gid), z_0);
+    assert_eq!(ev::and_gate(cipher, &x_1, &y_0, &encrypted_gate, gid), z_0);
+    assert_eq!(ev::and_gate(cipher, &x_1, &y_1, &encrypted_gate, gid), z_1);
+}
+
+/// E2E test: Garble and evaluate AES128 circuit
+///
+/// This tests the complete half-gates scheme on a real circuit with 6800 AND
+/// gates. The garbler and evaluator communicate via streaming batches.
+#[test]
+fn test_garble() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let key = [69u8; 16];
+    let msg = [42u8; 16];
+
+    let expected: [u8; 16] = {
+        let cipher = Aes128::new_from_slice(&key).unwrap();
+        let mut out = msg.into();
+        cipher.encrypt_block(&mut out);
+        out.into()
+    };
+
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+    let input_keys = (0..AES128.inputs().len())
+        .map(|_| rng.random())
+        .collect::<Vec<Key>>();
+
+    let input_macs = input_keys
+        .iter()
+        .zip(key.iter().copied().chain(msg).into_iter_lsb0())
+        .map(|(key, bit)| key.auth(bit, &delta))
+        .collect::<Vec<_>>();
+
+    let mut gb = Garbler::new(seed, delta);
+    let setup = gb.setup().unwrap();
+    let mut ev = Evaluator::default();
+    ev.setup(setup).unwrap();
+
+    let mut gb_iter = gb.generate_batched(&AES128, &input_keys).unwrap();
+    let mut ev_consumer = ev.evaluate_batched(&AES128, &input_macs).unwrap();
+
+    for batch in gb_iter.by_ref() {
+        ev_consumer.next(batch);
+    }
+
+    let GarblerOutput {
+        outputs: output_keys,
+    } = gb_iter.finish().unwrap();
+    let EvaluatorOutput {
+        outputs: output_macs,
+    } = ev_consumer.finish().unwrap();
+
+    assert!(
+        output_keys
+            .iter()
+            .zip(&output_macs)
+            .zip(expected.iter_lsb0())
+            .all(|((key, mac), bit)| &key.auth(bit, &delta) == mac)
+    );
+
+    let output: Vec<u8> = Vec::from_lsb0_iter(
+        output_macs
+            .into_iter()
+            .zip(output_keys)
+            .map(|(mac, key)| mac.pointer() ^ key.pointer()),
+    );
+
+    assert_eq!(output, expected);
+}
+
+/// E2E test: Preprocessed circuit evaluation
+///
+/// Tests the scenario where a circuit is garbled once and then evaluated
+/// multiple times. This is useful for amortizing garbling costs.
+#[test]
+fn test_garble_preprocessed() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let key = [69u8; 16];
+    let msg = [42u8; 16];
+
+    let expected: [u8; 16] = {
+        let cipher = Aes128::new_from_slice(&key).unwrap();
+        let mut out = msg.into();
+        cipher.encrypt_block(&mut out);
+        out.into()
+    };
+
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+    let input_keys = (0..AES128.inputs().len())
+        .map(|_| rng.random())
+        .collect::<Vec<Key>>();
+
+    let input_macs = input_keys
+        .iter()
+        .zip(key.iter().copied().chain(msg).into_iter_lsb0())
+        .map(|(key, bit)| key.auth(bit, &delta))
+        .collect::<Vec<_>>();
+
+    let mut gb = Garbler::new(seed, delta);
+    let setup = gb.setup().unwrap();
+    let mut ev = Evaluator::default();
+    ev.setup(setup).unwrap();
+
+    // Allocate 2 workers from garbler
+    let gb_worker1 = gb.alloc_worker(AES128.and_count()).unwrap();
+    let gb_worker2 = gb.alloc_worker(AES128.and_count()).unwrap();
+
+    // Garble circuit 1 with worker1
+    let mut gb_iter1 = gb_worker1.generate_batched(&AES128, &input_keys).unwrap();
+    let mut gates1 = Vec::new();
+    for batch in gb_iter1.by_ref() {
+        gates1.extend(batch.into_array());
+    }
+    let garbled_circuit1 = GarbledCircuit { gates: gates1 };
+    let GarblerOutput {
+        outputs: output_keys1,
+    } = gb_iter1.finish().unwrap();
+
+    // Garble circuit 2 with worker2
+    let mut gb_iter2 = gb_worker2.generate_batched(&AES128, &input_keys).unwrap();
+    let mut gates2 = Vec::new();
+    for batch in gb_iter2.by_ref() {
+        gates2.extend(batch.into_array());
+    }
+    let garbled_circuit2 = GarbledCircuit { gates: gates2 };
+    let GarblerOutput {
+        outputs: output_keys2,
+    } = gb_iter2.finish().unwrap();
+
+    // Allocate 2 workers from evaluator
+    let outputs = evaluate_garbled_circuits(
+        vec![
+            (AES128.clone(), input_macs.clone(), garbled_circuit1),
+            (AES128.clone(), input_macs.clone(), garbled_circuit2),
+        ],
+        vec![
+            ev.alloc_worker(AES128.and_count()).unwrap(),
+            ev.alloc_worker(AES128.and_count()).unwrap(),
+        ],
+    )
+    .unwrap();
+
+    for (output, output_keys) in outputs.into_iter().zip([output_keys1, output_keys2]) {
+        let EvaluatorOutput {
+            outputs: output_macs,
+        } = output;
+
+        assert!(
+            output_keys
+                .iter()
+                .zip(&output_macs)
+                .zip(expected.iter_lsb0())
+                .all(|((key, mac), bit)| &key.auth(bit, &delta) == mac)
+        );
+
+        let output: Vec<u8> = Vec::from_lsb0_iter(
+            output_macs
+                .into_iter()
+                .zip(&output_keys)
+                .map(|(mac, key)| mac.pointer() ^ key.pointer()),
+        );
+
+        assert_eq!(output, expected);
+    }
+}
+
+/// E2E test: Circuit with no AND gates (XOR-only)
+///
+/// Tests that free XOR works correctly when there are no AND gates to garble.
+#[test]
+fn test_garble_no_and() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let circ = xor(8);
+    assert_eq!(circ.and_count(), 0);
+
+    let a = 1u8;
+    let b = 2u8;
+    let expected = a ^ b;
+
+    let delta = Delta::random(&mut rng);
+    let seed: [u8; 16] = rng.random();
+    let input_keys = (0..circ.inputs().len())
+        .map(|_| rng.random())
+        .collect::<Vec<Key>>();
+
+    let input_macs = input_keys
+        .iter()
+        .zip(a.iter_lsb0().chain(b.iter_lsb0()))
+        .map(|(key, bit)| key.auth(bit, &delta))
+        .collect::<Vec<_>>();
+
+    let mut gb = Garbler::new(seed, delta);
+    let setup = gb.setup().unwrap();
+    let mut ev = Evaluator::default();
+    ev.setup(setup).unwrap();
+
+    let mut gb_iter = gb.generate_batched(&circ, &input_keys).unwrap();
+    let mut ev_consumer = ev.evaluate_batched(&circ, &input_macs).unwrap();
+
+    for batch in gb_iter.by_ref() {
+        ev_consumer.next(batch);
+    }
+
+    let GarblerOutput {
+        outputs: output_keys,
+    } = gb_iter.finish().unwrap();
+    let EvaluatorOutput {
+        outputs: output_macs,
+    } = ev_consumer.finish().unwrap();
+
+    assert!(
+        output_keys
+            .iter()
+            .zip(&output_macs)
+            .zip(expected.iter_lsb0())
+            .all(|((key, mac), bit)| &key.auth(bit, &delta) == mac)
+    );
+
+    let output: u8 = u8::from_lsb0_iter(
+        output_macs
+            .into_iter()
+            .zip(output_keys)
+            .map(|(mac, key)| mac.pointer() ^ key.pointer()),
+    );
+
+    assert_eq!(output, expected);
+}
diff --git a/crates/garble-core/src/lib.rs b/crates/garble-core/src/lib.rs
index 48463b3c..4cf366c4 100644
--- a/crates/garble-core/src/lib.rs
+++ b/crates/garble-core/src/lib.rs
@@ -1,23 +1,32 @@
 //! Core components used to implement garbled circuit protocols
 //!
-//! This crate implements "half-gate" garbled circuits from the [Two Halves Make a Whole \[ZRE15\]](https://eprint.iacr.org/2014/756) paper.
+//! This crate provides two garbling schemes:
+//!
+//! ## Half-Gates (default)
+//!
+//! The [`half_gates`] module implements "half-gate" garbled circuits from the
+//! [Two Halves Make a Whole \[ZRE15\]](https://eprint.iacr.org/2014/756) paper.
+//! AND gates require **2κ bits** (two ciphertexts).
+//!
+//! ## Three-Halves
+//!
+//! The [`three_halves`] module implements the "Three Halves Make a Whole"
+//! scheme from [Rosulek & Roy 2021](https://eprint.iacr.org/2021/749) which reduces AND gate size
+//! from 2κ bits to **1.5κ + 5 bits**.
 
 #![deny(missing_docs, unreachable_pub, unused_must_use)]
 #![deny(clippy::all)]
 
-pub(crate) mod circuit;
-mod evaluator;
-mod garbler;
+pub mod half_gates;
 pub mod store;
+pub mod three_halves;
 pub(crate) mod view;
 
-pub use circuit::{EncryptedGate, EncryptedGateBatch, GarbledCircuit};
-pub use evaluator::{
-    EncryptedGateBatchConsumer, EncryptedGateConsumer, Evaluator, EvaluatorError, EvaluatorOutput,
-    EvaluatorWorker, evaluate_garbled_circuits,
-};
-pub use garbler::{
-    EncryptedGateBatchIter, EncryptedGateIter, Garbler, GarblerError, GarblerOutput, GarblerWorker,
+pub use half_gates::{
+    EncryptedGate, EncryptedGateBatch, EncryptedGateBatchConsumer, EncryptedGateBatchIter,
+    EncryptedGateConsumer, EncryptedGateIter, Evaluator, EvaluatorError, EvaluatorOutput,
+    EvaluatorWorker, GarbledCircuit, Garbler, GarblerError, GarblerOutput, GarblerWorker,
+    evaluate_garbled_circuits,
 };
 pub use mpz_memory_core::correlated::{Delta, Key, Mac};
 use serde::{Deserialize, Serialize};
@@ -55,19 +64,16 @@ mod tests {
     };
     use itybity::{FromBitIterator, IntoBitIterator, ToBits};
     use mpz_circuits::{AES128, circuits::xor};
-    use mpz_core::{Block, aes::FIXED_KEY_AES};
     use rand::{Rng, SeedableRng, rngs::StdRng};
-    use rand_chacha::ChaCha12Rng;
-
-    use crate::evaluator::evaluate_garbled_circuits;
 
     use super::*;
 
     #[test]
     fn test_and_gate() {
-        use crate::{evaluator as ev, garbler as gb};
+        use crate::half_gates::{evaluator as ev, garbler as gb};
+        use mpz_core::{Block, aes::FIXED_KEY_AES};
 
-        let mut rng = ChaCha12Rng::seed_from_u64(0);
+        let mut rng = StdRng::seed_from_u64(0);
         let cipher = &(*FIXED_KEY_AES);
 
         let delta = Delta::random(&mut rng);
diff --git a/crates/garble-core/src/store/garbler.rs b/crates/garble-core/src/store/garbler.rs
index 6c35b1f7..5bebd39b 100644
--- a/crates/garble-core/src/store/garbler.rs
+++ b/crates/garble-core/src/store/garbler.rs
@@ -1,6 +1,6 @@
 use std::sync::Arc;
 
-use rand::Rng;
+use rand::{Rng, RngCore};
 use tokio::sync::{Mutex, OwnedMutexGuard};
 
 use mpz_core::{Block, bitvec::BitVec, prg::Prg};
@@ -112,6 +112,13 @@ impl<COT> GarblerStore<COT> {
         self.view.set_output(slice.to_range()).map_err(Error::from)
     }
 
+    /// Generates a random seed from the PRG.
+    pub fn random_seed(&mut self) -> [u8; 32] {
+        let mut seed = [0u8; 32];
+        self.prg.fill_bytes(&mut seed);
+        seed
+    }
+
     /// Returns `true` if the store wants to flush.
     pub fn wants_flush(&self) -> bool {
         self.view.wants_flush()
diff --git a/crates/garble-core/src/three_halves/circuit.rs b/crates/garble-core/src/three_halves/circuit.rs
new file mode 100644
index 00000000..d4294945
--- /dev/null
+++ b/crates/garble-core/src/three_halves/circuit.rs
@@ -0,0 +1,55 @@
+//! Circuit types for Three-Halves garbling scheme
+
+use serde::{Deserialize, Serialize};
+
+use crate::DEFAULT_BATCH_SIZE;
+
+use super::{ControlBits, ThreeHalvesGate};
+
+/// A garbled circuit using the Three-Halves scheme.
+#[derive(Debug, Clone)]
+pub struct GarbledCircuit {
+    /// Encrypted gates.
+    pub gates: Vec<EncryptedGate>,
+}
+
+/// Encrypted gate for Three Halves scheme.
+///
+/// Contains both the gate ciphertexts (1.5κ bits) and control bits (1 byte)
+/// needed for evaluation.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct EncryptedGate {
+    /// Gate ciphertexts (1.5κ bits = 24 bytes)
+    pub gate: ThreeHalvesGate,
+    /// Control bits for evaluator (1 byte)
+    pub control_bits: ControlBits,
+}
+
+impl EncryptedGate {
+    /// Create a new encrypted gate.
+    pub fn new(gate: ThreeHalvesGate, control_bits: ControlBits) -> Self {
+        Self { gate, control_bits }
+    }
+}
+
+/// A batch of encrypted gates.
+///
+/// # Parameters
+///
+/// - `N`: The size of a batch.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct EncryptedGateBatch<const N: usize = DEFAULT_BATCH_SIZE>(
+    #[serde(with = "serde_arrays")] [EncryptedGate; N],
+);
+
+impl<const N: usize> EncryptedGateBatch<N> {
+    /// Creates a new batch of encrypted gates.
+    pub fn new(batch: [EncryptedGate; N]) -> Self {
+        Self(batch)
+    }
+
+    /// Returns the inner array.
+    pub fn into_array(self) -> [EncryptedGate; N] {
+        self.0
+    }
+}
diff --git a/crates/garble-core/src/three_halves/control.rs b/crates/garble-core/src/three_halves/control.rs
new file mode 100644
index 00000000..061fe73a
--- /dev/null
+++ b/crates/garble-core/src/three_halves/control.rs
@@ -0,0 +1,670 @@
+//! # Control Matrix System for Three-Halves Garbling
+//!
+//! This module implements the "dicing" technique from the paper.
+//! The control matrix R determines which linear combinations of input label
+//! pieces the evaluator uses to compute output label halves.
+//!
+//! ## Overview
+//!
+//! The control matrix R is an 8×6 matrix that specifies, for each of the 8
+//! evaluation equations (4 input combinations × 2 halves), which pieces of
+//! the input labels [A₀; B₀; Δ] to include.
+//!
+//! ### The Problem (Paper Section 4.3)
+//!
+//! The matrix R must satisfy the constraint:
+//! ```text
+//! KR = K[0 0 t]
+//! ```
+//! where t is the truth table. But this constraint depends on t, which must
+//! be hidden from the evaluator!
+//!
+//! ### The Solution: Randomization
+//!
+//! R is sampled from a distribution R(t) such that:
+//! 1. KR = K[0 0 t] always holds (correctness)
+//! 2. Each marginal view R_ij is uniform and independent of t (security)
+//!
+//! ### Marginal Views
+//!
+//! When the evaluator has input (A_i, B_j), they only see/need the 2×4
+//! submatrix:
+//! R_ij = [R_ijA  R_ijB]  (rows 2i, 2i+1 and columns for A, B parts)
+
+//! The full R is never revealed - only one marginal view per evaluation.
+//!
+//! ## Compression with Basis {S₁, S₂}
+//!
+//! Instead of encrypting 8-bit marginal views, we express them in a 2D basis:
+//!
+//! R_ij = c₁·S₁ ⊕ c₂·S₂
+//!
+//! This reduces overhead to 2 bits per marginal view × 4 views = 8 bits,
+//! but we encode it as 5 bits total (see paper Section 5.2).
+
+// ============================================================================
+// Basis Matrices for Marginal View Compression
+// ============================================================================
+
+/// Basis matrix S₁ for expressing marginal views
+///
+/// From Paper Figure 3, Page 14:
+/// ```text
+/// S₁ = [ 1 1 | 1 0 ]
+///      [ 1 0 | 0 1 ]
+/// ```
+///
+/// **Interpretation**: A 2×4 matrix where:
+/// - Row 0 is coefficients for the left half computation
+/// - Row 1 is coefficients for the right half computation
+/// - Columns are [A_L, A_R, B_L, B_R] (the four input label halves)
+///
+/// The vertical bar separates the A-part from the B-part.
+pub const S1: [[u8; 4]; 2] = [
+    //  A_L  A_R  B_L  B_R
+    [1, 1, 1, 0], // Left half computation
+    [1, 0, 0, 1], // Right half computation
+];
+
+/// Basis matrix S₂ for expressing marginal views
+///
+/// From Paper Figure 3, Page 14:
+/// ```text
+/// S₂ = [ 1 0 | 0 1 ]
+///      [ 0 1 | 1 1 ]
+/// ```
+pub const S2: [[u8; 4]; 2] = [
+    //  A_L  A_R  B_L  B_R
+    [1, 0, 0, 1], // Left half computation
+    [0, 1, 1, 1], // Right half computation
+];
+
+// ============================================================================
+// Fixed Control Matrices for ODD-Parity Gates (AND, OR, NAND, NOR)
+// ============================================================================
+
+/// Matrix R_p: Added for odd-parity gates
+///
+/// From Paper Figure 4, Page 14:
+/// ```text
+/// R_p = [ 0 0 | 1 0 | 0 0 ]
+///       [ 0 1 | 0 0 | 0 0 ]
+///       [ 0 0 | 1 0 | 1 0 ]
+///       [ 0 0 | 0 0 | 0 0 ]
+///       [ 0 0 | 0 0 | 0 0 ]
+///       [ 0 1 | 0 0 | 0 1 ]
+///       [ 0 0 | 0 0 | 0 0 ]
+///       [ 0 0 | 0 0 | 0 0 ]
+/// ```
+///
+/// **Purpose**: When garbling an odd-parity gate (like AND), we add R_p to
+/// the sampled R. The evaluator knows to add the corresponding (R_p)_ij to
+/// their marginal view since parity is public in ODD mode.
+///
+/// **Constraint** (Paper Equation 7):
+/// ```text
+/// K·R_p = [ 0 0 0 0 | 1 0 ]
+///         [ 0 0 0 0 | 0 1 ]
+///         [ 0 0 0 0 | 0 0 ]
+/// ```
+/// This contributes the "p" (parity) term to the K·R constraint.
+///
+/// **Column layout**: [A₀_L, A₀_R, B₀_L, B₀_R, Δ_L, Δ_R]
+pub const R_P: [[u8; 6]; 8] = [
+    // (0,0) left
+    [0, 0, 1, 0, 0, 0],
+    // (0,0) right
+    [0, 1, 0, 0, 0, 0],
+    // (0,1) left
+    [0, 0, 1, 0, 1, 0],
+    // (0,1) right
+    [0, 0, 0, 0, 0, 0],
+    // (1,0) left
+    [0, 0, 0, 0, 0, 0],
+    // (1,0) right
+    [0, 1, 0, 0, 0, 1],
+    // (1,1) left
+    [0, 0, 0, 0, 0, 0],
+    // (1,1) right
+    [0, 0, 0, 0, 0, 0],
+];
+
+// ============================================================================
+// Compressed Representation (R̄)
+// ============================================================================
+
+/// Compressed representation of R_a in terms of basis {S₁, S₂}
+///
+/// From Paper Figure 3, Page 14:
+/// ```text
+/// R̄_a = [ 0 0 ]   <- (0,0): 0·S₁ ⊕ 0·S₂
+///       [ 1 1 ]   <- (0,1): 1·S₁ ⊕ 1·S₂
+///       [ 0 1 ]   <- (1,0): 0·S₁ ⊕ 1·S₂
+///       [ 1 0 ]   <- (1,1): 1·S₁ ⊕ 0·S₂
+/// ```
+///
+/// Each row gives the coefficients [c₁, c₂] such that R_ij = c₁·S₁ ⊕ c₂·S₂
+pub const R_BAR_A: [[u8; 2]; 4] = [
+    [0, 0], // (0,0)
+    [1, 1], // (0,1)
+    [0, 1], // (1,0)
+    [1, 0], // (1,1)
+];
+
+/// Compressed representation of R_b in terms of basis {S₁, S₂}
+///
+/// From Paper Figure 3, Page 14:
+/// ```text
+/// R̄_b = [ 0 0 ]
+///       [ 1 0 ]
+///       [ 1 1 ]
+///       [ 0 1 ]
+/// ```
+pub const R_BAR_B: [[u8; 2]; 4] = [
+    [0, 0], // (0,0)
+    [1, 0], // (0,1)
+    [1, 1], // (1,0)
+    [0, 1], // (1,1)
+];
+
+/// Compressed representation of R$ basis vectors
+///
+/// From Paper Figure 3, Page 14:
+/// ```text
+/// R̄$ ← span { [ 1 0 ]   [ 0 1 ] }
+///             [ 1 0 ] , [ 0 1 ]
+///             [ 1 0 ]   [ 0 1 ]
+///             [ 1 0 ]   [ 0 1 ]
+/// ```
+///
+/// **Key insight**: Both basis vectors have the same value in every row!
+/// This means sampling random R$ in compressed form just picks a random
+/// pair (c₁, c₂) and uses it for ALL four marginal views.
+///
+/// This is what makes each marginal view individually uniform while
+/// maintaining the correlation needed for KR$ = 0.
+pub const R_BAR_DOLLAR_BASIS_0: [[u8; 2]; 4] = [
+    [1, 0], // (0,0)
+    [1, 0], // (0,1)
+    [1, 0], // (1,0)
+    [1, 0], // (1,1)
+];
+
+/// Second basis vector for compressed R$ randomization
+///
+/// From Paper Figure 3, Page 14.
+/// Same pattern as R_BAR_DOLLAR_BASIS_0 but with [0,1] instead of [1,0].
+pub const R_BAR_DOLLAR_BASIS_1: [[u8; 2]; 4] = [
+    [0, 1], // (0,0)
+    [0, 1], // (0,1)
+    [0, 1], // (1,0)
+    [0, 1], // (1,1)
+];
+
+// ============================================================================
+// Control Matrix Operations
+// ============================================================================
+
+/// Precomputed lookup table for compressed control matrices R_bar.
+///
+/// Indexed by: `(pi_a as usize) << 3 | (pi_b as usize) << 2 | (r0 as usize) <<
+/// 1 | r1 as usize`
+///
+/// Each entry is the compressed R_bar representation where:
+/// - `a = !pi_a` (true position bit for input A in AND gate)
+/// - `b = !pi_b` (true position bit for input B in AND gate)
+/// - `p = true` (AND gate has odd parity, exactly one true output)
+/// - `R_bar = r0·R̄$_BASIS_0 ⊕ r1·R̄$_BASIS_1 ⊕ a·R̄_A ⊕ b·R̄_B`
+///
+/// Note: R_P is NOT included in R_bar because it's not in the span of {S₁, S₂}.
+/// The evaluator adds R_P separately since parity is public.
+const SAMPLE_R_BAR_TABLE: [[[u8; 2]; 4]; 16] = [
+    [[0, 0], [0, 1], [1, 0], [1, 1]], // Index 0
+    [[0, 1], [0, 0], [1, 1], [1, 0]], // Index 1
+    [[1, 0], [1, 1], [0, 0], [0, 1]], // Index 2
+    [[1, 1], [1, 0], [0, 1], [0, 0]], // Index 3
+    [[0, 0], [1, 1], [0, 1], [1, 0]], // Index 4
+    [[0, 1], [1, 0], [0, 0], [1, 1]], // Index 5
+    [[1, 0], [0, 1], [1, 1], [0, 0]], // Index 6
+    [[1, 1], [0, 0], [1, 0], [0, 1]], // Index 7
+    [[0, 0], [1, 0], [1, 1], [0, 1]], // Index 8
+    [[0, 1], [1, 1], [1, 0], [0, 0]], // Index 9
+    [[1, 0], [0, 0], [0, 1], [1, 1]], // Index 10
+    [[1, 1], [0, 1], [0, 0], [1, 0]], // Index 11
+    [[0, 0], [0, 0], [0, 0], [0, 0]], // Index 12
+    [[0, 1], [0, 1], [0, 1], [0, 1]], // Index 13
+    [[1, 0], [1, 0], [1, 0], [1, 0]], // Index 14
+    [[1, 1], [1, 1], [1, 1], [1, 1]], // Index 15
+];
+
+/// Sample a control matrix R for an AND gate (ODD mode)
+///
+/// Uses precomputed lookup table for all 16 combinations of inputs.
+///
+/// Paper Section 5.1, Algorithm:
+/// ```text
+/// R = p·R_p ⊕ a·R_a ⊕ b·R_b ⊕ R$
+/// ```
+///
+/// where:
+/// - `a = !pi_a` (position bit derived from permute bit)
+/// - `b = !pi_b` (position bit derived from permute bit)
+/// - `p = true` (AND gates have odd parity)
+/// - `R$` is sampled uniformly from span{R$_BASIS_0, R$_BASIS_1}
+///
+/// # Arguments
+/// * `pi_a` - Permute bit for input A
+/// * `pi_b` - Permute bit for input B
+/// * `rand_bits` - Two random bits [r₀, r₁] for sampling R$
+///
+/// # Returns
+/// * `r_bar` - The 4×2 compressed representation for encryption (u8 values 0 or
+///   1)
+#[inline]
+pub fn sample_r_odd(pi_a: bool, pi_b: bool, rand_bits: [bool; 2]) -> [[u8; 2]; 4] {
+    let index = (pi_a as usize) << 3
+        | (pi_b as usize) << 2
+        | (rand_bits[0] as usize) << 1
+        | rand_bits[1] as usize;
+    SAMPLE_R_BAR_TABLE[index]
+}
+
+/// Precomputed lookup table for all possible marginal view expansions
+///
+/// Since [c₁, c₂] are binary (0 or 1), there are only 4 possible combinations.
+/// This table precomputes R_ij = c₁·S₁ ⊕ c₂·S₂ for all 4 cases:
+///
+/// - Index 0 ([0,0]): 0·S₁ ⊕ 0·S₂ = zero matrix
+/// - Index 1 ([0,1]): 0·S₁ ⊕ 1·S₂ = S₂
+/// - Index 2 ([1,0]): 1·S₁ ⊕ 0·S₂ = S₁
+/// - Index 3 ([1,1]): 1·S₁ ⊕ 1·S₂ = S₁ ⊕ S₂
+///
+/// This optimization replaces nested loops with a single array lookup.
+const EXPANDED_MARGINALS: [[[u8; 4]; 2]; 4] = [
+    // [c₁=0, c₂=0]: zero matrix
+    [[0, 0, 0, 0], [0, 0, 0, 0]],
+    // [c₁=0, c₂=1]: S₂
+    [[1, 0, 0, 1], [0, 1, 1, 1]],
+    // [c₁=1, c₂=0]: S₁
+    [[1, 1, 1, 0], [1, 0, 0, 1]],
+    // [c₁=1, c₂=1]: S₁ ⊕ S₂
+    [[0, 1, 1, 1], [1, 1, 1, 0]],
+];
+
+/// Expand compressed marginal view R̄_ij to full R_ij
+///
+/// Given coefficients [c₁, c₂], compute R_ij = c₁·S₁ ⊕ c₂·S₂
+///
+/// # Arguments
+/// * `r_bar_ij` - The 2-element compressed representation [c₁, c₂]
+///
+/// # Returns
+/// The 2×4 marginal view matrix
+///
+/// # Implementation Note
+/// Since [c₁, c₂] can only be [0,0], [0,1], [1,0], or [1,1], this function
+/// uses a precomputed lookup table instead of computing the linear combination
+/// at runtime, which is faster and avoids loops/XOR operations.
+pub fn expand_marginal(r_bar_ij: &[u8; 2]) -> [[u8; 4]; 2] {
+    let index = (r_bar_ij[0] << 1) | r_bar_ij[1];
+    EXPANDED_MARGINALS[index as usize]
+}
+
+/// Precomputed R_P marginals for all 4 input combinations
+///
+/// Since (i, j) are color bits (0 or 1), there are only 4 combinations.
+/// This table precomputes the 2×4 marginal view from R_P for each:
+///
+/// - Index 0 (i=0, j=0): Extract rows 0,1 columns 0-3 from R_P
+/// - Index 1 (i=0, j=1): Extract rows 2,3 columns 0-3 from R_P
+/// - Index 2 (i=1, j=0): Extract rows 4,5 columns 0-3 from R_P
+/// - Index 3 (i=1, j=1): Extract rows 6,7 columns 0-3 from R_P
+const R_P_MARGINALS: [[[u8; 4]; 2]; 4] = [
+    // (i=0, j=0): rows 0,1, columns [A₀_L, A₀_R, B₀_L, B₀_R]
+    [[0, 0, 1, 0], [0, 1, 0, 0]],
+    // (i=0, j=1): rows 2,3
+    [[0, 0, 1, 0], [0, 0, 0, 0]],
+    // (i=1, j=0): rows 4,5
+    [[0, 0, 0, 0], [0, 1, 0, 0]],
+    // (i=1, j=1): rows 6,7
+    [[0, 0, 0, 0], [0, 0, 0, 0]],
+];
+
+/// Extract R_P's marginal view for input position (i,j)
+///
+/// In ODD mode, the evaluator knows parity is odd and must add R_P's
+/// contribution to their marginal view. This function extracts the
+/// 2×4 marginal from the constant R_P matrix.
+///
+/// # Arguments
+/// * `i` - First input's color bit (0 or 1)
+/// * `j` - Second input's color bit (0 or 1)
+///
+/// # Returns
+/// The 2×4 marginal view from R_P for position (i,j)
+///
+/// # Implementation Note
+/// Since (i, j) can only be (0,0), (0,1), (1,0), or (1,1), this function
+/// uses a precomputed lookup table instead of extracting from R_P at runtime.
+pub fn extract_r_p_marginal(i: usize, j: usize) -> [[u8; 4]; 2] {
+    let index = (i << 1) | j;
+    R_P_MARGINALS[index]
+}
+
+#[cfg(test)]
+pub(in crate::three_halves) mod tests {
+    use super::{super::matrices::K, *};
+
+    /// Test: Verify expand_marginal correctly indexes EXPANDED_MARGINALS table
+    #[test]
+    fn test_expand_marginal_indexing() {
+        // Verify expand_marginal correctly indexes EXPANDED_MARGINALS table
+        // Index = (c₁ << 1) | c₂
+        assert_eq!(expand_marginal(&[0, 0]), EXPANDED_MARGINALS[0]); // index 0b00 = 0
+        assert_eq!(expand_marginal(&[0, 1]), EXPANDED_MARGINALS[1]); // index 0b01 = 1
+        assert_eq!(expand_marginal(&[1, 0]), EXPANDED_MARGINALS[2]); // index 0b10 = 2
+        assert_eq!(expand_marginal(&[1, 1]), EXPANDED_MARGINALS[3]); // index 0b11 = 3
+    }
+
+    /// Verification tests for precomputed constants and lookup tables.
+    ///
+    /// These verify that hardcoded constants/LUTs match their specifications:
+    /// - Source matrices (R_P, R_A, R_B, R_DOLLAR_BASIS_*) satisfy K×R
+    ///   constraints
+    /// - Derived LUTs (SAMPLE_R_BAR_TABLE, EXPANDED_MARGINALS, R_P_MARGINALS)
+    ///   are correctly constructed
+    ///
+    /// These tests only need to run once to verify correctness after changes.
+    ///
+    /// Run with: `cargo test control::tests::table_verification -- --ignored`
+    pub(in crate::three_halves) mod table_verification {
+        #![allow(unused_imports)]
+        use super::*;
+
+        /// Matrix R_a: Encodes the 'a' bit of truth table position
+        ///
+        /// From Paper Figure 3, Page 14. See control module docs for details.
+        pub(in crate::three_halves) const R_A: [[u8; 6]; 8] = [
+            [0, 0, 0, 0, 0, 0], // (0,0) left
+            [0, 0, 0, 0, 0, 0], // (0,0) right
+            [0, 1, 1, 1, 1, 1], // (0,1) left
+            [1, 1, 1, 0, 1, 0], // (0,1) right
+            [1, 0, 0, 1, 1, 0], // (1,0) left
+            [0, 1, 1, 1, 0, 1], // (1,0) right
+            [1, 1, 1, 0, 0, 1], // (1,1) left
+            [1, 0, 0, 1, 1, 1], // (1,1) right
+        ];
+
+        /// Matrix R_b: Encodes the 'b' bit of truth table position
+        ///
+        /// From Paper Figure 3, Page 14. See control module docs for details.
+        pub(in crate::three_halves) const R_B: [[u8; 6]; 8] = [
+            [0, 0, 0, 0, 0, 0], // (0,0) left
+            [0, 0, 0, 0, 0, 0], // (0,0) right
+            [1, 1, 1, 0, 1, 0], // (0,1) left
+            [1, 0, 0, 1, 0, 1], // (0,1) right
+            [0, 1, 1, 1, 0, 1], // (1,0) left
+            [1, 1, 1, 0, 1, 1], // (1,0) right
+            [1, 0, 0, 1, 1, 1], // (1,1) left
+            [0, 1, 1, 1, 1, 0], // (1,1) right
+        ];
+
+        /// First basis matrix for R$ randomization
+        ///
+        /// From Paper Figure 3, Page 14. See control module docs for details.
+        pub(in crate::three_halves) const R_DOLLAR_BASIS_0: [[u8; 6]; 8] = [
+            [1, 1, 1, 0, 0, 0],
+            [1, 0, 0, 1, 0, 0],
+            [1, 1, 1, 0, 1, 0],
+            [1, 0, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 1],
+            [1, 0, 0, 1, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+        ];
+
+        /// Second basis matrix for R$ randomization
+        ///
+        /// From Paper Figure 3, Page 14. See control module docs for details.
+        pub(in crate::three_halves) const R_DOLLAR_BASIS_1: [[u8; 6]; 8] = [
+            [1, 0, 0, 1, 0, 0],
+            [0, 1, 1, 1, 0, 0],
+            [1, 0, 0, 1, 0, 1],
+            [0, 1, 1, 1, 1, 1],
+            [1, 0, 0, 1, 1, 0],
+            [0, 1, 1, 1, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+        ];
+
+        // ========================================================================
+        // Helper Functions (used only for verification)
+        // ========================================================================
+
+        /// Multiply two matrices over GF(2)
+        ///
+        /// Computes A × B where all arithmetic is mod 2.
+        fn matmul_gf2<const RA: usize, const CA: usize, const CB: usize>(
+            a: &[[u8; CA]; RA],
+            b: &[[u8; CB]; CA],
+        ) -> [[u8; CB]; RA] {
+            let mut result = [[0u8; CB]; RA];
+
+            for i in 0..RA {
+                for j in 0..CB {
+                    let mut sum = 0u8;
+                    for k in 0..CA {
+                        // In GF(2): multiplication is AND, addition is XOR
+                        sum ^= a[i][k] & b[k][j];
+                    }
+                    result[i][j] = sum;
+                }
+            }
+
+            result
+        }
+
+        /// Check if a matrix is all zeros
+        fn is_zero_matrix<const R: usize, const C: usize>(m: &[[u8; C]; R]) -> bool {
+            for row in m {
+                for &val in row {
+                    if val != 0 {
+                        return false;
+                    }
+                }
+            }
+            true
+        }
+
+        /// Verify K × R$ = 0 for basis matrices
+        fn verify_k_r_dollar_is_zero_impl() -> bool {
+            let kr0 = matmul_gf2(&K, &R_DOLLAR_BASIS_0);
+            let kr1 = matmul_gf2(&K, &R_DOLLAR_BASIS_1);
+            is_zero_matrix(&kr0) && is_zero_matrix(&kr1)
+        }
+
+        /// Verify K × R_a gives expected result (Paper Equation 7)
+        fn verify_k_r_a_impl() -> bool {
+            let kr_a = matmul_gf2(&K, &R_A);
+            let expected: [[u8; 6]; 3] =
+                [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0]];
+            kr_a == expected
+        }
+
+        /// Verify K × R_b gives expected result (Paper Equation 7)
+        fn verify_k_r_b_impl() -> bool {
+            let kr_b = matmul_gf2(&K, &R_B);
+            let expected: [[u8; 6]; 3] =
+                [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1]];
+            kr_b == expected
+        }
+
+        /// Verify K × R_p gives expected result (Paper Equation 7)
+        fn verify_k_r_p_impl() -> bool {
+            let kr_p = matmul_gf2(&K, &R_P);
+            let expected: [[u8; 6]; 3] =
+                [[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0]];
+            kr_p == expected
+        }
+
+        /// Extract marginal view R_ij from full control matrix R
+        ///
+        /// Paper Section 5.1: "When the evaluator holds input labels A_i, B_j,
+        /// the submatrix R_ij = [R_ijA R_ijB] is enough to completely determine
+        /// which linear combination should be applied."
+        ///
+        /// # Arguments
+        /// * `r` - The full 8×6 control matrix
+        /// * `i` - First input's color bit (0 or 1)
+        /// * `j` - Second input's color bit (0 or 1)
+        ///
+        /// # Returns
+        /// The 2×4 marginal view [R_ijA R_ijB] where:
+        /// - Columns 0-1 are the A-part (coefficients for A_L, A_R)
+        /// - Columns 2-3 are the B-part (coefficients for B_L, B_R)
+        fn extract_marginal(r: &[[u8; 6]; 8], i: usize, j: usize) -> [[u8; 4]; 2] {
+            let row_base = 2 * (2 * i + j); // Row index: 0, 2, 4, or 6
+
+            let mut marginal = [[0u8; 4]; 2];
+
+            // Row 0 of marginal = row (row_base) of R, columns 0-3
+            // Row 1 of marginal = row (row_base + 1) of R, columns 0-3
+            for col in 0..4 {
+                marginal[0][col] = r[row_base][col];
+                marginal[1][col] = r[row_base + 1][col];
+            }
+
+            marginal
+        }
+
+        // ========================================================================
+        // Verification Tests
+        // ========================================================================
+
+        /// Verify K × R$_BASIS vectors = 0
+        ///
+        /// Paper Figure 3: The R$ distribution must satisfy KR$ = 0
+        #[test]
+        #[ignore]
+        fn verify_k_r_dollar_is_zero() {
+            assert!(
+                verify_k_r_dollar_is_zero_impl(),
+                "K × R$_BASIS should be zero"
+            );
+        }
+
+        /// Verify K × R_p matches Equation 7
+        #[test]
+        #[ignore]
+        fn verify_k_r_p_constraint() {
+            assert!(verify_k_r_p_impl(), "K × R_p doesn't match Equation 7");
+        }
+
+        /// Verify K × R_a matches Equation 7
+        #[test]
+        #[ignore]
+        fn verify_k_r_a_constraint() {
+            assert!(verify_k_r_a_impl(), "K × R_a doesn't match Equation 7");
+        }
+
+        /// Verify K × R_b matches Equation 7
+        #[test]
+        #[ignore]
+        fn verify_k_r_b_constraint() {
+            assert!(verify_k_r_b_impl(), "K × R_b doesn't match Equation 7");
+        }
+
+        /// Verify SAMPLE_R_BAR_TABLE construction from compressed basis
+        /// matrices
+        #[test]
+        #[ignore]
+        fn verify_sample_r_bar_table() {
+            for pi_a in [false, true] {
+                for pi_b in [false, true] {
+                    for r0 in [false, true] {
+                        for r1 in [false, true] {
+                            let (a, b) = (!pi_a, !pi_b);
+
+                            // Compute r_bar from formula:
+                            // r_bar = r0·R̄$_BASIS_0 ⊕ r1·R̄$_BASIS_1 ⊕ a·R̄_A ⊕ b·R̄_B
+                            let mut expected = [[0u8; 2]; 4];
+                            for ij in 0..4 {
+                                for k in 0..2 {
+                                    expected[ij][k] = (r0 as u8 * R_BAR_DOLLAR_BASIS_0[ij][k])
+                                        ^ (r1 as u8 * R_BAR_DOLLAR_BASIS_1[ij][k])
+                                        ^ (a as u8 * R_BAR_A[ij][k])
+                                        ^ (b as u8 * R_BAR_B[ij][k]);
+                                }
+                            }
+
+                            let index = (pi_a as usize) << 3
+                                | (pi_b as usize) << 2
+                                | (r0 as usize) << 1
+                                | r1 as usize;
+
+                            assert_eq!(
+                                super::super::SAMPLE_R_BAR_TABLE[index],
+                                expected,
+                                "SAMPLE_R_BAR_TABLE[{}] incorrect for pi_a={}, pi_b={}, r0={}, r1={}",
+                                index,
+                                pi_a,
+                                pi_b,
+                                r0,
+                                r1
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        /// Verify EXPANDED_MARGINALS construction from S₁ and S₂
+        #[test]
+        #[ignore]
+        fn verify_expanded_marginals() {
+            for c1 in 0u8..=1 {
+                for c2 in 0u8..=1 {
+                    // Compute marginal = c1·S₁ ⊕ c2·S₂
+                    let mut expected = [[0u8; 4]; 2];
+                    for row in 0..2 {
+                        for col in 0..4 {
+                            expected[row][col] = (c1 * S1[row][col]) ^ (c2 * S2[row][col]);
+                        }
+                    }
+
+                    let index = (c1 << 1) | c2;
+                    assert_eq!(
+                        super::super::EXPANDED_MARGINALS[index as usize],
+                        expected,
+                        "EXPANDED_MARGINALS[{}] incorrect for c1={}, c2={}",
+                        index,
+                        c1,
+                        c2
+                    );
+                }
+            }
+        }
+
+        /// Verify R_P_MARGINALS extraction from R_P matrix
+        #[test]
+        #[ignore]
+        fn verify_r_p_marginals() {
+            for i in 0..2 {
+                for j in 0..2 {
+                    // Extract marginal from R_P
+                    let expected = extract_marginal(&R_P, i, j);
+
+                    let index = (i << 1) | j;
+                    assert_eq!(
+                        super::super::R_P_MARGINALS[index],
+                        expected,
+                        "R_P_MARGINALS[{}] incorrect for i={}, j={}",
+                        index,
+                        i,
+                        j
+                    );
+                }
+            }
+        }
+    }
+}
diff --git a/crates/garble-core/src/three_halves/evaluator.rs b/crates/garble-core/src/three_halves/evaluator.rs
new file mode 100644
index 00000000..709aca54
--- /dev/null
+++ b/crates/garble-core/src/three_halves/evaluator.rs
@@ -0,0 +1,638 @@
+//! Evaluator for Three Halves Scheme
+//!
+//! This module implements the evaluation function for circuits using the
+//! Three Halves technique from Rosulek & Roy 2021.
+
+use core::fmt;
+use std::{ops::Range, sync::Arc};
+
+use mpz_circuits::{Circuit, Gate};
+use mpz_core::{
+    Block,
+    aes::{FIXED_KEY_AES, FixedKeyAes},
+};
+use mpz_memory_core::correlated::Mac;
+
+use super::{
+    ControlBits, EncryptedGate, EncryptedGateBatch, ThreeHalvesGate,
+    control::{expand_marginal, extract_r_p_marginal},
+    garbler::xor_assign_8,
+    slicing::SlicedLabel,
+};
+
+use crate::DEFAULT_BATCH_SIZE;
+
+/// Precomputed bitmasks indicating which gate ciphertexts to XOR for each row.
+///
+/// This is derived from columns 2, 3, 4 of matrix V (the G₀, G₁, G₂ columns).
+/// For each of the 8 rows, the bitmask indicates which gate components to
+/// include:
+/// - bit 0 (LSB): include G₀ if set
+/// - bit 1:       include G₁ if set
+/// - bit 2:       include G₂ if set
+///
+/// Row-by-row breakdown from V matrix:
+/// - Row 0 (0,0)L: [0,0,0] → 0b000 = 0 (no gate components)
+/// - Row 1 (0,0)R: [0,0,0] → 0b000 = 0 (no gate components)
+/// - Row 2 (0,1)L: [0,0,1] → 0b100 = 4 (only G₂)
+/// - Row 3 (0,1)R: [0,1,1] → 0b110 = 6 (G₁ ⊕ G₂)
+/// - Row 4 (1,0)L: [1,0,1] → 0b101 = 5 (G₀ ⊕ G₂)
+/// - Row 5 (1,0)R: [0,0,1] → 0b100 = 4 (only G₂)
+/// - Row 6 (1,1)L: [1,0,0] → 0b001 = 1 (only G₀)
+/// - Row 7 (1,1)R: [0,1,0] → 0b010 = 2 (only G₁)
+const GATE_CONTRIBUTION_MASKS: [u8; 8] = [0, 0, 4, 6, 5, 4, 1, 2];
+
+/// Precomputed bitmasks indicating which hash outputs to XOR for each row.
+///
+/// This is derived from the M matrix pattern. Each row needs to XOR specific
+/// hash outputs based on the input combination (i, j):
+/// - bit 0 (LSB): include h_a (H(A_i)) if set
+/// - bit 1:       include h_b (H(B_j)) if set
+/// - bit 2:       include h_ab (H(A_i ⊕ B_j)) if set
+///
+/// Pattern from M matrix:
+/// - Even rows (left halves):  Always H(A_i) ⊕ H(A_i⊕B_j) → 0b101 = 5
+/// - Odd rows (right halves):  Always H(B_j) ⊕ H(A_i⊕B_j) → 0b110 = 6
+///
+/// Row-by-row breakdown:
+/// - Row 0 (0,0)L: H(A₀) ⊕ H(A₀⊕B₀) → h_a ⊕ h_ab = 0b101 = 5
+/// - Row 1 (0,0)R: H(B₀) ⊕ H(A₀⊕B₀) → h_b ⊕ h_ab = 0b110 = 6
+/// - Row 2 (0,1)L: H(A₀) ⊕ H(A₀⊕B₁) → h_a ⊕ h_ab = 0b101 = 5
+/// - Row 3 (0,1)R: H(B₁) ⊕ H(A₀⊕B₁) → h_b ⊕ h_ab = 0b110 = 6
+/// - Row 4 (1,0)L: H(A₁) ⊕ H(A₀⊕B₁) → h_a ⊕ h_ab = 0b101 = 5
+/// - Row 5 (1,0)R: H(B₀) ⊕ H(A₀⊕B₁) → h_b ⊕ h_ab = 0b110 = 6
+/// - Row 6 (1,1)L: H(A₁) ⊕ H(A₀⊕B₀) → h_a ⊕ h_ab = 0b101 = 5
+/// - Row 7 (1,1)R: H(B₁) ⊕ H(A₀⊕B₀) → h_b ⊕ h_ab = 0b110 = 6
+const HASH_CONTRIBUTION_MASKS: [u8; 8] = [5, 6, 5, 6, 5, 6, 5, 6];
+
+/// Evaluator for Three Halves scheme.
+#[derive(Debug, Default)]
+pub struct Evaluator {
+    /// Buffer for the active labels.
+    buffer: Vec<Block>,
+}
+
+impl Evaluator {
+    /// Creates a new evaluator with a buffer of the given capacity.
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self {
+            buffer: Vec::with_capacity(capacity),
+        }
+    }
+
+    /// Returns a consumer over the encrypted gates of a circuit.
+    ///
+    /// # Arguments
+    ///
+    /// * `circ` - The circuit to evaluate.
+    /// * `inputs` - The input labels to the circuit.
+    pub fn evaluate<'a>(
+        &'a mut self,
+        circ: &'a Circuit,
+        inputs: &[Mac],
+    ) -> Result<EncryptedGateConsumer<'a, std::slice::Iter<'a, Gate>>, EvaluatorError> {
+        if inputs.len() != circ.inputs().len() {
+            return Err(EvaluatorError::InputLength {
+                expected: circ.inputs().len(),
+                actual: inputs.len(),
+            });
+        }
+
+        // Expand the buffer to fit the circuit
+        if circ.feed_count() > self.buffer.len() {
+            self.buffer.resize(circ.feed_count(), Default::default());
+        }
+
+        self.buffer[..inputs.len()].copy_from_slice(Mac::as_blocks(inputs));
+
+        Ok(EncryptedGateConsumer::new(
+            circ.gates().iter(),
+            &mut self.buffer,
+            circ.and_count(),
+            circ.outputs(),
+        ))
+    }
+
+    /// Returns a consumer over batched encrypted gates of a circuit.
+    ///
+    /// # Arguments
+    ///
+    /// * `circ` - The circuit to evaluate.
+    /// * `inputs` - The input labels to the circuit.
+    pub fn evaluate_batched<'a>(
+        &'a mut self,
+        circ: &'a Circuit,
+        inputs: &[Mac],
+    ) -> Result<EncryptedGateBatchConsumer<'a, std::slice::Iter<'a, Gate>>, EvaluatorError> {
+        self.evaluate(circ, inputs).map(EncryptedGateBatchConsumer)
+    }
+}
+
+/// Consumer over the encrypted gates of a circuit.
+pub struct EncryptedGateConsumer<'a, I: Iterator> {
+    /// Cipher to use to evaluate the gates.
+    cipher: &'static FixedKeyAes,
+    /// Buffer for the active labels.
+    labels: &'a mut [Block],
+    /// Iterator over the gates.
+    gates: I,
+    /// Current gate id.
+    gid: usize,
+    /// Number of AND gates evaluated.
+    counter: usize,
+    /// Total number of AND gates in the circuit.
+    and_count: usize,
+    /// Range of the outputs in the buffer.
+    outputs: Range<usize>,
+    /// Whether the entire circuit has been evaluated.
+    complete: bool,
+}
+
+impl<I: Iterator> fmt::Debug for EncryptedGateConsumer<'_, I> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "EncryptedGateConsumer {{ .. }}")
+    }
+}
+
+impl<'a, I> EncryptedGateConsumer<'a, I>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    fn new(gates: I, labels: &'a mut [Block], and_count: usize, outputs: Range<usize>) -> Self {
+        Self {
+            cipher: &(*FIXED_KEY_AES),
+            gates,
+            labels,
+            gid: 1,
+            counter: 0,
+            and_count,
+            outputs,
+            complete: false,
+        }
+    }
+
+    /// Returns `true` if the evaluator wants more encrypted gates.
+    #[inline]
+    pub fn wants_gates(&self) -> bool {
+        self.counter != self.and_count
+    }
+
+    /// Evaluates the next encrypted gate in the circuit.
+    #[inline]
+    pub fn next(&mut self, encrypted_gate: EncryptedGate) {
+        while let Some(gate) = self.gates.next() {
+            match gate {
+                Gate::Xor {
+                    x: node_x,
+                    y: node_y,
+                    z: node_z,
+                } => {
+                    let x = self.labels[node_x.id()];
+                    let y = self.labels[node_y.id()];
+                    self.labels[node_z.id()] = x ^ y;
+                }
+                Gate::And {
+                    x: node_x,
+                    y: node_y,
+                    z: node_z,
+                } => {
+                    let x = self.labels[node_x.id()];
+                    let y = self.labels[node_y.id()];
+                    let z = and_gate(
+                        self.cipher,
+                        &x,
+                        &y,
+                        &encrypted_gate.gate,
+                        &encrypted_gate.control_bits,
+                        self.gid,
+                    );
+                    self.labels[node_z.id()] = z;
+
+                    self.gid += 1;
+                    self.counter += 1;
+
+                    // If we have more AND gates to evaluate, return.
+                    if self.wants_gates() {
+                        return;
+                    }
+                }
+                Gate::Inv {
+                    x: node_x,
+                    z: node_z,
+                } => {
+                    let x = self.labels[node_x.id()];
+                    self.labels[node_z.id()] = x;
+                }
+                Gate::Id {
+                    x: node_x,
+                    z: node_z,
+                } => {
+                    let x = self.labels[node_x.id()];
+                    self.labels[node_z.id()] = x;
+                }
+            }
+        }
+
+        self.complete = true;
+    }
+
+    /// Returns the encoded outputs of the circuit.
+    pub fn finish(mut self) -> Result<EvaluatorOutput, EvaluatorError> {
+        if self.wants_gates() {
+            return Err(EvaluatorError::NotFinished);
+        }
+
+        // If there were 0 AND gates, evaluate the "free" gates now.
+        if !self.complete {
+            self.next(Default::default());
+        }
+
+        Ok(EvaluatorOutput {
+            outputs: Mac::from_blocks(self.labels[self.outputs.clone()].to_vec()),
+        })
+    }
+}
+
+/// Consumer returned by [`Evaluator::evaluate_batched`].
+#[derive(Debug)]
+pub struct EncryptedGateBatchConsumer<'a, I: Iterator, const N: usize = DEFAULT_BATCH_SIZE>(
+    EncryptedGateConsumer<'a, I>,
+);
+
+impl<'a, I, const N: usize> EncryptedGateBatchConsumer<'a, I, N>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    /// Returns `true` if the evaluator wants more encrypted gates.
+    pub fn wants_gates(&self) -> bool {
+        self.0.wants_gates()
+    }
+
+    /// Evaluates the next batch of gates in the circuit.
+    #[inline]
+    pub fn next(&mut self, batch: EncryptedGateBatch<N>) {
+        for encrypted_gate in batch.into_array() {
+            self.0.next(encrypted_gate);
+            if !self.0.wants_gates() {
+                return;
+            }
+        }
+    }
+
+    /// Returns the encoded outputs of the circuit.
+    pub fn finish(self) -> Result<EvaluatorOutput, EvaluatorError> {
+        self.0.finish()
+    }
+}
+
+// ============================================================================
+// Single gate evaluation (internal)
+// ============================================================================
+
+/// Evaluate a single AND gate using the Three Halves scheme.
+///
+/// # Arguments
+/// * `cipher` - The fixed-key AES cipher
+/// * `a` - The active label for input A
+/// * `b` - The active label for input B
+/// * `gate` - The gate ciphertexts (G₀, G₁, G₂)
+/// * `control_bits` - The compressed control bits from the garbler
+/// * `gid` - The gate ID
+///
+/// # Returns
+/// The output label C
+#[inline]
+fn and_gate(
+    cipher: &FixedKeyAes,
+    a: &Block,
+    b: &Block,
+    gate: &ThreeHalvesGate,
+    control_bits: &ControlBits,
+    gid: usize,
+) -> Block {
+    // Determine input combination (i, j) from color bits
+    let i = a.lsb() as usize;
+    let j = b.lsb() as usize;
+    let ij = (i << 1) | j;
+
+    // Compute the three hashes the evaluator has access to
+    let tweak = Block::new((gid as u128).to_be_bytes());
+    let mut hash_inputs = [*a, *b, *a ^ *b];
+    cipher.rtccr_many(&[tweak; 3], &mut hash_inputs);
+
+    let h_a = SlicedLabel::from_block(hash_inputs[0]);
+    let h_b = SlicedLabel::from_block(hash_inputs[1]);
+    let h_ab = SlicedLabel::from_block(hash_inputs[2]);
+
+    // Slice the input labels
+    let a_sliced = SlicedLabel::from_block(*a);
+    let b_sliced = SlicedLabel::from_block(*b);
+
+    // Expand compressed r_bar to full marginal (includes R_P for ODD mode)
+    let marginal = expand_evaluator_marginal(control_bits, i, j);
+
+    // Get the two rows for this input combination
+    let row_l = 2 * ij;
+    let row_r = 2 * ij + 1;
+
+    // Compute contributions for each half
+    let hash_contrib_l = compute_hash_contribution(row_l, &h_a, &h_b, &h_ab);
+    let hash_contrib_r = compute_hash_contribution(row_r, &h_a, &h_b, &h_ab);
+
+    let input_contrib_l = compute_input_contribution(&marginal, 0, &a_sliced, &b_sliced);
+    let input_contrib_r = compute_input_contribution(&marginal, 1, &a_sliced, &b_sliced);
+
+    let gate_contrib_l = compute_gate_contribution(row_l, gate);
+    let gate_contrib_r = compute_gate_contribution(row_r, gate);
+
+    // Combine all contributions
+    let mut c_l = hash_contrib_l;
+    xor_assign_8(&mut c_l, &input_contrib_l);
+    xor_assign_8(&mut c_l, &gate_contrib_l);
+
+    let mut c_r = hash_contrib_r;
+    xor_assign_8(&mut c_r, &input_contrib_r);
+    xor_assign_8(&mut c_r, &gate_contrib_r);
+
+    SlicedLabel::new(c_l, c_r).to_block()
+}
+
+/// Expand evaluator's marginal from compressed control bits.
+fn expand_evaluator_marginal(control_bits: &ControlBits, i: usize, j: usize) -> [[u8; 4]; 2] {
+    let ij = (i << 1) | j;
+    let r_bar_ij = control_bits.get(ij);
+
+    let mut marginal = expand_marginal(&r_bar_ij);
+
+    // Add R_P's marginal (ODD mode)
+    let r_p_marginal = extract_r_p_marginal(i, j);
+    for row in 0..2 {
+        for col in 0..4 {
+            marginal[row][col] ^= r_p_marginal[row][col];
+        }
+    }
+
+    marginal
+}
+
+/// Compute hash contribution using precomputed bitmask.
+///
+/// Instead of checking 3 conditional branches against M matrix columns and
+/// dynamically computing which hash to use, we use a precomputed bitmask.
+/// The M matrix has a simple pattern: even rows always XOR h_a ⊕ h_ab,
+/// odd rows always XOR h_b ⊕ h_ab.
+fn compute_hash_contribution(
+    row: usize,
+    h_a: &SlicedLabel,
+    h_b: &SlicedLabel,
+    h_ab: &SlicedLabel,
+) -> [u8; 8] {
+    let mut result = [0u8; 8];
+    let mask = HASH_CONTRIBUTION_MASKS[row];
+
+    // Check each bit of the mask to determine which hash outputs to XOR
+    if mask & 0b001 != 0 {
+        xor_assign_8(&mut result, &h_a.left);
+    }
+    if mask & 0b010 != 0 {
+        xor_assign_8(&mut result, &h_b.left);
+    }
+    if mask & 0b100 != 0 {
+        xor_assign_8(&mut result, &h_ab.left);
+    }
+
+    result
+}
+
+/// Compute input contribution from expanded marginal.
+fn compute_input_contribution(
+    r_bar_expanded: &[[u8; 4]; 2],
+    half: usize,
+    a: &SlicedLabel,
+    b: &SlicedLabel,
+) -> [u8; 8] {
+    let coeffs = r_bar_expanded[half];
+    let mut result = [0u8; 8];
+
+    if coeffs[0] == 1 {
+        xor_assign_8(&mut result, &a.left);
+    }
+    if coeffs[1] == 1 {
+        xor_assign_8(&mut result, &a.right);
+    }
+    if coeffs[2] == 1 {
+        xor_assign_8(&mut result, &b.left);
+    }
+    if coeffs[3] == 1 {
+        xor_assign_8(&mut result, &b.right);
+    }
+
+    result
+}
+
+/// Compute gate ciphertext contribution using precomputed bitmask.
+///
+/// Instead of checking 3 conditional branches against V matrix columns,
+/// we use a precomputed bitmask to determine which gate ciphertexts (g0, g1,
+/// g2) to XOR together for this row's evaluation equation.
+fn compute_gate_contribution(row: usize, gate: &ThreeHalvesGate) -> [u8; 8] {
+    let mut result = [0u8; 8];
+    let mask = GATE_CONTRIBUTION_MASKS[row];
+
+    // Check each bit of the mask to determine which components to XOR
+    if mask & 0b001 != 0 {
+        xor_assign_8(&mut result, &gate.g0);
+    }
+    if mask & 0b010 != 0 {
+        xor_assign_8(&mut result, &gate.g1);
+    }
+    if mask & 0b100 != 0 {
+        xor_assign_8(&mut result, &gate.g2);
+    }
+
+    result
+}
+
+/// Errors that can occur during garbled circuit evaluation.
+#[derive(Debug, thiserror::Error)]
+#[allow(missing_docs)]
+pub enum EvaluatorError {
+    #[error("input length mismatch: expected {expected}, got {actual}")]
+    InputLength { expected: usize, actual: usize },
+    #[error("evaluator not finished")]
+    NotFinished,
+}
+
+/// Output of the evaluator.
+#[derive(Debug)]
+pub struct EvaluatorOutput {
+    /// Output MACs of the circuit.
+    pub outputs: Vec<Mac>,
+}
+
+/// Evaluates multiple garbled circuits, potentially in parallel using rayon.
+///
+/// # Arguments
+///
+/// * `circs` - Vector of (circuit, input MACs, garbled circuit) tuples
+///
+/// # Returns
+///
+/// Vector of evaluation outputs (one per circuit)
+pub fn evaluate_garbled_circuits(
+    circs: Vec<(Arc<Circuit>, Vec<Mac>, super::GarbledCircuit)>,
+) -> Result<Vec<EvaluatorOutput>, EvaluatorError> {
+    cfg_if::cfg_if! {
+        if #[cfg(feature = "rayon")] {
+            use rayon::prelude::*;
+
+            circs.into_par_iter().map(|(circ, inputs, garbled_circuit)| {
+                let mut ev = Evaluator::with_capacity(circ.feed_count());
+                let mut consumer = ev.evaluate(&circ, &inputs)?;
+                for gate in garbled_circuit.gates {
+                    consumer.next(gate);
+                }
+                consumer.finish()
+            }).collect::<Result<Vec<_>, _>>()
+        } else {
+            let mut ev = Evaluator::default();
+            let mut outputs = Vec::with_capacity(circs.len());
+            for (circ, inputs, garbled_circuit) in circs {
+                let mut consumer = ev.evaluate(&circ, &inputs)?;
+                for gate in garbled_circuit.gates {
+                    consumer.next(gate);
+                }
+                outputs.push(consumer.finish()?);
+            }
+
+            Ok(outputs)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mpz_circuits::CircuitBuilder;
+    use mpz_core::Block;
+    use mpz_memory_core::correlated::{Delta, Key};
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha12Rng;
+
+    use crate::three_halves::{Garbler, GarblerOutput};
+
+    /// Test fixture containing pre-generated encrypted gates for a single AND
+    /// gate.
+    ///
+    /// This fixture allows testing the evaluator in isolation without requiring
+    /// a garbler instance.
+    struct AndGateFixture {
+        /// The encrypted gate (3 blocks: G₀, G₁, G₂)
+        encrypted_gate: EncryptedGate,
+        /// Input keys (0-labels)
+        input_keys: Vec<Key>,
+        /// Output 0-label
+        output_label: Key,
+        /// Delta for computing 1-label
+        delta: Delta,
+        /// The circuit being garbled
+        circuit: Circuit,
+    }
+
+    impl AndGateFixture {
+        /// Generate a fixture for a single AND gate using a fixed seed.
+        ///
+        /// This uses seed=12345 for reproducible test fixtures.
+        fn generate() -> Self {
+            let mut rng = ChaCha12Rng::seed_from_u64(12345);
+            let delta = Delta::random(&mut rng);
+
+            // Create single AND gate circuit
+            let mut builder = CircuitBuilder::new();
+            let a = builder.add_input();
+            let b = builder.add_input();
+            let out = builder.add_and_gate(a, b);
+            builder.add_output(out);
+            let circuit = builder.build().unwrap();
+
+            let input_keys: Vec<Key> = (0..2)
+                .map(|_| {
+                    let block: Block = Block::random(&mut rng);
+                    block.into()
+                })
+                .collect();
+
+            // Garble the circuit
+            let mut gb = Garbler::default();
+            let mut gb_iter = gb.generate(&circuit, delta, &input_keys, &mut rng).unwrap();
+
+            // Extract the single encrypted gate
+            let encrypted_gate = gb_iter.next().expect("should have one gate");
+            assert!(gb_iter.next().is_none(), "should only have one gate");
+
+            let GarblerOutput {
+                outputs: output_labels,
+            } = gb_iter.finish().unwrap();
+
+            Self {
+                encrypted_gate,
+                input_keys,
+                output_label: output_labels[0],
+                delta,
+                circuit,
+            }
+        }
+    }
+
+    /// Unit test: Evaluator with pre-generated fixture for all AND gate inputs
+    ///
+    /// Tests all 4 input combinations: (0,0), (0,1), (1,0), (1,1)
+    #[test]
+    fn test_evaluate_and_gate_all_inputs() {
+        let fixture = AndGateFixture::generate();
+
+        // Test cases: (a_bit, b_bit, expected_output_bit)
+        let test_cases = [
+            (false, false, false), // AND(0,0) = 0
+            (false, true, false),  // AND(0,1) = 0
+            (true, false, false),  // AND(1,0) = 0
+            (true, true, true),    // AND(1,1) = 1
+        ];
+
+        for (a_bit, b_bit, expected_output) in test_cases {
+            // Select input MACs based on input bits
+            // Key is 0-label, Key ⊕ Δ is 1-label
+            let delta_block = *fixture.delta.as_block();
+            let input_macs = vec![
+                if a_bit {
+                    Mac::from(*fixture.input_keys[0].as_block() ^ delta_block)
+                } else {
+                    Mac::from(*fixture.input_keys[0].as_block())
+                },
+                if b_bit {
+                    Mac::from(*fixture.input_keys[1].as_block() ^ delta_block)
+                } else {
+                    Mac::from(*fixture.input_keys[1].as_block())
+                },
+            ];
+
+            // Evaluate
+            let mut ev = Evaluator::default();
+            let mut ev_consumer = ev.evaluate(&fixture.circuit, &input_macs).unwrap();
+            ev_consumer.next(fixture.encrypted_gate);
+
+            let EvaluatorOutput { outputs } = ev_consumer.finish().unwrap();
+
+            // Verify output: compute 1-label as 0-label XOR delta
+            let false_mac = Mac::from(*fixture.output_label.as_block());
+            let true_mac = Mac::from(*fixture.output_label.as_block() ^ *fixture.delta.as_block());
+            let expected_mac = if expected_output { true_mac } else { false_mac };
+
+            assert_eq!(
+                outputs[0], expected_mac,
+                "AND({},{}) should be {}",
+                a_bit as u8, b_bit as u8, expected_output as u8
+            );
+        }
+    }
+}
diff --git a/crates/garble-core/src/three_halves/garbler.rs b/crates/garble-core/src/three_halves/garbler.rs
new file mode 100644
index 00000000..bd4e43ac
--- /dev/null
+++ b/crates/garble-core/src/three_halves/garbler.rs
@@ -0,0 +1,687 @@
+//! Garbler for Three Halves Scheme
+//!
+//! This module implements the garbling function for circuits using the
+//! Three Halves technique from Rosulek & Roy 2021.
+
+use core::fmt;
+use std::ops::Range;
+
+use mpz_circuits::{Circuit, Gate};
+use mpz_core::{
+    Block,
+    aes::{FIXED_KEY_AES, FixedKeyAes},
+};
+use mpz_memory_core::correlated::{Delta, Key};
+use rand::{CryptoRng, Rng};
+
+use super::{
+    ControlBits, EncryptedGate, EncryptedGateBatch, ThreeHalvesGate,
+    control::sample_r_odd,
+    garbler_tables::{M_COLUMN_MASKS, R_COLUMN_MASKS},
+    random_bits::RandomBitSource,
+    slicing::SlicedLabel,
+};
+
+use crate::DEFAULT_BATCH_SIZE;
+
+/// Garbler for Three Halves scheme.
+#[derive(Debug, Default)]
+pub struct Garbler {
+    /// Wire labels W_k (color bit 0) for each wire.
+    buffer: Vec<Block>,
+}
+
+impl Garbler {
+    /// Returns an iterator over the encrypted gates of a circuit.
+    ///
+    /// # Arguments
+    ///
+    /// * `circ` - The circuit to garble.
+    /// * `delta` - The delta value to use for garbling.
+    /// * `inputs` - The input labels to the circuit.
+    /// * `rng` - Random number generator for control matrix randomization.
+    pub fn generate<'a, R: Rng + CryptoRng>(
+        &'a mut self,
+        circ: &'a Circuit,
+        delta: Delta,
+        inputs: &[Key],
+        rng: &mut R,
+    ) -> Result<EncryptedGateIter<'a, std::slice::Iter<'a, Gate>>, GarblerError> {
+        if inputs.len() != circ.inputs().len() {
+            return Err(GarblerError::InputLength {
+                expected: circ.inputs().len(),
+                actual: inputs.len(),
+            });
+        }
+
+        // Expand the buffer to fit the circuit
+        if circ.feed_count() > self.buffer.len() {
+            self.buffer.resize(circ.feed_count(), Default::default());
+        }
+
+        // Initialize permute bits for all wires
+        let mut permute_bits = vec![false; circ.feed_count()];
+
+        // Pre-generate random bits for control matrix randomization (2 bits per AND
+        // gate)
+        let total_random_bits = 2 * circ.and_count();
+        let random_bits = RandomBitSource::new(total_random_bits, rng);
+
+        let delta_block = *delta.as_block();
+
+        // For input wires: π = Key.lsb(), W_k = Key ⊕ (π·Δ)
+        for (i, key) in inputs.iter().enumerate() {
+            let key_block = *key.as_block();
+            let pi = key_block.lsb();
+            let w_k = if pi {
+                key_block ^ delta_block
+            } else {
+                key_block
+            };
+            self.buffer[i] = w_k;
+            permute_bits[i] = pi;
+        }
+
+        Ok(EncryptedGateIter::new(
+            delta,
+            circ.gates().iter(),
+            &mut self.buffer,
+            permute_bits,
+            circ.and_count(),
+            circ.outputs(),
+            random_bits,
+        ))
+    }
+
+    /// Returns an iterator over batched encrypted gates of a circuit.
+    ///
+    /// # Arguments
+    ///
+    /// * `circ` - The circuit to garble.
+    /// * `delta` - The delta value to use for garbling.
+    /// * `inputs` - The input labels to the circuit.
+    /// * `rng` - Random number generator for control matrix randomization.
+    pub fn generate_batched<'a, R: Rng + CryptoRng>(
+        &'a mut self,
+        circ: &'a Circuit,
+        delta: Delta,
+        inputs: &[Key],
+        rng: &mut R,
+    ) -> Result<EncryptedGateBatchIter<'a, std::slice::Iter<'a, Gate>>, GarblerError> {
+        self.generate(circ, delta, inputs, rng)
+            .map(EncryptedGateBatchIter)
+    }
+}
+
+/// Errors that can occur during garbled circuit generation.
+#[derive(Debug, thiserror::Error)]
+#[allow(missing_docs)]
+pub enum GarblerError {
+    #[error("input length mismatch: expected {expected}, got {actual}")]
+    InputLength { expected: usize, actual: usize },
+    #[error("garbler not finished")]
+    NotFinished,
+}
+
+/// Output of the garbler.
+#[derive(Debug)]
+pub struct GarblerOutput {
+    /// Output labels for each output wire (0-label only).
+    pub outputs: Vec<Key>,
+}
+
+/// Iterator over encrypted gates of a garbled circuit.
+pub struct EncryptedGateIter<'a, I> {
+    /// Cipher to use to encrypt the gates.
+    cipher: &'static FixedKeyAes,
+    /// Global offset.
+    delta: Delta,
+    /// Wire labels W_k (color bit 0) for each wire.
+    labels: &'a mut [Block],
+    /// Buffer for the point-and-permute bits (tracked separately from labels).
+    permute_bits: Vec<bool>,
+    /// Iterator over the gates.
+    gates: I,
+    /// Current gate id.
+    gid: usize,
+    /// Number of AND gates generated.
+    counter: usize,
+    /// Number of AND gates in the circuit.
+    and_count: usize,
+    /// Range of the outputs in the buffer.
+    outputs: Range<usize>,
+    /// Whether the entire circuit has been garbled.
+    complete: bool,
+    /// Pre-generated random bits for control matrix randomization.
+    random_bits: RandomBitSource,
+}
+
+impl<I> fmt::Debug for EncryptedGateIter<'_, I> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "EncryptedGateIter {{ .. }}")
+    }
+}
+
+impl<'a, I> EncryptedGateIter<'a, I>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    fn new(
+        delta: Delta,
+        gates: I,
+        labels: &'a mut [Block],
+        permute_bits: Vec<bool>,
+        and_count: usize,
+        outputs: Range<usize>,
+        random_bits: RandomBitSource,
+    ) -> Self {
+        Self {
+            cipher: &(*FIXED_KEY_AES),
+            delta,
+            gates,
+            labels,
+            permute_bits,
+            gid: 1,
+            counter: 0,
+            and_count,
+            outputs,
+            complete: false,
+            random_bits,
+        }
+    }
+
+    /// Returns `true` if the garbler has more encrypted gates to generate.
+    #[inline]
+    pub fn has_gates(&self) -> bool {
+        self.counter != self.and_count
+    }
+
+    /// Returns the encoded outputs of the circuit.
+    pub fn finish(mut self) -> Result<GarblerOutput, GarblerError> {
+        if self.has_gates() {
+            return Err(GarblerError::NotFinished);
+        }
+
+        // Finish computing any "free" gates.
+        if !self.complete {
+            assert_eq!(self.next(), None);
+        }
+
+        let delta_block = *self.delta.as_block();
+
+        // Return output keys.
+        let output_labels: Vec<Key> = self
+            .outputs
+            .clone()
+            .map(|i| {
+                let w_k = self.labels[i];
+                let pi_k = self.permute_bits[i];
+                // 0-label = W_k ⊕ (π_k · Δ)
+                if pi_k {
+                    (w_k ^ delta_block).into()
+                } else {
+                    w_k.into()
+                }
+            })
+            .collect();
+
+        Ok(GarblerOutput {
+            outputs: output_labels,
+        })
+    }
+}
+
+impl<'a, I> Iterator for EncryptedGateIter<'a, I>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    type Item = EncryptedGate;
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        while let Some(gate) = self.gates.next() {
+            match gate {
+                Gate::Xor {
+                    x: node_x,
+                    y: node_y,
+                    z: node_z,
+                } => {
+                    // Free XOR: output label = XOR of input labels
+                    let x_0 = self.labels[node_x.id()];
+                    let y_0 = self.labels[node_y.id()];
+                    self.labels[node_z.id()] = x_0 ^ y_0;
+                    // Permute bit of output = XOR of input permute bits
+                    self.permute_bits[node_z.id()] =
+                        self.permute_bits[node_x.id()] ^ self.permute_bits[node_y.id()];
+                }
+                Gate::And {
+                    x: node_x,
+                    y: node_y,
+                    z: node_z,
+                } => {
+                    let x_0 = self.labels[node_x.id()];
+                    let y_0 = self.labels[node_y.id()];
+                    let pi_a = self.permute_bits[node_x.id()];
+                    let pi_b = self.permute_bits[node_y.id()];
+
+                    // Get pre-generated random bits for this AND gate
+                    let rand_bits = self.random_bits.next_two_bits();
+
+                    let (c, pi_c, encrypted_gate) = and_gate(
+                        self.cipher,
+                        &x_0,
+                        &y_0,
+                        pi_a,
+                        pi_b,
+                        &self.delta,
+                        self.gid,
+                        rand_bits,
+                    );
+                    // c already has LSB = 0 (adjusted in and_gate)
+                    self.labels[node_z.id()] = c;
+                    self.permute_bits[node_z.id()] = pi_c;
+
+                    self.gid += 1;
+                    self.counter += 1;
+
+                    // If we have generated all AND gates, compute remaining free gates.
+                    if !self.has_gates() {
+                        assert!(self.next().is_none());
+                        self.complete = true;
+                    }
+
+                    return Some(encrypted_gate);
+                }
+                Gate::Inv {
+                    x: node_x,
+                    z: node_z,
+                } => {
+                    // INV: W_k stays the same, but the permute bit flips
+                    let x_0 = self.labels[node_x.id()];
+                    self.labels[node_z.id()] = x_0;
+                    self.permute_bits[node_z.id()] = !self.permute_bits[node_x.id()];
+                }
+                Gate::Id {
+                    x: node_x,
+                    z: node_z,
+                } => {
+                    let x_0 = self.labels[node_x.id()];
+                    self.labels[node_z.id()] = x_0;
+                    self.permute_bits[node_z.id()] = self.permute_bits[node_x.id()];
+                }
+            }
+        }
+
+        None
+    }
+}
+
+/// Iterator returned by [`Garbler::generate_batched`].
+#[derive(Debug)]
+pub struct EncryptedGateBatchIter<'a, I: Iterator, const N: usize = DEFAULT_BATCH_SIZE>(
+    EncryptedGateIter<'a, I>,
+);
+
+impl<'a, I, const N: usize> EncryptedGateBatchIter<'a, I, N>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    /// Returns `true` if the garbler has more encrypted gates to generate.
+    pub fn has_gates(&self) -> bool {
+        self.0.has_gates()
+    }
+
+    /// Returns the encoded outputs of the circuit.
+    pub fn finish(self) -> Result<GarblerOutput, GarblerError> {
+        self.0.finish()
+    }
+}
+
+impl<'a, I, const N: usize> Iterator for EncryptedGateBatchIter<'a, I, N>
+where
+    I: Iterator<Item = &'a Gate>,
+{
+    type Item = EncryptedGateBatch<N>;
+
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        if !self.has_gates() {
+            return None;
+        }
+
+        let mut batch = [EncryptedGate::default(); N];
+        let mut i = 0;
+        for gate in self.0.by_ref() {
+            batch[i] = gate;
+            i += 1;
+
+            if i == N {
+                break;
+            }
+        }
+
+        Some(EncryptedGateBatch::new(batch))
+    }
+}
+
+// ============================================================================
+// Single gate garbling (internal)
+// ============================================================================
+
+/// Garble a single AND gate using the Three Halves scheme.
+///
+/// # Arguments
+/// * `cipher` - The fixed-key AES cipher
+/// * `w_a` - Wire label W_a (color bit 0)
+/// * `w_b` - Wire label W_b (color bit 0)
+/// * `pi_a` - The permute bit for input A
+/// * `pi_b` - The permute bit for input B
+/// * `delta` - The global offset
+/// * `gid` - The gate ID
+/// * `rand_bits` - Pre-generated random bits for control matrix
+///
+/// # Returns
+/// * `(W_c, pi_c, gate)` where W_c has LSB = 0 and pi_c is the output permute
+///   bit
+fn and_gate(
+    cipher: &FixedKeyAes,
+    w_a: &Block,
+    w_b: &Block,
+    pi_a: bool,
+    pi_b: bool,
+    delta: &Delta,
+    gid: usize,
+    rand_bits: [bool; 2],
+) -> (Block, bool, EncryptedGate) {
+    let delta_block = *delta.as_block();
+
+    // Compute the 6 hash values from W_a, W_b, and Δ
+    let hashes = compute_hashes(cipher, *w_a, *w_b, delta_block, gid);
+
+    // Slice the input labels
+    let a0_sliced = SlicedLabel::from_block(*w_a);
+    let b0_sliced = SlicedLabel::from_block(*w_b);
+    let delta_sliced = SlicedLabel::from_block(delta_block);
+
+    // Sample the randomized control matrix R for AND gate (ODD mode)
+    //
+    // The permute bits define the relationship between color bits and logical
+    // values:
+    // - Color bit i corresponds to logical value (i ⊕ π_a) for input A
+    // - Color bit j corresponds to logical value (j ⊕ π_b) for input B
+    //
+    // Compute the R matrix index
+    let r_index = (pi_a as usize) << 3
+        | (pi_b as usize) << 2
+        | (rand_bits[0] as usize) << 1
+        | rand_bits[1] as usize;
+    let r_bar = sample_r_odd(pi_a, pi_b, rand_bits);
+
+    // Compute M · H⃗ (hash contribution)
+    let m_times_h = apply_m_to_hashes(&hashes);
+
+    // Compute R · [A₀; B₀; Δ] (input contribution)
+    let r_times_input = apply_r_to_inputs(r_index, &a0_sliced, &b0_sliced, &delta_sliced);
+
+    // Compute RHS = M·H ⊕ R·input
+    let mut rhs = [[0u8; 8]; 8];
+    for i in 0..8 {
+        rhs[i] = m_times_h[i];
+        xor_assign_8(&mut rhs[i], &r_times_input[i]);
+    }
+
+    // Solve for [C; G⃗]
+    let output = solve_for_output(&rhs, &delta_sliced, pi_a, pi_b);
+
+    // Extract output label C from the linear algebra
+    let c = SlicedLabel::new(output[0], output[1]).to_block();
+
+    // Per the paper (Section 5.3):
+    // π_c := lsb(C)
+    // W_c := C ⊕ π_c·Δ
+    // This ensures W_c has LSB = 0
+    let pi_c = c.lsb();
+    let w_c = if pi_c { c ^ delta_block } else { c };
+
+    let gate = ThreeHalvesGate::new(output[2], output[3], output[4]);
+    let control_bits = ControlBits::new(r_bar);
+
+    (w_c, pi_c, EncryptedGate::new(gate, control_bits))
+}
+
+/// Compute the 6 hash values needed for garbling.
+fn compute_hashes(
+    cipher: &FixedKeyAes,
+    a0: Block,
+    b0: Block,
+    delta: Block,
+    gid: usize,
+) -> [SlicedLabel; 6] {
+    let a1 = a0 ^ delta;
+    let b1 = b0 ^ delta;
+
+    let tweak = Block::new((gid as u128).to_be_bytes());
+    let mut blocks = [a0, a1, b0, b1, a0 ^ b0, a0 ^ b1];
+    cipher.rtccr_many(&[tweak; 6], &mut blocks);
+
+    [
+        SlicedLabel::from_block(blocks[0]),
+        SlicedLabel::from_block(blocks[1]),
+        SlicedLabel::from_block(blocks[2]),
+        SlicedLabel::from_block(blocks[3]),
+        SlicedLabel::from_block(blocks[4]),
+        SlicedLabel::from_block(blocks[5]),
+    ]
+}
+
+/// Apply matrix M to hash vector using precomputed branchless operations.
+///
+/// Uses M_COLUMN_MASKS from garbler_tables for branchless matrix
+/// multiplication.
+///
+/// Returns M × H⃗ as 8×8 byte array, where each row is a 64-bit value.
+#[inline]
+fn apply_m_to_hashes(hashes: &[SlicedLabel; 6]) -> [[u8; 8]; 8] {
+    // Pack hash left-halves as u64 for efficient XOR operations
+    let inputs: [u64; 6] = [
+        u64::from_le_bytes(hashes[0].left),
+        u64::from_le_bytes(hashes[1].left),
+        u64::from_le_bytes(hashes[2].left),
+        u64::from_le_bytes(hashes[3].left),
+        u64::from_le_bytes(hashes[4].left),
+        u64::from_le_bytes(hashes[5].left),
+    ];
+
+    let mut result = [[0u8; 8]; 8];
+
+    for row in 0..8 {
+        let m = M_COLUMN_MASKS[row] as u64;
+
+        // Branchless expansion: convert each bit to a full u64 mask
+        // ((m >> bit) & 1) is 0 or 1
+        // .wrapping_neg() converts: 0 → 0, 1 → 0xFFFFFFFFFFFFFFFF
+        let row_result = (inputs[0] & ((m >> 0) & 1).wrapping_neg())
+            ^ (inputs[1] & ((m >> 1) & 1).wrapping_neg())
+            ^ (inputs[2] & ((m >> 2) & 1).wrapping_neg())
+            ^ (inputs[3] & ((m >> 3) & 1).wrapping_neg())
+            ^ (inputs[4] & ((m >> 4) & 1).wrapping_neg())
+            ^ (inputs[5] & ((m >> 5) & 1).wrapping_neg());
+
+        result[row] = row_result.to_le_bytes();
+    }
+
+    result
+}
+
+/// Apply control matrix R to input labels using precomputed branchless
+/// operations.
+///
+/// Uses R_COLUMN_MASKS[r_index] from garbler_tables for branchless matrix
+/// multiplication.
+///
+/// Returns R × [A₀; B₀; Δ] as 8×8 byte array, where each row is a 64-bit value.
+#[inline]
+fn apply_r_to_inputs(
+    r_index: usize,
+    a0: &SlicedLabel,
+    b0: &SlicedLabel,
+    delta: &SlicedLabel,
+) -> [[u8; 8]; 8] {
+    // Pack inputs as u64 for efficient XOR operations (instead of byte-by-byte)
+    let inputs: [u64; 6] = [
+        u64::from_le_bytes(a0.left),
+        u64::from_le_bytes(a0.right),
+        u64::from_le_bytes(b0.left),
+        u64::from_le_bytes(b0.right),
+        u64::from_le_bytes(delta.left),
+        u64::from_le_bytes(delta.right),
+    ];
+
+    let masks = &R_COLUMN_MASKS[r_index];
+    let mut result = [[0u8; 8]; 8];
+
+    for row in 0..8 {
+        let m = masks[row] as u64;
+
+        // Branchless expansion: convert each bit to a full u64 mask
+        // ((m >> bit) & 1) is 0 or 1
+        // .wrapping_neg() converts: 0 → 0, 1 → 0xFFFFFFFFFFFFFFFF
+        let row_result = (inputs[0] & ((m >> 0) & 1).wrapping_neg())
+            ^ (inputs[1] & ((m >> 1) & 1).wrapping_neg())
+            ^ (inputs[2] & ((m >> 2) & 1).wrapping_neg())
+            ^ (inputs[3] & ((m >> 3) & 1).wrapping_neg())
+            ^ (inputs[4] & ((m >> 4) & 1).wrapping_neg())
+            ^ (inputs[5] & ((m >> 5) & 1).wrapping_neg());
+
+        result[row] = row_result.to_le_bytes();
+    }
+
+    result
+}
+
+/// Solve for [C_L, C_R, G₀, G₁, G₂] from RHS.
+fn solve_for_output(
+    rhs: &[[u8; 8]; 8],
+    delta: &SlicedLabel,
+    pi_a: bool,
+    pi_b: bool,
+) -> [[u8; 8]; 5] {
+    let mut rhs_adjusted = *rhs;
+
+    // For AND gates, the identity block (true output) is at position (!pi_a, !pi_b)
+    let true_i = !pi_a as usize;
+    let true_j = !pi_b as usize;
+    let true_ij = (true_i << 1) | true_j;
+    let row_l = 2 * true_ij;
+    let row_r = 2 * true_ij + 1;
+    xor_assign_8(&mut rhs_adjusted[row_l], &delta.left);
+    xor_assign_8(&mut rhs_adjusted[row_r], &delta.right);
+
+    let mut result = [[0u8; 8]; 5];
+
+    // C_L = RHS[0]
+    result[0] = rhs_adjusted[0];
+    // C_R = RHS[1]
+    result[1] = rhs_adjusted[1];
+    // G₀ = RHS[0] ⊕ RHS[1] ⊕ RHS[4] ⊕ RHS[5]
+    for k in 0..8 {
+        result[2][k] =
+            rhs_adjusted[0][k] ^ rhs_adjusted[1][k] ^ rhs_adjusted[4][k] ^ rhs_adjusted[5][k];
+    }
+    // G₁ = RHS[0] ⊕ RHS[1] ⊕ RHS[2] ⊕ RHS[3]
+    for k in 0..8 {
+        result[3][k] =
+            rhs_adjusted[0][k] ^ rhs_adjusted[1][k] ^ rhs_adjusted[2][k] ^ rhs_adjusted[3][k];
+    }
+    // G₂ = RHS[4] ⊕ RHS[6]
+    for k in 0..8 {
+        result[4][k] = rhs_adjusted[4][k] ^ rhs_adjusted[6][k];
+    }
+
+    result
+}
+
+/// XOR-assign two 8-byte arrays.
+#[inline]
+pub(crate) fn xor_assign_8(a: &mut [u8; 8], b: &[u8; 8]) {
+    for i in 0..8 {
+        a[i] ^= b[i];
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mpz_circuits::circuits::xor;
+    use mpz_core::aes::FIXED_KEY_AES;
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha12Rng;
+
+    #[test]
+    fn test_garbling_deterministic() {
+        let cipher = &(*FIXED_KEY_AES);
+        let mut rng = ChaCha12Rng::seed_from_u64(42);
+
+        let mut a0 = Block::random(&mut rng);
+        let mut b0 = Block::random(&mut rng);
+        let mut delta = Block::random(&mut rng);
+        a0.set_lsb(false);
+        b0.set_lsb(false);
+        delta.set_lsb(true);
+
+        // Use the same random bits for both calls
+        let rand_bits = [true, false];
+
+        // Test with permute bits both false (since labels have LSB=0)
+        let (z1, pi_z1, gate1) = and_gate(
+            cipher,
+            &a0,
+            &b0,
+            false,
+            false,
+            &Delta::new(delta),
+            1,
+            rand_bits,
+        );
+        let (z2, pi_z2, gate2) = and_gate(
+            cipher,
+            &a0,
+            &b0,
+            false,
+            false,
+            &Delta::new(delta),
+            1,
+            rand_bits,
+        );
+
+        assert_eq!(z1, z2);
+        assert_eq!(pi_z1, pi_z2);
+        assert_eq!(gate1, gate2);
+    }
+
+    #[test]
+    fn test_garble_xor_circuit() {
+        let mut rng = ChaCha12Rng::seed_from_u64(42);
+        let circ = xor(8);
+
+        let mut delta = Block::random(&mut rng);
+        delta.set_lsb(true);
+        let delta = Delta::new(delta);
+
+        let input_keys: Vec<Key> = (0..circ.inputs().len())
+            .map(|_| {
+                let block: Block = rng.random();
+                block.into()
+            })
+            .collect();
+
+        let mut gb = Garbler::default();
+        let iter = gb.generate(&circ, delta, &input_keys, &mut rng).unwrap();
+
+        // XOR circuit has no AND gates
+        assert!(!iter.has_gates());
+
+        let output = iter.finish().unwrap();
+        assert_eq!(output.outputs.len(), circ.outputs().len());
+    }
+}
diff --git a/crates/garble-core/src/three_halves/garbler_tables.rs b/crates/garble-core/src/three_halves/garbler_tables.rs
new file mode 100644
index 00000000..1ba99ddf
--- /dev/null
+++ b/crates/garble-core/src/three_halves/garbler_tables.rs
@@ -0,0 +1,534 @@
+//! Precomputed Lookup Tables for Garbler Optimizations
+//!
+//! This module contains precomputed bitmask tables used by the garbler for
+//! branchless matrix application. These tables eliminate conditional branches
+//! that would otherwise cause significant performance overhead.
+
+// ============================================================================
+// Precomputed M Matrix Application
+// ============================================================================
+//
+// # Why This Optimization Exists
+//
+// Similar to `apply_r_to_inputs`, the naive `apply_m_to_hashes` function has
+// 48 conditional branches (8 rows × 6 columns). While M is a fixed matrix
+// (unlike R which has 16 variants), we can still eliminate branches using
+// precomputed bitmasks.
+//
+// # The M Matrix (from Paper Page 10, Table 2)
+//
+// M specifies which hash values to XOR for each evaluation equation:
+//
+// ```text
+// M = [ 1 0 0 0 1 0 ]  <- (0,0) left:  H(A₀) ⊕ H(A₀⊕B₀)
+//     [ 0 0 1 0 1 0 ]  <- (0,0) right: H(B₀) ⊕ H(A₀⊕B₀)
+//     [ 1 0 0 0 0 1 ]  <- (0,1) left:  H(A₀) ⊕ H(A₀⊕B₁)
+//     [ 0 0 0 1 0 1 ]  <- (0,1) right: H(B₁) ⊕ H(A₀⊕B₁)
+//     [ 0 1 0 0 0 1 ]  <- (1,0) left:  H(A₁) ⊕ H(A₀⊕B₁)
+//     [ 0 0 1 0 0 1 ]  <- (1,0) right: H(B₀) ⊕ H(A₀⊕B₁)
+//     [ 0 1 0 0 1 0 ]  <- (1,1) left:  H(A₁) ⊕ H(A₀⊕B₀)
+//     [ 0 0 0 1 1 0 ]  <- (1,1) right: H(B₁) ⊕ H(A₀⊕B₀)
+// ```
+//
+// Columns: [H(A₀), H(A₁), H(B₀), H(B₁), H(A₀⊕B₀), H(A₀⊕B₁)]
+//
+// # Bitmask Encoding
+//
+// Each u8 bitmask has bits [0..5] indicating which columns to XOR:
+//   bit 0 = H(A₀)
+//   bit 1 = H(A₁)
+//   bit 2 = H(B₀)
+//   bit 3 = H(B₁)
+//   bit 4 = H(A₀⊕B₀)
+//   bit 5 = H(A₀⊕B₁)
+
+/// Precomputed column bitmasks for the M matrix.
+///
+/// Since M is a fixed constant matrix, we only need one set of 8 bitmasks
+/// (one per row), unlike R which has 16 variants.
+///
+/// Generated from the M matrix in matrices.rs at compile time.
+pub(super) const M_COLUMN_MASKS: [u8; 8] = {
+    // Convert M matrix rows to bitmasks
+    // M[row][col] == 1 means bit `col` is set in the mask
+    //
+    // M = [[1,0,0,0,1,0], [0,0,1,0,1,0], [1,0,0,0,0,1], [0,0,0,1,0,1],
+    //      [0,1,0,0,0,1], [0,0,1,0,0,1], [0,1,0,0,1,0], [0,0,0,1,1,0]]
+    [
+        0b_010001, // Row 0: cols 0,4 → bits 0,4 = 1 + 16 = 17 = 0x11
+        0b_010100, // Row 1: cols 2,4 → bits 2,4 = 4 + 16 = 20 = 0x14
+        0b_100001, // Row 2: cols 0,5 → bits 0,5 = 1 + 32 = 33 = 0x21
+        0b_101000, // Row 3: cols 3,5 → bits 3,5 = 8 + 32 = 40 = 0x28
+        0b_100010, // Row 4: cols 1,5 → bits 1,5 = 2 + 32 = 34 = 0x22
+        0b_100100, // Row 5: cols 2,5 → bits 2,5 = 4 + 32 = 36 = 0x24
+        0b_010010, // Row 6: cols 1,4 → bits 1,4 = 2 + 16 = 18 = 0x12
+        0b_011000, // Row 7: cols 3,4 → bits 3,4 = 8 + 16 = 24 = 0x18
+    ]
+};
+
+// ============================================================================
+// Precomputed R Matrix Application
+// ============================================================================
+//
+// # Why This Optimization Exists
+//
+// The naive `apply_r_to_inputs` function performs an 8×6 matrix-vector multiply
+// where each entry is a conditional XOR:
+//
+//   for row in 0..8:
+//       for col in 0..6:
+//           if R[row][col]:
+//               result[row] ^= inputs[col]
+//
+// This results in 48 conditional branches per AND gate. With millions of gates,
+// branch mispredictions become a significant bottleneck (~33% of garbling time
+// in profiling).
+//
+// # The Optimization
+//
+// The control matrix R comes from `sample_r_odd`, which returns one of only
+// **16 possible matrices** (4 permute bit combinations × 4 random bit
+// combinations). Instead of runtime conditionals, we:
+//
+// 1. Precompute a bitmask for each row of each R variant, indicating which
+//    columns (inputs) to XOR together
+// 2. At runtime, use branchless masking: `result ^= input & mask`
+//
+// This eliminates all branches and reduces the operation to pure XOR/AND.
+//
+// # Bitmask Format
+//
+// Each u8 bitmask has bits [0..5] corresponding to the 6 input columns:
+//   bit 0 = A₀_L (a0.left)
+//   bit 1 = A₀_R (a0.right)
+//   bit 2 = B₀_L (b0.left)
+//   bit 3 = B₀_R (b0.right)
+//   bit 4 = Δ_L  (delta.left)
+//   bit 5 = Δ_R  (delta.right)
+//
+// # Table Index
+//
+// Index = (pi_a << 3) | (pi_b << 2) | (r0 << 1) | r1
+// Same indexing as sample_r_odd in control.rs.
+
+/// Precomputed column bitmasks for each R matrix variant.
+///
+/// `R_COLUMN_MASKS[variant][row]` is a u8 where bit `col` indicates whether
+/// R[row][col] is true (i.e., whether to XOR input[col] into result[row]).
+///
+/// Generated from the 16 possible R matrices from sample_r_odd - each entry is
+/// the same R matrix but encoded as bitmasks for branchless application.
+pub(super) const R_COLUMN_MASKS: [[u8; 8]; 16] = {
+    // Helper to convert a u8 row [c0,c1,c2,c3,c4,c5] to bitmask
+    const fn row_to_mask(row: [u8; 6]) -> u8 {
+        row[0] | (row[1] << 1) | (row[2] << 2) | (row[3] << 3) | (row[4] << 4) | (row[5] << 5)
+    }
+
+    // Helper to convert full 8×6 R matrix to 8 bitmasks
+    const fn matrix_to_masks(r: [[u8; 6]; 8]) -> [u8; 8] {
+        [
+            row_to_mask(r[0]),
+            row_to_mask(r[1]),
+            row_to_mask(r[2]),
+            row_to_mask(r[3]),
+            row_to_mask(r[4]),
+            row_to_mask(r[5]),
+            row_to_mask(r[6]),
+            row_to_mask(r[7]),
+        ]
+    }
+
+    // These are the 16 possible R matrices from sample_r_odd (control.rs),
+    // precomputed and converted to bitmask form at compile time for branchless
+    // application.
+    [
+        // Index 0: pi_a=0, pi_b=0, r0=0, r1=0
+        matrix_to_masks([
+            [0, 0, 1, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [1, 0, 1, 1, 1, 1],
+            [0, 1, 1, 1, 1, 1],
+            [1, 1, 1, 0, 1, 1],
+            [1, 1, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+        ]),
+        // Index 1: pi_a=0, pi_b=0, r0=0, r1=1
+        matrix_to_masks([
+            [1, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 1, 1, 1, 0, 1],
+            [1, 0, 1, 0, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+        ]),
+        // Index 2: pi_a=0, pi_b=0, r0=1, r1=0
+        matrix_to_masks([
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 0, 1, 0, 0],
+            [0, 1, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+        ]),
+        // Index 3: pi_a=0, pi_b=0, r0=1, r1=1
+        matrix_to_masks([
+            [0, 1, 0, 1, 0, 0],
+            [1, 0, 1, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1],
+            [1, 0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+        ]),
+        // Index 4: pi_a=0, pi_b=1, r0=0, r1=0
+        matrix_to_masks([
+            [0, 0, 1, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [0, 1, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 0],
+            [1, 0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0, 0],
+            [1, 1, 1, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+        ]),
+        // Index 5: pi_a=0, pi_b=1, r0=0, r1=1
+        matrix_to_masks([
+            [1, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1],
+            [0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 1],
+            [0, 1, 1, 1, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+        ]),
+        // Index 6: pi_a=0, pi_b=1, r0=1, r1=0
+        matrix_to_masks([
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 0, 1, 0, 0],
+            [1, 0, 1, 1, 1, 1],
+            [0, 1, 1, 1, 1, 1],
+            [0, 1, 1, 1, 0, 1],
+            [1, 0, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+        ]),
+        // Index 7: pi_a=0, pi_b=1, r0=1, r1=1
+        matrix_to_masks([
+            [0, 1, 0, 1, 0, 0],
+            [1, 0, 1, 0, 0, 0],
+            [0, 0, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [1, 1, 1, 0, 1, 1],
+            [1, 1, 0, 1, 1, 1],
+            [1, 0, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+        ]),
+        // Index 8: pi_a=1, pi_b=0, r0=0, r1=0
+        matrix_to_masks([
+            [0, 0, 1, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1],
+            [0, 1, 1, 1, 0, 1],
+            [1, 0, 1, 0, 1, 0],
+            [1, 0, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+        ]),
+        // Index 9: pi_a=1, pi_b=0, r0=0, r1=1
+        matrix_to_masks([
+            [1, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 1, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 0],
+            [1, 1, 1, 0, 1, 1],
+            [1, 1, 0, 1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+        ]),
+        // Index 10: pi_a=1, pi_b=0, r0=1, r1=0
+        matrix_to_masks([
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 0, 1, 0, 0],
+            [0, 0, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [1, 0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0, 0],
+            [0, 1, 1, 1, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+        ]),
+        // Index 11: pi_a=1, pi_b=0, r0=1, r1=1
+        matrix_to_masks([
+            [0, 1, 0, 1, 0, 0],
+            [1, 0, 1, 0, 0, 0],
+            [1, 0, 1, 1, 1, 1],
+            [0, 1, 1, 1, 1, 1],
+            [0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 1],
+            [1, 1, 1, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+        ]),
+        // Index 12: pi_a=1, pi_b=1, r0=0, r1=0
+        matrix_to_masks([
+            [0, 0, 1, 0, 0, 0],
+            [0, 1, 0, 0, 0, 0],
+            [0, 0, 1, 0, 1, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 0, 0, 1],
+            [0, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 0],
+        ]),
+        // Index 13: pi_a=1, pi_b=1, r0=0, r1=1
+        matrix_to_masks([
+            [1, 0, 1, 1, 0, 0],
+            [0, 0, 1, 1, 0, 0],
+            [1, 0, 1, 1, 1, 1],
+            [0, 1, 1, 1, 1, 1],
+            [1, 0, 0, 1, 1, 0],
+            [0, 0, 1, 1, 0, 0],
+            [1, 0, 0, 1, 1, 1],
+            [0, 1, 1, 1, 1, 0],
+        ]),
+        // Index 14: pi_a=1, pi_b=1, r0=1, r1=0
+        matrix_to_masks([
+            [1, 1, 0, 0, 0, 0],
+            [1, 1, 0, 1, 0, 0],
+            [1, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 1],
+            [1, 1, 0, 1, 1, 1],
+            [1, 1, 1, 0, 0, 1],
+            [1, 0, 0, 1, 1, 1],
+        ]),
+        // Index 15: pi_a=1, pi_b=1, r0=1, r1=1
+        matrix_to_masks([
+            [0, 1, 0, 1, 0, 0],
+            [1, 0, 1, 0, 0, 0],
+            [0, 1, 0, 1, 0, 1],
+            [1, 1, 1, 0, 1, 0],
+            [0, 1, 1, 1, 0, 1],
+            [1, 0, 1, 0, 1, 0],
+            [0, 1, 1, 1, 1, 0],
+            [1, 1, 1, 0, 0, 1],
+        ]),
+    ]
+};
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::three_halves::{matrices::M, slicing::SlicedLabel};
+    use mpz_core::Block;
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha12Rng;
+
+    /// XOR assign 8 bytes at a time
+    fn xor_assign_8(dst: &mut [u8; 8], src: &[u8; 8]) {
+        for i in 0..8 {
+            dst[i] ^= src[i];
+        }
+    }
+
+    /// Verification tests for precomputed lookup tables.
+    ///
+    /// These verify that optimized LUTs match their specifications:
+    /// - R_COLUMN_MASKS: Precomputed column masks for R matrix application
+    /// - M_COLUMN_MASKS: Precomputed column masks for M matrix application
+    ///
+    /// These tests only need to run once to verify correctness after changes.
+    ///
+    /// Run with: `cargo test garbler_tables::tests::table_verification --
+    /// --ignored`
+    mod table_verification {
+        use super::*;
+        use crate::three_halves::control::{
+            R_P, sample_r_odd,
+            tests::table_verification::{R_A, R_B, R_DOLLAR_BASIS_0, R_DOLLAR_BASIS_1},
+        };
+
+        /// Sample control matrix R for ODD mode gates (test version that
+        /// returns R).
+        ///
+        /// Computes R dynamically using the formula:
+        /// R = r0·R$_BASIS_0 ⊕ r1·R$_BASIS_1 ⊕ a·R_A ⊕ b·R_B ⊕ R_P
+        fn sample_r_odd_with_r(
+            pi_a: bool,
+            pi_b: bool,
+            rand_bits: [bool; 2],
+        ) -> ([[u8; 6]; 8], [[u8; 2]; 4]) {
+            let (a, b) = (!pi_a, !pi_b);
+            let r0 = rand_bits[0] as u8;
+            let r1 = rand_bits[1] as u8;
+
+            let mut r = [[0u8; 6]; 8];
+            for i in 0..8 {
+                for j in 0..6 {
+                    r[i][j] = (r0 * R_DOLLAR_BASIS_0[i][j])
+                        ^ (r1 * R_DOLLAR_BASIS_1[i][j])
+                        ^ (a as u8 * R_A[i][j])
+                        ^ (b as u8 * R_B[i][j])
+                        ^ R_P[i][j];
+                }
+            }
+
+            let r_bar = sample_r_odd(pi_a, pi_b, rand_bits);
+            (r, r_bar)
+        }
+
+        /// Naive implementation of apply_r_to_inputs for testing
+        fn apply_r_to_inputs_naive(
+            r: &[[u8; 6]; 8],
+            a0: &SlicedLabel,
+            b0: &SlicedLabel,
+            delta: &SlicedLabel,
+        ) -> [[u8; 8]; 8] {
+            let inputs: [[u8; 8]; 6] = [
+                a0.left,
+                a0.right,
+                b0.left,
+                b0.right,
+                delta.left,
+                delta.right,
+            ];
+
+            let mut result = [[0u8; 8]; 8];
+            for row in 0..8 {
+                for col in 0..6 {
+                    if r[row][col] != 0 {
+                        xor_assign_8(&mut result[row], &inputs[col]);
+                    }
+                }
+            }
+            result
+        }
+
+        /// Apply R matrix using precomputed masks (production implementation
+        /// replica)
+        fn apply_r_to_inputs_fast(
+            r_index: usize,
+            a0: &SlicedLabel,
+            b0: &SlicedLabel,
+            delta: &SlicedLabel,
+        ) -> [[u8; 8]; 8] {
+            let inputs: [[u8; 8]; 6] = [
+                a0.left,
+                a0.right,
+                b0.left,
+                b0.right,
+                delta.left,
+                delta.right,
+            ];
+
+            let mut result = [[0u8; 8]; 8];
+            let masks = R_COLUMN_MASKS[r_index];
+
+            for row in 0..8 {
+                let mask = masks[row];
+                for col in 0..6 {
+                    let active = (mask >> col) & 1;
+                    let masked_input = inputs[col].map(|b| b & active.wrapping_neg());
+                    xor_assign_8(&mut result[row], &masked_input);
+                }
+            }
+            result
+        }
+
+        /// Naive implementation of apply_m_to_hashes for testing
+        fn apply_m_to_hashes_naive(hashes: &[SlicedLabel; 6]) -> [[u8; 8]; 8] {
+            let mut result = [[0u8; 8]; 8];
+            for row in 0..8 {
+                for col in 0..6 {
+                    if M[row][col] == 1 {
+                        xor_assign_8(&mut result[row], &hashes[col].left);
+                    }
+                }
+            }
+            result
+        }
+
+        /// Apply M matrix using precomputed masks (production implementation
+        /// replica)
+        fn apply_m_to_hashes_fast(hashes: &[SlicedLabel; 6]) -> [[u8; 8]; 8] {
+            let inputs: [[u8; 8]; 6] = hashes.map(|h| h.left);
+
+            let mut result = [[0u8; 8]; 8];
+
+            for row in 0..8 {
+                let mask = M_COLUMN_MASKS[row];
+                for col in 0..6 {
+                    let active = (mask >> col) & 1;
+                    let masked_input = inputs[col].map(|b| b & active.wrapping_neg());
+                    xor_assign_8(&mut result[row], &masked_input);
+                }
+            }
+            result
+        }
+
+        /// Verify R_COLUMN_MASKS matches the original sample_r_odd matrices
+        #[test]
+        #[ignore]
+        fn verify_r_column_masks() {
+            // Test all 16 variants with random inputs
+            let mut rng = ChaCha12Rng::seed_from_u64(12345);
+            let a0 = SlicedLabel::from_block(Block::random(&mut rng));
+            let b0 = SlicedLabel::from_block(Block::random(&mut rng));
+            let delta = SlicedLabel::from_block(Block::random(&mut rng));
+
+            for pi_a in [false, true] {
+                for pi_b in [false, true] {
+                    for r0 in [false, true] {
+                        for r1 in [false, true] {
+                            let r_index = (pi_a as usize) << 3
+                                | (pi_b as usize) << 2
+                                | (r0 as usize) << 1
+                                | r1 as usize;
+
+                            let (r_matrix, _) = sample_r_odd_with_r(pi_a, pi_b, [r0, r1]);
+
+                            let naive_result = apply_r_to_inputs_naive(&r_matrix, &a0, &b0, &delta);
+                            let fast_result = apply_r_to_inputs_fast(r_index, &a0, &b0, &delta);
+
+                            assert_eq!(
+                                naive_result, fast_result,
+                                "R_COLUMN_MASKS[{}] incorrect for pi_a={}, pi_b={}, r0={}, r1={}",
+                                r_index, pi_a, pi_b, r0, r1
+                            );
+                        }
+                    }
+                }
+            }
+        }
+
+        /// Verify M_COLUMN_MASKS matches the original M matrix
+        #[test]
+        #[ignore]
+        fn verify_m_column_masks() {
+            let mut rng = ChaCha12Rng::seed_from_u64(54321);
+
+            // Generate random hashes
+            let hashes: [SlicedLabel; 6] = [
+                SlicedLabel::from_block(Block::random(&mut rng)),
+                SlicedLabel::from_block(Block::random(&mut rng)),
+                SlicedLabel::from_block(Block::random(&mut rng)),
+                SlicedLabel::from_block(Block::random(&mut rng)),
+                SlicedLabel::from_block(Block::random(&mut rng)),
+                SlicedLabel::from_block(Block::random(&mut rng)),
+            ];
+
+            let naive_result = apply_m_to_hashes_naive(&hashes);
+            let fast_result = apply_m_to_hashes_fast(&hashes);
+
+            assert_eq!(
+                naive_result, fast_result,
+                "M_COLUMN_MASKS doesn't match M matrix"
+            );
+        }
+    }
+}
diff --git a/crates/garble-core/src/three_halves/matrices.rs b/crates/garble-core/src/three_halves/matrices.rs
new file mode 100644
index 00000000..4a7c788a
--- /dev/null
+++ b/crates/garble-core/src/three_halves/matrices.rs
@@ -0,0 +1,402 @@
+//! # Core Matrices for Three-Halves Garbling
+//!
+//! This module defines the fixed matrices used in the three-halves garbling
+//! scheme. All matrices operate over GF(2) (binary field), so all arithmetic is
+//! mod 2.
+//!
+//! ## Matrix Dimensions and Roles
+//!
+//! | Matrix | Size  | Role |
+//! |--------|-------|------|
+//! | K      | 3×8   | Cokernel of gate space G; vectors v ∈ G satisfy Kv = 0 |
+//! | V      | 8×5   | Maps [output_label; gate_ciphertexts] to 8 evaluation equations |
+//! | M      | 8×6   | Maps 6 hash outputs to 8 evaluation equations |
+//! | V⁻¹    | 5×8   | Left-inverse of V; used by garbler to solve for outputs |
+//!
+//! ## The 8 Rows
+//!
+//! Each matrix has 8 rows corresponding to:
+//! - Rows 0-1: Input combination (0,0) - left and right halves
+//! - Rows 2-3: Input combination (0,1) - left and right halves
+//! - Rows 4-5: Input combination (1,0) - left and right halves
+//! - Rows 6-7: Input combination (1,1) - left and right halves
+
+/// Matrix K: Cokernel basis for the gate space G
+///
+/// From Paper Page 12:
+/// ```text
+/// K = [ 1 0 | 1 0 | 1 0 | 1 0 ]
+///     [ 0 1 | 0 1 | 0 1 | 0 1 ]
+///     [ 0 0 | 0 1 | 1 0 | 1 1 ]
+/// ```
+///
+/// **Property**: A vector v is in the gate space G if and only if Kv = 0
+///
+/// **Interpretation**:
+/// - Row 0: Sum of all left halves must be 0 (even parity across left halves)
+/// - Row 1: Sum of all right halves must be 0 (even parity across right halves)
+/// - Row 2: A specific linear relation involving right halves of (0,1), left of
+///   (1,0), both of (1,1)
+///
+/// The vertical bars separate the 2×2 blocks for each input combination:
+/// (0,0) | (0,1) | (1,0) | (1,1)
+pub const K: [[u8; 8]; 3] = [
+    //   (0,0)L  (0,0)R  (0,1)L  (0,1)R  (1,0)L  (1,0)R  (1,1)L  (1,1)R
+    // Row 0: checks parity of left halves
+    [1, 0, 1, 0, 1, 0, 1, 0],
+    // Row 1: checks parity of right halves
+    [0, 1, 0, 1, 0, 1, 0, 1],
+    // Row 2: specific constraint from paper
+    [0, 0, 0, 1, 1, 0, 1, 1],
+];
+
+/// Matrix V: Maps [C_L, C_R, G₀, G₁, G₂] to 8 evaluation equations
+///
+/// From Paper Page 12:
+/// ```text
+/// V = [ 1 0 | 0 0 0 ]   <- (0,0) left:  uses C_L
+///     [ 0 1 | 0 0 0 ]   <- (0,0) right: uses C_R
+///     [ 1 0 | 0 0 1 ]   <- (0,1) left:  uses C_L ⊕ G₂
+///     [ 0 1 | 0 1 1 ]   <- (0,1) right: uses C_R ⊕ G₁ ⊕ G₂
+///     [ 1 0 | 1 0 1 ]   <- (1,0) left:  uses C_L ⊕ G₀ ⊕ G₂
+///     [ 0 1 | 0 0 1 ]   <- (1,0) right: uses C_R ⊕ G₂
+///     [ 1 0 | 1 0 0 ]   <- (1,1) left:  uses C_L ⊕ G₀
+///     [ 0 1 | 0 1 0 ]   <- (1,1) right: uses C_R ⊕ G₁
+/// ```
+///
+/// **Column interpretation**:
+/// - Column 0: C_L (left half of output label)
+/// - Column 1: C_R (right half of output label)
+/// - Column 2: G₀ (first gate ciphertext, κ/2 bits)
+/// - Column 3: G₁ (second gate ciphertext, κ/2 bits)
+/// - Column 4: G₂ (third gate ciphertext, κ/2 bits)
+///
+/// **Key property**: The columns of V span the gate space G (same as columns of
+/// M) This means: colspace(V) = colspace(M) = G, and KV = 0
+///
+/// **Note**: The columns for G₀, G₁, G₂ (rightmost 3 columns) are chosen to
+/// match columns of M corresponding to H(A₁), H(B₁), H(A₀⊕B₁). See paper page
+/// 12.
+pub const V: [[u8; 5]; 8] = [
+    //   C_L  C_R  G₀   G₁   G₂
+    // Row 0: (0,0) left half
+    [1, 0, 0, 0, 0],
+    // Row 1: (0,0) right half
+    [0, 1, 0, 0, 0],
+    // Row 2: (0,1) left half
+    [1, 0, 0, 0, 1],
+    // Row 3: (0,1) right half
+    [0, 1, 0, 1, 1],
+    // Row 4: (1,0) left half
+    [1, 0, 1, 0, 1],
+    // Row 5: (1,0) right half
+    [0, 1, 0, 0, 1],
+    // Row 6: (1,1) left half
+    [1, 0, 1, 0, 0],
+    // Row 7: (1,1) right half
+    [0, 1, 0, 1, 0],
+];
+
+/// Matrix M: Maps hash outputs to 8 evaluation equations
+///
+/// From Paper Page 10, Equation 3 (the non-? entries):
+/// ```text
+/// M = [ 1 0 | 0 0 | 1 0 ]   <- (0,0) left:  H(A₀) ⊕ H(A₀⊕B₀)
+///     [ 0 0 | 1 0 | 1 0 ]   <- (0,0) right: H(B₀) ⊕ H(A₀⊕B₀)
+///     [ 1 0 | 0 0 | 0 1 ]   <- (0,1) left:  H(A₀) ⊕ H(A₀⊕B₁)
+///     [ 0 0 | 0 1 | 0 1 ]   <- (0,1) right: H(B₁) ⊕ H(A₀⊕B₁)
+///     [ 0 1 | 0 0 | 0 1 ]   <- (1,0) left:  H(A₁) ⊕ H(A₀⊕B₁)
+///     [ 0 0 | 1 0 | 0 1 ]   <- (1,0) right: H(B₀) ⊕ H(A₀⊕B₁)
+///     [ 0 1 | 0 0 | 1 0 ]   <- (1,1) left:  H(A₁) ⊕ H(A₀⊕B₀)
+///     [ 0 0 | 0 1 | 1 0 ]   <- (1,1) right: H(B₁) ⊕ H(A₀⊕B₀)
+/// ```
+///
+/// **Column interpretation** (in order):
+/// - Column 0: H(A₀)
+/// - Column 1: H(A₁) = H(A₀ ⊕ Δ)
+/// - Column 2: H(B₀)
+/// - Column 3: H(B₁) = H(B₀ ⊕ Δ)
+/// - Column 4: H(A₀ ⊕ B₀)
+/// - Column 5: H(A₀ ⊕ B₁) = H(A₁ ⊕ B₀) due to free-XOR
+///
+/// **Key observation** (Paper Section 4.1):
+/// Because of free-XOR, A₀ ⊕ B₀ = A₁ ⊕ B₁ and A₀ ⊕ B₁ = A₁ ⊕ B₀.
+/// This creates useful redundancy - each H(A⊕B) term can be used in 2 rows.
+///
+/// **Which queries are available for each input combination**:
+/// - (0,0): Has A₀, B₀, so can query H(A₀), H(B₀), H(A₀⊕B₀)
+/// - (0,1): Has A₀, B₁, so can query H(A₀), H(B₁), H(A₀⊕B₁)
+/// - (1,0): Has A₁, B₀, so can query H(A₁), H(B₀), H(A₁⊕B₀) = H(A₀⊕B₁)
+/// - (1,1): Has A₁, B₁, so can query H(A₁), H(B₁), H(A₁⊕B₁) = H(A₀⊕B₀)
+///
+/// This is Table (2) from Paper Page 9.
+pub const M: [[u8; 6]; 8] = [
+    //   H(A₀) H(A₁) H(B₀) H(B₁) H(A₀⊕B₀) H(A₀⊕B₁)
+    // Row 0: (0,0) left - uses H(A₀) and H(A₀⊕B₀)
+    [1, 0, 0, 0, 1, 0],
+    // Row 1: (0,0) right - uses H(B₀) and H(A₀⊕B₀)
+    [0, 0, 1, 0, 1, 0],
+    // Row 2: (0,1) left - uses H(A₀) and H(A₀⊕B₁)
+    [1, 0, 0, 0, 0, 1],
+    // Row 3: (0,1) right - uses H(B₁) and H(A₀⊕B₁)
+    [0, 0, 0, 1, 0, 1],
+    // Row 4: (1,0) left - uses H(A₁) and H(A₀⊕B₁) [note: A₁⊕B₀ = A₀⊕B₁]
+    [0, 1, 0, 0, 0, 1],
+    // Row 5: (1,0) right - uses H(B₀) and H(A₀⊕B₁)
+    [0, 0, 1, 0, 0, 1],
+    // Row 6: (1,1) left - uses H(A₁) and H(A₀⊕B₀) [note: A₁⊕B₁ = A₀⊕B₀]
+    [0, 1, 0, 0, 1, 0],
+    // Row 7: (1,1) right - uses H(B₁) and H(A₀⊕B₀)
+    [0, 0, 0, 1, 1, 0],
+];
+
+/// Matrix V⁻¹: Left-inverse of V
+///
+/// From Paper Page 18, Equation 10:
+/// ```text
+/// V⁻¹ = [ 1 0 | 0 0 | 0 0 | 0 0 ]
+///       [ 0 1 | 0 0 | 0 0 | 0 0 ]
+///       [ 1 1 | 0 0 | 1 1 | 0 0 ]
+///       [ 1 1 | 1 1 | 0 0 | 0 0 ]
+///       [ 0 0 | 0 0 | 1 0 | 1 0 ]
+/// ```
+///
+/// **Property**: V⁻¹ · V = I₅ (5×5 identity matrix)
+/// This is a LEFT-inverse, not a full inverse (V is 8×5, not square).
+///
+/// **Usage**: The garbler computes [C; G⃗] = V⁻¹ · (right-hand side of Equation
+/// 4)
+///
+/// **Row interpretation**:
+/// - Row 0: Extracts C_L (output label left half)
+/// - Row 1: Extracts C_R (output label right half)
+/// - Row 2: Extracts G₀ (first gate ciphertext)
+/// - Row 3: Extracts G₁ (second gate ciphertext)
+/// - Row 4: Extracts G₂ (third gate ciphertext)
+///
+/// **Note**: The vertical bars separate the 2-column blocks corresponding to
+/// each input combination: (0,0) | (0,1) | (1,0) | (1,1)
+pub const V_INV: [[u8; 8]; 5] = [
+    //   (0,0)L  (0,0)R  (0,1)L  (0,1)R  (1,0)L  (1,0)R  (1,1)L  (1,1)R
+    // Row 0: extracts C_L - just takes (0,0) left directly
+    [1, 0, 0, 0, 0, 0, 0, 0],
+    // Row 1: extracts C_R - just takes (0,0) right directly
+    [0, 1, 0, 0, 0, 0, 0, 0],
+    // Row 2: extracts G₀
+    [1, 1, 0, 0, 1, 1, 0, 0],
+    // Row 3: extracts G₁
+    [1, 1, 1, 1, 0, 0, 0, 0],
+    // Row 4: extracts G₂
+    [0, 0, 0, 0, 1, 0, 1, 0],
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Verification tests for core matrices K, V, M, V_INV.
+    ///
+    /// These verify that the hardcoded matrices were correctly copied from
+    /// the paper by checking their mathematical properties.
+    ///
+    /// These tests only need to run once to verify correctness after changes.
+    ///
+    /// Run with: `cargo test matrices::tests::table_verification -- --ignored`
+    mod table_verification {
+        use super::*;
+
+        // ====================================================================
+        // Matrix Operations (over GF(2))
+        // ====================================================================
+
+        /// Multiply two matrices over GF(2)
+        ///
+        /// Computes A × B where all arithmetic is mod 2.
+        fn matmul_gf2<const RA: usize, const CA: usize, const CB: usize>(
+            a: &[[u8; CA]; RA],
+            b: &[[u8; CB]; CA],
+        ) -> [[u8; CB]; RA] {
+            let mut result = [[0u8; CB]; RA];
+
+            for i in 0..RA {
+                for j in 0..CB {
+                    let mut sum = 0u8;
+                    for k in 0..CA {
+                        // In GF(2): multiplication is AND, addition is XOR
+                        sum ^= a[i][k] & b[k][j];
+                    }
+                    result[i][j] = sum;
+                }
+            }
+
+            result
+        }
+
+        /// Check if a matrix is all zeros
+        fn is_zero_matrix<const R: usize, const C: usize>(m: &[[u8; C]; R]) -> bool {
+            for row in m {
+                for &val in row {
+                    if val != 0 {
+                        return false;
+                    }
+                }
+            }
+            true
+        }
+
+        /// Check if matrix equals identity
+        fn is_identity<const N: usize>(m: &[[u8; N]; N]) -> bool {
+            for i in 0..N {
+                for j in 0..N {
+                    let expected = if i == j { 1 } else { 0 };
+                    if m[i][j] != expected {
+                        return false;
+                    }
+                }
+            }
+            true
+        }
+
+        /// Compute rank of a matrix over GF(2) using Gaussian elimination
+        fn rank_gf2<const R: usize, const C: usize>(m: &[[u8; C]; R]) -> usize {
+            // Make a mutable copy for row reduction
+            let mut work = *m;
+            let mut rank = 0;
+            let mut pivot_col = 0;
+
+            for row in 0..R {
+                // Find a column with a pivot (staying on the same row until we find one)
+                while pivot_col < C {
+                    // Find pivot in current column
+                    let mut pivot_row = None;
+                    for r in row..R {
+                        if work[r][pivot_col] == 1 {
+                            pivot_row = Some(r);
+                            break;
+                        }
+                    }
+
+                    if let Some(pr) = pivot_row {
+                        // Swap rows
+                        work.swap(row, pr);
+
+                        // Eliminate below (and above for reduced form)
+                        for r in 0..R {
+                            if r != row && work[r][pivot_col] == 1 {
+                                for c in 0..C {
+                                    work[r][c] ^= work[row][c];
+                                }
+                            }
+                        }
+
+                        rank += 1;
+                        pivot_col += 1;
+                        break; // Move to next row
+                    } else {
+                        // No pivot in this column, try next column (stay on same row)
+                        pivot_col += 1;
+                    }
+                }
+
+                if pivot_col >= C {
+                    break; // No more columns to search
+                }
+            }
+
+            rank
+        }
+
+        /// Verify K × V = 0
+        ///
+        /// Paper Page 12: "Then V must satisfy rank(V) = 5 and KV = 0"
+        /// This confirms that all columns of V lie in the gate space G.
+        #[test]
+        #[ignore]
+        fn test_k_times_v_is_zero() {
+            let kv = matmul_gf2(&K, &V);
+            assert!(
+                is_zero_matrix(&kv),
+                "K × V should be zero matrix. Got: {:?}",
+                kv
+            );
+        }
+
+        /// Verify K × M = 0
+        ///
+        /// Paper Section 5.1: Since colspace(V) = colspace(M) = G,
+        /// and K is the cokernel of G, we must have KM = 0.
+        #[test]
+        #[ignore]
+        fn test_k_times_m_is_zero() {
+            let km = matmul_gf2(&K, &M);
+            assert!(
+                is_zero_matrix(&km),
+                "K × M should be zero matrix. Got: {:?}",
+                km
+            );
+        }
+
+        /// Verify V⁻¹ × V = I₅
+        ///
+        /// Paper Page 18: V⁻¹ is defined as a left-inverse of V.
+        #[test]
+        #[ignore]
+        fn test_v_inv_is_left_inverse() {
+            let v_inv_v = matmul_gf2(&V_INV, &V);
+            assert!(
+                is_identity(&v_inv_v),
+                "V⁻¹ × V should be 5×5 identity. Got: {:?}",
+                v_inv_v
+            );
+        }
+
+        /// Verify ranks of V and M are both 5
+        ///
+        /// Paper Page 12: "The gate space G has dimension 5"
+        /// Both V and M should span this 5-dimensional space.
+        #[test]
+        #[ignore]
+        fn test_matrix_ranks() {
+            let rank_v = rank_gf2(&V);
+            let rank_m = rank_gf2(&M);
+
+            assert_eq!(rank_v, 5, "V should have rank 5, got {}", rank_v);
+            assert_eq!(rank_m, 5, "M should have rank 5, got {}", rank_m);
+        }
+
+        /// Verify K has rank 3 (3 independent constraints)
+        ///
+        /// K defines the cokernel of an 8-dimensional space, leaving dimension
+        /// 8-3=5 for the gate space G.
+        #[test]
+        #[ignore]
+        fn test_k_rank() {
+            let rank_k = rank_gf2(&K);
+            assert_eq!(rank_k, 3, "K should have rank 3, got {}", rank_k);
+        }
+
+        /// Verify V⁻¹ × M matches expected value from paper
+        ///
+        /// Paper Equation 12 (Page 21) gives the explicit result.
+        #[test]
+        #[ignore]
+        fn test_v_inv_m_matches_paper() {
+            let v_inv_m = matmul_gf2(&V_INV, &M);
+
+            // Expected from Paper Equation 12
+            let expected: [[u8; 6]; 5] = [
+                [1, 0, 0, 0, 1, 0], // Row 0
+                [0, 0, 1, 0, 1, 0], // Row 1
+                [1, 1, 0, 0, 0, 0], // Row 2
+                [0, 0, 1, 1, 0, 0], // Row 3
+                [0, 0, 0, 0, 1, 1], // Row 4
+            ];
+
+            assert_eq!(
+                v_inv_m, expected,
+                "V⁻¹ × M doesn't match paper Equation 12.\nGot: {:?}\nExpected: {:?}",
+                v_inv_m, expected
+            );
+        }
+    }
+}
diff --git a/crates/garble-core/src/three_halves/mod.rs b/crates/garble-core/src/three_halves/mod.rs
new file mode 100644
index 00000000..8aa7e286
--- /dev/null
+++ b/crates/garble-core/src/three_halves/mod.rs
@@ -0,0 +1,130 @@
+//! # Three Halves Make a Whole - Garbled Circuit Implementation
+//!
+//! This module implements the "Three Halves Make a Whole" garbling scheme from:
+//!
+//! **Paper**: "Three Halves Make a Whole? Beating the Half-Gates Lower Bound
+//! for Garbled Circuits" **Authors**: Mike Rosulek, Lawrence Roy
+//! **Published**: Eurocrypt 2021
+//! **ePrint**: <https://eprint.iacr.org/2021/749>
+//!
+//! ## Overview
+//!
+//! This scheme reduces AND gate size from 2κ bits (half-gates) to 1.5κ + 5 bits
+//! using two key techniques:
+//!
+//! 1. **Slicing**: Wire labels are split into left/right halves (κ/2 bits
+//!    each), and the evaluator computes each half using potentially different
+//!    linear combinations.
+//!
+//! 2. **Dicing**: The evaluator decrypts "control bits" that determine which
+//!    linear combinations to apply. These control bits are randomized to hide
+//!    the gate's truth table.
+//!
+//! ## Module Structure
+//!
+//! - [`matrices`]: Core matrices (K, V, M) that define the linear algebraic
+//!   structure
+//! - [`control`]: Control matrix system (R, S₁, S₂) for the "dicing" technique
+//! - [`slicing`]: Wire label slicing utilities
+//! - [`garbler`]: Garbling functions
+//! - [`evaluator`]: Evaluation functions
+//!
+//! ## Usage
+//!
+//! ```ignore
+//! use mpz_garble_core::three_halves::{Garbler, Evaluator, GarblerOutput, EvaluatorOutput};
+//!
+//! let mut gb = Garbler::default();
+//! let mut ev = Evaluator::default();
+//!
+//! let mut gb_iter = gb.generate(&circuit, delta, &input_keys, &mut rng)?;
+//! let mut ev_consumer = ev.evaluate(&circuit, &input_macs)?;
+//!
+//! while let Some(gate) = gb_iter.next() {
+//!     ev_consumer.next(gate);
+//! }
+//!
+//! let gb_output = gb_iter.finish()?;
+//! let ev_output = ev_consumer.finish()?;
+//! ```
+
+pub(crate) mod circuit;
+pub mod control;
+pub mod evaluator;
+pub mod garbler;
+mod garbler_tables;
+pub mod matrices;
+mod random_bits;
+pub mod slicing;
+
+/// Gate ciphertexts for a Three Halves AND gate.
+///
+/// Contains 3 ciphertexts of κ/2 bits each = 1.5κ bits total.
+/// This is smaller than half-gates which uses 2κ bits.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct ThreeHalvesGate {
+    /// Gate ciphertext G₀ (κ/2 = 64 bits)
+    pub g0: [u8; 8],
+    /// Gate ciphertext G₁ (κ/2 = 64 bits)
+    pub g1: [u8; 8],
+    /// Gate ciphertext G₂ (κ/2 = 64 bits)
+    pub g2: [u8; 8],
+}
+
+impl ThreeHalvesGate {
+    /// Create a new gate from three κ/2-bit ciphertexts.
+    pub fn new(g0: [u8; 8], g1: [u8; 8], g2: [u8; 8]) -> Self {
+        Self { g0, g1, g2 }
+    }
+}
+
+/// Control bits for evaluator (compressed form).
+///
+/// The r_bar is a 4×2 matrix where each row r_bar[ij] contains the coefficients
+/// [c₁, c₂] for input position (i,j). The evaluator expands this to a 2×4
+/// marginal using: R_ij = c₁·S₁ ⊕ c₂·S₂
+///
+/// Stored as a packed u8: bits [2*ij, 2*ij+1] hold [c₁, c₂] for position ij.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct ControlBits(u8);
+
+impl ControlBits {
+    /// Create new control bits from the compressed r_bar representation.
+    ///
+    /// Packs the 4×2 matrix into a single byte.
+    pub fn new(r_bar: [[u8; 2]; 4]) -> Self {
+        let bits = (r_bar[0][0] & 1)
+            | ((r_bar[0][1] & 1) << 1)
+            | ((r_bar[1][0] & 1) << 2)
+            | ((r_bar[1][1] & 1) << 3)
+            | ((r_bar[2][0] & 1) << 4)
+            | ((r_bar[2][1] & 1) << 5)
+            | ((r_bar[3][0] & 1) << 6)
+            | ((r_bar[3][1] & 1) << 7);
+        Self(bits)
+    }
+
+    /// Get the coefficients [c₁, c₂] for input position (i, j).
+    #[inline]
+    pub fn get(&self, ij: usize) -> [u8; 2] {
+        let shift = 2 * ij;
+        [(self.0 >> shift) & 1, (self.0 >> (shift + 1)) & 1]
+    }
+}
+
+// Re-export circuit types
+pub use circuit::{EncryptedGate, EncryptedGateBatch, GarbledCircuit};
+
+// Re-export main types from garbler
+pub use garbler::{
+    EncryptedGateBatchIter, EncryptedGateIter, Garbler, GarblerError, GarblerOutput,
+};
+
+// Re-export main types from evaluator
+pub use evaluator::{
+    EncryptedGateBatchConsumer, EncryptedGateConsumer, Evaluator, EvaluatorError, EvaluatorOutput,
+    evaluate_garbled_circuits,
+};
+
+#[cfg(test)]
+mod tests;
diff --git a/crates/garble-core/src/three_halves/random_bits.rs b/crates/garble-core/src/three_halves/random_bits.rs
new file mode 100644
index 00000000..89ba30b6
--- /dev/null
+++ b/crates/garble-core/src/three_halves/random_bits.rs
@@ -0,0 +1,92 @@
+//! Random Bit Source for Garbling
+//!
+//! This module provides an efficient source of random bits for garbling
+//! operations. Random bits are pre-generated in bulk and consumed sequentially
+//! to avoid repeated RNG calls during circuit processing.
+
+use rand::Rng;
+
+/// Pre-generated random bits for efficient consumption during garbling.
+///
+/// Random bits are packed into u64 words and extracted one at a time.
+/// This is more efficient than calling the RNG for each individual bit.
+pub(super) struct RandomBitSource {
+    /// Pre-generated random words
+    data: Vec<u64>,
+    /// Current bit index
+    bit_idx: usize,
+}
+
+impl RandomBitSource {
+    /// Create a new source with pre-generated random bits.
+    ///
+    /// # Arguments
+    /// * `num_bits` - Total number of random bits needed
+    /// * `rng` - Random number generator to use for generation
+    pub(super) fn new<R: Rng>(num_bits: usize, rng: &mut R) -> Self {
+        let num_u64s = (num_bits + 63) / 64;
+        let data: Vec<u64> = (0..num_u64s).map(|_| rng.random()).collect();
+        Self { data, bit_idx: 0 }
+    }
+
+    /// Get the next random bit.
+    #[inline]
+    pub(super) fn next_bit(&mut self) -> bool {
+        let word_idx = self.bit_idx / 64;
+        let bit_pos = self.bit_idx % 64;
+        self.bit_idx += 1;
+        (self.data[word_idx] >> bit_pos) & 1 == 1
+    }
+
+    /// Get the next two random bits.
+    #[inline]
+    pub(super) fn next_two_bits(&mut self) -> [bool; 2] {
+        [self.next_bit(), self.next_bit()]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::{Rng, SeedableRng};
+    use rand_chacha::ChaCha12Rng;
+
+    #[test]
+    fn test_random_bits_match_rng() {
+        let seed = 42;
+
+        let mut rng1 = ChaCha12Rng::seed_from_u64(seed);
+        let mut source = RandomBitSource::new(32, &mut rng1);
+
+        let mut rng2 = ChaCha12Rng::seed_from_u64(seed);
+        let expected: u32 = rng2.random();
+
+        // Extract 32 bits in mixed order: 1, 2, 1, 2, 1, 2, ... (10 calls = 15 bits)
+        // Then 1, 2, 1, 2, 1, 2, 1, 1, 1 (9 calls = 17 bits) = 32 total
+        let mut actual: u32 = 0;
+        let mut bit_idx = 0;
+
+        // Mixed extraction pattern
+        for _ in 0..5 {
+            actual |= (source.next_bit() as u32) << bit_idx;
+            bit_idx += 1;
+            let two = source.next_two_bits();
+            actual |= (two[0] as u32) << bit_idx;
+            actual |= (two[1] as u32) << (bit_idx + 1);
+            bit_idx += 2;
+        }
+        // Remaining 17 bits
+        for _ in 0..5 {
+            let two = source.next_two_bits();
+            actual |= (two[0] as u32) << bit_idx;
+            actual |= (two[1] as u32) << (bit_idx + 1);
+            bit_idx += 2;
+        }
+        for _ in 0..7 {
+            actual |= (source.next_bit() as u32) << bit_idx;
+            bit_idx += 1;
+        }
+
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/crates/garble-core/src/three_halves/slicing.rs b/crates/garble-core/src/three_halves/slicing.rs
new file mode 100644
index 00000000..7770f7a5
--- /dev/null
+++ b/crates/garble-core/src/three_halves/slicing.rs
@@ -0,0 +1,147 @@
+//! Wire Label Slicing for Three Halves Garbling
+//!
+//! This module implements the "slicing" technique from the Three Halves paper.
+//! Wire labels are split into left and right halves (κ/2 bits each), allowing
+//! the evaluator to compute each half using potentially different linear
+//! combinations.
+//!
+//! # Paper Reference
+//!
+//! Section 3.1 (Page 8):
+//! > "We slice a wire label W into two halves W_L and W_R, each of length κ/2."
+//!
+//! Section 5 (Page 11):
+//! > "The slicing technique means that 'half' of each wire label (i.e., κ/2
+//! > bits)
+//! > can be computed from a different linear combination."
+//!
+//! # Layout
+//!
+//! A 128-bit Block is split as follows:
+//! ```text
+//! Block (128 bits):  [byte0, byte1, ..., byte7, byte8, ..., byte15]
+//!                    [======= left =======][======= right ========]
+//!                         (64 bits)              (64 bits)
+//! ```
+//!
+//! The left half occupies bytes 0-7, the right half occupies bytes 8-15.
+//! This matches the little-endian layout used throughout mpz.
+
+use mpz_core::Block;
+
+/// A wire label split into left and right halves.
+///
+/// Each half is κ/2 = 64 bits, stored as `[u8; 8]`.
+///
+/// # Paper Reference
+///
+/// The paper uses notation like `A_L` and `A_R` for left and right halves
+/// of a wire label `A`. This struct represents that split form.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub struct SlicedLabel {
+    /// Left half of the wire label (κ/2 bits)
+    ///
+    /// In evaluation equations, this appears in even rows (0, 2, 4, 6).
+    pub left: [u8; 8],
+
+    /// Right half of the wire label (κ/2 bits)
+    ///
+    /// In evaluation equations, this appears in odd rows (1, 3, 5, 7).
+    pub right: [u8; 8],
+}
+
+impl SlicedLabel {
+    /// Create a new SlicedLabel from left and right halves.
+    #[inline]
+    pub const fn new(left: [u8; 8], right: [u8; 8]) -> Self {
+        Self { left, right }
+    }
+
+    /// Create a zero-valued SlicedLabel.
+    pub const ZERO: Self = Self {
+        left: [0u8; 8],
+        right: [0u8; 8],
+    };
+
+    /// Split a 128-bit Block into left and right halves.
+    ///
+    /// # Layout
+    ///
+    /// ```text
+    /// Block bytes:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+    ///               [======= left ========][========== right ===========]
+    /// ```
+    #[inline]
+    pub fn from_block(block: Block) -> Self {
+        let [left, right]: [[u8; 8]; 2] = bytemuck::cast(block);
+        Self { left, right }
+    }
+
+    /// Recombine left and right halves into a 128-bit Block.
+    #[inline]
+    pub fn to_block(&self) -> Block {
+        bytemuck::cast([self.left, self.right])
+    }
+}
+
+impl From<Block> for SlicedLabel {
+    #[inline]
+    fn from(block: Block) -> Self {
+        Self::from_block(block)
+    }
+}
+
+impl From<SlicedLabel> for Block {
+    #[inline]
+    fn from(sliced: SlicedLabel) -> Self {
+        sliced.to_block()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::SeedableRng;
+    use rand_chacha::ChaCha12Rng;
+
+    /// Round-trip conversion
+    ///
+    /// Splitting and recombining should give back the original block.
+    #[test]
+    fn test_roundtrip() {
+        let mut rng = ChaCha12Rng::seed_from_u64(42);
+
+        // Test with random block
+        let block = Block::random(&mut rng);
+        let sliced = SlicedLabel::from_block(block);
+        let recovered = sliced.to_block();
+        assert_eq!(block, recovered, "Round-trip failed");
+
+        // Test with zero block
+        let zero_sliced = SlicedLabel::from_block(Block::ZERO);
+        assert_eq!(zero_sliced.left, [0u8; 8]);
+        assert_eq!(zero_sliced.right, [0u8; 8]);
+        assert_eq!(zero_sliced.to_block(), Block::ZERO);
+
+        // Test with all-ones block
+        let ones_sliced = SlicedLabel::from_block(Block::ONES);
+        assert_eq!(ones_sliced.left, [0xffu8; 8]);
+        assert_eq!(ones_sliced.right, [0xffu8; 8]);
+        assert_eq!(ones_sliced.to_block(), Block::ONES);
+    }
+
+    /// From/Into trait implementations
+    #[test]
+    fn test_from_into_traits() {
+        let mut rng = ChaCha12Rng::seed_from_u64(999);
+        let block = Block::random(&mut rng);
+
+        // Test From<Block> for SlicedLabel
+        let sliced: SlicedLabel = block.into();
+        assert_eq!(sliced, SlicedLabel::from_block(block));
+
+        // Test From<SlicedLabel> for Block
+        let recovered: Block = sliced.into();
+        assert_eq!(recovered, block);
+    }
+}
diff --git a/crates/garble-core/src/three_halves/tests.rs b/crates/garble-core/src/three_halves/tests.rs
new file mode 100644
index 00000000..cfbeb056
--- /dev/null
+++ b/crates/garble-core/src/three_halves/tests.rs
@@ -0,0 +1,487 @@
+//! Integration tests for the three-halves garbling scheme
+//!
+//! These tests verify that the Garbler and Evaluator work correctly together.
+//! They are organized by circuit complexity.
+
+use aes::{
+    Aes128 as AesCipher,
+    cipher::{BlockCipherEncrypt, KeyInit},
+};
+use itybity::{FromBitIterator, IntoBitIterator, ToBits};
+use mpz_circuits::{AES128, CircuitBuilder, circuits::xor};
+use mpz_core::Block;
+use mpz_memory_core::correlated::{Delta, Key, Mac};
+use rand::{Rng, SeedableRng, rngs::StdRng};
+use rand_chacha::ChaCha12Rng;
+
+use super::{Evaluator, EvaluatorOutput, Garbler, GarblerOutput};
+
+// ==============================================================================
+// Small Circuit Tests (1-4 AND gates)
+//
+// Basic tests verify correctness on small circuits
+// ==============================================================================
+
+/// Helper function to run a garble-evaluate-verify test
+fn test_circuit_correctness<F>(circuit_builder: F, input_combinations: &[(Vec<bool>, bool)])
+where
+    F: Fn() -> mpz_circuits::Circuit,
+{
+    let circ = circuit_builder();
+
+    for (input_bits, expected_output) in input_combinations {
+        let mut rng = ChaCha12Rng::seed_from_u64(42);
+        let delta = Delta::random(&mut rng);
+
+        let input_keys: Vec<Key> = (0..input_bits.len())
+            .map(|_| {
+                let block: Block = rng.random();
+                block.into()
+            })
+            .collect();
+
+        // Garble the circuit
+        let mut gb = Garbler::default();
+        let mut gb_iter = gb.generate(&circ, delta, &input_keys, &mut rng).unwrap();
+
+        let mut encrypted_gates = Vec::new();
+        while let Some(gate) = gb_iter.next() {
+            encrypted_gates.push(gate);
+        }
+
+        let GarblerOutput {
+            outputs: output_labels,
+        } = gb_iter.finish().unwrap();
+
+        // Select input MACs based on input bits
+        // Key is 0-label, Key ⊕ Δ is 1-label
+        let delta_block = *delta.as_block();
+        let input_macs: Vec<Mac> = input_bits
+            .iter()
+            .enumerate()
+            .map(|(i, &bit)| {
+                let key_block = *input_keys[i].as_block();
+                if bit {
+                    (key_block ^ delta_block).into()
+                } else {
+                    key_block.into()
+                }
+            })
+            .collect();
+
+        // Evaluate the circuit
+        let mut ev = Evaluator::default();
+        let mut ev_consumer = ev.evaluate(&circ, &input_macs).unwrap();
+
+        for gate in encrypted_gates {
+            ev_consumer.next(gate);
+        }
+
+        let EvaluatorOutput {
+            outputs: output_macs,
+        } = ev_consumer.finish().unwrap();
+
+        // Verify output
+        let false_label = Mac::from(*output_labels[0].as_block());
+        let true_label = Mac::from(*output_labels[0].as_block() ^ *delta.as_block());
+        let expected_mac = if *expected_output {
+            &true_label
+        } else {
+            &false_label
+        };
+
+        assert_eq!(
+            &output_macs[0], expected_mac,
+            "Failed for inputs {:?}: expected {}, got wrong output",
+            input_bits, expected_output
+        );
+    }
+}
+
+/// Test: Single AND gate (1 gate)
+#[test]
+fn test_single_and_gate() {
+    let circuit_builder = || {
+        let mut builder = CircuitBuilder::new();
+        let a = builder.add_input();
+        let b = builder.add_input();
+        let out = builder.add_and_gate(a, b);
+        builder.add_output(out);
+        builder.build().unwrap()
+    };
+
+    let test_cases = vec![
+        (vec![false, false], false),
+        (vec![false, true], false),
+        (vec![true, false], false),
+        (vec![true, true], true),
+    ];
+
+    test_circuit_correctness(circuit_builder, &test_cases);
+}
+
+/// Test: Chained AND gates (2 gates) - (a AND b) AND c
+#[test]
+fn test_chained_and_gates() {
+    let circuit_builder = || {
+        let mut builder = CircuitBuilder::new();
+        let a = builder.add_input();
+        let b = builder.add_input();
+        let c = builder.add_input();
+        let ab = builder.add_and_gate(a, b);
+        let abc = builder.add_and_gate(ab, c);
+        builder.add_output(abc);
+        builder.build().unwrap()
+    };
+
+    let test_cases = vec![
+        (vec![false, false, false], false),
+        (vec![false, false, true], false),
+        (vec![false, true, false], false),
+        (vec![false, true, true], false),
+        (vec![true, false, false], false),
+        (vec![true, false, true], false),
+        (vec![true, true, false], false),
+        (vec![true, true, true], true),
+    ];
+
+    test_circuit_correctness(circuit_builder, &test_cases);
+}
+
+/// Test: XOR then AND (1 gate) - (a XOR b) AND c
+#[test]
+fn test_xor_then_and() {
+    let circuit_builder = || {
+        let mut builder = CircuitBuilder::new();
+        let a = builder.add_input();
+        let b = builder.add_input();
+        let c = builder.add_input();
+        let ab_xor = builder.add_xor_gate(a, b);
+        let result = builder.add_and_gate(ab_xor, c);
+        builder.add_output(result);
+        builder.build().unwrap()
+    };
+
+    let test_cases = vec![
+        (vec![false, false, false], false), // (0 XOR 0) AND 0 = 0
+        (vec![false, false, true], false),  // (0 XOR 0) AND 1 = 0
+        (vec![false, true, false], false),  // (0 XOR 1) AND 0 = 0
+        (vec![false, true, true], true),    // (0 XOR 1) AND 1 = 1
+        (vec![true, false, false], false),  // (1 XOR 0) AND 0 = 0
+        (vec![true, false, true], true),    // (1 XOR 0) AND 1 = 1
+        (vec![true, true, false], false),   // (1 XOR 1) AND 0 = 0
+        (vec![true, true, true], false),    // (1 XOR 1) AND 1 = 0
+    ];
+
+    test_circuit_correctness(circuit_builder, &test_cases);
+}
+
+/// Test: AND, XOR, AND (2 gates) - ((a AND b) XOR c) AND d
+#[test]
+fn test_and_xor_and() {
+    let circuit_builder = || {
+        let mut builder = CircuitBuilder::new();
+        let a = builder.add_input();
+        let b = builder.add_input();
+        let c = builder.add_input();
+        let d = builder.add_input();
+        let ab = builder.add_and_gate(a, b);
+        let ab_xor_c = builder.add_xor_gate(ab, c);
+        let result = builder.add_and_gate(ab_xor_c, d);
+        builder.add_output(result);
+        builder.build().unwrap()
+    };
+
+    let test_cases = vec![
+        (vec![false, false, false, false], false),
+        (vec![false, false, false, true], false),
+        (vec![false, false, true, false], false),
+        (vec![false, false, true, true], true),
+        (vec![false, true, false, false], false),
+        (vec![false, true, false, true], false),
+        (vec![false, true, true, false], false),
+        (vec![false, true, true, true], true),
+        (vec![true, false, false, false], false),
+        (vec![true, false, false, true], false),
+        (vec![true, false, true, false], false),
+        (vec![true, false, true, true], true),
+        (vec![true, true, false, false], false),
+        (vec![true, true, false, true], true),
+        (vec![true, true, true, false], false),
+        (vec![true, true, true, true], false),
+    ];
+
+    test_circuit_correctness(circuit_builder, &test_cases);
+}
+
+/// Test: Four chained AND gates (4 gates)
+#[test]
+fn test_four_chained_and_gates() {
+    let circuit_builder = || {
+        let mut builder = CircuitBuilder::new();
+        let a = builder.add_input();
+        let b = builder.add_input();
+        let c = builder.add_input();
+        let d = builder.add_input();
+        let ab = builder.add_and_gate(a, b);
+        let cd = builder.add_and_gate(c, d);
+        let abcd = builder.add_and_gate(ab, cd);
+        builder.add_output(abcd);
+        builder.build().unwrap()
+    };
+
+    // Only test a subset of the 16 combinations for speed
+    let test_cases = vec![
+        (vec![false, false, false, false], false),
+        (vec![true, false, true, false], false),
+        (vec![true, true, false, false], false),
+        (vec![true, true, true, false], false),
+        (vec![true, true, true, true], true),
+    ];
+
+    test_circuit_correctness(circuit_builder, &test_cases);
+}
+
+// ==============================================================================
+// Large Circuit Tests (100+ AND gates)
+//
+// These tests verify correctness on realistic circuits
+// ==============================================================================
+
+/// Test: XOR-only circuit produces no encrypted gates
+#[test]
+fn test_xor_only_circuit() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let circ = xor(8);
+    assert_eq!(circ.and_count(), 0);
+
+    let a = 1u8;
+    let b = 2u8;
+    let expected = a ^ b;
+
+    let delta = Delta::random(&mut rng);
+
+    let input_keys: Vec<Key> = (0..circ.inputs().len())
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let input_bits: Vec<bool> = a.iter_lsb0().chain(b.iter_lsb0()).collect();
+
+    // Garble
+    let mut gb = Garbler::default();
+    let mut gb_iter = gb
+        .generate_batched(&circ, delta, &input_keys, &mut rng)
+        .unwrap();
+
+    let mut encrypted_gates = Vec::new();
+    for batch in gb_iter.by_ref() {
+        encrypted_gates.extend(batch.into_array());
+    }
+
+    assert_eq!(
+        encrypted_gates.len(),
+        0,
+        "XOR-only circuit should produce no encrypted gates"
+    );
+
+    let GarblerOutput {
+        outputs: output_labels,
+    } = gb_iter.finish().unwrap();
+
+    let delta_block = *delta.as_block();
+    let input_macs: Vec<Mac> = input_keys
+        .iter()
+        .zip(&input_bits)
+        .map(|(key, &bit)| {
+            let key_block = *key.as_block();
+            if bit {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    // Evaluate
+    let mut ev = Evaluator::default();
+    let mut ev_consumer = ev.evaluate(&circ, &input_macs).unwrap();
+
+    for gate in encrypted_gates {
+        ev_consumer.next(gate);
+    }
+
+    let EvaluatorOutput {
+        outputs: output_macs,
+    } = ev_consumer.finish().unwrap();
+
+    // Decode: check if mac equals true_label (false_label XOR delta)
+    let output: u8 = u8::from_lsb0_iter(output_macs.into_iter().zip(&output_labels).map(
+        |(mac, false_label)| {
+            let true_label = Mac::from(*false_label.as_block() ^ delta_block);
+            mac == true_label
+        },
+    ));
+
+    assert_eq!(output, expected);
+}
+
+/// Test: AES128 circuit (6800 AND gates)
+#[test]
+fn test_aes128_circuit() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let key = [69u8; 16];
+    let msg = [42u8; 16];
+
+    let expected: [u8; 16] = {
+        let cipher = AesCipher::new_from_slice(&key).unwrap();
+        let mut out = msg.into();
+        cipher.encrypt_block(&mut out);
+        out.into()
+    };
+
+    let delta = Delta::random(&mut rng);
+
+    let input_keys: Vec<Key> = (0..AES128.inputs().len())
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let input_bits: Vec<bool> = key.iter().copied().chain(msg).into_iter_lsb0().collect();
+
+    // Garble
+    let mut gb = Garbler::default();
+    let mut gb_iter = gb
+        .generate_batched(&AES128, delta, &input_keys, &mut rng)
+        .unwrap();
+
+    let mut encrypted_gates = Vec::new();
+    for batch in gb_iter.by_ref() {
+        encrypted_gates.extend(batch.into_array());
+    }
+
+    let GarblerOutput {
+        outputs: output_labels,
+    } = gb_iter.finish().unwrap();
+
+    let delta_block = *delta.as_block();
+    let input_macs: Vec<Mac> = input_keys
+        .iter()
+        .zip(&input_bits)
+        .map(|(key, &bit)| {
+            let key_block = *key.as_block();
+            if bit {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    // Evaluate
+    let mut ev = Evaluator::default();
+    let mut ev_consumer = ev.evaluate(&AES128, &input_macs).unwrap();
+
+    for gate in encrypted_gates {
+        ev_consumer.next(gate);
+    }
+
+    let EvaluatorOutput {
+        outputs: output_macs,
+    } = ev_consumer.finish().unwrap();
+
+    // Decode: check if mac equals true_label (false_label XOR delta)
+    let output: Vec<u8> = Vec::from_lsb0_iter(output_macs.into_iter().zip(&output_labels).map(
+        |(mac, false_label)| {
+            let true_label = Mac::from(*false_label.as_block() ^ delta_block);
+            mac == true_label
+        },
+    ));
+
+    assert_eq!(output, expected);
+}
+
+/// Test: AES128 with circuit reuse (preprocessed garbling)
+#[test]
+fn test_aes128_preprocessed() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let key = [69u8; 16];
+    let msg = [42u8; 16];
+
+    let expected: [u8; 16] = {
+        let cipher = AesCipher::new_from_slice(&key).unwrap();
+        let mut out = msg.into();
+        cipher.encrypt_block(&mut out);
+        out.into()
+    };
+
+    let delta = Delta::random(&mut rng);
+
+    let input_keys: Vec<Key> = (0..AES128.inputs().len())
+        .map(|_| {
+            let block: Block = rng.random();
+            block.into()
+        })
+        .collect();
+
+    let input_bits: Vec<bool> = key.iter().copied().chain(msg).into_iter_lsb0().collect();
+
+    let mut gb = Garbler::default();
+    let mut gb_iter = gb
+        .generate_batched(&AES128, delta, &input_keys, &mut rng)
+        .unwrap();
+
+    // Collect (preprocess)
+    let mut encrypted_gates = Vec::new();
+    for batch in gb_iter.by_ref() {
+        encrypted_gates.extend(batch.into_array());
+    }
+
+    let GarblerOutput {
+        outputs: output_labels,
+    } = gb_iter.finish().unwrap();
+
+    let delta_block = *delta.as_block();
+    let input_macs: Vec<Mac> = input_keys
+        .iter()
+        .zip(&input_bits)
+        .map(|(key, &bit)| {
+            let key_block = *key.as_block();
+            if bit {
+                (key_block ^ delta_block).into()
+            } else {
+                key_block.into()
+            }
+        })
+        .collect();
+
+    // Evaluate the same circuit twice using preprocessed gates
+    for _ in 0..2 {
+        let mut ev = Evaluator::default();
+        let mut ev_consumer = ev.evaluate(&AES128, &input_macs).unwrap();
+
+        for gate in &encrypted_gates {
+            ev_consumer.next(*gate);
+        }
+
+        let EvaluatorOutput {
+            outputs: output_macs,
+        } = ev_consumer.finish().unwrap();
+
+        let output: Vec<u8> = Vec::from_lsb0_iter(output_macs.iter().zip(&output_labels).map(
+            |(mac, false_label)| {
+                let true_label = Mac::from(*false_label.as_block() ^ delta_block);
+                mac == &true_label
+            },
+        ));
+
+        assert_eq!(output, expected);
+    }
+}