diff --git a/Cargo.toml b/Cargo.toml index 78a6bfc5..aa450071 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ members = [ "crates/vm-core", "crates/zk-core", "crates/zk", + "crates/predicate", ] exclude = ["crates/lpn-estimator"] resolver = "2" @@ -63,6 +64,7 @@ mpz-memory-core = { path = "crates/memory-core" } mpz-vm-core = { path = "crates/vm-core" } mpz-zk-core = { path = "crates/zk-core" } mpz-zk = { path = "crates/zk" } +mpz-predicate = { path = "crates/predicate" } clmul = { path = "crates/clmul" } matrix-transpose = { path = "crates/matrix-transpose" } rangeset = "0.4" diff --git a/crates/circuits-core/src/ops.rs b/crates/circuits-core/src/ops.rs index 60d5e9c2..b477506f 100644 --- a/crates/circuits-core/src/ops.rs +++ b/crates/circuits-core/src/ops.rs @@ -213,6 +213,108 @@ pub fn inv(builder: &mut CircuitBuilder, a: [Node; N]) -> std::array::from_fn(|n| builder.add_inv_gate(a[n])) } +/// Returns 1 if all input bits are 1, otherwise 0. +pub fn all(builder: &mut CircuitBuilder, inputs: &[Node]) -> Node { + assert!(!inputs.is_empty(), "all requires at least one input"); + inputs + .iter() + .copied() + .reduce(|acc, x| builder.add_and_gate(acc, x)) + .unwrap() +} + +/// Returns 1 if any input bit is 1, otherwise 0. +pub fn any(builder: &mut CircuitBuilder, inputs: &[Node]) -> Node { + assert!(!inputs.is_empty(), "any requires at least one input"); + inputs + .iter() + .copied() + .reduce(|acc, x| { + // OR = (A ⊕ B) ⊕ (A ^ B) + let a_xor_b = builder.add_xor_gate(acc, x); + let a_and_b = builder.add_and_gate(acc, x); + builder.add_xor_gate(a_xor_b, a_and_b) + }) + .unwrap() +} + +/// Returns 1 if two nbit values are equal, otherwise 0. +pub fn eq( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // Two values are equal if all XOR bits are 0, i.e., NOR of all XORs + let xors: Vec<_> = a + .iter() + .zip(b.iter()) + .map(|(a, b)| builder.add_xor_gate(*a, *b)) + .collect(); + + // All XORs must be 0 for equality + let any_diff = any(builder, &xors); + builder.add_inv_gate(any_diff) +} + +/// Returns 1 if two nbit values are not equal, otherwise 0. +pub fn neq( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // Two values are not equal if any XOR bit is 1 + let xors: Vec<_> = a + .iter() + .zip(b.iter()) + .map(|(a, b)| builder.add_xor_gate(*a, *b)) + .collect(); + + any(builder, &xors) +} + +/// Returns 1 if a < b (unsigned), otherwise 0. +pub fn lt( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // a < b iff (a - b) underflows + let (_diff, underflow) = wrapping_sub(builder, &a, &b); + underflow +} + +/// Returns 1 if a <= b (unsigned), otherwise 0. +pub fn lte( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // a <= b iff NOT (a > b) iff NOT (b < a) + let b_lt_a = lt(builder, b, a); + builder.add_inv_gate(b_lt_a) +} + +/// Returns 1 if a > b (unsigned), otherwise 0. +pub fn gt( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // a > b iff b < a + lt(builder, b, a) +} + +/// Returns 1 if a >= b (unsigned), otherwise 0. +pub fn gte( + builder: &mut CircuitBuilder, + a: [Node; N], + b: [Node; N], +) -> Node { + // a >= b iff NOT (a < b) + let a_lt_b = lt(builder, a, b); + builder.add_inv_gate(a_lt_b) +} + #[cfg(test)] mod tests { use std::array::from_fn; diff --git a/crates/predicate/Cargo.toml b/crates/predicate/Cargo.toml new file mode 100644 index 00000000..c0fe39da --- /dev/null +++ b/crates/predicate/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "mpz-predicate" +version = "0.1.0-alpha.14-pre" +edition = "2021" + +[lints] +workspace = true + +[dependencies] +bytes = { workspace = true } +serde = { version = "1", features = ["derive"] } +rand = { workspace = true } +rand_core = { workspace = true } +rand_chacha = { workspace = true } +thiserror = { workspace = true } +rangeset = { workspace = true, features = ["serde"] } +mpz-circuits = { workspace = true } +serde_json = { version = "*", features = ["raw_value"] } + +[dev-dependencies] +rstest = { workspace = true } diff --git a/crates/predicate/src/compiler.rs b/crates/predicate/src/compiler.rs new file mode 100644 index 00000000..82b412fd --- /dev/null +++ b/crates/predicate/src/compiler.rs @@ -0,0 +1,285 @@ +//! Compiles predicates into boolean circuits. +//! +//! The [`Compiler`] transforms a predicate tree into a circuit that can be +//! evaluated on byte data. Each byte index referenced in the predicate becomes +//! 8 input feeds (one per bit), and the circuit outputs a single bit indicating +//! whether the predicate is satisfied. + +use std::collections::HashMap; + +use mpz_circuits::{itybity::ToBits, ops, Circuit, CircuitBuilder, Feed, Node}; + +use crate::{CmpOp, Pred, PredNode, Rhs}; + +/// Compiles predicates into boolean circuits. +/// +/// The compiler maintains internal state for mapping byte indices to circuit +/// feeds and caching processed predicate nodes to avoid redundant work. +pub struct Compiler { + /// Maps byte indices to their 8-bit circuit feed representation. + map: HashMap; 8]>, + /// Caches processed predicate nodes by pointer address to avoid + /// recomputation. + cache: HashMap>, +} + +impl Compiler { + /// Creates a new compiler instance. + pub fn new() -> Self { + Self { + map: HashMap::new(), + cache: HashMap::new(), + } + } + + /// Compiles a predicate into a boolean circuit. + /// + /// # Arguments + /// * `pred` - The predicate to compile. + /// + /// # Returns + /// A circuit with: + /// - Inputs: 8 bits per unique byte index in the predicate (sorted order) + /// - Output: Single bit indicating predicate satisfaction + pub fn compile(&mut self, pred: &Pred) -> Circuit { + let mut builder = CircuitBuilder::new(); + + // Create 8-bit input feeds for each byte index referenced in the predicate + for idx in pred.indices() { + let feeds: Vec<_> = (0..8).map(|_| builder.add_input()).collect(); + self.map.insert(idx, feeds.try_into().unwrap()); + } + + let output = self.process(&mut builder, pred); + + builder.add_output(output); + builder.build().unwrap() + } + + /// Recursively processes a predicate node into circuit gates. + /// + /// # Arguments + /// * `builder` - The circuit builder to add gates to. + /// * `pred` - The predicate node to process. + /// + /// # Returns + /// A feed node representing the predicate's boolean result. + fn process(&mut self, builder: &mut CircuitBuilder, pred: &Pred) -> Node { + // Check cache to avoid reprocessing shared predicate nodes + let key = pred.ptr_key(); + if let Some(&cached) = self.cache.get(&key) { + return cached; + } + + let result = match pred.inner() { + PredNode::And(children) => { + let outputs: Vec<_> = children.iter().map(|p| self.process(builder, p)).collect(); + ops::all(builder, &outputs) + } + PredNode::Or(children) => { + let outputs: Vec<_> = children.iter().map(|p| self.process(builder, p)).collect(); + ops::any(builder, &outputs) + } + PredNode::Not(child) => { + let child_out = self.process(builder, child); + ops::inv(builder, [child_out])[0] + } + PredNode::Atom(atom) => { + let lhs = self.map[&atom.index]; + let rhs = match atom.rhs { + Rhs::Const(c) => const_to_feeds(builder, c), + Rhs::Idx(idx) => self.map[&idx], + }; + match atom.op { + CmpOp::Eq => ops::eq(builder, lhs, rhs), + CmpOp::Ne => ops::neq(builder, lhs, rhs), + CmpOp::Lt => ops::lt(builder, lhs, rhs), + CmpOp::Lte => ops::lte(builder, lhs, rhs), + CmpOp::Gt => ops::gt(builder, lhs, rhs), + CmpOp::Gte => ops::gte(builder, lhs, rhs), + } + } + }; + + self.cache.insert(key, result); + result + } +} + +/// Converts a constant byte value to 8 circuit feed nodes. +/// +/// # Arguments +/// * `builder` - The circuit builder to get constant nodes from. +/// * `value` - The byte value to convert. +/// +/// # Returns +/// An array of 8 feed nodes representing the byte in LSB-first order. +fn const_to_feeds(builder: &CircuitBuilder, value: u8) -> [Node; 8] { + value + .iter_lsb0() + .map(|bit| { + if bit { + builder.get_const_one() + } else { + builder.get_const_zero() + } + }) + .collect::>() + .try_into() + .expect("u8 always has 8 bits") +} + +#[cfg(test)] +mod test { + use super::*; + use crate::{eq, gt, gte, lt, lte, ne}; + use mpz_circuits::evaluate; + + #[test] + fn test_compile_and() { + // data[0] < data[1] AND data[2] == 2 + let pred = Pred::and(vec![lt(0, 1usize), eq(2, 2u8)]); + + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [1u8, 2, 2]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [1u8, 2, 3]).unwrap(); + assert_eq!(res, false); // second fails + let res: bool = evaluate!(circ, [5u8, 2, 2]).unwrap(); + assert_eq!(res, false); // first fails + } + + #[test] + fn test_compile_or() { + // data[0] < data[1] OR data[2] == 2 + let pred = Pred::or(vec![lt(0, 1usize), eq(2, 2u8)]); + + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [1u8, 2, 0]).unwrap(); + assert_eq!(res, true); // first true + let res: bool = evaluate!(circ, [5u8, 2, 2]).unwrap(); + assert_eq!(res, true); // second true + let res: bool = evaluate!(circ, [5u8, 2, 0]).unwrap(); + assert_eq!(res, false); // both false + } + + #[test] + fn test_compile_not() { + // NOT (data[0] < data[1]) + let pred = Pred::not(lt(0, 1usize)); + + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [5u8, 3]).unwrap(); + assert_eq!(res, true); // 5 < 3 is false + let res: bool = evaluate!(circ, [1u8, 3]).unwrap(); + assert_eq!(res, false); // 1 < 3 is true + } + + #[test] + fn test_compile_const_rhs() { + // data[0] < 22 + let pred = lt(0, 22u8); + + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, 5u8).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, 23u8).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_index_rhs() { + // data[0] < data[1] + let pred = lt(0, 1usize); + + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, 5u8, 10u8).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, 23u8, 5u8).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_same_index() { + // data[0] == data[0] (always true) + let pred1 = eq(0, 0usize); + // data[0] < data[0] (always false) + let pred2 = lt(0, 0usize); + + let res: bool = evaluate!(Compiler::new().compile(&pred1), 5u8).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(Compiler::new().compile(&pred2), 5u8).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_eq() { + let pred = eq(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [5u8, 5]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [1u8, 3]).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_ne() { + let pred = ne(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [5u8, 6]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [1u8, 1]).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_gt() { + let pred = gt(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [7u8, 6]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [1u8, 1]).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_gte() { + let pred = gte(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [7u8, 7]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [0u8, 1]).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_lt() { + let pred = lt(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [2u8, 7]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [4u8, 1]).unwrap(); + assert_eq!(res, false); + } + + #[test] + fn test_compile_lte() { + let pred = lte(0, 1usize); + let circ = Compiler::new().compile(&pred); + + let res: bool = evaluate!(circ, [2u8, 2]).unwrap(); + assert_eq!(res, true); + let res: bool = evaluate!(circ, [4u8, 1]).unwrap(); + assert_eq!(res, false); + } +} diff --git a/crates/predicate/src/http.rs b/crates/predicate/src/http.rs new file mode 100644 index 00000000..42f08055 --- /dev/null +++ b/crates/predicate/src/http.rs @@ -0,0 +1,67 @@ +//! Pre-built predicates for HTTP validation. + +use crate::{ne, Pred}; +use rangeset::prelude::RangeSet; + +/// Builds a predicate that validates an HTTP header value. +/// +/// HTTP header values must not contain carriage return (`\r`, ASCII 13). +pub fn validate_header_value(range: RangeSet) -> Pred { + let preds: Vec = range.iter_values().map(|idx| ne(idx, b'\r')).collect(); + Pred::and(preds) +} + +#[cfg(test)] +mod test { + use super::*; + use crate::compiler::Compiler; + use mpz_circuits::evaluate; + + #[test] + fn test_validate_header_value_valid() { + let valid_cases = vec![ + "application/json", + "text/html; charset=utf-8", + "Bearer token123", + "gzip, deflate", + "Mon, 01 Jan 2024 00:00:00 GMT", + "bytes=0-1023", + "*/*", + "keep-alive", + "no-cache", + "https://example.com", + "hello world", + "value\twith\ttabs", // tabs are allowed + "value with spaces", + ]; + + for input in valid_cases { + let bytes = input.as_bytes(); + let pred = validate_header_value(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(out, "Expected valid header value for '{}'", input); + } + } + + #[test] + fn test_validate_header_value_invalid() { + let invalid_cases = vec![ + ("value\r\ninjection", "contains CRLF"), + ("value\ronly", "contains CR"), + ("\rstart", "starts with CR"), + ("end\r", "ends with CR"), + ("mid\rdle", "CR in middle"), + ]; + + for (input, desc) in invalid_cases { + let bytes = input.as_bytes(); + let pred = validate_header_value(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(!out, "Expected invalid header value for '{}' ({})", input, desc); + } + } +} diff --git a/crates/predicate/src/json.rs b/crates/predicate/src/json.rs new file mode 100644 index 00000000..72020c8a --- /dev/null +++ b/crates/predicate/src/json.rs @@ -0,0 +1,1167 @@ +//! Pre-built predicates for validating JSON objects. + +use crate::{eq, gte, lte, Pred}; +use rangeset::prelude::RangeSet; + +/// Builds a predicate that validates a non-empty JSON integer (digits only). +pub fn validate_integer(range: RangeSet) -> Pred { + let preds = range + .iter_values() + .map(|idx| Pred::and(vec![lte(idx, 57u8), gte(idx, 48u8)])) + .collect::>(); + Pred::and(preds) +} + +/// Builds a predicate that validates a JSON number. +/// +/// JSON number grammar: +/// ```text +/// number = [ "-" ] int [ frac ] [ exp ] +/// int = "0" | ( digit1-9 *digit ) +/// frac = "." 1*digit +/// exp = ( "e" | "E" ) [ "+" | "-" ] 1*digit +/// ``` +pub fn validate_number(range: RangeSet) -> Pred { + let len = range.len(); + assert!(len > 0); + + let positions: Vec = range.iter_values().collect(); + + // Character class predicates for each position + let is_minus: Vec = positions.iter().map(|&p| eq(p, b'-')).collect(); + let is_zero: Vec = positions.iter().map(|&p| eq(p, b'0')).collect(); + let is_digit: Vec = positions + .iter() + .map(|&p| Pred::and(vec![gte(p, b'0'), lte(p, b'9')])) + .collect(); + let is_digit_1_9: Vec = positions + .iter() + .map(|&p| Pred::and(vec![gte(p, b'1'), lte(p, b'9')])) + .collect(); + let is_dot: Vec = positions.iter().map(|&p| eq(p, b'.')).collect(); + let is_exp: Vec = positions + .iter() + .map(|&p| Pred::or(vec![eq(p, b'e'), eq(p, b'E')])) + .collect(); + let is_sign: Vec = positions + .iter() + .map(|&p| Pred::or(vec![eq(p, b'+'), eq(p, b'-')])) + .collect(); + + // State tracking: after processing position i, which states are possible? + // States represent where we are in the grammar: + // - in_int_zero: integer part is exactly "0" (or "-0") + // - in_int_digits: in integer digits (after 1-9) + // - in_frac_start: just saw '.', need at least one digit + // - in_frac_digits: in fractional digits + // - in_exp_start: just saw 'e'/'E', need sign or digit + // - in_exp_sign: just saw exp sign, need at least one digit + // - in_exp_digits: in exponent digits + + let mut in_int_zero: Vec> = Vec::with_capacity(len); + let mut in_int_digits: Vec> = Vec::with_capacity(len); + let mut in_frac_start: Vec> = Vec::with_capacity(len); + let mut in_frac_digits: Vec> = Vec::with_capacity(len); + let mut in_exp_start: Vec> = Vec::with_capacity(len); + let mut in_exp_sign: Vec> = Vec::with_capacity(len); + let mut in_exp_digits: Vec> = Vec::with_capacity(len); + + let mut is_valid: Vec = Vec::with_capacity(len); + + for i in 0..len { + if i == 0 { + // First character: '-', '0', or '1'-'9' + in_int_zero.push(Some(is_zero[i].clone())); + in_int_digits.push(Some(is_digit_1_9[i].clone())); + in_frac_start.push(None); + in_frac_digits.push(None); + in_exp_start.push(None); + in_exp_sign.push(None); + in_exp_digits.push(None); + + let valid = Pred::or(vec![ + is_minus[i].clone(), + is_zero[i].clone(), + is_digit_1_9[i].clone(), + ]); + is_valid.push(valid); + } else if i == 1 { + // Second character depends on first + let prev_was_minus = is_minus[0].clone(); + let prev_was_zero = is_zero[0].clone(); + let prev_was_digit_1_9 = is_digit_1_9[0].clone(); + + // Transitions to int_zero: '-' followed by '0' + let after_minus_zero = Pred::and(vec![prev_was_minus.clone(), is_zero[i].clone()]); + in_int_zero.push(Some(after_minus_zero)); + + // Transitions to int_digits: '-' + 1-9, or 1-9 + digit + let after_minus_digit = + Pred::and(vec![prev_was_minus.clone(), is_digit_1_9[i].clone()]); + let continue_int = Pred::and(vec![prev_was_digit_1_9.clone(), is_digit[i].clone()]); + in_int_digits.push(Some(Pred::or(vec![after_minus_digit, continue_int]))); + + // Transitions to frac_start: '0' + '.', or 1-9 + '.' + let zero_to_frac = Pred::and(vec![prev_was_zero.clone(), is_dot[i].clone()]); + let digit_to_frac = Pred::and(vec![prev_was_digit_1_9.clone(), is_dot[i].clone()]); + in_frac_start.push(Some(Pred::or(vec![zero_to_frac, digit_to_frac]))); + + in_frac_digits.push(None); + + // Transitions to exp_start: '0' + e/E, or 1-9 + e/E + let zero_to_exp = Pred::and(vec![prev_was_zero.clone(), is_exp[i].clone()]); + let digit_to_exp = Pred::and(vec![prev_was_digit_1_9.clone(), is_exp[i].clone()]); + in_exp_start.push(Some(Pred::or(vec![zero_to_exp, digit_to_exp]))); + + in_exp_sign.push(None); + in_exp_digits.push(None); + + let valid = Pred::or(vec![ + Pred::and(vec![prev_was_minus.clone(), is_zero[i].clone()]), + Pred::and(vec![prev_was_minus.clone(), is_digit_1_9[i].clone()]), + Pred::and(vec![prev_was_zero.clone(), is_dot[i].clone()]), + Pred::and(vec![prev_was_zero.clone(), is_exp[i].clone()]), + Pred::and(vec![prev_was_digit_1_9.clone(), is_digit[i].clone()]), + Pred::and(vec![prev_was_digit_1_9.clone(), is_dot[i].clone()]), + Pred::and(vec![prev_was_digit_1_9.clone(), is_exp[i].clone()]), + ]); + is_valid.push(valid); + } else { + // General case: i >= 2 + let new_int_zero: Vec = Vec::new(); + let mut new_int_digits: Vec = Vec::new(); + let mut new_frac_start: Vec = Vec::new(); + let mut new_frac_digits: Vec = Vec::new(); + let mut new_exp_start: Vec = Vec::new(); + let mut new_exp_sign: Vec = Vec::new(); + let mut new_exp_digits: Vec = Vec::new(); + let mut valid_preds: Vec = Vec::new(); + + // From int_zero: -> frac_start ('.') or exp_start ('e'/'E') + if let Some(ref prev) = in_int_zero[i - 1] { + let to_frac = Pred::and(vec![prev.clone(), is_dot[i].clone()]); + let to_exp = Pred::and(vec![prev.clone(), is_exp[i].clone()]); + new_frac_start.push(to_frac.clone()); + new_exp_start.push(to_exp.clone()); + valid_preds.push(to_frac); + valid_preds.push(to_exp); + } + + // From int_digits: -> digit (stay), frac_start ('.'), or exp_start ('e'/'E') + if let Some(ref prev) = in_int_digits[i - 1] { + let stay = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + let to_frac = Pred::and(vec![prev.clone(), is_dot[i].clone()]); + let to_exp = Pred::and(vec![prev.clone(), is_exp[i].clone()]); + new_int_digits.push(stay.clone()); + new_frac_start.push(to_frac.clone()); + new_exp_start.push(to_exp.clone()); + valid_preds.push(stay); + valid_preds.push(to_frac); + valid_preds.push(to_exp); + } + + // From frac_start: -> frac_digits (digit required) + if let Some(ref prev) = in_frac_start[i - 1] { + let to_digits = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + new_frac_digits.push(to_digits.clone()); + valid_preds.push(to_digits); + } + + // From frac_digits: -> digit (stay) or exp_start ('e'/'E') + if let Some(ref prev) = in_frac_digits[i - 1] { + let stay = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + let to_exp = Pred::and(vec![prev.clone(), is_exp[i].clone()]); + new_frac_digits.push(stay.clone()); + new_exp_start.push(to_exp.clone()); + valid_preds.push(stay); + valid_preds.push(to_exp); + } + + // From exp_start: -> exp_sign ('+'/'-') or exp_digits (digit) + if let Some(ref prev) = in_exp_start[i - 1] { + let to_sign = Pred::and(vec![prev.clone(), is_sign[i].clone()]); + let to_digits = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + new_exp_sign.push(to_sign.clone()); + new_exp_digits.push(to_digits.clone()); + valid_preds.push(to_sign); + valid_preds.push(to_digits); + } + + // From exp_sign: -> exp_digits (digit required) + if let Some(ref prev) = in_exp_sign[i - 1] { + let to_digits = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + new_exp_digits.push(to_digits.clone()); + valid_preds.push(to_digits); + } + + // From exp_digits: -> digit (stay) + if let Some(ref prev) = in_exp_digits[i - 1] { + let stay = Pred::and(vec![prev.clone(), is_digit[i].clone()]); + new_exp_digits.push(stay.clone()); + valid_preds.push(stay); + } + + in_int_zero.push(if new_int_zero.is_empty() { + None + } else { + Some(Pred::or(new_int_zero)) + }); + in_int_digits.push(if new_int_digits.is_empty() { + None + } else { + Some(Pred::or(new_int_digits)) + }); + in_frac_start.push(if new_frac_start.is_empty() { + None + } else { + Some(Pred::or(new_frac_start)) + }); + in_frac_digits.push(if new_frac_digits.is_empty() { + None + } else { + Some(Pred::or(new_frac_digits)) + }); + in_exp_start.push(if new_exp_start.is_empty() { + None + } else { + Some(Pred::or(new_exp_start)) + }); + in_exp_sign.push(if new_exp_sign.is_empty() { + None + } else { + Some(Pred::or(new_exp_sign)) + }); + in_exp_digits.push(if new_exp_digits.is_empty() { + None + } else { + Some(Pred::or(new_exp_digits)) + }); + + assert!( + !valid_preds.is_empty(), + "No valid transitions at position {i}" + ); + is_valid.push(Pred::or(valid_preds)); + } + } + + // Final validation: must end in a terminal state + // Terminal: int_zero, int_digits, frac_digits, exp_digits + // Non-terminal: frac_start, exp_start, exp_sign (all require more input) + let last = len - 1; + let mut terminal_states: Vec = Vec::new(); + + if len == 1 { + // Single character: must be a digit + terminal_states.push(is_digit[0].clone()); + } else { + if let Some(ref state) = in_int_zero[last] { + terminal_states.push(state.clone()); + } + if let Some(ref state) = in_int_digits[last] { + terminal_states.push(state.clone()); + } + if let Some(ref state) = in_frac_digits[last] { + terminal_states.push(state.clone()); + } + if let Some(ref state) = in_exp_digits[last] { + terminal_states.push(state.clone()); + } + } + + assert!( + !terminal_states.is_empty(), + "No terminal states possible for length {len}" + ); + + is_valid.push(Pred::or(terminal_states)); + Pred::and(is_valid) +} + +/// Builds a predicate that validates a non-empty JSON string (content between +/// quotes). +pub fn validate_string(range: RangeSet) -> Pred { + let len = range.len(); + assert!(len > 0); + + let positions: Vec = range.iter_values().collect(); + let mut data: Vec = positions.iter().map(|&pos| ByteData::new(pos)).collect(); + + // Track escape sequences - Option where None means "can't start here" + let mut is_escape_start: Vec> = Vec::with_capacity(len); + let mut is_unicode_escape_start: Vec> = Vec::with_capacity(len); + + // Track UTF-8 multi-byte sequences + let mut starts_utf8_2: Vec> = Vec::with_capacity(len); + let mut starts_utf8_3: Vec> = Vec::with_capacity(len); + let mut starts_utf8_4: Vec> = Vec::with_capacity(len); + + let mut is_valid = Vec::with_capacity(len); + + for i in 0..len { + // Compute if this position is consumed by a previous escape or UTF-8 sequence + let mut consumed_by: Vec> = Vec::new(); + + // Consumed by simple escape at i-1 (not unicode) + if i >= 1 { + if let Some(prev_starts) = &is_escape_start[i - 1] { + let prev_is_simple = match &is_unicode_escape_start[i - 1] { + Some(prev_is_unicode) => { + // prev_starts AND NOT prev_is_unicode + Pred::and(vec![ + prev_starts.clone(), + Pred::not(prev_is_unicode.clone()), + ]) + } + None => { + // No unicode escape possible, so if escape starts, it's simple + prev_starts.clone() + } + }; + consumed_by.push(Some(prev_is_simple)); + } + } + + // Consumed by unicode escape at i-1 through i-5 + for offset in 1..=5 { + if i >= offset { + consumed_by.push(is_unicode_escape_start[i - offset].clone()); + } + } + + // Consumed by UTF-8 2-byte sequence starting at i-1 + if i >= 1 { + consumed_by.push(starts_utf8_2[i - 1].clone()); + } + + // Consumed by UTF-8 3-byte sequence starting at i-1 or i-2 + for offset in 1..=2 { + if i >= offset { + consumed_by.push(starts_utf8_3[i - offset].clone()); + } + } + + // Consumed by UTF-8 4-byte sequence starting at i-1, i-2, or i-3 + for offset in 1..=3 { + if i >= offset { + consumed_by.push(starts_utf8_4[i - offset].clone()); + } + } + + let pos_consumed = or_opts(consumed_by); + + // Compute escape sequence starts (only if not consumed) + let is_escape = data[i].is_escape(); + + let (could_start_escape, could_start_unicode): (Option, Option) = if len - i > 1 + { + let next_is_simple_suffix = data[i + 1].is_escape_suffix(); + let next_is_unicode = data[i + 1].is_unicode_escape_suffix(); + + let is_valid_unicode: Option = if len - i > 5 { + let is_hex_0 = data[i + 2].is_hex(); + let is_hex_1 = data[i + 3].is_hex(); + let is_hex_2 = data[i + 4].is_hex(); + let is_hex_3 = data[i + 5].is_hex(); + Some(Pred::and(vec![ + next_is_unicode.clone(), + is_hex_0, + is_hex_1, + is_hex_2, + is_hex_3, + ])) + } else { + None + }; + + // Valid escape = simple suffix OR valid unicode + let is_valid_escape_seq = match &is_valid_unicode { + Some(unicode) => Pred::or(vec![next_is_simple_suffix, unicode.clone()]), + None => next_is_simple_suffix, + }; + + let could_start = Pred::and(vec![is_escape.clone(), is_valid_escape_seq]); + let could_unicode = is_valid_unicode.map(|u| Pred::and(vec![is_escape.clone(), u])); + + (Some(could_start), could_unicode) + } else { + (None, None) + }; + + // Apply "not consumed" constraint + let starts_escape = match (&could_start_escape, &pos_consumed) { + (Some(escape), Some(consumed)) => { + Some(Pred::and(vec![escape.clone(), Pred::not(consumed.clone())])) + } + (Some(escape), None) => Some(escape.clone()), + (None, _) => None, + }; + + let starts_unicode = match (&could_start_unicode, &pos_consumed) { + (Some(unicode), Some(consumed)) => Some(Pred::and(vec![ + unicode.clone(), + Pred::not(consumed.clone()), + ])), + (Some(unicode), None) => Some(unicode.clone()), + (None, _) => None, + }; + + is_escape_start.push(starts_escape.clone()); + is_unicode_escape_start.push(starts_unicode.clone()); + + // Compute UTF-8 sequence starts (only if not consumed) + // 2-byte: 0xC2-0xDF followed by 0x80-0xBF + let could_start_utf8_2: Option = if len - i > 1 { + let is_2byte_start = data[i].is_2byte_start(); + let next_is_cont = data[i + 1].is_continuation(); + Some(Pred::and(vec![is_2byte_start, next_is_cont])) + } else { + None + }; + + // 3-byte: 0xE0-0xEF with special cases + let could_start_utf8_3: Option = if len - i > 2 { + let is_3byte_start = data[i].is_3byte_start(); + let cont1 = data[i + 1].is_continuation(); + let cont2 = data[i + 2].is_continuation(); + + let is_e0 = data[i].is_e0(); + let is_ed = data[i].is_ed(); + let not_e0 = Pred::not(is_e0.clone()); + let not_ed = Pred::not(is_ed.clone()); + let is_normal_3byte = Pred::and(vec![is_3byte_start, not_e0, not_ed]); + + // E0: second byte must be A0-BF + let cont1_a0_bf = data[i + 1].is_cont_a0_bf(); + let e0_valid = Pred::and(vec![is_e0, cont1_a0_bf]); + + // ED: second byte must be 80-9F + let cont1_80_9f = data[i + 1].is_cont_80_9f(); + let ed_valid = Pred::and(vec![is_ed, cont1_80_9f]); + + // Normal 3-byte: second byte 80-BF + let normal_valid = Pred::and(vec![is_normal_3byte, cont1]); + + let first_two_valid = Pred::or(vec![e0_valid, ed_valid, normal_valid]); + Some(Pred::and(vec![first_two_valid, cont2])) + } else { + None + }; + + // 4-byte: 0xF0-0xF4 with special cases + let could_start_utf8_4: Option = if len - i > 3 { + let is_4byte_start = data[i].is_4byte_start(); + let cont1 = data[i + 1].is_continuation(); + let cont2 = data[i + 2].is_continuation(); + let cont3 = data[i + 3].is_continuation(); + + let is_f0 = data[i].is_f0(); + let is_f4 = data[i].is_f4(); + let not_f0 = Pred::not(is_f0.clone()); + let not_f4 = Pred::not(is_f4.clone()); + let is_normal_4byte = Pred::and(vec![is_4byte_start, not_f0, not_f4]); + + // F0: second byte must be 90-BF + let cont1_90_bf = data[i + 1].is_cont_90_bf(); + let f0_valid = Pred::and(vec![is_f0, cont1_90_bf]); + + // F4: second byte must be 80-8F + let cont1_80_8f = data[i + 1].is_cont_80_8f(); + let f4_valid = Pred::and(vec![is_f4, cont1_80_8f]); + + // Normal 4-byte (F1-F3): second byte 80-BF + let normal_valid = Pred::and(vec![is_normal_4byte, cont1]); + + let first_two_valid = Pred::or(vec![f0_valid, f4_valid, normal_valid]); + let first_three_valid = Pred::and(vec![first_two_valid, cont2]); + Some(Pred::and(vec![first_three_valid, cont3])) + } else { + None + }; + + // Apply "not consumed" constraint to UTF-8 starts + let utf8_2_start = match (&could_start_utf8_2, &pos_consumed) { + (Some(start), Some(consumed)) => { + Some(Pred::and(vec![start.clone(), Pred::not(consumed.clone())])) + } + (Some(start), None) => Some(start.clone()), + (None, _) => None, + }; + + let utf8_3_start = match (&could_start_utf8_3, &pos_consumed) { + (Some(start), Some(consumed)) => { + Some(Pred::and(vec![start.clone(), Pred::not(consumed.clone())])) + } + (Some(start), None) => Some(start.clone()), + (None, _) => None, + }; + + let utf8_4_start = match (&could_start_utf8_4, &pos_consumed) { + (Some(start), Some(consumed)) => { + Some(Pred::and(vec![start.clone(), Pred::not(consumed.clone())])) + } + (Some(start), None) => Some(start.clone()), + (None, _) => None, + }; + + starts_utf8_2.push(utf8_2_start.clone()); + starts_utf8_3.push(utf8_3_start.clone()); + starts_utf8_4.push(utf8_4_start.clone()); + + // Validate this position + let is_ctrl = data[i].is_ctrl(); + let is_not_ctrl = Pred::not(is_ctrl); + let is_quote = data[i].is_quote(); + let is_not_quote = Pred::not(is_quote); + let is_not_escape = Pred::not(is_escape); + + // Quote is valid if: not a quote, OR preceded by an escape start + let quote_ok = if i >= 1 { + match &is_escape_start[i - 1] { + Some(preceded_by_escape) => { + Pred::or(vec![is_not_quote, preceded_by_escape.clone()]) + } + None => is_not_quote, + } + } else { + is_not_quote + }; + + // Escape is valid if it starts a valid escape sequence + let escape_ok = match &starts_escape { + Some(starts) => Pred::or(vec![is_not_escape, starts.clone()]), + None => is_not_escape, + }; + + // UTF-8 validation + let is_2byte_start_byte = data[i].is_2byte_start(); + let is_3byte_start_byte = data[i].is_3byte_start(); + let is_4byte_start_byte = data[i].is_4byte_start(); + let is_continuation_byte = data[i].is_continuation(); + + // If it's a multi-byte start, it must start a valid sequence + let is_any_multibyte_start = Pred::or(vec![ + is_2byte_start_byte, + is_3byte_start_byte, + is_4byte_start_byte, + ]); + + let starts_valid_multibyte = + or_opts([utf8_2_start, utf8_3_start, utf8_4_start].into_iter()); + + // multibyte_ok: not a multibyte start, OR starts a valid sequence + let not_multibyte_start = Pred::not(is_any_multibyte_start); + let multibyte_ok = match starts_valid_multibyte { + Some(valid) => Pred::or(vec![not_multibyte_start, valid]), + None => not_multibyte_start, + }; + + // Continuation bytes are only valid if consumed by a previous sequence + let not_continuation = Pred::not(is_continuation_byte); + let continuation_ok = match &pos_consumed { + Some(consumed) => Pred::or(vec![not_continuation, consumed.clone()]), + None => not_continuation, + }; + + // Invalid bytes: 0xC0, 0xC1, 0xF5-0xFF + let is_c0 = eq(positions[i], 0xC0u8); + let is_c1 = eq(positions[i], 0xC1u8); + let gte_f5 = gte(positions[i], 0xF5u8); + let is_invalid_byte = Pred::or(vec![is_c0, is_c1, gte_f5]); + let not_invalid_byte = Pred::not(is_invalid_byte); + + // Combine all UTF-8 checks + let utf8_ok = Pred::and(vec![multibyte_ok, continuation_ok, not_invalid_byte]); + + // When not consumed: must pass ctrl, quote, escape, and UTF-8 checks + let valid_when_not_consumed = Pred::and(vec![is_not_ctrl, quote_ok, escape_ok, utf8_ok]); + + // Position is valid if: consumed OR (not consumed AND passes all checks) + let pos_valid = match pos_consumed { + Some(consumed) => { + let is_not_consumed = Pred::not(consumed.clone()); + let not_consumed_and_valid = + Pred::and(vec![is_not_consumed, valid_when_not_consumed]); + Pred::or(vec![consumed, not_consumed_and_valid]) + } + None => { + // Position can't be consumed, so it must pass all checks + valid_when_not_consumed + } + }; + + is_valid.push(pos_valid); + } + + Pred::and(is_valid) +} + +struct ByteData { + pos: usize, + is_ctrl: Option, + is_quote: Option, + is_escape: Option, + is_unicode: Option, + is_valid_escape: Option, + is_hex: Option, + // UTF-8 byte classification + is_continuation: Option, + is_2byte_start: Option, + is_3byte_start: Option, + is_4byte_start: Option, + // Special UTF-8 cases + is_e0: Option, + is_ed: Option, + is_f0: Option, + is_f4: Option, + // Continuation byte sub-ranges + is_cont_80_8f: Option, + is_cont_80_9f: Option, + is_cont_90_bf: Option, + is_cont_a0_bf: Option, +} + +impl ByteData { + fn new(pos: usize) -> Self { + Self { + pos, + is_ctrl: None, + is_quote: None, + is_escape: None, + is_unicode: None, + is_valid_escape: None, + is_hex: None, + is_continuation: None, + is_2byte_start: None, + is_3byte_start: None, + is_4byte_start: None, + is_e0: None, + is_ed: None, + is_f0: None, + is_f4: None, + is_cont_80_8f: None, + is_cont_80_9f: None, + is_cont_90_bf: None, + is_cont_a0_bf: None, + } + } + + fn is_ctrl(&mut self) -> Pred { + self.is_ctrl + .get_or_insert_with(|| lte(self.pos, 0x1Fu8)) + .clone() + } + + fn is_quote(&mut self) -> Pred { + self.is_quote + .get_or_insert_with(|| eq(self.pos, b'"')) + .clone() + } + + fn is_escape(&mut self) -> Pred { + self.is_escape + .get_or_insert_with(|| eq(self.pos, b'\\')) + .clone() + } + + fn is_unicode_escape_suffix(&mut self) -> Pred { + self.is_unicode + .get_or_insert_with(|| eq(self.pos, b'u')) + .clone() + } + + fn is_escape_suffix(&mut self) -> Pred { + self.is_valid_escape + .get_or_insert_with(|| { + // Valid escape suffixes: " / \ b f n r t (unicode 'u' handled separately) + let chars: Vec = vec![b'"', b'/', b'\\', b'b', b'f', b'n', b'r', b't']; + let atoms: Vec = chars.iter().map(|c| eq(self.pos, *c)).collect(); + Pred::or(atoms) + }) + .clone() + } + + fn is_hex(&mut self) -> Pred { + self.is_hex + .get_or_insert_with(|| { + // 0-9, a-f, A-F + let is_digit = Pred::and(vec![gte(self.pos, b'0'), lte(self.pos, b'9')]); + let is_lower = Pred::and(vec![gte(self.pos, b'a'), lte(self.pos, b'f')]); + let is_upper = Pred::and(vec![gte(self.pos, b'A'), lte(self.pos, b'F')]); + Pred::or(vec![is_digit, is_lower, is_upper]) + }) + .clone() + } + + // UTF-8 classification methods + + fn is_continuation(&mut self) -> Pred { + self.is_continuation + .get_or_insert_with(|| { + // 0x80-0xBF (10xxxxxx) + Pred::and(vec![gte(self.pos, 0x80u8), lte(self.pos, 0xBFu8)]) + }) + .clone() + } + + fn is_2byte_start(&mut self) -> Pred { + self.is_2byte_start + .get_or_insert_with(|| { + // 0xC2-0xDF (excludes 0xC0-0xC1 which are overlong) + Pred::and(vec![gte(self.pos, 0xC2u8), lte(self.pos, 0xDFu8)]) + }) + .clone() + } + + fn is_3byte_start(&mut self) -> Pred { + self.is_3byte_start + .get_or_insert_with(|| { + // 0xE0-0xEF + Pred::and(vec![gte(self.pos, 0xE0u8), lte(self.pos, 0xEFu8)]) + }) + .clone() + } + + fn is_4byte_start(&mut self) -> Pred { + self.is_4byte_start + .get_or_insert_with(|| { + // 0xF0-0xF4 (excludes 0xF5+ which encode > U+10FFFF) + Pred::and(vec![gte(self.pos, 0xF0u8), lte(self.pos, 0xF4u8)]) + }) + .clone() + } + + fn is_e0(&mut self) -> Pred { + self.is_e0 + .get_or_insert_with(|| eq(self.pos, 0xE0u8)) + .clone() + } + + fn is_ed(&mut self) -> Pred { + self.is_ed + .get_or_insert_with(|| eq(self.pos, 0xEDu8)) + .clone() + } + + fn is_f0(&mut self) -> Pred { + self.is_f0 + .get_or_insert_with(|| eq(self.pos, 0xF0u8)) + .clone() + } + + fn is_f4(&mut self) -> Pred { + self.is_f4 + .get_or_insert_with(|| eq(self.pos, 0xF4u8)) + .clone() + } + + fn is_cont_80_8f(&mut self) -> Pred { + self.is_cont_80_8f + .get_or_insert_with(|| Pred::and(vec![gte(self.pos, 0x80u8), lte(self.pos, 0x8Fu8)])) + .clone() + } + + fn is_cont_80_9f(&mut self) -> Pred { + self.is_cont_80_9f + .get_or_insert_with(|| Pred::and(vec![gte(self.pos, 0x80u8), lte(self.pos, 0x9Fu8)])) + .clone() + } + + fn is_cont_90_bf(&mut self) -> Pred { + self.is_cont_90_bf + .get_or_insert_with(|| Pred::and(vec![gte(self.pos, 0x90u8), lte(self.pos, 0xBFu8)])) + .clone() + } + + fn is_cont_a0_bf(&mut self) -> Pred { + self.is_cont_a0_bf + .get_or_insert_with(|| Pred::and(vec![gte(self.pos, 0xA0u8), lte(self.pos, 0xBFu8)])) + .clone() + } +} + +/// Helper to build OR from a collection of optional predicates. +/// Returns None if no predicates are present (represents "always false" / +/// impossible). +fn or_opts(preds: impl IntoIterator>) -> Option { + let collected: Vec = preds.into_iter().flatten().collect(); + if collected.is_empty() { + None + } else { + Some(Pred::or(collected)) + } +} + +/// Helper to build AND from a collection of optional predicates. +/// Returns None if any predicate is None (since AND with false = false). +/// Returns the conjunction if all are present. +fn and_opts(preds: impl IntoIterator>) -> Option { + let collected: Vec = preds.into_iter().collect::>>()?; + if collected.is_empty() { + // AND of nothing = true, but we represent "no constraint" as None + // In practice this shouldn't happen in our usage + None + } else { + Some(Pred::and(collected)) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::compiler::Compiler; + use mpz_circuits::evaluate; + use serde_json::value::RawValue; + use std::{fs, path::PathBuf}; + + #[test] + fn test_validate_string() { + const LENGTH: usize = 1024; + + let pred = validate_string(RangeSet::from(0..LENGTH)); + + println!("done building predicate"); + + let circ = Compiler::new().compile(&pred); + + println!( + "JSON string length: {:?}; circuit AND gate count {:?}", + LENGTH, + circ.and_count() + ); + } + + #[test] + fn test_validate_number_and_gates() { + const LENGTH: usize = 20; + + let pred = validate_number(RangeSet::from(0..LENGTH)); + + println!("done building predicate"); + + let circ = Compiler::new().compile(&pred); + + println!( + "JSON string length: {:?}; circuit AND gate count {:?}", + LENGTH, + circ.and_count() + ); + } + + #[test] + fn test_json_test_suite_pass() { + let folder = PathBuf::from("tests/json_test_suite_pass"); + + for entry in fs::read_dir(&folder).expect("Failed to read JSON directory") { + let entry = entry.expect("Invalid dir entry"); + let path = entry.path(); + + if path.extension().and_then(|s| s.to_str()) != Some("json") { + continue; + } + + let data = + fs::read_to_string(&path).unwrap_or_else(|_| panic!("Failed to read {:?}", path)); + + let parsed: Vec> = serde_json::from_str(&data).unwrap(); + let raw_element = parsed[0].get(); + let inner = &raw_element[1..raw_element.len() - 1]; + + if inner.is_empty() { + // Empty string is vacuously valid + continue; + } + + let pred = validate_string(RangeSet::from(0..inner.len())); + let circ = Compiler::new().compile(&pred); + + let out: bool = evaluate!(circ, inner.as_bytes()).unwrap(); + assert_eq!(out, true, "Failed for {:?}", path); + } + } + + #[test] + fn test_json_test_suite_fail() { + let folder = PathBuf::from("tests/json_test_suite_fail"); + + for entry in fs::read_dir(&folder).expect("Failed to read JSON directory") { + let entry = entry.expect("Invalid dir entry"); + let path = entry.path(); + + if path.extension().and_then(|s| s.to_str()) != Some("json") { + continue; + } + + let raw = match fs::read_to_string(&path) { + Ok(raw) => { + let trimmed = raw.trim(); + assert!(trimmed.starts_with('[') && trimmed.ends_with(']')); + let inside = &trimmed[1..trimmed.len() - 1].trim(); + assert!(inside.starts_with('"') && inside.ends_with('"')); + let inner_raw = &inside[1..inside.len() - 1]; + inner_raw.as_bytes().to_vec() + } + Err(_) => { + let bytes: Vec = fs::read(&path).expect("failed to read file"); + bytes[2..bytes.len() - 2].to_vec() + } + }; + + if raw.is_empty() { + continue; + } + + let pred = validate_string(RangeSet::from(0..raw.len())); + let circ = Compiler::new().compile(&pred); + + let out: bool = evaluate!(circ, raw).unwrap(); + assert_eq!(out, false, "Failed for {:?}", path); + } + } + + #[test] + fn test_invalid_utf8() { + let invalid_cases: Vec<(&str, Vec)> = vec![ + // Lone continuation bytes + ("lone continuation byte 0x80", vec![0x80]), + ("lone continuation byte 0xBF", vec![0xBF]), + ("lone continuation byte in middle", vec![b'a', 0x80, b'b']), + // Overlong encodings + ("overlong 0xC0 0x80", vec![0xC0, 0x80]), + ("overlong 0xC1 0xBF", vec![0xC1, 0xBF]), + // Incomplete sequences + ("incomplete 2-byte at end", vec![0xC2]), + ("incomplete 2-byte followed by ASCII", vec![0xC2, b'a']), + ("incomplete 3-byte missing 2 bytes", vec![0xE0]), + ("incomplete 3-byte missing 1 byte", vec![0xE0, 0xA0]), + ("incomplete 4-byte missing 3 bytes", vec![0xF0]), + ("incomplete 4-byte missing 2 bytes", vec![0xF0, 0x90]), + ("incomplete 4-byte missing 1 byte", vec![0xF0, 0x90, 0x80]), + // E0 overlong + ("E0 overlong second byte 0x80", vec![0xE0, 0x80, 0x80]), + ("E0 overlong second byte 0x9F", vec![0xE0, 0x9F, 0x80]), + // ED surrogate + ("ED surrogate second byte 0xA0", vec![0xED, 0xA0, 0x80]), + ("ED surrogate second byte 0xBF", vec![0xED, 0xBF, 0x80]), + // F0 overlong + ("F0 overlong second byte 0x80", vec![0xF0, 0x80, 0x80, 0x80]), + ("F0 overlong second byte 0x8F", vec![0xF0, 0x8F, 0x80, 0x80]), + // F4 out of range + ( + "F4 out of range second byte 0x90", + vec![0xF4, 0x90, 0x80, 0x80], + ), + ( + "F4 out of range second byte 0xBF", + vec![0xF4, 0xBF, 0x80, 0x80], + ), + // Invalid start bytes + ("invalid start byte 0xF5", vec![0xF5, 0x80, 0x80, 0x80]), + ("invalid start byte 0xFF", vec![0xFF]), + // Mixed + ( + "valid then invalid continuation", + vec![b'h', b'e', b'l', b'l', b'o', 0x80], + ), + ( + "valid 2-byte then lone continuation", + vec![0xC2, 0x80, 0x80], + ), + ]; + + for (name, bytes) in invalid_cases { + if bytes.is_empty() { + continue; + } + + let pred = validate_string(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, &bytes).unwrap(); + + assert_eq!( + out, false, + "Expected invalid UTF-8 for case '{}': {:02X?}", + name, bytes + ); + } + } + + #[test] + fn test_valid_utf8() { + let valid_cases: Vec<(&str, Vec)> = vec![ + // ASCII + ("simple ASCII", b"hello world".to_vec()), + // 2-byte sequences + ("2-byte U+0080", vec![0xC2, 0x80]), + ("2-byte U+07FF", vec![0xDF, 0xBF]), + // 3-byte sequences + ("3-byte U+0800", vec![0xE0, 0xA0, 0x80]), + ("3-byte U+D7FF", vec![0xED, 0x9F, 0xBF]), + ("3-byte U+E000", vec![0xEE, 0x80, 0x80]), + ("3-byte U+FFFF", vec![0xEF, 0xBF, 0xBF]), + // 4-byte sequences + ("4-byte U+10000", vec![0xF0, 0x90, 0x80, 0x80]), + ("4-byte U+10FFFF", vec![0xF4, 0x8F, 0xBF, 0xBF]), + // Mixed + ("mixed ASCII and 2-byte", vec![b'a', 0xC2, 0x80, b'b']), + ("euro sign", vec![0xE2, 0x82, 0xAC]), + ("emoji", vec![0xF0, 0x9F, 0x98, 0x80]), + ]; + + for (name, bytes) in valid_cases { + if bytes.is_empty() { + continue; + } + + let pred = validate_string(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, &bytes).unwrap(); + + assert_eq!( + out, true, + "Expected valid UTF-8 for case '{}': {:02X?}", + name, bytes + ); + } + } + + #[test] + fn test_validate_number_valid() { + let valid_cases = vec![ + // Integers + "0", + "1", + "9", + "10", + "123", + "999999", + // Negative integers + "-0", + "-1", + "-123", + // Decimals + "0.0", + "0.1", + "0.123", + "1.0", + "1.23", + "123.456", + "-0.0", + "-1.23", + // Exponents + "1e0", + "1e1", + "1e10", + "1E0", + "1E10", + "1e+0", + "1e-0", + "1e+10", + "1e-10", + "0e0", + "-1e0", + // Decimals with exponents + "1.0e0", + "1.23e4", + "1.23e+4", + "1.23e-4", + "1.23E4", + "-1.23e-4", + // Edge cases + "0.0e0", + "123456789", + ]; + + for input in valid_cases { + let bytes = input.as_bytes(); + let pred = validate_number(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(out, "Expected valid number for '{}'", input); + } + } + + #[test] + fn test_validate_number_invalid() { + let invalid_cases = vec![ + // Leading zeros (invalid in JSON) + "01", "00", "007", "-01", // Missing digits + ".", "-", "e", "E", ".1", // no leading digit + "1.", // trailing dot + "1e", // trailing e + "1e+", // trailing sign + "1e-", // trailing sign + "-.", // minus then dot + // Invalid characters + "+1", // leading plus (not allowed in JSON) + "1a", "1.2.3", "1ee1", "1e1e1", // Spaces (not part of number) + " 1", "1 ", + // Empty + // "", // can't test empty - asserts + ]; + + for input in invalid_cases { + let bytes = input.as_bytes(); + if bytes.is_empty() { + continue; + } + let pred = validate_number(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(!out, "Expected invalid number for '{}'", input); + } + } + + #[test] + fn test_validate_integer_valid() { + let valid_cases = vec![ + "0", + "1", + "9", + "00", + "01", + "10", + "42", + "99", + "123", + "999", + "0000", + "1234", + "9999", + "123456789", + "00000000000", + ]; + + for input in valid_cases { + let bytes = input.as_bytes(); + let pred = validate_integer(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(out, "Expected valid integer for '{}'", input); + } + } + + #[test] + fn test_validate_integer_invalid() { + let invalid_cases = vec![ + // Non-digit characters + "-1", "+1", "1.0", "1e0", "a", "1a", "a1", " 1", "1 ", ".", "-", "+", "hello", "12 34", + "12.34", "0x10", + ]; + + for input in invalid_cases { + let bytes = input.as_bytes(); + if bytes.is_empty() { + continue; + } + let pred = validate_integer(RangeSet::from(0..bytes.len())); + let circ = Compiler::new().compile(&pred); + let out: bool = evaluate!(circ, bytes).unwrap(); + + assert!(!out, "Expected invalid integer for '{}'", input); + } + } +} diff --git a/crates/predicate/src/lib.rs b/crates/predicate/src/lib.rs new file mode 100644 index 00000000..a11dd8fe --- /dev/null +++ b/crates/predicate/src/lib.rs @@ -0,0 +1,445 @@ +//! Boolean predicates over byte data. +//! +//! This module provides types for building and evaluating boolean predicates +//! that operate on byte arrays. Predicates can be: +//! +//! - Compiled into boolean circuits (see [`compiler::Compiler`]) +//! - Evaluated directly via [`eval_pred`] +//! +//! # Building predicates +//! +//! Use helper functions to create atomic comparisons, and combine them with +//! [`Pred::and`], [`Pred::or`], and [`Pred::not`]: +//! +//! ```ignore +//! use mpz_predicate::{Pred, lt, eq, gte}; +//! +//! // data[0] < data[1] AND data[2] == 42 +//! let pred = Pred::and(vec![ +//! lt(0, 1), // data[0] < data[1] (compare two indices) +//! eq(2, 42u8), // data[2] == 42 (compare index to constant) +//! ]); +//! +//! // NOT (data[3] >= 0x80) +//! let ascii_check = Pred::not(gte(3, 0x80u8)); +//! ``` +//! +//! # Comparison helpers +//! +//! Each helper takes an index and an operand. The operand can be: +//! - `usize` - compare with byte at another index +//! - `u8` - compare with a constant value +//! +//! Available comparisons: +//! - [`eq`] - equal (`==`) +//! - [`ne`] - not equal (`!=`) +//! - [`lt`] - less than (`<`) +//! - [`lte`] - less than or equal (`<=`) +//! - [`gt`] - greater than (`>`) +//! - [`gte`] - greater than or equal (`>=`) + +pub mod compiler; +pub mod http; +pub mod json; + +use std::{collections::BTreeSet, fmt}; + +use serde::{Deserialize, Serialize}; +use std::rc::Rc; + +/// Predicate handle with structural sharing. +/// +/// Predicates are reference-counted to allow efficient sharing of common +/// subexpressions. Use [`Pred::and`], [`Pred::or`], [`Pred::not`] to combine +/// predicates, and helper functions like [`eq`], [`lt`], etc. to create +/// atomic comparisons. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Pred(Rc); + +/// Internal predicate node representation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub(crate) enum PredNode { + And(Vec), + Or(Vec), + Not(Pred), + Atom(Atom), +} + +impl Pred { + /// Creates a conjunction (AND) of predicates. + /// + /// Returns true if all children are true. + pub fn and(children: Vec) -> Pred { + Pred(Rc::new(PredNode::And(children))) + } + + /// Creates a disjunction (OR) of predicates. + /// + /// Returns true if any child is true. + pub fn or(children: Vec) -> Pred { + Pred(Rc::new(PredNode::Or(children))) + } + + /// Creates a negation (NOT) of a predicate. + pub fn not(child: Pred) -> Pred { + Pred(Rc::new(PredNode::Not(child))) + } + + /// Creates an atomic predicate from a comparison. + /// + /// Prefer using helper functions like [`eq`], [`lt`], etc. instead. + pub(crate) fn atom(atom: Atom) -> Pred { + Pred(Rc::new(PredNode::Atom(atom))) + } + + /// Returns sorted unique byte indices referenced by this predicate. + pub fn indices(&self) -> Vec { + let mut collected = BTreeSet::new(); + let mut visited = std::collections::HashSet::new(); + collect_indices(self, &mut collected, &mut visited); + collected.into_iter().collect() + } + + /// Returns the raw pointer address for use as a cache key. + pub(crate) fn ptr_key(&self) -> usize { + Rc::as_ptr(&self.0) as usize + } +} + +// ============================================================================ +// Comparison helper functions +// ============================================================================ + +/// Creates an equality predicate: `data[index] == rhs`. +pub fn eq(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Eq, + rhs: rhs.into().0, + }) +} + +/// Creates a not-equal predicate: `data[index] != rhs`. +pub fn ne(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Ne, + rhs: rhs.into().0, + }) +} + +/// Creates a less-than predicate: `data[index] < rhs`. +pub fn lt(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Lt, + rhs: rhs.into().0, + }) +} + +/// Creates a less-than-or-equal predicate: `data[index] <= rhs`. +pub fn lte(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Lte, + rhs: rhs.into().0, + }) +} + +/// Creates a greater-than predicate: `data[index] > rhs`. +pub fn gt(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Gt, + rhs: rhs.into().0, + }) +} + +/// Creates a greater-than-or-equal predicate: `data[index] >= rhs`. +pub fn gte(index: usize, rhs: impl Into) -> Pred { + Pred::atom(Atom { + index, + op: CmpOp::Gte, + rhs: rhs.into().0, + }) +} + +// ============================================================================ +// Operand type for ergonomic API +// ============================================================================ + +/// Wrapper for comparison operand, enabling ergonomic API. +/// +/// This type is not meant to be used directly. Instead, pass `usize` (for +/// index comparison) or `u8` (for constant comparison) to helper functions. +pub struct Operand(Rhs); + +impl From for Operand { + /// Compare with byte at another index. + fn from(idx: usize) -> Self { + Operand(Rhs::Idx(idx)) + } +} + +impl From for Operand { + /// Compare with a constant byte value. + fn from(val: u8) -> Self { + Operand(Rhs::Const(val)) + } +} + +impl Pred { + /// Returns a reference to the inner node (internal use only). + pub(crate) fn inner(&self) -> &PredNode { + &self.0 + } +} + +impl fmt::Display for Pred { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.fmt_with_indent(f, 0) + } +} + +impl Pred { + fn fmt_with_indent(&self, f: &mut fmt::Formatter<'_>, indent: usize) -> fmt::Result { + fn pad(f: &mut fmt::Formatter<'_>, indent: usize) -> fmt::Result { + write!(f, "{:indent$}", "", indent = indent * 2) + } + + match self.inner() { + PredNode::And(preds) => { + pad(f, indent)?; + writeln!(f, "And(")?; + for p in preds { + p.fmt_with_indent(f, indent + 1)?; + } + pad(f, indent)?; + writeln!(f, ")") + } + PredNode::Or(preds) => { + pad(f, indent)?; + writeln!(f, "Or(")?; + for p in preds { + p.fmt_with_indent(f, indent + 1)?; + } + pad(f, indent)?; + writeln!(f, ")") + } + PredNode::Not(p) => { + pad(f, indent)?; + writeln!(f, "Not(")?; + p.fmt_with_indent(f, indent + 1)?; + pad(f, indent)?; + writeln!(f, ")") + } + PredNode::Atom(a) => { + pad(f, indent)?; + writeln!(f, "Atom({:?})", a) + } + } + } +} + +/// Atomic predicate of the form: `data[index] op rhs`. +/// +/// This is an internal type. Use helper functions like [`eq`], [`lt`], etc. +/// to create predicates. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub(crate) struct Atom { + pub(crate) index: usize, + pub(crate) op: CmpOp, + pub(crate) rhs: Rhs, +} + +/// Comparison operator for atomic predicates. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub(crate) enum CmpOp { + Eq, + Ne, + Gt, + Gte, + Lt, + Lte, +} + +/// Right-hand side of a comparison (internal). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +pub(crate) enum Rhs { + /// Byte at index. + Idx(usize), + /// Literal constant. + Const(u8), +} + +/// Evaluates the predicate on the input `data`. +pub(crate) fn eval_pred(pred: &Pred, data: &[u8]) -> bool { + match pred.inner() { + PredNode::And(vec) => vec.iter().map(|p| eval_pred(p, data)).all(|b| b), + PredNode::Or(vec) => vec.iter().map(|p| eval_pred(p, data)).any(|b| b), + PredNode::Not(p) => !eval_pred(p, data), + PredNode::Atom(atom) => { + let lhs = data[atom.index]; + let rhs = match atom.rhs { + Rhs::Const(c) => c, + Rhs::Idx(s) => data[s], + }; + match atom.op { + CmpOp::Eq => lhs == rhs, + CmpOp::Ne => lhs != rhs, + CmpOp::Lt => lhs < rhs, + CmpOp::Lte => lhs <= rhs, + CmpOp::Gt => lhs > rhs, + CmpOp::Gte => lhs >= rhs, + } + } + } +} + +/// Recursively collects byte indices from atoms in the predicate tree. +/// +/// # Arguments +/// * `collected` - accumulates the byte indices found in atoms +/// * `visited` - tracks visited predicate nodes (by pointer) to avoid redundant +/// traversal +fn collect_indices( + pred: &Pred, + collected: &mut BTreeSet, + visited: &mut std::collections::HashSet, +) { + let key = pred.ptr_key(); + if !visited.insert(key) { + return; + } + + match pred.inner() { + PredNode::And(vec) | PredNode::Or(vec) => { + for p in vec { + collect_indices(p, collected, visited); + } + } + PredNode::Not(p) => collect_indices(p, collected, visited), + PredNode::Atom(atom) => { + collected.insert(atom.index); + if let Rhs::Idx(idx) = atom.rhs { + collected.insert(idx); + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_and() { + // data[0] < data[2] AND data[1] == 2 + let pred = Pred::and(vec![lt(0, 2usize), eq(1, 2u8)]); + + assert_eq!(eval_pred(&pred, &[1u8, 2, 3]), true); + assert_eq!(eval_pred(&pred, &[1u8, 3, 3]), false); + } + + #[test] + fn test_or() { + // data[0] < data[2] OR data[1] == 2 + let pred = Pred::or(vec![lt(0, 2usize), eq(1, 2u8)]); + + assert_eq!(eval_pred(&pred, &[1u8, 0, 3]), true); // first condition true + assert_eq!(eval_pred(&pred, &[1u8, 3, 0]), false); // both false + } + + #[test] + fn test_not() { + // NOT (data[0] < data[1]) + let pred = Pred::not(lt(0, 1usize)); + + assert_eq!(eval_pred(&pred, &[5u8, 3]), true); // 5 < 3 is false, NOT false = true + assert_eq!(eval_pred(&pred, &[1u8, 3]), false); // 1 < 3 is true, NOT + // true = false + } + + #[test] + fn test_rhs_const() { + // data[0] < 22 + let pred = lt(0, 22u8); + + assert_eq!(eval_pred(&pred, &[5u8]), true); + assert_eq!(eval_pred(&pred, &[23u8]), false); + } + + #[test] + fn test_rhs_idx() { + // data[0] < data[1] + let pred = lt(0, 1usize); + + assert_eq!(eval_pred(&pred, &[5u8, 10u8]), true); + assert_eq!(eval_pred(&pred, &[23u8, 5u8]), false); + } + + #[test] + fn test_same_idx() { + // data[0] == data[0] (always true) + let pred1 = eq(0, 0usize); + // data[0] < data[0] (always false) + let pred2 = lt(0, 0usize); + + assert_eq!(eval_pred(&pred1, &[5u8]), true); + assert_eq!(eval_pred(&pred2, &[5u8]), false); + } + + #[test] + fn test_eq() { + // data[0] == data[1] + let pred = eq(0, 1usize); + + assert_eq!(eval_pred(&pred, &[5u8, 5]), true); + assert_eq!(eval_pred(&pred, &[1u8, 3]), false); + } + + #[test] + fn test_ne() { + // data[0] != data[1] + let pred = ne(0, 1usize); + + assert_eq!(eval_pred(&pred, &[5u8, 6]), true); + assert_eq!(eval_pred(&pred, &[1u8, 1]), false); + } + + #[test] + fn test_gt() { + // data[0] > data[1] + let pred = gt(0, 1usize); + + assert_eq!(eval_pred(&pred, &[7u8, 6]), true); + assert_eq!(eval_pred(&pred, &[1u8, 1]), false); + } + + #[test] + fn test_gte() { + // data[0] >= data[1] + let pred = gte(0, 1usize); + + assert_eq!(eval_pred(&pred, &[7u8, 7]), true); + assert_eq!(eval_pred(&pred, &[0u8, 1]), false); + } + + #[test] + fn test_lt() { + // data[0] < data[1] + let pred = lt(0, 1usize); + + assert_eq!(eval_pred(&pred, &[2u8, 7]), true); + assert_eq!(eval_pred(&pred, &[4u8, 1]), false); + } + + #[test] + fn test_lte() { + // data[0] <= data[1] + let pred = lte(0, 1usize); + + assert_eq!(eval_pred(&pred, &[2u8, 2]), true); + assert_eq!(eval_pred(&pred, &[4u8, 1]), false); + } +} diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape.json b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape.json new file mode 100644 index 00000000..acec66d8 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape.json @@ -0,0 +1 @@ +["\uD800\"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u.json b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u.json new file mode 100644 index 00000000..e834b05e --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u.json @@ -0,0 +1 @@ +["\uD800\u"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1.json b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1.json new file mode 100644 index 00000000..a04cd348 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1.json @@ -0,0 +1 @@ +["\uD800\u1"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1x.json b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1x.json new file mode 100644 index 00000000..bfbd2340 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_1_surrogate_then_escape_u1x.json @@ -0,0 +1 @@ +["\uD800\u1x"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_backslash_00.json b/crates/predicate/tests/json_test_suite_fail/n_string_backslash_00.json new file mode 100644 index 00000000..b5bf267b Binary files /dev/null and b/crates/predicate/tests/json_test_suite_fail/n_string_backslash_00.json differ diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_escape_x.json b/crates/predicate/tests/json_test_suite_fail/n_string_escape_x.json new file mode 100644 index 00000000..fae29193 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_escape_x.json @@ -0,0 +1 @@ +["\x00"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_escaped_backslash_bad.json b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_backslash_bad.json new file mode 100755 index 00000000..016fcb47 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_backslash_bad.json @@ -0,0 +1 @@ +["\\\"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_escaped_ctrl_char_tab.json b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_ctrl_char_tab.json new file mode 100644 index 00000000..f35ea382 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_ctrl_char_tab.json @@ -0,0 +1 @@ +["\ "] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_escaped_emoji.json b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_emoji.json new file mode 100644 index 00000000..a2777542 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_escaped_emoji.json @@ -0,0 +1 @@ +["\🌀"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escape.json b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escape.json new file mode 100755 index 00000000..3415c33c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escape.json @@ -0,0 +1 @@ +["\"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escaped_character.json b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escaped_character.json new file mode 100755 index 00000000..0f2197ea --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_escaped_character.json @@ -0,0 +1 @@ +["\u00A"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate.json b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate.json new file mode 100755 index 00000000..75504a65 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate.json @@ -0,0 +1 @@ +["\uD834\uDd"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate_escape_invalid.json b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate_escape_invalid.json new file mode 100755 index 00000000..bd965606 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_incomplete_surrogate_escape_invalid.json @@ -0,0 +1 @@ +["\uD800\uD800\x"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_invalid-utf-8-in-escape.json b/crates/predicate/tests/json_test_suite_fail/n_string_invalid-utf-8-in-escape.json new file mode 100644 index 00000000..0c430064 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_invalid-utf-8-in-escape.json @@ -0,0 +1 @@ +["\u"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_invalid_backslash_esc.json b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_backslash_esc.json new file mode 100755 index 00000000..d1eb6092 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_backslash_esc.json @@ -0,0 +1 @@ +["\a"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_invalid_unicode_escape.json b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_unicode_escape.json new file mode 100644 index 00000000..7608cb6b --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_unicode_escape.json @@ -0,0 +1 @@ +["\uqqqq"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_invalid_utf8_after_escape.json b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_utf8_after_escape.json new file mode 100644 index 00000000..2f757a25 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_invalid_utf8_after_escape.json @@ -0,0 +1 @@ +["\"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_ctrl_char.json b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_ctrl_char.json new file mode 100755 index 00000000..9f213480 Binary files /dev/null and b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_ctrl_char.json differ diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_newline.json b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_newline.json new file mode 100644 index 00000000..700d3608 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_newline.json @@ -0,0 +1,2 @@ +["new +line"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_tab.json b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_tab.json new file mode 100644 index 00000000..160264a2 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_unescaped_tab.json @@ -0,0 +1 @@ +[" "] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_fail/n_string_unicode_CapitalU.json b/crates/predicate/tests/json_test_suite_fail/n_string_unicode_CapitalU.json new file mode 100644 index 00000000..3a49d412 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_fail/n_string_unicode_CapitalU.json @@ -0,0 +1,2 @@ +["\UA66D" +] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_1_2_3_bytes_UTF-8_sequences.json b/crates/predicate/tests/json_test_suite_pass/y_string_1_2_3_bytes_UTF-8_sequences.json new file mode 100755 index 00000000..9967ddeb --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_1_2_3_bytes_UTF-8_sequences.json @@ -0,0 +1 @@ +["\u0060\u012a\u12AB"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pair.json b/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pair.json new file mode 100755 index 00000000..996875cc --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pair.json @@ -0,0 +1 @@ +["\uD801\udc37"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pairs.json b/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pairs.json new file mode 100755 index 00000000..3401021e --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_accepted_surrogate_pairs.json @@ -0,0 +1 @@ +["\ud83d\ude39\ud83d\udc8d"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_allowed_escapes.json b/crates/predicate/tests/json_test_suite_pass/y_string_allowed_escapes.json new file mode 100644 index 00000000..7f495532 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_allowed_escapes.json @@ -0,0 +1 @@ +["\"\\\/\b\f\n\r\t"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_backslash_and_u_escaped_zero.json b/crates/predicate/tests/json_test_suite_pass/y_string_backslash_and_u_escaped_zero.json new file mode 100755 index 00000000..d4439eda --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_backslash_and_u_escaped_zero.json @@ -0,0 +1 @@ +["\\u0000"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_backslash_doublequotes.json b/crates/predicate/tests/json_test_suite_pass/y_string_backslash_doublequotes.json new file mode 100644 index 00000000..ae03243b --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_backslash_doublequotes.json @@ -0,0 +1 @@ +["\""] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_comments.json b/crates/predicate/tests/json_test_suite_pass/y_string_comments.json new file mode 100644 index 00000000..2260c20c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_comments.json @@ -0,0 +1 @@ +["a/*b*/c/*d//e"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_a.json b/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_a.json new file mode 100644 index 00000000..6715d6f4 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_a.json @@ -0,0 +1 @@ +["\\a"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_n.json b/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_n.json new file mode 100644 index 00000000..44ca56c4 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_double_escape_n.json @@ -0,0 +1 @@ +["\\n"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_escaped_control_character.json b/crates/predicate/tests/json_test_suite_pass/y_string_escaped_control_character.json new file mode 100644 index 00000000..5b014a9c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_escaped_control_character.json @@ -0,0 +1 @@ +["\u0012"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_escaped_noncharacter.json b/crates/predicate/tests/json_test_suite_pass/y_string_escaped_noncharacter.json new file mode 100755 index 00000000..2ff52e2c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_escaped_noncharacter.json @@ -0,0 +1 @@ +["\uFFFF"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_in_array.json b/crates/predicate/tests/json_test_suite_pass/y_string_in_array.json new file mode 100755 index 00000000..21d7ae4c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_in_array.json @@ -0,0 +1 @@ +["asd"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_in_array_with_leading_space.json b/crates/predicate/tests/json_test_suite_pass/y_string_in_array_with_leading_space.json new file mode 100755 index 00000000..9e1887c1 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_in_array_with_leading_space.json @@ -0,0 +1 @@ +[ "asd"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_last_surrogates_1_and_2.json b/crates/predicate/tests/json_test_suite_pass/y_string_last_surrogates_1_and_2.json new file mode 100644 index 00000000..3919cef7 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_last_surrogates_1_and_2.json @@ -0,0 +1 @@ +["\uDBFF\uDFFF"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_nbsp_uescaped.json b/crates/predicate/tests/json_test_suite_pass/y_string_nbsp_uescaped.json new file mode 100644 index 00000000..2085ab1a --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_nbsp_uescaped.json @@ -0,0 +1 @@ +["new\u00A0line"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+10FFFF.json b/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+10FFFF.json new file mode 100755 index 00000000..059e4d9d --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+10FFFF.json @@ -0,0 +1 @@ +["􏿿"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+FFFF.json b/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+FFFF.json new file mode 100755 index 00000000..4c913bd4 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_nonCharacterInUTF-8_U+FFFF.json @@ -0,0 +1 @@ +["￿"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_null_escape.json b/crates/predicate/tests/json_test_suite_pass/y_string_null_escape.json new file mode 100644 index 00000000..c1ad8440 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_null_escape.json @@ -0,0 +1 @@ +["\u0000"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_one-byte-utf-8.json b/crates/predicate/tests/json_test_suite_pass/y_string_one-byte-utf-8.json new file mode 100644 index 00000000..15718592 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_one-byte-utf-8.json @@ -0,0 +1 @@ +["\u002c"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_pi.json b/crates/predicate/tests/json_test_suite_pass/y_string_pi.json new file mode 100644 index 00000000..9df11ae8 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_pi.json @@ -0,0 +1 @@ +["π"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_reservedCharacterInUTF-8_U+1BFFF.json b/crates/predicate/tests/json_test_suite_pass/y_string_reservedCharacterInUTF-8_U+1BFFF.json new file mode 100755 index 00000000..10a33a17 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_reservedCharacterInUTF-8_U+1BFFF.json @@ -0,0 +1 @@ +["𛿿"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_simple_ascii.json b/crates/predicate/tests/json_test_suite_pass/y_string_simple_ascii.json new file mode 100644 index 00000000..8cadf7d0 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_simple_ascii.json @@ -0,0 +1 @@ +["asd "] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json b/crates/predicate/tests/json_test_suite_pass/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json new file mode 100755 index 00000000..7620b665 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_surrogates_U+1D11E_MUSICAL_SYMBOL_G_CLEF.json @@ -0,0 +1 @@ +["\uD834\uDd1e"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_three-byte-utf-8.json b/crates/predicate/tests/json_test_suite_pass/y_string_three-byte-utf-8.json new file mode 100644 index 00000000..108f1d67 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_three-byte-utf-8.json @@ -0,0 +1 @@ +["\u0821"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_two-byte-utf-8.json b/crates/predicate/tests/json_test_suite_pass/y_string_two-byte-utf-8.json new file mode 100644 index 00000000..461503c3 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_two-byte-utf-8.json @@ -0,0 +1 @@ +["\u0123"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_u+2028_line_sep.json b/crates/predicate/tests/json_test_suite_pass/y_string_u+2028_line_sep.json new file mode 100755 index 00000000..897b6021 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_u+2028_line_sep.json @@ -0,0 +1 @@ +["
"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_u+2029_par_sep.json b/crates/predicate/tests/json_test_suite_pass/y_string_u+2029_par_sep.json new file mode 100755 index 00000000..8cd998c8 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_u+2029_par_sep.json @@ -0,0 +1 @@ +["
"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_uEscape.json b/crates/predicate/tests/json_test_suite_pass/y_string_uEscape.json new file mode 100755 index 00000000..f7b41a02 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_uEscape.json @@ -0,0 +1 @@ +["\u0061\u30af\u30EA\u30b9"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_uescaped_newline.json b/crates/predicate/tests/json_test_suite_pass/y_string_uescaped_newline.json new file mode 100644 index 00000000..3a5a220b --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_uescaped_newline.json @@ -0,0 +1 @@ +["new\u000Aline"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unescaped_char_delete.json b/crates/predicate/tests/json_test_suite_pass/y_string_unescaped_char_delete.json new file mode 100755 index 00000000..7d064f49 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unescaped_char_delete.json @@ -0,0 +1 @@ +[""] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode.json new file mode 100644 index 00000000..3598095b --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode.json @@ -0,0 +1 @@ +["\uA66D"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicodeEscapedBackslash.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicodeEscapedBackslash.json new file mode 100755 index 00000000..0bb3b51e --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicodeEscapedBackslash.json @@ -0,0 +1 @@ +["\u005C"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_2.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_2.json new file mode 100644 index 00000000..a7dcb976 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_2.json @@ -0,0 +1 @@ +["⍂㈴⍂"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+10FFFE_nonchar.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+10FFFE_nonchar.json new file mode 100644 index 00000000..9a8370b9 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+10FFFE_nonchar.json @@ -0,0 +1 @@ +["\uDBFF\uDFFE"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+1FFFE_nonchar.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+1FFFE_nonchar.json new file mode 100644 index 00000000..c51f8ae4 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+1FFFE_nonchar.json @@ -0,0 +1 @@ +["\uD83F\uDFFE"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json new file mode 100644 index 00000000..626d5f81 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+200B_ZERO_WIDTH_SPACE.json @@ -0,0 +1 @@ +["\u200B"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+2064_invisible_plus.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+2064_invisible_plus.json new file mode 100644 index 00000000..1e23972c --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+2064_invisible_plus.json @@ -0,0 +1 @@ +["\u2064"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FDD0_nonchar.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FDD0_nonchar.json new file mode 100644 index 00000000..18ef151b --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FDD0_nonchar.json @@ -0,0 +1 @@ +["\uFDD0"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FFFE_nonchar.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FFFE_nonchar.json new file mode 100644 index 00000000..13d261fd --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_U+FFFE_nonchar.json @@ -0,0 +1 @@ +["\uFFFE"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_unicode_escaped_double_quote.json b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_escaped_double_quote.json new file mode 100755 index 00000000..4e625785 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_unicode_escaped_double_quote.json @@ -0,0 +1 @@ +["\u0022"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_utf8.json b/crates/predicate/tests/json_test_suite_pass/y_string_utf8.json new file mode 100644 index 00000000..40878435 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_utf8.json @@ -0,0 +1 @@ +["€𝄞"] \ No newline at end of file diff --git a/crates/predicate/tests/json_test_suite_pass/y_string_with_del_character.json b/crates/predicate/tests/json_test_suite_pass/y_string_with_del_character.json new file mode 100755 index 00000000..8bd24907 --- /dev/null +++ b/crates/predicate/tests/json_test_suite_pass/y_string_with_del_character.json @@ -0,0 +1 @@ +["aa"] \ No newline at end of file