Skip to content

Commit 4c55689

Browse files
authored
Merge pull request #54 from NodeDB-Lab/fix/issues-46-47-parser-planner-bugs
fix: issues #46 + #47 — parser, planner, and executor correctness bugs
2 parents a8acd1a + 96cbd80 commit 4c55689

File tree

53 files changed

+2747
-467
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+2747
-467
lines changed
Lines changed: 20 additions & 166 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616
//!
1717
//! Determinism validation: rejects `NOW()`, `RANDOM()`, `NEXTVAL()`, `UUID()`.
1818
19+
mod tokenizer;
20+
1921
use super::expr::{BinaryOp, SqlExpr};
2022
use nodedb_types::Value;
23+
use tokenizer::{Token, TokenKind, tokenize};
2124

2225
/// Parse a SQL expression string into an SqlExpr AST.
2326
///
@@ -46,160 +49,11 @@ pub fn parse_generated_expr(text: &str) -> Result<(SqlExpr, Vec<String>), String
4649
Ok((expr, deps))
4750
}
4851

49-
// ── Tokenizer ─────────────────────────────────────────────────────────
50-
51-
#[derive(Debug, Clone)]
52-
struct Token {
53-
text: String,
54-
kind: TokenKind,
55-
}
56-
57-
#[derive(Debug, Clone, Copy, PartialEq)]
58-
enum TokenKind {
59-
Ident,
60-
Number,
61-
StringLit,
62-
LParen,
63-
RParen,
64-
Comma,
65-
Op,
66-
}
67-
68-
fn tokenize(input: &str) -> Result<Vec<Token>, String> {
69-
let bytes = input.as_bytes();
70-
let mut tokens = Vec::new();
71-
let mut i = 0;
72-
73-
while i < bytes.len() {
74-
let b = bytes[i];
75-
76-
// Skip whitespace.
77-
if b.is_ascii_whitespace() {
78-
i += 1;
79-
continue;
80-
}
81-
82-
// Single-char tokens.
83-
if b == b'(' {
84-
tokens.push(Token {
85-
text: "(".into(),
86-
kind: TokenKind::LParen,
87-
});
88-
i += 1;
89-
continue;
90-
}
91-
if b == b')' {
92-
tokens.push(Token {
93-
text: ")".into(),
94-
kind: TokenKind::RParen,
95-
});
96-
i += 1;
97-
continue;
98-
}
99-
if b == b',' {
100-
tokens.push(Token {
101-
text: ",".into(),
102-
kind: TokenKind::Comma,
103-
});
104-
i += 1;
105-
continue;
106-
}
107-
108-
// Two-char operators.
109-
if i + 1 < bytes.len() {
110-
let two = &input[i..i + 2];
111-
if matches!(two, "<=" | ">=" | "!=" | "<>") {
112-
tokens.push(Token {
113-
text: two.into(),
114-
kind: TokenKind::Op,
115-
});
116-
i += 2;
117-
continue;
118-
}
119-
if two == "||" {
120-
tokens.push(Token {
121-
text: "||".into(),
122-
kind: TokenKind::Op,
123-
});
124-
i += 2;
125-
continue;
126-
}
127-
}
128-
129-
// Single-char operators.
130-
if matches!(b, b'+' | b'-' | b'*' | b'/' | b'%' | b'=' | b'<' | b'>') {
131-
tokens.push(Token {
132-
text: (b as char).to_string(),
133-
kind: TokenKind::Op,
134-
});
135-
i += 1;
136-
continue;
137-
}
138-
139-
// String literal.
140-
if b == b'\'' {
141-
let mut s = String::new();
142-
i += 1;
143-
while i < bytes.len() {
144-
if bytes[i] == b'\'' {
145-
if i + 1 < bytes.len() && bytes[i + 1] == b'\'' {
146-
s.push('\'');
147-
i += 2;
148-
continue;
149-
}
150-
i += 1;
151-
break;
152-
}
153-
s.push(bytes[i] as char);
154-
i += 1;
155-
}
156-
tokens.push(Token {
157-
text: s,
158-
kind: TokenKind::StringLit,
159-
});
160-
continue;
161-
}
162-
163-
// Number.
164-
if b.is_ascii_digit() || (b == b'.' && i + 1 < bytes.len() && bytes[i + 1].is_ascii_digit())
165-
{
166-
let start = i;
167-
while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') {
168-
i += 1;
169-
}
170-
tokens.push(Token {
171-
text: input[start..i].to_string(),
172-
kind: TokenKind::Number,
173-
});
174-
continue;
175-
}
176-
177-
// Identifier or keyword.
178-
if b.is_ascii_alphabetic() || b == b'_' {
179-
let start = i;
180-
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
181-
i += 1;
182-
}
183-
tokens.push(Token {
184-
text: input[start..i].to_string(),
185-
kind: TokenKind::Ident,
186-
});
187-
continue;
188-
}
189-
190-
return Err(format!("unexpected character: '{}'", b as char));
191-
}
192-
193-
Ok(tokens)
194-
}
195-
19652
// ── Recursive descent parser ──────────────────────────────────────────
19753

19854
/// Maximum recursion depth for nested parentheses / sub-expressions.
199-
/// Exceeding this limit returns `Err` instead of overflowing the stack.
20055
const MAX_EXPR_DEPTH: usize = 128;
20156

202-
/// Parse an expression (lowest precedence: OR).
20357
fn parse_expr(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result<SqlExpr, String> {
20458
parse_or(tokens, pos, depth)
20559
}
@@ -304,13 +158,11 @@ fn parse_multiplicative(
304158
}
305159

306160
fn parse_unary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result<SqlExpr, String> {
307-
// Unary minus.
308161
if *pos < tokens.len() && tokens[*pos].kind == TokenKind::Op && tokens[*pos].text == "-" {
309162
*pos += 1;
310163
let expr = parse_primary(tokens, pos, depth)?;
311164
return Ok(SqlExpr::Negate(Box::new(expr)));
312165
}
313-
// NOT
314166
if peek_keyword(tokens, *pos, "NOT") {
315167
*pos += 1;
316168
let expr = parse_primary(tokens, pos, depth)?;
@@ -327,7 +179,6 @@ fn parse_primary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result
327179
let token = &tokens[*pos];
328180

329181
match token.kind {
330-
// Parenthesized expression.
331182
TokenKind::LParen => {
332183
*depth += 1;
333184
if *depth > MAX_EXPR_DEPTH {
@@ -342,7 +193,6 @@ fn parse_primary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result
342193
Ok(expr)
343194
}
344195

345-
// Number literal.
346196
TokenKind::Number => {
347197
*pos += 1;
348198
if let Ok(i) = token.text.parse::<i64>() {
@@ -354,13 +204,11 @@ fn parse_primary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result
354204
}
355205
}
356206

357-
// String literal.
358207
TokenKind::StringLit => {
359208
*pos += 1;
360209
Ok(SqlExpr::Literal(Value::String(token.text.clone())))
361210
}
362211

363-
// Identifier: column ref, function call, keyword (NULL, TRUE, FALSE, CASE, COALESCE).
364212
TokenKind::Ident => {
365213
let name = token.text.clone();
366214
let upper = name.to_uppercase();
@@ -376,15 +224,13 @@ fn parse_primary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result
376224
Ok(SqlExpr::Coalesce(args))
377225
}
378226
_ => {
379-
// Function call: IDENT(args).
380227
if *pos < tokens.len() && tokens[*pos].kind == TokenKind::LParen {
381228
let args = parse_arg_list(tokens, pos, depth)?;
382229
Ok(SqlExpr::Function {
383230
name: name.to_lowercase(),
384231
args,
385232
})
386233
} else {
387-
// Column reference.
388234
Ok(SqlExpr::Column(name.to_lowercase()))
389235
}
390236
}
@@ -395,7 +241,6 @@ fn parse_primary(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result
395241
}
396242
}
397243

398-
/// Parse `CASE WHEN cond THEN result [WHEN ... THEN ...] [ELSE result] END`.
399244
fn parse_case(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result<SqlExpr, String> {
400245
let mut when_thens = Vec::new();
401246
let mut else_expr = None;
@@ -429,7 +274,6 @@ fn parse_case(tokens: &[Token], pos: &mut usize, depth: &mut usize) -> Result<Sq
429274
})
430275
}
431276

432-
/// Parse a parenthesized, comma-separated argument list: `(expr, expr, ...)`.
433277
fn parse_arg_list(
434278
tokens: &[Token],
435279
pos: &mut usize,
@@ -488,7 +332,6 @@ fn expect_token(
488332

489333
// ── Validation ────────────────────────────────────────────────────────
490334

491-
/// Non-deterministic functions that are rejected in GENERATED ALWAYS AS.
492335
const NON_DETERMINISTIC: &[&str] = &[
493336
"now",
494337
"current_timestamp",
@@ -613,7 +456,6 @@ mod tests {
613456
assert_eq!(deps, vec!["price", "tax_rate"]);
614457
let doc = Value::from(serde_json::json!({"price": 100.0, "tax_rate": 0.08}));
615458
let result = expr.eval(&doc);
616-
// eval returns integer when result is whole number.
617459
assert_eq!(result.as_f64(), Some(108.0));
618460
}
619461

@@ -693,15 +535,27 @@ mod tests {
693535

694536
#[test]
695537
fn deeply_nested_parentheses_return_error_not_stack_overflow() {
696-
// Spec: the parser must enforce a recursion depth limit so that
697-
// pathologically deep nesting returns Err rather than overflowing the
698-
// call stack and causing a process crash.
699538
let depth = 10_000;
700-
let input = format!("{}x{}", "(".repeat(depth), ")".repeat(depth),);
539+
let input = format!("{}x{}", "(".repeat(depth), ")".repeat(depth));
701540
let result = parse_generated_expr(&input);
702541
assert!(
703542
result.is_err(),
704-
"parse_generated_expr must return Err for {depth}-deep nesting, not stack overflow"
543+
"parse_generated_expr must return Err for {depth}-deep nesting"
705544
);
706545
}
546+
547+
#[test]
548+
fn cjk_string_in_concat() {
549+
let (expr, _) = parse_ok("CONCAT('你好', name)");
550+
let doc = Value::from(serde_json::json!({"name": "world"}));
551+
assert_eq!(expr.eval(&doc), Value::String("你好world".into()));
552+
}
553+
554+
#[test]
555+
fn comparison_with_utf8_literal() {
556+
let (expr, deps) = parse_ok("name != '禁止'");
557+
assert_eq!(deps, vec!["name"]);
558+
let doc = Value::from(serde_json::json!({"name": "allowed"}));
559+
assert_eq!(expr.eval(&doc), Value::Bool(true));
560+
}
707561
}

0 commit comments

Comments
 (0)