diff --git a/Cargo.lock b/Cargo.lock index 826b13f0..068b20ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -385,9 +385,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chdb-rust" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73d357c85065d85b9760b036a2ff011a049fedd5b67f434f1a74b02ce5d93f86" +checksum = "3db53787943679165395651367ba9122092078256ec5ee1fb53b81c692d77667" dependencies = [ "bindgen", "flate2", diff --git a/Cargo.toml b/Cargo.toml index b86c916e..64da4684 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,7 @@ lazy_static = "1.5" regex = "1.12.3" async-trait = "0.1" # chdb: embedded ClickHouse (optional — only compiled when feature "embedded" is enabled) -chdb-rust = { version = "1.3.0", optional = true } +chdb-rust = { version = "1.3.1", optional = true } [target.'cfg(not(target_env = "msvc"))'.dependencies] tikv-jemallocator = { version = "0.6", optional = true } diff --git a/README.md b/README.md index 7d1aae62..e119cd42 100644 --- a/README.md +++ b/README.md @@ -4,16 +4,17 @@ # ClickGraph -#### ClickGraph - A high-performance, stateless, read-only graph query service for ClickHouse, written in Rust, with Neo4j ecosystem compatibility - Cypher and Bolt Protocol 5.8 support. Now supports embedded mode with local writes, and exporting query results to external destinations, with Golang, Python bindings, in addition to native Rust. +#### ClickGraph - A high-performance, stateless, read-only graph query service for ClickHouse, written in Rust, with Neo4j ecosystem compatibility - Cypher and Bolt Protocol 5.8 support. Now supports embedded mode with local writes, and exporting query results to external destinations, with Golang and Python bindings, in addition to native Rust. New `cg` CLI tool supports agentic workflows. > **Note: ClickGraph dev release is at beta quality for view-based graph analytics applications. Kindly raise an issue if you encounter any problem.** ---- -## Motivation and Rationale -- Viewing ClickHouse databases (including external sources) as graph data with graph analytics capability brings another level of abstraction and boosts productivity with graph tools, and enables agentic GraphRAG support with local writes. -- Research shows relational analytics with columnar stores and vectorized execution engines like ClickHouse provide superior analytical performance and scalability to graph-native technologies, which usually leverage explicit adjacency representations and are more suitable for local-area graph traversals. -- View-based graph analytics offer the benefits of zero-ETL without the hassle of data migration and duplicate cost, yet better performance and scalability than most of the native graph analytics options. -- Neo4j Bolt protocol support gives access to the tools available based on the Bolt protocol. +`ClickGraph` provides three modes now: +- Stateless service +- Embedded mode with embedded `chDB` +- Hybrid mode with remote querying and local storage + +See [motivation and rationale](docs/motivation.md). + --- ## What's New in v0.6.6-dev @@ -75,7 +76,7 @@ See [CHANGELOG.md](CHANGELOG.md) for complete release history. ## Architecture -ClickGraph runs as a lightweight stateless query translator alongside ClickHouse: +ClickGraph service runs as a lightweight stateless query translator alongside ClickHouse: ```mermaid flowchart LR diff --git a/clickgraph-embedded/src/connection.rs b/clickgraph-embedded/src/connection.rs index 4e0c6131..6df3366e 100644 --- a/clickgraph-embedded/src/connection.rs +++ b/clickgraph-embedded/src/connection.rs @@ -626,9 +626,8 @@ impl<'db> Connection<'db> { /// Delete nodes matching the given label and filter criteria. /// - /// Uses `ALTER TABLE DELETE` which is a ClickHouse mutation — asynchronous - /// and resource-heavy. Not suitable for high-frequency use in tight loops. - /// For bulk cleanup, prefer fewer calls with broader filters. + /// Uses lightweight `DELETE FROM` — synchronous and low-overhead compared to + /// the old `ALTER TABLE DELETE` mutation path. pub fn delete_nodes( &self, label: &str, @@ -653,7 +652,6 @@ impl<'db> Connection<'db> { } /// Delete edges matching the given type and filter criteria. - /// See [`delete_nodes`] for mutation performance caveats. pub fn delete_edges( &self, edge_type: &str, @@ -1569,7 +1567,7 @@ graph_schema: assert!(conn.delete_nodes("Person", filters).is_ok()); let sqls = captured.lock().unwrap(); let sql = &sqls[0]; - assert!(sql.contains("ALTER TABLE") && sql.contains("DELETE WHERE")); + assert!(sql.contains("DELETE FROM") && sql.contains("WHERE")); assert!(sql.contains("full_name") && sql.contains("'Alice'")); } diff --git a/clickgraph-embedded/src/database.rs b/clickgraph-embedded/src/database.rs index 89dbf840..c1963c7a 100644 --- a/clickgraph-embedded/src/database.rs +++ b/clickgraph-embedded/src/database.rs @@ -56,6 +56,9 @@ impl std::fmt::Debug for RemoteConfig { /// Configuration for an embedded database session. /// /// Mirrors `kuzu::SystemConfig`. +/// +/// All fields are `Option` so that callers can safely use `..SystemConfig::default()` +/// to forward-compatibly add new fields without breaking struct literals. #[derive(Debug, Clone, Default)] pub struct SystemConfig { /// Directory where chdb stores its session data. @@ -69,9 +72,13 @@ pub struct SystemConfig { /// Maximum number of threads for chdb query execution. /// `None` uses the chdb default (typically number of CPU cores). - /// Reserved for future use -- not yet passed to chdb session. pub max_threads: Option, + /// Maximum memory a single query may use, in bytes. + /// `None` uses the chdb/ClickHouse default (no cap). + /// Set this in test environments to prevent runaway memory usage. + pub max_memory_usage_bytes: Option, + /// Storage credentials for remote sources (S3, GCS, Azure Blob, Iceberg). /// /// Applied as ClickHouse session-level `SET` commands before any VIEWs are @@ -160,6 +167,17 @@ impl Database { ChdbExecutor::new_with_credentials(&session_dir, auto_cleanup, &config.credentials) .map_err(|e| EmbeddedError::Executor(e.to_string()))?; + if let Some(threads) = config.max_threads { + executor + .execute_blocking_ddl(&format!("SET max_threads = {threads}")) + .map_err(|e| EmbeddedError::Executor(e.to_string()))?; + } + if let Some(bytes) = config.max_memory_usage_bytes { + executor + .execute_blocking_ddl(&format!("SET max_memory_usage = {bytes}")) + .map_err(|e| EmbeddedError::Executor(e.to_string()))?; + } + let view_count = clickgraph::executor::data_loader::load_schema_sources(&executor, &schema) .map_err(|e| EmbeddedError::Executor(e.to_string()))?; if view_count > 0 { diff --git a/clickgraph-embedded/src/write_helpers.rs b/clickgraph-embedded/src/write_helpers.rs index 1b9fc1f3..a1a279fe 100644 --- a/clickgraph-embedded/src/write_helpers.rs +++ b/clickgraph-embedded/src/write_helpers.rs @@ -103,7 +103,7 @@ pub fn build_insert_sql( ) } -/// Build an `ALTER TABLE ... DELETE WHERE ...` SQL statement. +/// Build a `DELETE FROM ... WHERE ...` SQL statement (lightweight delete). /// /// Maps filter keys from Cypher property names to ClickHouse column names using /// `property_mappings`, and renders filter values via `Value::to_sql_literal()`. @@ -146,7 +146,7 @@ pub fn build_delete_sql( } Ok(format!( - "ALTER TABLE `{}`.`{}` DELETE WHERE {}", + "DELETE FROM `{}`.`{}` WHERE {}", db, table, conditions.join(" AND ") @@ -393,7 +393,7 @@ mod tests { let sql = build_delete_sql("mydb", "users", &filters, &mappings, &["user_id"]).unwrap(); assert_eq!( sql, - "ALTER TABLE `mydb`.`users` DELETE WHERE `full_name` = 'Alice'" + "DELETE FROM `mydb`.`users` WHERE `full_name` = 'Alice'" ); } @@ -406,10 +406,7 @@ mod tests { filters.insert("user_id".to_string(), Value::String("u123".to_string())); let sql = build_delete_sql("mydb", "users", &filters, &mappings, &["user_id"]).unwrap(); - assert_eq!( - sql, - "ALTER TABLE `mydb`.`users` DELETE WHERE `user_id` = 'u123'" - ); + assert_eq!(sql, "DELETE FROM `mydb`.`users` WHERE `user_id` = 'u123'"); } #[test] @@ -426,7 +423,7 @@ mod tests { // Keys are sorted for deterministic output assert_eq!( sql, - "ALTER TABLE `mydb`.`users` DELETE WHERE `user_age` = 30 AND `full_name` = 'Bob'" + "DELETE FROM `mydb`.`users` WHERE `user_age` = 30 AND `full_name` = 'Bob'" ); } diff --git a/clickgraph-tck/Cargo.toml b/clickgraph-tck/Cargo.toml index 1e373f51..c750c3e3 100644 --- a/clickgraph-tck/Cargo.toml +++ b/clickgraph-tck/Cargo.toml @@ -15,7 +15,7 @@ path = "tests/tck.rs" harness = false [dev-dependencies] -clickgraph-embedded = { path = "../clickgraph-embedded" } +clickgraph-embedded = { path = "../clickgraph-embedded", features = ["embedded"] } cucumber = "0.21" futures = "0.3" uuid = { version = "1", features = ["v4"] } diff --git a/clickgraph-tck/README.md b/clickgraph-tck/README.md new file mode 100644 index 00000000..3a6e15a0 --- /dev/null +++ b/clickgraph-tck/README.md @@ -0,0 +1,80 @@ +# clickgraph-tck + +openCypher [Technology Compatibility Kit (TCK)](https://github.com/opencypher/openCypher/tree/master/tck) runner for ClickGraph, using the embedded chdb engine. + +## Current status + +**383 / 402 scenarios passing (95.3%)** — 19 skipped (`@NegativeTests` / `@skip`), 0 failing. + +## What it tests + +The TCK is the openCypher project's official conformance suite. Each scenario is a Gherkin `.feature` file that specifies a graph setup, a Cypher query, and expected results. This crate runs a subset of those scenarios against ClickGraph's embedded query engine. + +**Current coverage** — 402 scenarios across 20 feature files: + +| Category | Feature files | Scenarios | +|----------|--------------|-----------| +| `MATCH` / `OPTIONAL MATCH` | Match1–3, MatchWhere1–2 | 66 | +| `RETURN` / `ORDER BY` / `SKIP`+`LIMIT` | Return1–3, ReturnOrderBy1, ReturnSkipLimit1 | 46 | +| `WITH` | With1–3 | 9 | +| Aggregation (`count`, `min`, `max`) | Aggregation1–2 | 14 | +| Boolean expressions | Boolean1 | 7 | +| Comparison expressions | Comparison1 | 13 | +| List expressions | List1 | 5 | +| Null handling | Null1 | 5 | +| String functions | String1 | 1 | + +Scenarios tagged `@NegativeTests`, `@skip`, `@fails`, `@crash`, or `@wip` are skipped. + +### Known gaps + +Write operations (`SET`, `DELETE`, `MERGE`) are not covered — ClickGraph is a read-query engine. The TCK's write-oriented feature files are not included. + +## How it works + +1. **Schema generation** — at startup, all `.feature` files are scanned to extract every `CREATE` block. Node labels and relationship types are collected into a universal schema (`SchemaCatalog`), which is written as a ClickGraph YAML schema and used to create `ReplacingMergeTree` tables in chdb. + +2. **One chdb session per process** — chdb supports only one active session per process. A single `Database` is created at startup and shared across all scenarios via `LazyLock`. Tables are **truncated** between scenarios rather than recreated. + +3. **Test execution** — each scenario follows the standard Cucumber lifecycle: + - *Given* `an empty graph` / `having executed:` — truncates tables, then runs Cypher `CREATE` statements to populate data + - *When* `executing query:` — translates Cypher to SQL and executes it via the embedded engine + - *Then* `the result should be (in any order / in order)` — normalises output (bools, nulls, floats) and compares with the expected Gherkin table + +## Running + +```bash +# Requires CLICKGRAPH_CHDB_TESTS=1 to opt in to chdb e2e tests +CLICKGRAPH_CHDB_TESTS=1 cargo test -p clickgraph-tck --test tck + +# Show SQL generated for failing scenarios (written to /tmp/tck_failing_sql.txt) +CLICKGRAPH_CHDB_TESTS=1 cargo test -p clickgraph-tck --test tck 2>&1 | grep FAIL +``` + +> **Important**: Never run multiple instances of these tests concurrently. chdb is a +> full in-process ClickHouse engine and is memory-intensive. The test harness caps +> each session to 4 threads and 4 GiB per query; running several instances in +> parallel will still saturate available RAM. + +## Adding feature files + +1. Copy the `.feature` file from the [openCypher TCK](https://github.com/opencypher/openCypher/tree/master/tck/features) into `tests/features/clauses/` or `tests/features/expressions/`. +2. Update `tests/features/FEATURES_VERSION` with the source commit. +3. Run the tests — the schema generator picks up new labels/rel-types automatically. +4. Tag scenarios that rely on unsupported features with `@skip` and add a comment explaining why. + +## Directory structure + +``` +clickgraph-tck/ +├── Cargo.toml +└── tests/ + ├── tck.rs # Cucumber test harness (step definitions, world state) + ├── create_parser.rs # Re-export of the embedded Cypher CREATE parser + ├── schema_gen.rs # Universal schema inference from feature files + ├── result_fmt.rs # Result normalisation and Gherkin table parsing + └── features/ + ├── FEATURES_VERSION + ├── clauses/ # MATCH, WITH, RETURN, ORDER BY, SKIP/LIMIT + └── expressions/ # Aggregation, Boolean, Comparison, List, Null, String +``` diff --git a/clickgraph-tck/tests/tck.rs b/clickgraph-tck/tests/tck.rs index dacc177f..ef718490 100644 --- a/clickgraph-tck/tests/tck.rs +++ b/clickgraph-tck/tests/tck.rs @@ -48,8 +48,15 @@ static SHARED: LazyLock<&'static TckDatabase> = LazyLock::new(|| { let schema_path = std::env::temp_dir().join("clickgraph_tck_schema.yaml"); std::fs::write(&schema_path, &yaml).expect("write TCK schema YAML"); - let db = - Database::in_memory(&schema_path, SystemConfig::default()).expect("create TCK database"); + // Cap resource usage: TCK queries are trivial; 4 threads and 4 GiB per + // query is plenty and prevents runaway memory if multiple test processes + // are accidentally started in parallel. + let config = SystemConfig { + max_threads: Some(4), + max_memory_usage_bytes: Some(4 * 1024 * 1024 * 1024), // 4 GiB + ..SystemConfig::default() + }; + let db = Database::in_memory(&schema_path, config).expect("create TCK database"); // Leak intentionally: chdb SIGABRT on Drop; same pattern as chdb_e2e.rs Box::leak(Box::new(TckDatabase { db, tables })) @@ -67,7 +74,6 @@ fn all_tables() -> &'static [String] { fn truncate_all_tables() { let db = shared_db(); if let Ok(conn) = Connection::new(db) { - let _ = conn.execute_sql("SET mutations_sync=2"); for table in all_tables() { let r = conn.execute_sql(&format!("TRUNCATE TABLE IF EXISTS `default`.`{table}`")); if let Err(ref e) = r { diff --git a/src/query_planner/analyzer/type_inference.rs b/src/query_planner/analyzer/type_inference.rs index 400212bc..4b8181ba 100644 --- a/src/query_planner/analyzer/type_inference.rs +++ b/src/query_planner/analyzer/type_inference.rs @@ -1381,6 +1381,7 @@ impl TypeInference { graph_schema: &GraphSchema, relationships: &[RelationshipPattern], property_accesses: &HashMap>, + null_check_accesses: &HashMap>, ) -> AnalyzerResult> { log::info!( "🔀 UnifiedTypeInference: Generating UNION for {} untyped nodes", @@ -1478,22 +1479,52 @@ impl TypeInference { // Constraint 2: Filter by accessed properties // Uses property_mappings keys (Cypher names), NOT column_names (ClickHouse names) - if let Some(props) = property_accesses.get(var_name) { - candidates.retain(|type_name| { - if let Ok(node_schema) = graph_schema.node_schema(type_name) { - props - .iter() - .all(|prop| node_schema.has_cypher_property(prop)) - } else { - false - } - }); - log::debug!( - "🔍 Constrained '{}' by properties {:?}: {:?}", - var_name, - props, - candidates - ); + // + // We use ANY-semantics: a type is kept if it has at least one of the accessed + // properties. This correctly handles OR predicates (`WHERE n.p1 = 12 OR n.p2 = 13`) + // where different types satisfy different branches. + // + // Properties are split into two sets: + // - `regular`: accessed in normal expressions (equality, comparisons, …) + // - `null_check`: accessed inside IS NULL / IS NOT NULL predicates + // + // ALL-semantics would incorrectly eliminate types in disjunctive predicates. + // Pruning only on `regular` props (ignoring null_check) would incorrectly drop + // types that only appear in an IS NOT NULL branch of an OR, e.g.: + // `WHERE n.p IS NOT NULL OR n.q = 5` — type A={p}, type B={q}: both must survive. + // Using `regular ∪ null_check` with ANY-semantics handles this correctly. + // + // Special case: if a variable has ONLY null-check accesses and no regular accesses, + // skip pruning entirely — missing properties return NULL, not an error + // (e.g., `WHERE n.missing IS NULL` should not eliminate any type candidates). + let regular_props = property_accesses.get(var_name); + let nc_props = null_check_accesses.get(var_name); + if regular_props.is_some() || nc_props.is_some() { + let has_regular = regular_props.map_or(false, |p| !p.is_empty()); + if has_regular { + // Prune using regular ∪ null_check with ANY semantics + let all_props: HashSet<&String> = regular_props + .iter() + .flat_map(|s| s.iter()) + .chain(nc_props.iter().flat_map(|s| s.iter())) + .collect(); + candidates.retain(|type_name| { + if let Ok(node_schema) = graph_schema.node_schema(type_name) { + all_props + .iter() + .any(|prop| node_schema.has_cypher_property(prop)) + } else { + false + } + }); + log::debug!( + "🔍 Constrained '{}' by properties {:?}: {:?}", + var_name, + all_props, + candidates + ); + } + // else: only null-check accesses — skip pruning (missing props → NULL) } if candidates.is_empty() { @@ -1501,10 +1532,8 @@ impl TypeInference { "🔍 No valid types for '{}' after constraints — query returns empty result", var_name ); - // No type satisfies the constraints for this variable, so the entire - // query pattern produces no rows. Return Empty immediately so that - // downstream passes (FilterTagging, rendering) produce a correct - // zero-row plan (SELECT 1 AS "_empty" WHERE false). + // No type has ANY of the accessed properties (or relationship constraints + // eliminated all candidates). The query can produce no rows. return Ok(Arc::new(LogicalPlan::Empty)); } @@ -3657,6 +3686,13 @@ impl AnalyzerPass for TypeInference { .cloned() .collect(); let arm_props: HashMap<_, _> = original_property_accesses + .regular + .iter() + .filter(|(k, _)| arm_untyped.contains(k.as_str())) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + let arm_null_check: HashMap<_, _> = original_property_accesses + .null_check .iter() .filter(|(k, _)| arm_untyped.contains(k.as_str())) .map(|(k, v)| (k.clone(), v.clone())) @@ -3668,6 +3704,7 @@ impl AnalyzerPass for TypeInference { graph_schema, &arm_rels, &arm_props, + &arm_null_check, )?; new_inputs.push(expanded); } @@ -3686,7 +3723,8 @@ impl AnalyzerPass for TypeInference { plan_ctx, graph_schema, &original_relationships, - &original_property_accesses, + &original_property_accesses.regular, + &original_property_accesses.null_check, )?; } } else { @@ -4176,76 +4214,101 @@ fn extract_patterns_recursive( } } +/// Property accesses extracted from a plan, split by whether they appear inside +/// IS NULL / IS NOT NULL predicates. +/// +/// - `regular`: properties accessed in normal expressions (equality, comparison, functions, …) +/// - `null_check`: properties accessed *only* as the operand of IS NULL / IS NOT NULL +/// +/// Keeping them separate lets the pruning step use ANY-semantics on the full set while +/// still allowing relaxation when a variable's accesses are exclusively null-checks +/// (e.g., `WHERE n.missing IS NULL` — missing properties return NULL, not an error). +struct PropertyAccesses { + regular: HashMap>, + null_check: HashMap>, +} + /// Extract property accesses from the plan tree, grouped by variable name. -/// Returns a map: variable_alias → set of Cypher property names accessed. /// Used to constrain node type candidates to only those types that have the accessed properties. -fn extract_property_accesses(plan: &Arc) -> HashMap> { - let mut accesses: HashMap> = HashMap::new(); - extract_props_from_plan(plan.as_ref(), &mut accesses); - accesses +fn extract_property_accesses(plan: &Arc) -> PropertyAccesses { + let mut regular: HashMap> = HashMap::new(); + let mut null_check: HashMap> = HashMap::new(); + extract_props_from_plan(plan.as_ref(), &mut regular, &mut null_check); + PropertyAccesses { + regular, + null_check, + } } -fn extract_props_from_plan(plan: &LogicalPlan, accesses: &mut HashMap>) { +fn extract_props_from_plan( + plan: &LogicalPlan, + accesses: &mut HashMap>, + null_check: &mut HashMap>, +) { match plan { LogicalPlan::Filter(filter) => { - extract_props_from_expr(&filter.predicate, accesses); - extract_props_from_plan(&filter.input, accesses); + extract_props_from_expr(&filter.predicate, accesses, null_check); + extract_props_from_plan(&filter.input, accesses, null_check); } LogicalPlan::Projection(proj) => { for item in &proj.items { - extract_props_from_expr(&item.expression, accesses); + extract_props_from_expr(&item.expression, accesses, null_check); } - extract_props_from_plan(&proj.input, accesses); + extract_props_from_plan(&proj.input, accesses, null_check); } LogicalPlan::OrderBy(ob) => { for item in &ob.items { - extract_props_from_expr(&item.expression, accesses); + extract_props_from_expr(&item.expression, accesses, null_check); } - extract_props_from_plan(&ob.input, accesses); + extract_props_from_plan(&ob.input, accesses, null_check); } LogicalPlan::GroupBy(gb) => { for item in &gb.expressions { - extract_props_from_expr(item, accesses); + extract_props_from_expr(item, accesses, null_check); } if let Some(ref having) = gb.having_clause { - extract_props_from_expr(having, accesses); + extract_props_from_expr(having, accesses, null_check); } - extract_props_from_plan(&gb.input, accesses); + extract_props_from_plan(&gb.input, accesses, null_check); } LogicalPlan::GraphRel(rel) => { - extract_props_from_plan(&rel.left, accesses); - extract_props_from_plan(&rel.center, accesses); - extract_props_from_plan(&rel.right, accesses); + extract_props_from_plan(&rel.left, accesses, null_check); + extract_props_from_plan(&rel.center, accesses, null_check); + extract_props_from_plan(&rel.right, accesses, null_check); if let Some(ref pred) = rel.where_predicate { - extract_props_from_expr(pred, accesses); + extract_props_from_expr(pred, accesses, null_check); } } LogicalPlan::GraphNode(node) => { - extract_props_from_plan(&node.input, accesses); + extract_props_from_plan(&node.input, accesses, null_check); } LogicalPlan::Union(u) => { for input in &u.inputs { - extract_props_from_plan(input, accesses); + extract_props_from_plan(input, accesses, null_check); } } LogicalPlan::WithClause(wc) => { - extract_props_from_plan(&wc.input, accesses); + extract_props_from_plan(&wc.input, accesses, null_check); } LogicalPlan::CartesianProduct(cp) => { - extract_props_from_plan(&cp.left, accesses); - extract_props_from_plan(&cp.right, accesses); + extract_props_from_plan(&cp.left, accesses, null_check); + extract_props_from_plan(&cp.right, accesses, null_check); } LogicalPlan::Limit(lim) => { - extract_props_from_plan(&lim.input, accesses); + extract_props_from_plan(&lim.input, accesses, null_check); } LogicalPlan::Skip(s) => { - extract_props_from_plan(&s.input, accesses); + extract_props_from_plan(&s.input, accesses, null_check); } _ => {} } } -fn extract_props_from_expr(expr: &LogicalExpr, accesses: &mut HashMap>) { +fn extract_props_from_expr( + expr: &LogicalExpr, + accesses: &mut HashMap>, + null_check: &mut HashMap>, +) { match expr { LogicalExpr::PropertyAccessExp(pa) => { let var = pa.table_alias.0.clone(); @@ -4253,35 +4316,55 @@ fn extract_props_from_expr(expr: &LogicalExpr, accesses: &mut HashMap { + use crate::query_planner::logical_expr::Operator as Op; + if matches!(op.operator, Op::IsNull | Op::IsNotNull) { + // Record properties accessed inside IS NULL / IS NOT NULL in the + // null_check map, NOT in regular accesses. They still participate in + // ANY-semantics pruning (fixing the OR disjunction case, e.g. + // `WHERE n.p IS NOT NULL OR n.q = 5` keeps types with either p or q), + // but when a variable's ONLY accesses are null-checks the pruning is + // relaxed (missing properties return NULL, not an error — see pruning + // logic in generate_union_for_untyped_nodes). + // IS NULL / IS NOT NULL always takes a PropertyAccessExp operand — + // extract directly to avoid a double-mutable-borrow of null_check. + for operand in &op.operands { + if let LogicalExpr::PropertyAccessExp(pa) = operand { + let var = pa.table_alias.0.clone(); + let prop = pa.column.raw().to_string(); + null_check.entry(var).or_default().insert(prop); + } + } + return; + } for operand in &op.operands { - extract_props_from_expr(operand, accesses); + extract_props_from_expr(operand, accesses, null_check); } } LogicalExpr::ScalarFnCall(f) => { for arg in &f.args { - extract_props_from_expr(arg, accesses); + extract_props_from_expr(arg, accesses, null_check); } } LogicalExpr::AggregateFnCall(f) => { for arg in &f.args { - extract_props_from_expr(arg, accesses); + extract_props_from_expr(arg, accesses, null_check); } } LogicalExpr::Case(c) => { if let Some(ref e) = c.expr { - extract_props_from_expr(e, accesses); + extract_props_from_expr(e, accesses, null_check); } for (w, t) in &c.when_then { - extract_props_from_expr(w, accesses); - extract_props_from_expr(t, accesses); + extract_props_from_expr(w, accesses, null_check); + extract_props_from_expr(t, accesses, null_check); } if let Some(ref e) = c.else_expr { - extract_props_from_expr(e, accesses); + extract_props_from_expr(e, accesses, null_check); } } LogicalExpr::List(items) => { for item in items { - extract_props_from_expr(item, accesses); + extract_props_from_expr(item, accesses, null_check); } } _ => {}