genezhang · genezhang · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -49,7 +49,7 @@ lazy_static = "1.5"
 regex = "1.12.3"
 async-trait = "0.1"
 # chdb: embedded ClickHouse (optional — only compiled when feature "embedded" is enabled)
-chdb-rust = { version = "1.3.0", optional = true }
+chdb-rust = { version = "1.3.1", optional = true }
 
 [target.'cfg(not(target_env = "msvc"))'.dependencies]
 tikv-jemallocator = { version = "0.6", optional = true }

diff --git a/README.md b/README.md
@@ -4,22 +4,23 @@
 
 # ClickGraph
 
-#### ClickGraph - A high-performance, stateless, read-only graph query service for ClickHouse, written in Rust, with Neo4j ecosystem compatibility - Cypher and Bolt Protocol 5.8 support. Now supports embedded mode with local writes, and exporting query results to external destinations, with Golang, Python bindings, in addition to native Rust.
+#### ClickGraph - A high-performance, stateless, read-only graph query service for ClickHouse, written in Rust, with Neo4j ecosystem compatibility - Cypher and Bolt Protocol 5.8 support. Now supports embedded mode with local writes, and exporting query results to external destinations, with Golang and Python bindings, in addition to native Rust. New `cg` CLI tool supports agentic workflows.
 
 > **Note: ClickGraph dev release is at beta quality for view-based graph analytics applications. Kindly raise an issue if you encounter any problem.**
 
----
-## Motivation and Rationale
-- Viewing ClickHouse databases (including external sources) as graph data with graph analytics capability brings another level of abstraction and boosts productivity with graph tools, and enables agentic GraphRAG support with local writes.
-- Research shows relational analytics with columnar stores and vectorized execution engines like ClickHouse provide superior analytical performance and scalability to graph-native technologies, which usually leverage explicit adjacency representations and are more suitable for local-area graph traversals.
-- View-based graph analytics offer the benefits of zero-ETL without the hassle of data migration and duplicate cost, yet better performance and scalability than most of the native graph analytics options.
-- Neo4j Bolt protocol support gives access to the tools available based on the Bolt protocol.
+`ClickGraph` provides three modes now:
+- Stateless service
+- Embedded mode with embedded `chDB`
+- Hybrid mode with remote querying and local storage
+
+See [here](https://github.com/genezhang/clickgraph/blob/main/docs/motivation.md) for motivation and rationale.
+
 ---
 ## What's New in v0.6.6-dev
 
 - **`cg` CLI tool** — Agent/script-oriented CLI (`clickgraph-tool` crate). Translate and execute Cypher without a running server: `cg sql`, `cg validate`, `cg query`, `cg nl` (NL→Cypher via LLM), `cg schema show/validate/discover/diff`. Config via `~/.config/cg/config.toml`. Designed for agentic callers, CI pipelines, and scripting.
 - **`embedded` feature now opt-in** — `clickgraph-embedded` compiles without chdb by default. New `Database::new_remote(schema, RemoteConfig)` constructor executes Cypher against external ClickHouse with no chdb dependency — useful for lightweight tooling and the `cg` CLI.
-- **Agent skills** — Three publishable skills for any agentic framework (Claude Code, LangChain, AutoGen, CrewAI, OpenAI function calling): `/cypher` (NL→Cypher→execute), `/graph-schema` (show schema), `/schema-discover` (generate schema YAML from ClickHouse via LLM). See [skills/README.md](skills/README.md).
+- **Agent skills** — Three publishable skills for any agentic framework (Claude Code, LangChain, AutoGen, CrewAI, OpenAI function calling): `/cypher` (NL→Cypher→execute), `/graph-schema` (show schema), `/schema-discover` (generate schema YAML from ClickHouse via LLM). See [skills/README.md](https://github.com/genezhang/clickgraph/blob/main/skills/README.md).
 - **openCypher TCK** — 383/402 openCypher Technology Compatibility Kit scenarios passing (95.3%), 0 failures. The 19 skipped scenarios cover Cypher write clauses (`CREATE`, `SET`, `DELETE`, `MERGE`) — not yet supported as Cypher syntax. Note: a programmatic write API (`create_node()`, `create_edge()`, `upsert_node()`) is already available in embedded mode; Cypher write syntax support is planned.
 
 ## What's New in v0.6.5-dev
@@ -75,7 +76,7 @@ See [CHANGELOG.md](CHANGELOG.md) for complete release history.
 
 ## Architecture
 
-ClickGraph runs as a lightweight stateless query translator alongside ClickHouse:
+ClickGraph service runs as a lightweight stateless query translator alongside ClickHouse:
 
 ```mermaid
 flowchart LR

diff --git a/clickgraph-embedded/src/connection.rs b/clickgraph-embedded/src/connection.rs
@@ -626,9 +626,8 @@ impl<'db> Connection<'db> {
 
     /// Delete nodes matching the given label and filter criteria.
     ///
-    /// Uses `ALTER TABLE DELETE` which is a ClickHouse mutation — asynchronous
-    /// and resource-heavy. Not suitable for high-frequency use in tight loops.
-    /// For bulk cleanup, prefer fewer calls with broader filters.
+    /// Uses lightweight `DELETE FROM` — synchronous and low-overhead compared to
+    /// the old `ALTER TABLE DELETE` mutation path.
     pub fn delete_nodes(
         &self,
         label: &str,
@@ -653,7 +652,6 @@ impl<'db> Connection<'db> {
     }
 
     /// Delete edges matching the given type and filter criteria.
-    /// See [`delete_nodes`] for mutation performance caveats.
     pub fn delete_edges(
         &self,
         edge_type: &str,
@@ -1569,7 +1567,7 @@ graph_schema:
         assert!(conn.delete_nodes("Person", filters).is_ok());
         let sqls = captured.lock().unwrap();
         let sql = &sqls[0];
-        assert!(sql.contains("ALTER TABLE") && sql.contains("DELETE WHERE"));
+        assert!(sql.contains("DELETE FROM") && sql.contains("WHERE"));
         assert!(sql.contains("full_name") && sql.contains("'Alice'"));
     }
 

diff --git a/clickgraph-embedded/src/database.rs b/clickgraph-embedded/src/database.rs
@@ -69,9 +69,13 @@ pub struct SystemConfig {
 
     /// Maximum number of threads for chdb query execution.
     /// `None` uses the chdb default (typically number of CPU cores).
-    /// Reserved for future use -- not yet passed to chdb session.
     pub max_threads: Option<usize>,
 
+    /// Maximum memory a single query may use, in bytes.
+    /// `None` uses the chdb/ClickHouse default (no cap).
+    /// Set this in test environments to prevent runaway memory usage.
+    pub max_memory_usage_bytes: Option<u64>,
+
     /// Storage credentials for remote sources (S3, GCS, Azure Blob, Iceberg).
     ///
     /// Applied as ClickHouse session-level `SET` commands before any VIEWs are
@@ -160,6 +164,17 @@ impl Database {
             ChdbExecutor::new_with_credentials(&session_dir, auto_cleanup, &config.credentials)
                 .map_err(|e| EmbeddedError::Executor(e.to_string()))?;
 
+        if let Some(threads) = config.max_threads {
+            executor
+                .execute_blocking_ddl(&format!("SET max_threads = {threads}"))
+                .map_err(|e| EmbeddedError::Executor(e.to_string()))?;
+        }
+        if let Some(bytes) = config.max_memory_usage_bytes {
+            executor
+                .execute_blocking_ddl(&format!("SET max_memory_usage = {bytes}"))
+                .map_err(|e| EmbeddedError::Executor(e.to_string()))?;
+        }
+
         let view_count = clickgraph::executor::data_loader::load_schema_sources(&executor, &schema)
             .map_err(|e| EmbeddedError::Executor(e.to_string()))?;
         if view_count > 0 {

diff --git a/clickgraph-embedded/src/write_helpers.rs b/clickgraph-embedded/src/write_helpers.rs
@@ -103,7 +103,7 @@ pub fn build_insert_sql(
     )
 }
 
-/// Build an `ALTER TABLE ... DELETE WHERE ...` SQL statement.
+/// Build a `DELETE FROM ... WHERE ...` SQL statement (lightweight delete).
 ///
 /// Maps filter keys from Cypher property names to ClickHouse column names using
 /// `property_mappings`, and renders filter values via `Value::to_sql_literal()`.
@@ -146,7 +146,7 @@ pub fn build_delete_sql(
     }
 
     Ok(format!(
-        "ALTER TABLE `{}`.`{}` DELETE WHERE {}",
+        "DELETE FROM `{}`.`{}` WHERE {}",
         db,
         table,
         conditions.join(" AND ")
@@ -393,7 +393,7 @@ mod tests {
         let sql = build_delete_sql("mydb", "users", &filters, &mappings, &["user_id"]).unwrap();
         assert_eq!(
             sql,
-            "ALTER TABLE `mydb`.`users` DELETE WHERE `full_name` = 'Alice'"
+            "DELETE FROM `mydb`.`users` WHERE `full_name` = 'Alice'"
         );
     }
 
@@ -406,10 +406,7 @@ mod tests {
         filters.insert("user_id".to_string(), Value::String("u123".to_string()));
 
         let sql = build_delete_sql("mydb", "users", &filters, &mappings, &["user_id"]).unwrap();
-        assert_eq!(
-            sql,
-            "ALTER TABLE `mydb`.`users` DELETE WHERE `user_id` = 'u123'"
-        );
+        assert_eq!(sql, "DELETE FROM `mydb`.`users` WHERE `user_id` = 'u123'");
     }
 
     #[test]
@@ -426,7 +423,7 @@ mod tests {
         // Keys are sorted for deterministic output
         assert_eq!(
             sql,
-            "ALTER TABLE `mydb`.`users` DELETE WHERE `user_age` = 30 AND `full_name` = 'Bob'"
+            "DELETE FROM `mydb`.`users` WHERE `user_age` = 30 AND `full_name` = 'Bob'"
         );
     }
 

diff --git a/clickgraph-tck/Cargo.toml b/clickgraph-tck/Cargo.toml
@@ -15,7 +15,7 @@ path = "tests/tck.rs"
 harness = false
 
 [dev-dependencies]
-clickgraph-embedded = { path = "../clickgraph-embedded" }
+clickgraph-embedded = { path = "../clickgraph-embedded", features = ["embedded"] }
 cucumber = "0.21"
 futures = "0.3"
 uuid = { version = "1", features = ["v4"] }

diff --git a/clickgraph-tck/README.md b/clickgraph-tck/README.md
@@ -0,0 +1,80 @@
+# clickgraph-tck
+
+openCypher [Technology Compatibility Kit (TCK)](https://github.com/opencypher/openCypher/tree/master/tck) runner for ClickGraph, using the embedded chdb engine.
+
+## Current status
+
+**383 / 402 scenarios passing (95.3%)** — 19 skipped (`@NegativeTests` / `@skip`), 0 failing.
+
+## What it tests
+
+The TCK is the openCypher project's official conformance suite. Each scenario is a Gherkin `.feature` file that specifies a graph setup, a Cypher query, and expected results. This crate runs a subset of those scenarios against ClickGraph's embedded query engine.
+
+**Current coverage** — 402 scenarios across 20 feature files:
+
+| Category | Feature files | Scenarios |
+|----------|--------------|-----------|
+| `MATCH` / `OPTIONAL MATCH` | Match1–3, MatchWhere1–2 | 66 |
+| `RETURN` / `ORDER BY` / `SKIP`+`LIMIT` | Return1–3, ReturnOrderBy1, ReturnSkipLimit1 | 46 |
+| `WITH` | With1–3 | 9 |
+| Aggregation (`count`, `min`, `max`) | Aggregation1–2 | 14 |
+| Boolean expressions | Boolean1 | 7 |
+| Comparison expressions | Comparison1 | 13 |
+| List expressions | List1 | 5 |
+| Null handling | Null1 | 5 |
+| String functions | String1 | 1 |
+
+Scenarios tagged `@NegativeTests`, `@skip`, `@fails`, `@crash`, or `@wip` are skipped.
+
+### Known gaps
+
+Write operations (`SET`, `DELETE`, `MERGE`) are not covered — ClickGraph is a read-query engine. The TCK's write-oriented feature files are not included.
+
+## How it works
+
+1. **Schema generation** — at startup, all `.feature` files are scanned to extract every `CREATE` block. Node labels and relationship types are collected into a universal schema (`SchemaCatalog`), which is written as a ClickGraph YAML schema and used to create `ReplacingMergeTree` tables in chdb.
+
+2. **One chdb session per process** — chdb supports only one active session per process. A single `Database` is created at startup and shared across all scenarios via `LazyLock`. Tables are **truncated** between scenarios rather than recreated.
+
+3. **Test execution** — each scenario follows the standard Cucumber lifecycle:
+   - *Given* `an empty graph` / `having executed:` — truncates tables, then runs Cypher `CREATE` statements to populate data
+   - *When* `executing query:` — translates Cypher to SQL and executes it via the embedded engine
+   - *Then* `the result should be (in any order / in order)` — normalises output (bools, nulls, floats) and compares with the expected Gherkin table
+
+## Running
+
+```bash
+# Requires CLICKGRAPH_CHDB_TESTS=1 to opt in to chdb e2e tests
+CLICKGRAPH_CHDB_TESTS=1 cargo test -p clickgraph-tck --test tck
+
+# Show SQL generated for failing scenarios (written to /tmp/tck_failing_sql.txt)
+CLICKGRAPH_CHDB_TESTS=1 cargo test -p clickgraph-tck --test tck 2>&1 | grep FAIL
+```
+
+> **Important**: Never run multiple instances of these tests concurrently. chdb is a
+> full in-process ClickHouse engine and is memory-intensive. The test harness caps
+> each session to 4 threads and 4 GiB per query; running several instances in
+> parallel will still saturate available RAM.
+
+## Adding feature files
+
+1. Copy the `.feature` file from the [openCypher TCK](https://github.com/opencypher/openCypher/tree/master/tck/features) into `tests/features/clauses/` or `tests/features/expressions/`.
+2. Update `tests/features/FEATURES_VERSION` with the source commit.
+3. Run the tests — the schema generator picks up new labels/rel-types automatically.
+4. Tag scenarios that rely on unsupported features with `@skip` and add a comment explaining why.
+
+## Directory structure
+
+```
+clickgraph-tck/
+├── Cargo.toml
+└── tests/
+    ├── tck.rs             # Cucumber test harness (step definitions, world state)
+    ├── create_parser.rs   # Re-export of the embedded Cypher CREATE parser
+    ├── schema_gen.rs      # Universal schema inference from feature files
+    ├── result_fmt.rs      # Result normalisation and Gherkin table parsing
+    └── features/
+        ├── FEATURES_VERSION
+        ├── clauses/       # MATCH, WITH, RETURN, ORDER BY, SKIP/LIMIT
+        └── expressions/   # Aggregation, Boolean, Comparison, List, Null, String
+```
diff --git a/clickgraph-tck/tests/tck.rs b/clickgraph-tck/tests/tck.rs
@@ -48,8 +48,15 @@ static SHARED: LazyLock<&'static TckDatabase> = LazyLock::new(|| {
     let schema_path = std::env::temp_dir().join("clickgraph_tck_schema.yaml");
     std::fs::write(&schema_path, &yaml).expect("write TCK schema YAML");
 
-    let db =
-        Database::in_memory(&schema_path, SystemConfig::default()).expect("create TCK database");
+    // Cap resource usage: TCK queries are trivial; 4 threads and 4 GiB per
+    // query is plenty and prevents runaway memory if multiple test processes
+    // are accidentally started in parallel.
+    let config = SystemConfig {
+        max_threads: Some(4),
+        max_memory_usage_bytes: Some(4 * 1024 * 1024 * 1024), // 4 GiB
+        ..SystemConfig::default()
+    };
+    let db = Database::in_memory(&schema_path, config).expect("create TCK database");
 
     // Leak intentionally: chdb SIGABRT on Drop; same pattern as chdb_e2e.rs
     Box::leak(Box::new(TckDatabase { db, tables }))
@@ -67,7 +74,6 @@ fn all_tables() -> &'static [String] {
 fn truncate_all_tables() {
     let db = shared_db();
     if let Ok(conn) = Connection::new(db) {
-        let _ = conn.execute_sql("SET mutations_sync=2");
         for table in all_tables() {
             let r = conn.execute_sql(&format!("TRUNCATE TABLE IF EXISTS `default`.`{table}`"));
             if let Err(ref e) = r {

diff --git a/src/query_planner/analyzer/type_inference.rs b/src/query_planner/analyzer/type_inference.rs
@@ -1478,12 +1478,22 @@ impl TypeInference {
 
             // Constraint 2: Filter by accessed properties
             // Uses property_mappings keys (Cypher names), NOT column_names (ClickHouse names)
+            //
+            // We use ANY-semantics: a type is kept if it has at least one of the accessed
+            // properties. This correctly handles OR predicates (`WHERE n.p1 = 12 OR n.p2 = 13`)
+            // where different types satisfy different branches. IS NULL accesses are excluded
+            // from `property_accesses` entirely (see extract_props_from_expr) because
+            // accessing a missing property yields NULL, not an error.
+            //
+            // ALL-semantics would incorrectly eliminate types when properties appear in
+            // disjunctive (OR) predicates or when a type has only some of the accessed
+            // properties (those it lacks resolve to NULL and are filtered by the WHERE clause).
             if let Some(props) = property_accesses.get(var_name) {
                 candidates.retain(|type_name| {
                     if let Ok(node_schema) = graph_schema.node_schema(type_name) {
                         props
                             .iter()
-                            .all(|prop| node_schema.has_cypher_property(prop))
+                            .any(|prop| node_schema.has_cypher_property(prop))
                     } else {
                         false
                     }
@@ -1501,10 +1511,8 @@ impl TypeInference {
                     "🔍 No valid types for '{}' after constraints — query returns empty result",
                     var_name
                 );
-                // No type satisfies the constraints for this variable, so the entire
-                // query pattern produces no rows. Return Empty immediately so that
-                // downstream passes (FilterTagging, rendering) produce a correct
-                // zero-row plan (SELECT 1 AS "_empty" WHERE false).
+                // No type has ANY of the accessed properties (or relationship constraints
+                // eliminated all candidates). The query can produce no rows.
                 return Ok(Arc::new(LogicalPlan::Empty));
             }
 
@@ -4253,6 +4261,13 @@ fn extract_props_from_expr(expr: &LogicalExpr, accesses: &mut HashMap<String, Ha
             accesses.entry(var).or_default().insert(prop);
         }
         LogicalExpr::Operator(op) | LogicalExpr::OperatorApplicationExp(op) => {
+            // Skip IS NULL / IS NOT NULL — accessing a missing property returns NULL,
+            // so the property not existing in a type's schema is semantically valid.
+            // Including it would incorrectly eliminate types from the candidate set.
+            use crate::query_planner::logical_expr::Operator as Op;
+            if matches!(op.operator, Op::IsNull | Op::IsNotNull) {
+                return;
+            }
             for operand in &op.operands {
                 extract_props_from_expr(operand, accesses);
             }