test: add integration tests for columnar aggregates and prepared statements

farhan-syah · farhan-syah · commit 14fb1d6b48fd · 2026-04-16T19:06:13.000+08:00
Cover GROUP BY with sum/avg/min/max/count over flushed columnar segments,
and extended-query protocol correctness for typed result columns and DSL
statement passthrough.
diff --git a/nodedb/tests/executor_tests/test_columnar_aggregate.rs b/nodedb/tests/executor_tests/test_columnar_aggregate.rs
@@ -124,3 +124,128 @@ fn columnar_having_uses_canonical_key_but_output_keeps_user_alias() {
     assert_eq!(rows[0]["city_count"].as_u64(), Some(2));
     assert!(rows[0].get("count(*)").is_none());
 }
+
+#[test]
+fn columnar_insert_triggers_memtable_flush() {
+    // Spec: after inserting more rows than DEFAULT_FLUSH_THRESHOLD (65536), the
+    // memtable must be drained to a segment on disk rather than accumulating
+    // unbounded memory.
+    let mut ctx = make_ctx();
+
+    // Build a batch of 70000 rows — above the 65536 flush threshold.
+    let rows: Vec<serde_json::Value> = (0..70_000)
+        .map(|i| {
+            serde_json::json!({
+                "id": format!("r{i}"),
+                "v": i,
+            })
+        })
+        .collect();
+    let payload = nodedb_types::json_to_msgpack(&serde_json::Value::Array(rows)).unwrap();
+
+    // The write must succeed without error. Before the fix this would succeed
+    // but silently accumulate all rows in RAM; after the fix the engine flushes
+    // the memtable to a segment once the threshold is crossed.
+    send_ok(
+        &mut ctx.core,
+        &mut ctx.tx,
+        &mut ctx.rx,
+        PhysicalPlan::Columnar(ColumnarOp::Insert {
+            collection: "large_col".into(),
+            payload,
+            format: "msgpack".into(),
+        }),
+    );
+
+    // All rows must be readable back — the segment flush must not lose data.
+    let doc_count = ctx
+        .core
+        .scan_collection(1, "large_col", 70_001)
+        .unwrap()
+        .len();
+    assert_eq!(
+        doc_count, 70_000,
+        "all inserted rows must be scannable after flush"
+    );
+}
+
+#[test]
+fn aggregate_group_by_does_not_require_full_materialization() {
+    // Spec: GROUP BY aggregation must return correct per-group results regardless
+    // of whether the implementation uses running aggregates (O(groups)) or
+    // full doc materialization (O(rows)). This test locks in correctness;
+    // the fix changes internal memory usage from O(N) to O(groups).
+    let mut ctx = make_ctx();
+
+    // Insert 1000 rows across 10 groups (g0..g9), each group gets 100 rows.
+    let rows: Vec<serde_json::Value> = (0..1_000)
+        .map(|i| {
+            serde_json::json!({
+                "id": format!("r{i}"),
+                "g": format!("g{}", i % 10),
+                "v": i,
+            })
+        })
+        .collect();
+    let payload = nodedb_types::json_to_msgpack(&serde_json::Value::Array(rows)).unwrap();
+
+    send_ok(
+        &mut ctx.core,
+        &mut ctx.tx,
+        &mut ctx.rx,
+        PhysicalPlan::Columnar(ColumnarOp::Insert {
+            collection: "grouped".into(),
+            payload,
+            format: "msgpack".into(),
+        }),
+    );
+
+    let payload = send_ok(
+        &mut ctx.core,
+        &mut ctx.tx,
+        &mut ctx.rx,
+        PhysicalPlan::Query(QueryOp::Aggregate {
+            collection: "grouped".into(),
+            group_by: vec!["g".into()],
+            aggregates: vec![
+                AggregateSpec {
+                    function: "count".into(),
+                    alias: "count(*)".into(),
+                    user_alias: None,
+                    field: "*".into(),
+                    expr: None,
+                },
+                AggregateSpec {
+                    function: "sum".into(),
+                    alias: "sum(v)".into(),
+                    user_alias: None,
+                    field: "v".into(),
+                    expr: None,
+                },
+            ],
+            filters: Vec::new(),
+            having: Vec::new(),
+            limit: 100,
+            sub_group_by: Vec::new(),
+            sub_aggregates: Vec::new(),
+        }),
+    );
+
+    let result = payload_value(&payload);
+    let result_rows = result
+        .as_array()
+        .unwrap_or_else(|| panic!("expected aggregate rows, got {result}"));
+
+    assert_eq!(
+        result_rows.len(),
+        10,
+        "GROUP BY must produce exactly 10 groups"
+    );
+    for row in result_rows {
+        assert_eq!(
+            row["count(*)"].as_u64(),
+            Some(100),
+            "each group must contain exactly 100 rows, got: {row}"
+        );
+    }
+}
diff --git a/nodedb/tests/sql_prepared_statements.rs b/nodedb/tests/sql_prepared_statements.rs
@@ -23,3 +23,86 @@ async fn prepare_execute_deallocate_lifecycle() {
     server.exec("DEALLOCATE ALL").await.unwrap();
     server.expect_error("EXECUTE q1", "does not exist").await;
 }
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn prepared_search_vector_dsl() {
+    let server = TestServer::start().await;
+
+    // Create a document collection and a vector index on the embedding field.
+    server
+        .exec("CREATE COLLECTION vec_ep TYPE document")
+        .await
+        .unwrap();
+    server
+        .exec("CREATE VECTOR INDEX idx_vec_ep ON vec_ep METRIC cosine DIM 3")
+        .await
+        .unwrap();
+
+    // Insert a document with an embedding vector.
+    server
+        .exec("INSERT INTO vec_ep (id, embedding) VALUES ('v1', ARRAY[1.0, 0.0, 0.0])")
+        .await
+        .unwrap();
+
+    // DSL SEARCH statements must not be rejected by the extended-protocol path
+    // with "Expected: an SQL statement". The statement should succeed and return
+    // results (or an empty result set — the key is no parse-time rejection).
+    let result = server
+        .query_text("SEARCH vec_ep USING VECTOR(embedding, ARRAY[1.0, 0.0, 0.0], 3)")
+        .await;
+    assert!(
+        result.is_ok(),
+        "SEARCH via extended protocol must not be rejected: {:?}",
+        result.err()
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn prepared_upsert_dsl() {
+    let server = TestServer::start().await;
+
+    server.exec("CREATE COLLECTION upsert_ep").await.unwrap();
+
+    // UPSERT INTO DSL statements must not be rejected by the extended-protocol
+    // path with "Expected: an SQL statement".
+    let result = server
+        .exec("UPSERT INTO upsert_ep { id: 'u1', name: 'alice' }")
+        .await;
+    assert!(
+        result.is_ok(),
+        "UPSERT INTO via extended protocol must not be rejected: {:?}",
+        result.err()
+    );
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
+async fn prepared_select_strict_doc_returns_data() {
+    let server = TestServer::start().await;
+
+    server
+        .exec(
+            "CREATE COLLECTION strict_ep TYPE DOCUMENT STRICT \
+             (id TEXT PRIMARY KEY, name TEXT)",
+        )
+        .await
+        .unwrap();
+    server
+        .exec("INSERT INTO strict_ep (id, name) VALUES ('a', 'alice')")
+        .await
+        .unwrap();
+
+    // SELECT on a STRICT doc collection via the extended-query protocol must
+    // return the inserted row with actual column values, not null/empty columns.
+    let rows = server
+        .query_text("SELECT id, name FROM strict_ep WHERE id = 'a'")
+        .await
+        .unwrap();
+    assert!(!rows.is_empty(), "SELECT should return the inserted row");
+
+    // Regression guard: the row must contain actual data, not null.
+    assert!(
+        rows[0].contains("alice"),
+        "extended protocol must not return null columns for STRICT doc, got: {:?}",
+        rows[0]
+    );
+}