Skip to content

Commit e5901c5

Browse files
authored
Merge pull request #27 from NodeDB-Lab/fix/vector-index-ddl-quantization
fix(pgwire): wire quantization params through CREATE/ALTER VECTOR INDEX DDL
2 parents a02a387 + 298785e commit e5901c5

File tree

16 files changed

+994
-673
lines changed

16 files changed

+994
-673
lines changed

Cargo.toml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -235,11 +235,3 @@ incremental = false
235235
inherits = "dev"
236236
debug = true
237237

238-
# core2 0.4.0 is yanked on crates.io but no 0.4.1+ exists, and libflate /
239-
# libflate_lz77 (pulled in via the `jieba` feature of nodedb-fts →
240-
# include-flate → include-flate-compress) still require `core2 ^0.4`.
241-
# Point cargo at the upstream git source so fresh resolution (CI without
242-
# a committed lockfile) succeeds — git sources bypass the registry's yank
243-
# check. Safe to remove once libflate releases a bump off core2.
244-
[patch.crates-io]
245-
core2 = { git = "https://github.com/technocreatives/core2", rev = "545e84bcb0f235b12e21351e0c69767958efe2a7" }

nodedb-fts/Cargo.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ homepage.workspace = true
1212
[features]
1313
default = []
1414
lang-ja = ["dep:lindera"]
15-
lang-zh = ["dep:jieba-rs"]
15+
# lang-zh currently falls back to CJK bigrams. The previous dictionary-based
16+
# implementation pulled jieba-rs → include-flate → libflate → yanked core2,
17+
# which breaks `cargo publish`. Re-enable once the upstream chain is fixed.
18+
lang-zh = []
1619
lang-ko = ["dep:lindera"]
1720
lang-th = ["dep:icu_segmenter"]
1821
lang-detect = ["dep:whatlang"]
@@ -29,7 +32,6 @@ thiserror = { workspace = true }
2932

3033
# Optional: dictionary-based CJK segmentation
3134
lindera = { version = "2.3", optional = true }
32-
jieba-rs = { version = "0.9", optional = true }
3335
icu_segmenter = { version = "1", optional = true }
3436
whatlang = { version = "0.18", optional = true }
3537

nodedb-fts/src/analyzer/language/cjk/segmenter.rs

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
//!
77
//! Feature gates:
88
//! - `lang-ja`: lindera with IPADIC for Japanese
9-
//! - `lang-zh`: jieba-rs for Chinese
9+
//! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml)
1010
//! - `lang-ko`: lindera with ko-dic for Korean
1111
//! - `lang-th`: icu_segmenter for Thai
1212
@@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec<String> {
3737
}
3838
}
3939

40-
/// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise.
40+
/// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled).
4141
fn segment_chinese(text: &str) -> Vec<String> {
42-
#[cfg(feature = "lang-zh")]
43-
{
44-
jieba_segment(text)
45-
}
46-
#[cfg(not(feature = "lang-zh"))]
47-
{
48-
tokenize_cjk(text)
49-
}
42+
tokenize_cjk(text)
5043
}
5144

5245
/// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise.
@@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
9285
.collect()
9386
}
9487

95-
#[cfg(feature = "lang-zh")]
96-
fn jieba_segment(text: &str) -> Vec<String> {
97-
use jieba_rs::Jieba;
98-
let jieba = Jieba::new();
99-
jieba
100-
.cut(text, false)
101-
.into_iter()
102-
.map(|s| s.to_string())
103-
.filter(|s| !s.trim().is_empty())
104-
.collect()
105-
}
106-
10788
#[cfg(feature = "lang-th")]
10889
fn icu_segment_thai(text: &str) -> Vec<String> {
10990
use icu_segmenter::WordSegmenter;
@@ -124,21 +105,11 @@ mod tests {
124105
use super::*;
125106

126107
#[test]
127-
#[cfg(not(feature = "lang-zh"))]
128-
fn fallback_to_bigrams_chinese() {
129-
// Without lang-zh feature, should use CJK bigrams.
108+
fn bigrams_chinese() {
130109
let tokens = segment("全文検索", "zh");
131110
assert_eq!(tokens, vec!["全文", "文検", "検索"]);
132111
}
133112

134-
#[test]
135-
#[cfg(feature = "lang-zh")]
136-
fn dictionary_segmentation_chinese() {
137-
// With lang-zh feature, jieba produces dictionary-based tokens.
138-
let tokens = segment("全文検索", "zh");
139-
assert!(!tokens.is_empty());
140-
}
141-
142113
#[test]
143114
#[cfg(not(feature = "lang-ja"))]
144115
fn fallback_to_bigrams_japanese() {

0 commit comments

Comments
 (0)