66//!
77//! Feature gates:
88//! - `lang-ja`: lindera with IPADIC for Japanese
9- //! - `lang-zh`: jieba-rs for Chinese
9+ //! - `lang-zh`: currently falls back to CJK bigrams (see Cargo.toml)
1010//! - `lang-ko`: lindera with ko-dic for Korean
1111//! - `lang-th`: icu_segmenter for Thai
1212
@@ -37,16 +37,9 @@ fn segment_japanese(text: &str) -> Vec<String> {
3737 }
3838}
3939
40- /// Chinese segmentation: jieba when `lang-zh` is enabled, bigrams otherwise .
40+ /// Chinese segmentation: CJK bigrams (dictionary segmentation temporarily disabled) .
4141fn segment_chinese ( text : & str ) -> Vec < String > {
42- #[ cfg( feature = "lang-zh" ) ]
43- {
44- jieba_segment ( text)
45- }
46- #[ cfg( not( feature = "lang-zh" ) ) ]
47- {
48- tokenize_cjk ( text)
49- }
42+ tokenize_cjk ( text)
5043}
5144
5245/// Korean segmentation: lindera/ko-dic when `lang-ko` is enabled, bigrams otherwise.
@@ -92,18 +85,6 @@ fn lindera_segment(text: &str, _dict: &str) -> Vec<String> {
9285 . collect ( )
9386}
9487
95- #[ cfg( feature = "lang-zh" ) ]
96- fn jieba_segment ( text : & str ) -> Vec < String > {
97- use jieba_rs:: Jieba ;
98- let jieba = Jieba :: new ( ) ;
99- jieba
100- . cut ( text, false )
101- . into_iter ( )
102- . map ( |s| s. to_string ( ) )
103- . filter ( |s| !s. trim ( ) . is_empty ( ) )
104- . collect ( )
105- }
106-
10788#[ cfg( feature = "lang-th" ) ]
10889fn icu_segment_thai ( text : & str ) -> Vec < String > {
10990 use icu_segmenter:: WordSegmenter ;
@@ -124,21 +105,11 @@ mod tests {
124105 use super :: * ;
125106
126107 #[ test]
127- #[ cfg( not( feature = "lang-zh" ) ) ]
128- fn fallback_to_bigrams_chinese ( ) {
129- // Without lang-zh feature, should use CJK bigrams.
108+ fn bigrams_chinese ( ) {
130109 let tokens = segment ( "全文検索" , "zh" ) ;
131110 assert_eq ! ( tokens, vec![ "全文" , "文検" , "検索" ] ) ;
132111 }
133112
134- #[ test]
135- #[ cfg( feature = "lang-zh" ) ]
136- fn dictionary_segmentation_chinese ( ) {
137- // With lang-zh feature, jieba produces dictionary-based tokens.
138- let tokens = segment ( "全文検索" , "zh" ) ;
139- assert ! ( !tokens. is_empty( ) ) ;
140- }
141-
142113 #[ test]
143114 #[ cfg( not( feature = "lang-ja" ) ) ]
144115 fn fallback_to_bigrams_japanese ( ) {
0 commit comments