Skip to content

Commit 2d5a6ce

Browse files
Merge pull request #129 from contour-terminal/feature/issue-6-word-segmentation
word: implement UAX #29 Word Boundary segmentation (fixes #6)
2 parents d6e1207 + 44fec06 commit 2d5a6ce

File tree

11 files changed

+834
-122
lines changed

11 files changed

+834
-122
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,9 @@ jobs:
169169
PREPARE_ONLY_EMBEDS=OFF SYSDEP_ASSUME_YES=ON ./scripts/install-deps.sh
170170
dnf install -y unicode-ucd
171171
- name: configure
172-
run: cmake --preset gcc-debug -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd
172+
run: cmake --preset gcc-release -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd
173173
- name: build
174-
run: cmake --build --preset gcc-debug -j$(nproc)
174+
run: cmake --build --preset gcc-release -j$(nproc)
175175
- name: test
176176
run: |
177-
ctest --preset gcc-debug
177+
ctest --preset gcc-release

scripts/install-deps.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ fetch_and_unpack()
3737
FULL_DISTFILE="$SYSDEPS_DIST_DIR/$DISTFILE"
3838

3939
if ! test -f "$FULL_DISTFILE"; then
40-
if which curl &>/dev/null; then
40+
if command -v curl &>/dev/null; then
4141
curl -L -o "$FULL_DISTFILE" "$URL"
42-
elif which wget &>/dev/null; then
42+
elif command -v wget &>/dev/null; then
4343
wget -O "$FULL_DISTFILE" "$URL"
44-
elif which fetch &>/dev/null; then
44+
elif command -v fetch &>/dev/null; then
4545
# FreeBSD
4646
fetch -o "$FULL_DISTFILE" "$URL"
4747
else

src/libunicode/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
160160
emoji_segmenter.cpp
161161
grapheme_segmenter.cpp
162162
normalization.cpp
163+
word_segmenter.cpp
163164
scan.cpp
164165
script_segmenter.cpp
165166
utf8.cpp
@@ -323,6 +324,7 @@ if(LIBUNICODE_TESTING)
323324
endif()
324325

325326
target_link_libraries(unicode_test unicode Catch2::Catch2WithMain)
327+
target_compile_definitions(unicode_test PRIVATE LIBUNICODE_UCD_DIR="${LIBUNICODE_UCD_DIR}")
326328
add_test(unicode_test unicode_test)
327329
endif()
328330
# }}}

src/libunicode/case_mapping.cpp

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -223,42 +223,42 @@ std::u32string to_titlecase(std::u32string_view text)
223223
std::u32string result;
224224
result.reserve(text.size());
225225

226-
bool at_word_start = true;
227-
228-
for (char32_t cp: text)
226+
auto seg = word_segmenter(text);
227+
while (true)
229228
{
230-
// Simple word boundary detection: after space/punctuation
231-
bool const is_letter = is_cased(cp);
229+
auto const word = *seg;
230+
if (word.empty() && !seg.codepointsAvailable())
231+
break;
232232

233-
if (at_word_start && is_letter)
233+
auto firstCased = true;
234+
for (auto const cp: word)
234235
{
235-
// Titlecase the first letter of a word
236-
auto const mapping = full_titlecase(cp);
237-
if (mapping.is_identity())
238-
result.push_back(cp);
239-
else
240-
result.append(mapping.view());
241-
at_word_start = false;
242-
}
243-
else if (is_letter)
244-
{
245-
// Lowercase the rest of the word
246-
auto const mapping = full_lowercase(cp);
247-
if (mapping.is_identity())
248-
result.push_back(cp);
236+
if (firstCased && is_cased(cp))
237+
{
238+
auto const mapping = full_titlecase(cp);
239+
if (mapping.is_identity())
240+
result.push_back(cp);
241+
else
242+
result.append(mapping.view());
243+
firstCased = false;
244+
}
245+
else if (!firstCased && is_cased(cp))
246+
{
247+
auto const mapping = full_lowercase(cp);
248+
if (mapping.is_identity())
249+
result.push_back(cp);
250+
else
251+
result.append(mapping.view());
252+
}
249253
else
250-
result.append(mapping.view());
251-
}
252-
else
253-
{
254-
result.push_back(cp);
255-
// Check if this is a word boundary
256-
if (cp == ' ' || cp == '\t' || cp == '\n' || cp == '\r' || general_category::is_dash_punctuation(cp)
257-
|| general_category::is_open_punctuation(cp) || general_category::is_close_punctuation(cp))
258254
{
259-
at_word_start = true;
255+
result.push_back(cp);
260256
}
261257
}
258+
259+
if (!seg.codepointsAvailable())
260+
break;
261+
++seg;
262262
}
263263

264264
return result;
@@ -357,9 +357,12 @@ bool is_case_ignorable(char32_t codepoint) noexcept
357357
// Case_Ignorable includes:
358358
// - General_Category = Mn, Me, Cf, Lm, Sk
359359
// - Word_Break = MidLetter, MidNumLet, Single_Quote
360-
auto const gc = general_category::get(codepoint);
360+
auto const props = codepoint_properties::get(codepoint);
361+
auto const gc = props.general_category;
362+
auto const wb = props.word_break;
361363
return gc == General_Category::Nonspacing_Mark || gc == General_Category::Enclosing_Mark || gc == General_Category::Format
362-
|| gc == General_Category::Modifier_Letter || gc == General_Category::Modifier_Symbol;
364+
|| gc == General_Category::Modifier_Letter || gc == General_Category::Modifier_Symbol || wb == Word_Break::MidLetter
365+
|| wb == Word_Break::MidNumLet || wb == Word_Break::Single_Quote;
363366
}
364367

365368
bool changes_when_uppercased(char32_t codepoint) noexcept

src/libunicode/codepoint_properties.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ struct LIBUNICODE_PACKED codepoint_properties
3434
EmojiSegmentationCategory emoji_segmentation_category = EmojiSegmentationCategory::Invalid;
3535
Age age = Age::Unassigned;
3636
Indic_Conjunct_Break indic_conjunct_break = Indic_Conjunct_Break::None;
37+
Word_Break word_break = Word_Break::Other;
3738

3839
static uint8_t constexpr FlagEmoji = 0x01; // NOLINT(readability-identifier-naming)
3940
static uint8_t constexpr FlagEmojiPresentation = 0x02; // NOLINT(readability-identifier-naming)

src/libunicode/tablegen/multistage_generator.cpp

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,11 @@ namespace
8080
int8_t emoji_segmentation_category = ESC_Invalid;
8181
uint8_t age = 0;
8282
uint8_t indic_conjunct_break = 3; // Default: Indic_Conjunct_Break::None
83+
uint8_t word_break = 18; // Default: Word_Break::Other
8384
};
8485
#pragma pack(pop)
8586

86-
static_assert(sizeof(CodepointRecord) == 9, "CodepointRecord must be exactly 9 bytes");
87+
static_assert(sizeof(CodepointRecord) == 10, "CodepointRecord must be exactly 10 bytes");
8788

8889
inline bool operator==(CodepointRecord const& a, CodepointRecord const& b) noexcept
8990
{
@@ -238,17 +239,18 @@ namespace
238239
}
239240

240241
/// Reverse lookup: index -> name for a PVA-based enum.
242+
/// Prefers the longest matching name to avoid returning abbreviations.
241243
std::string reverseLookup(std::map<std::string, uint8_t> const& index,
242244
uint8_t value,
243245
std::string const& defaultName = "Unknown")
244246
{
247+
std::string bestName;
245248
for (auto const& [name, idx]: index)
246249
{
247-
// Skip abbreviation keys (single letters or 2-letter) — only match full names
248-
if (idx == value && name.size() > 2)
249-
return name;
250+
if (idx == value && name.size() > bestName.size())
251+
bestName = name;
250252
}
251-
return defaultName;
253+
return bestName.empty() ? defaultName : bestName;
252254
}
253255

254256
std::string_view escName(int8_t idx)
@@ -341,6 +343,7 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
341343
auto const ageIndex = buildAgeIndex(findPva("Age"));
342344
auto const gcbIndex = buildPvaBasedIndex(findPva("Grapheme_Cluster_Break"), "Undefined");
343345
auto const incbIndex = buildPvaBasedIndex(findPva("Indic_Conjunct_Break"));
346+
auto const wbIndex = buildPvaBasedIndex(findPva("Word_Break"));
344347

345348
// Name vectors for output
346349
auto const scriptNames = buildScriptNames(parser.scripts());
@@ -388,13 +391,15 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
388391
// General_Category::Unassigned is a member name in the GC enum
389392
auto gcUnassigned = gcIndex.count("Unassigned") ? gcIndex.at("Unassigned") : uint8_t(0);
390393
auto incbNone = incbIndex.count("None") ? incbIndex.at("None") : uint8_t(3);
394+
auto wbOther = wbIndex.count("Other") ? wbIndex.at("Other") : uint8_t(18);
391395
for (auto& rec: records)
392396
{
393397
rec.script = scriptUnknown;
394398
rec.grapheme_cluster_break = gcbOther;
395399
rec.east_asian_width = eawNarrow;
396400
rec.general_category = gcUnassigned;
397401
rec.indic_conjunct_break = incbNone;
402+
rec.word_break = wbOther;
398403
}
399404
}
400405

@@ -481,6 +486,19 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
481486
records[static_cast<size_t>(cp)].indic_conjunct_break = it->second;
482487
}
483488

489+
// Word Break
490+
for (auto const& [propName, ranges]: parser.wordBreakProps())
491+
{
492+
for (auto const& r: ranges)
493+
{
494+
auto it = wbIndex.find(r.property);
495+
if (it == wbIndex.end())
496+
continue;
497+
for (auto cp = r.first; cp <= r.last; ++cp)
498+
records[static_cast<size_t>(cp)].word_break = it->second;
499+
}
500+
}
501+
484502
// East Asian Width
485503
for (auto const& r: parser.eastAsianWidths())
486504
{
@@ -615,7 +633,8 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
615633
<< ", "
616634
<< "EmojiSegmentationCategory::" << escName(rec.emoji_segmentation_category) << ", "
617635
<< "Age::" << (rec.age < ageNames.size() ? ageNames[rec.age] : "Unassigned") << ", "
618-
<< "Indic_Conjunct_Break::" << reverseLookup(incbIndex, rec.indic_conjunct_break, "None") << "},\n";
636+
<< "Indic_Conjunct_Break::" << reverseLookup(incbIndex, rec.indic_conjunct_break, "None") << ", "
637+
<< "Word_Break::" << reverseLookup(wbIndex, rec.word_break, "Other") << "},\n";
619638
}
620639
impl << "}};\n\n";
621640

src/libunicode/tablegen/ucd_parser.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ void UcdParser::parseAll()
193193

194194
// Phase 2: UCD API files
195195
loadGraphemeBreakProps();
196+
loadWordBreakProps();
196197
loadEastAsianWidths();
197198
loadHangulSyllableType();
198199
loadEmojiProps();
@@ -437,6 +438,13 @@ void UcdParser::loadGraphemeBreakProps()
437438
_graphemeBreakProps = loadGroupedProperties(_ucdDir + "/auxiliary/GraphemeBreakProperty.txt", "Property");
438439
}
439440

441+
// ---- Word Break Properties ----
442+
443+
void UcdParser::loadWordBreakProps()
444+
{
445+
_wordBreakProps = loadGroupedProperties(_ucdDir + "/auxiliary/WordBreakProperty.txt", "Property");
446+
}
447+
440448
// ---- East Asian Widths ----
441449

442450
void UcdParser::loadEastAsianWidths()

src/libunicode/tablegen/ucd_parser.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ class UcdParser
119119
/// Grapheme cluster break properties grouped by property name.
120120
[[nodiscard]] auto const& graphemeBreakProps() const noexcept { return _graphemeBreakProps; }
121121

122+
/// Word break properties grouped by property name.
123+
[[nodiscard]] auto const& wordBreakProps() const noexcept { return _wordBreakProps; }
124+
122125
/// East Asian Width ranges, sorted by start codepoint.
123126
[[nodiscard]] auto const& eastAsianWidths() const noexcept { return _eastAsianWidths; }
124127

@@ -176,6 +179,7 @@ class UcdParser
176179
void loadScriptExtensions();
177180
void loadBlocks();
178181
void loadGraphemeBreakProps();
182+
void loadWordBreakProps();
179183
void loadEastAsianWidths();
180184
void loadHangulSyllableType();
181185
void loadEmojiProps();
@@ -230,6 +234,9 @@ class UcdParser
230234
// Grapheme break props
231235
std::map<std::string, std::vector<PropertyRange>> _graphemeBreakProps;
232236

237+
// Word break props
238+
std::map<std::string, std::vector<PropertyRange>> _wordBreakProps;
239+
233240
// East Asian Width
234241
std::vector<PropertyRange> _eastAsianWidths;
235242

0 commit comments

Comments
 (0)