Merge pull request #129 from contour-terminal/feature/issue-6-word-segmentation

christianparpart · web-flow · commit 2d5a6ce62748 · 2026-04-08T18:30:21.000-04:00
word: implement UAX #29 Word Boundary segmentation (fixes #6)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -169,9 +169,9 @@ jobs:
           PREPARE_ONLY_EMBEDS=OFF SYSDEP_ASSUME_YES=ON ./scripts/install-deps.sh
           dnf install -y unicode-ucd
       - name: configure
-        run: cmake --preset gcc-debug -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd
+        run: cmake --preset gcc-release -DLIBUNICODE_UCD_DIR=/usr/share/unicode/ucd
       - name: build
-        run: cmake --build --preset gcc-debug -j$(nproc)
+        run: cmake --build --preset gcc-release -j$(nproc)
       - name: test
         run: |
-          ctest --preset gcc-debug
+          ctest --preset gcc-release
diff --git a/scripts/install-deps.sh b/scripts/install-deps.sh
@@ -37,11 +37,11 @@ fetch_and_unpack()
     FULL_DISTFILE="$SYSDEPS_DIST_DIR/$DISTFILE"
 
     if ! test -f "$FULL_DISTFILE"; then
-        if which curl &>/dev/null; then
+        if command -v curl &>/dev/null; then
             curl -L -o "$FULL_DISTFILE" "$URL"
-        elif which wget &>/dev/null; then
+        elif command -v wget &>/dev/null; then
             wget -O "$FULL_DISTFILE" "$URL"
-        elif which fetch &>/dev/null; then
+        elif command -v fetch &>/dev/null; then
             # FreeBSD
             fetch -o "$FULL_DISTFILE" "$URL"
         else
diff --git a/src/libunicode/CMakeLists.txt b/src/libunicode/CMakeLists.txt
@@ -160,6 +160,7 @@ add_library(unicode ${LIBUNICODE_LIB_MODE}
     emoji_segmenter.cpp
     grapheme_segmenter.cpp
     normalization.cpp
+    word_segmenter.cpp
     scan.cpp
     script_segmenter.cpp
     utf8.cpp
@@ -323,6 +324,7 @@ if(LIBUNICODE_TESTING)
     endif()
 
     target_link_libraries(unicode_test unicode Catch2::Catch2WithMain)
+    target_compile_definitions(unicode_test PRIVATE LIBUNICODE_UCD_DIR="${LIBUNICODE_UCD_DIR}")
     add_test(unicode_test unicode_test)
 endif()
 # }}}
diff --git a/src/libunicode/case_mapping.cpp b/src/libunicode/case_mapping.cpp
@@ -223,42 +223,42 @@ std::u32string to_titlecase(std::u32string_view text)
     std::u32string result;
     result.reserve(text.size());
 
-    bool at_word_start = true;
-
-    for (char32_t cp: text)
+    auto seg = word_segmenter(text);
+    while (true)
     {
-        // Simple word boundary detection: after space/punctuation
-        bool const is_letter = is_cased(cp);
+        auto const word = *seg;
+        if (word.empty() && !seg.codepointsAvailable())
+            break;
 
-        if (at_word_start && is_letter)
+        auto firstCased = true;
+        for (auto const cp: word)
         {
-            // Titlecase the first letter of a word
-            auto const mapping = full_titlecase(cp);
-            if (mapping.is_identity())
-                result.push_back(cp);
-            else
-                result.append(mapping.view());
-            at_word_start = false;
-        }
-        else if (is_letter)
-        {
-            // Lowercase the rest of the word
-            auto const mapping = full_lowercase(cp);
-            if (mapping.is_identity())
-                result.push_back(cp);
+            if (firstCased && is_cased(cp))
+            {
+                auto const mapping = full_titlecase(cp);
+                if (mapping.is_identity())
+                    result.push_back(cp);
+                else
+                    result.append(mapping.view());
+                firstCased = false;
+            }
+            else if (!firstCased && is_cased(cp))
+            {
+                auto const mapping = full_lowercase(cp);
+                if (mapping.is_identity())
+                    result.push_back(cp);
+                else
+                    result.append(mapping.view());
+            }
             else
-                result.append(mapping.view());
-        }
-        else
-        {
-            result.push_back(cp);
-            // Check if this is a word boundary
-            if (cp == ' ' || cp == '\t' || cp == '\n' || cp == '\r' || general_category::is_dash_punctuation(cp)
-                || general_category::is_open_punctuation(cp) || general_category::is_close_punctuation(cp))
             {
-                at_word_start = true;
+                result.push_back(cp);
             }
         }
+
+        if (!seg.codepointsAvailable())
+            break;
+        ++seg;
     }
 
     return result;
@@ -357,9 +357,12 @@ bool is_case_ignorable(char32_t codepoint) noexcept
     // Case_Ignorable includes:
     // - General_Category = Mn, Me, Cf, Lm, Sk
     // - Word_Break = MidLetter, MidNumLet, Single_Quote
-    auto const gc = general_category::get(codepoint);
+    auto const props = codepoint_properties::get(codepoint);
+    auto const gc = props.general_category;
+    auto const wb = props.word_break;
     return gc == General_Category::Nonspacing_Mark || gc == General_Category::Enclosing_Mark || gc == General_Category::Format
-           || gc == General_Category::Modifier_Letter || gc == General_Category::Modifier_Symbol;
+           || gc == General_Category::Modifier_Letter || gc == General_Category::Modifier_Symbol || wb == Word_Break::MidLetter
+           || wb == Word_Break::MidNumLet || wb == Word_Break::Single_Quote;
 }
 
 bool changes_when_uppercased(char32_t codepoint) noexcept
diff --git a/src/libunicode/codepoint_properties.h b/src/libunicode/codepoint_properties.h
@@ -34,6 +34,7 @@ struct LIBUNICODE_PACKED codepoint_properties
     EmojiSegmentationCategory emoji_segmentation_category = EmojiSegmentationCategory::Invalid;
     Age age = Age::Unassigned;
     Indic_Conjunct_Break indic_conjunct_break = Indic_Conjunct_Break::None;
+    Word_Break word_break = Word_Break::Other;
 
     static uint8_t constexpr FlagEmoji = 0x01;                // NOLINT(readability-identifier-naming)
     static uint8_t constexpr FlagEmojiPresentation = 0x02;    // NOLINT(readability-identifier-naming)
diff --git a/src/libunicode/tablegen/multistage_generator.cpp b/src/libunicode/tablegen/multistage_generator.cpp
@@ -80,10 +80,11 @@ namespace
         int8_t emoji_segmentation_category = ESC_Invalid;
         uint8_t age = 0;
         uint8_t indic_conjunct_break = 3; // Default: Indic_Conjunct_Break::None
+        uint8_t word_break = 18;          // Default: Word_Break::Other
     };
 #pragma pack(pop)
 
-    static_assert(sizeof(CodepointRecord) == 9, "CodepointRecord must be exactly 9 bytes");
+    static_assert(sizeof(CodepointRecord) == 10, "CodepointRecord must be exactly 10 bytes");
 
     inline bool operator==(CodepointRecord const& a, CodepointRecord const& b) noexcept
     {
@@ -238,17 +239,18 @@ namespace
     }
 
     /// Reverse lookup: index -> name for a PVA-based enum.
+    /// Prefers the longest matching name to avoid returning abbreviations.
     std::string reverseLookup(std::map<std::string, uint8_t> const& index,
                               uint8_t value,
                               std::string const& defaultName = "Unknown")
     {
+        std::string bestName;
         for (auto const& [name, idx]: index)
         {
-            // Skip abbreviation keys (single letters or 2-letter) — only match full names
-            if (idx == value && name.size() > 2)
-                return name;
+            if (idx == value && name.size() > bestName.size())
+                bestName = name;
         }
-        return defaultName;
+        return bestName.empty() ? defaultName : bestName;
     }
 
     std::string_view escName(int8_t idx)
@@ -341,6 +343,7 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
     auto const ageIndex = buildAgeIndex(findPva("Age"));
     auto const gcbIndex = buildPvaBasedIndex(findPva("Grapheme_Cluster_Break"), "Undefined");
     auto const incbIndex = buildPvaBasedIndex(findPva("Indic_Conjunct_Break"));
+    auto const wbIndex = buildPvaBasedIndex(findPva("Word_Break"));
 
     // Name vectors for output
     auto const scriptNames = buildScriptNames(parser.scripts());
@@ -388,13 +391,15 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
         // General_Category::Unassigned is a member name in the GC enum
         auto gcUnassigned = gcIndex.count("Unassigned") ? gcIndex.at("Unassigned") : uint8_t(0);
         auto incbNone = incbIndex.count("None") ? incbIndex.at("None") : uint8_t(3);
+        auto wbOther = wbIndex.count("Other") ? wbIndex.at("Other") : uint8_t(18);
         for (auto& rec: records)
         {
             rec.script = scriptUnknown;
             rec.grapheme_cluster_break = gcbOther;
             rec.east_asian_width = eawNarrow;
             rec.general_category = gcUnassigned;
             rec.indic_conjunct_break = incbNone;
+            rec.word_break = wbOther;
         }
     }
 
@@ -481,6 +486,19 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
                 records[static_cast<size_t>(cp)].indic_conjunct_break = it->second;
     }
 
+    // Word Break
+    for (auto const& [propName, ranges]: parser.wordBreakProps())
+    {
+        for (auto const& r: ranges)
+        {
+            auto it = wbIndex.find(r.property);
+            if (it == wbIndex.end())
+                continue;
+            for (auto cp = r.first; cp <= r.last; ++cp)
+                records[static_cast<size_t>(cp)].word_break = it->second;
+        }
+    }
+
     // East Asian Width
     for (auto const& r: parser.eastAsianWidths())
     {
@@ -615,7 +633,8 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
              << ", "
              << "EmojiSegmentationCategory::" << escName(rec.emoji_segmentation_category) << ", "
              << "Age::" << (rec.age < ageNames.size() ? ageNames[rec.age] : "Unassigned") << ", "
-             << "Indic_Conjunct_Break::" << reverseLookup(incbIndex, rec.indic_conjunct_break, "None") << "},\n";
+             << "Indic_Conjunct_Break::" << reverseLookup(incbIndex, rec.indic_conjunct_break, "None") << ", "
+             << "Word_Break::" << reverseLookup(wbIndex, rec.word_break, "Other") << "},\n";
     }
     impl << "}};\n\n";
 
diff --git a/src/libunicode/tablegen/ucd_parser.cpp b/src/libunicode/tablegen/ucd_parser.cpp
@@ -193,6 +193,7 @@ void UcdParser::parseAll()
 
     // Phase 2: UCD API files
     loadGraphemeBreakProps();
+    loadWordBreakProps();
     loadEastAsianWidths();
     loadHangulSyllableType();
     loadEmojiProps();
@@ -437,6 +438,13 @@ void UcdParser::loadGraphemeBreakProps()
     _graphemeBreakProps = loadGroupedProperties(_ucdDir + "/auxiliary/GraphemeBreakProperty.txt", "Property");
 }
 
+// ---- Word Break Properties ----
+
+void UcdParser::loadWordBreakProps()
+{
+    _wordBreakProps = loadGroupedProperties(_ucdDir + "/auxiliary/WordBreakProperty.txt", "Property");
+}
+
 // ---- East Asian Widths ----
 
 void UcdParser::loadEastAsianWidths()
diff --git a/src/libunicode/tablegen/ucd_parser.h b/src/libunicode/tablegen/ucd_parser.h
@@ -119,6 +119,9 @@ class UcdParser
     /// Grapheme cluster break properties grouped by property name.
     [[nodiscard]] auto const& graphemeBreakProps() const noexcept { return _graphemeBreakProps; }
 
+    /// Word break properties grouped by property name.
+    [[nodiscard]] auto const& wordBreakProps() const noexcept { return _wordBreakProps; }
+
     /// East Asian Width ranges, sorted by start codepoint.
     [[nodiscard]] auto const& eastAsianWidths() const noexcept { return _eastAsianWidths; }
 
@@ -176,6 +179,7 @@ class UcdParser
     void loadScriptExtensions();
     void loadBlocks();
     void loadGraphemeBreakProps();
+    void loadWordBreakProps();
     void loadEastAsianWidths();
     void loadHangulSyllableType();
     void loadEmojiProps();
@@ -230,6 +234,9 @@ class UcdParser
     // Grapheme break props
     std::map<std::string, std::vector<PropertyRange>> _graphemeBreakProps;
 
+    // Word break props
+    std::map<std::string, std::vector<PropertyRange>> _wordBreakProps;
+
     // East Asian Width
     std::vector<PropertyRange> _eastAsianWidths;
 
diff --git a/src/libunicode/word_segmenter.cpp b/src/libunicode/word_segmenter.cpp
diff --git a/src/libunicode/word_segmenter.h b/src/libunicode/word_segmenter.h
diff --git a/src/libunicode/word_segmenter_test.cpp b/src/libunicode/word_segmenter_test.cpp