@@ -80,10 +80,11 @@ namespace
8080 int8_t emoji_segmentation_category = ESC_Invalid;
8181 uint8_t age = 0 ;
8282 uint8_t indic_conjunct_break = 3 ; // Default: Indic_Conjunct_Break::None
83+ uint8_t word_break = 18 ; // Default: Word_Break::Other
8384 };
8485#pragma pack(pop)
8586
86- static_assert (sizeof (CodepointRecord) == 9 , " CodepointRecord must be exactly 9 bytes" );
87+ static_assert (sizeof (CodepointRecord) == 10 , " CodepointRecord must be exactly 10 bytes" );
8788
8889 inline bool operator ==(CodepointRecord const & a, CodepointRecord const & b) noexcept
8990 {
@@ -238,17 +239,18 @@ namespace
238239 }
239240
240241 // / Reverse lookup: index -> name for a PVA-based enum.
242+ // / Prefers the longest matching name to avoid returning abbreviations.
241243 std::string reverseLookup (std::map<std::string, uint8_t > const & index,
242244 uint8_t value,
243245 std::string const & defaultName = " Unknown" )
244246 {
247+ std::string bestName;
245248 for (auto const & [name, idx]: index)
246249 {
247- // Skip abbreviation keys (single letters or 2-letter) — only match full names
248- if (idx == value && name.size () > 2 )
249- return name;
250+ if (idx == value && name.size () > bestName.size ())
251+ bestName = name;
250252 }
251- return defaultName;
253+ return bestName. empty () ? defaultName : bestName ;
252254 }
253255
254256 std::string_view escName (int8_t idx)
@@ -341,6 +343,7 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
341343 auto const ageIndex = buildAgeIndex (findPva (" Age" ));
342344 auto const gcbIndex = buildPvaBasedIndex (findPva (" Grapheme_Cluster_Break" ), " Undefined" );
343345 auto const incbIndex = buildPvaBasedIndex (findPva (" Indic_Conjunct_Break" ));
346+ auto const wbIndex = buildPvaBasedIndex (findPva (" Word_Break" ));
344347
345348 // Name vectors for output
346349 auto const scriptNames = buildScriptNames (parser.scripts ());
@@ -388,13 +391,15 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
388391 // General_Category::Unassigned is a member name in the GC enum
389392 auto gcUnassigned = gcIndex.count (" Unassigned" ) ? gcIndex.at (" Unassigned" ) : uint8_t (0 );
390393 auto incbNone = incbIndex.count (" None" ) ? incbIndex.at (" None" ) : uint8_t (3 );
394+ auto wbOther = wbIndex.count (" Other" ) ? wbIndex.at (" Other" ) : uint8_t (18 );
391395 for (auto & rec: records)
392396 {
393397 rec.script = scriptUnknown;
394398 rec.grapheme_cluster_break = gcbOther;
395399 rec.east_asian_width = eawNarrow;
396400 rec.general_category = gcUnassigned;
397401 rec.indic_conjunct_break = incbNone;
402+ rec.word_break = wbOther;
398403 }
399404 }
400405
@@ -481,6 +486,19 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
481486 records[static_cast <size_t >(cp)].indic_conjunct_break = it->second ;
482487 }
483488
489+ // Word Break
490+ for (auto const & [propName, ranges]: parser.wordBreakProps ())
491+ {
492+ for (auto const & r: ranges)
493+ {
494+ auto it = wbIndex.find (r.property );
495+ if (it == wbIndex.end ())
496+ continue ;
497+ for (auto cp = r.first ; cp <= r.last ; ++cp)
498+ records[static_cast <size_t >(cp)].word_break = it->second ;
499+ }
500+ }
501+
484502 // East Asian Width
485503 for (auto const & r: parser.eastAsianWidths ())
486504 {
@@ -615,7 +633,8 @@ void generateMultistageFiles(UcdParser const& parser, std::string const& outputD
615633 << " , "
616634 << " EmojiSegmentationCategory::" << escName (rec.emoji_segmentation_category ) << " , "
617635 << " Age::" << (rec.age < ageNames.size () ? ageNames[rec.age ] : " Unassigned" ) << " , "
618- << " Indic_Conjunct_Break::" << reverseLookup (incbIndex, rec.indic_conjunct_break , " None" ) << " },\n " ;
636+ << " Indic_Conjunct_Break::" << reverseLookup (incbIndex, rec.indic_conjunct_break , " None" ) << " , "
637+ << " Word_Break::" << reverseLookup (wbIndex, rec.word_break , " Other" ) << " },\n " ;
619638 }
620639 impl << " }};\n\n " ;
621640
0 commit comments