#
# file: char.txt
#
-# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-# See Unicode Standard Annex #29.
-# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
-# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+# ICU Character Break Rules
+# These rules are based on the Extended Grapheme Cluster rules from
+# Unicode UAX #29 Revision 34 for Unicode Version 12.0
!!quoted_literals_only;
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [[\p{Grapheme_Cluster_Break = Control}]];
-# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
-#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]];
-#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
$Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
$ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
#
+# From cldr/common/properties/segments/
+# and issue CLDR-10994
+#
+$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
+$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
+$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];
+
# Korean Syllable Definitions
#
$L = [\p{Grapheme_Cluster_Break = L}];
# Emoji defintions
-# Data for Extended Pictographic (now a regular property)
$Extended_Pict = [:ExtPict:];
# The following classes are no longer needed for ICU rules but may
# The first is still valid using the Unicode 11 properties:
# $EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\u00230-9©®™〰〽]];
# The other two are no longer valid because no characters have GCB=EB or GCB=EBG anymore:
-# $E_Base = [[\p{Grapheme_Cluster_Break = EB}] \U0001F46A-\U0001F46D\U0001F46F\U0001F91D\U0001F93C];
+# $E_Base = [[\p{Grapheme_Cluster_Break = EB}];
# $E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
# They must be replaced with updated versions as follows
-# $E_Base = [[:EBase:] \U0001F46A-\U0001F46D\U0001F46F\U0001F91D\U0001F93C];
-# $E_Base_GAZ = [\U000026F9\U0001F466-\U0001F469]; # EBase that also occur after ZWJ in emoji-zwj-sequences
+# $E_Base = [:EBase:];
+# $E_Base_GAZ = [\U0001F466-\U0001F469\U0001F91D\U0001F9D1]; # EBase that also occur after ZWJ in emoji-zwj-sequences
## -------------------------------------------------
!!chain;
# GB 9
[^$Control $CR $LF] ($Extend | $ZWJ);
-# GB 9a (only for extended grapheme clusters)
+# GB 9a
[^$Control $CR $LF] $SpacingMark;
# GB 9b
$Prepend [^$Control $CR $LF];
-# The following replaces the Unicode 10 rules GB 10 and GB 11.
+# GB 9.3, from CLDR-10994
+$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;
+
# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
$Extended_Pict $Extend* $ZWJ $Extended_Pict;