X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/729e4ab9bc6618bc3d8a898e575df7f4019e29ca..b801cf366c7671a99bdcef84d1e9c0ec64b36723:/icuSources/data/brkitr/char.txt diff --git a/icuSources/data/brkitr/char.txt b/icuSources/data/brkitr/char.txt index 5f2417c1..16c5928d 100644 --- a/icuSources/data/brkitr/char.txt +++ b/icuSources/data/brkitr/char.txt @@ -1,12 +1,12 @@ # -# Copyright (C) 2002-2010, International Business Machines Corporation and others. +# Copyright (C) 2002-2015, International Business Machines Corporation and others. # All Rights Reserved. # # file: char.txt # # ICU Character Break Rules, also known as Grapheme Cluster Boundaries # See Unicode Standard Annex #29. -# These rules are based on TR29 Revision 16, for Unicode Version 6.0 +# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 # # @@ -15,9 +15,19 @@ $CR = [\p{Grapheme_Cluster_Break = CR}]; $LF = [\p{Grapheme_Cluster_Break = LF}]; $Control = [\p{Grapheme_Cluster_Break = Control}]; -$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; +# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; $Extend = [\p{Grapheme_Cluster_Break = Extend}]; $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; +$RI = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; + +# Special character classes for people & body part emoji: +# Subsets of $Extend: +$ZWJ = \u200D; +$EmojiVar = [\uFE0F]; +# The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here +$EmojiForSeqs = [\u2764 \U0001F441 \U0001F466-\U0001F469 \U0001F48B \U0001F5E8]; +$EmojiForMods = [\u261D \u26F9 \u270A-\u270D \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA-\U0001F3CB \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F590 \U0001F595 \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0 \U0001F918]; +$EmojiMods = [\U0001F3FB-\U0001F3FF]; # # Korean Syllable Definitions @@ -32,6 +42,7 @@ $LVT = [\p{Grapheme_Cluster_Break = LVT}]; ## ------------------------------------------------- !!chain; +!!RINoChain; !!forward; @@ -41,10 +52,18 @@ $L ($L | $V | $LV | $LVT); ($LV | $V) ($V | $T); ($LVT | $T) $T; +$RI $RI $Extend* / $RI; +$RI $RI $Extend*; + [^$Control $CR $LF] $Extend; [^$Control $CR $LF] $SpacingMark; -$Prepend [^$Control $CR $LF]; +# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; + +# Special forward rules for people & body part emoji: +# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods +$ZWJ $EmojiForSeqs; +$EmojiForMods $EmojiVar? $EmojiMods; ## ------------------------------------------------- @@ -55,17 +74,32 @@ $LF $CR; ($V | $T) ($LV | $V); $T ($LVT | $T); +$Extend* $RI $RI / $Extend* $RI $RI; +$Extend* $RI $RI; + $Extend [^$Control $CR $LF]; $SpacingMark [^$Control $CR $LF]; -[^$Control $CR $LF] $Prepend; +# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; +# Special reverse rules for people & body part emoji: +# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods +$EmojiForSeqs $ZWJ; +$EmojiMods $EmojiVar? $EmojiForMods; ## ------------------------------------------------- +# We don't logically need safe char break rules, but if we don't provide any at all +# the engine for preceding() and following() will fall back to the +# old style inefficient algorithm. !!safe_reverse; - +$LF $CR; +$RI $RI+; +[$EmojiVar $EmojiMods]+ $EmojiForMods; ## ------------------------------------------------- !!safe_forward; +$CR $LF; +$RI $RI+; +$EmojiForMods [$EmojiVar $EmojiMods]+;