]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/char.txt
ICU-551.51.tar.gz
[apple/icu.git] / icuSources / data / brkitr / char.txt
1 #
2 # Copyright (C) 2002-2015, International Business Machines Corporation and others.
3 # All Rights Reserved.
4 #
5 # file: char.txt
6 #
7 # ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8 # See Unicode Standard Annex #29.
9 # These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
10 #
11
12 #
13 # Character Class Definitions.
14 #
15 $CR = [\p{Grapheme_Cluster_Break = CR}];
16 $LF = [\p{Grapheme_Cluster_Break = LF}];
17 $Control = [\p{Grapheme_Cluster_Break = Control}];
18 # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
19 $Extend = [\p{Grapheme_Cluster_Break = Extend}];
20 $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
21 $RI = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
22
23 # Special character classes for people & body part emoji:
24 # Subsets of $Extend:
25 $ZWJ = \u200D;
26 $EmojiVar = [\uFE0F];
27 # The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here
28 $EmojiForSeqs = [\u2764 \U0001F441 \U0001F466-\U0001F469 \U0001F48B \U0001F5E8];
29 $EmojiForMods = [\u261D \u26F9 \u270A-\u270D \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3CA-\U0001F3CB \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F575 \U0001F590 \U0001F595 \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0 \U0001F918];
30 $EmojiMods = [\U0001F3FB-\U0001F3FF];
31
32 #
33 # Korean Syllable Definitions
34 #
35 $L = [\p{Grapheme_Cluster_Break = L}];
36 $V = [\p{Grapheme_Cluster_Break = V}];
37 $T = [\p{Grapheme_Cluster_Break = T}];
38
39 $LV = [\p{Grapheme_Cluster_Break = LV}];
40 $LVT = [\p{Grapheme_Cluster_Break = LVT}];
41
42
43 ## -------------------------------------------------
44 !!chain;
45 !!RINoChain;
46
47 !!forward;
48
49 $CR $LF;
50
51 $L ($L | $V | $LV | $LVT);
52 ($LV | $V) ($V | $T);
53 ($LVT | $T) $T;
54
55 $RI $RI $Extend* / $RI;
56 $RI $RI $Extend*;
57
58 [^$Control $CR $LF] $Extend;
59
60 [^$Control $CR $LF] $SpacingMark;
61 # TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
62
63 # Special forward rules for people & body part emoji:
64 # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
65 $ZWJ $EmojiForSeqs;
66 $EmojiForMods $EmojiVar? $EmojiMods;
67
68
69 ## -------------------------------------------------
70
71 !!reverse;
72 $LF $CR;
73 ($L | $V | $LV | $LVT) $L;
74 ($V | $T) ($LV | $V);
75 $T ($LVT | $T);
76
77 $Extend* $RI $RI / $Extend* $RI $RI;
78 $Extend* $RI $RI;
79
80 $Extend [^$Control $CR $LF];
81 $SpacingMark [^$Control $CR $LF];
82 # TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
83
84 # Special reverse rules for people & body part emoji:
85 # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
86 $EmojiForSeqs $ZWJ;
87 $EmojiMods $EmojiVar? $EmojiForMods;
88
89 ## -------------------------------------------------
90 # We don't logically need safe char break rules, but if we don't provide any at all
91 # the engine for preceding() and following() will fall back to the
92 # old style inefficient algorithm.
93
94 !!safe_reverse;
95 $LF $CR;
96 $RI $RI+;
97 [$EmojiVar $EmojiMods]+ $EmojiForMods;
98
99 ## -------------------------------------------------
100
101 !!safe_forward;
102 $CR $LF;
103 $RI $RI+;
104 $EmojiForMods [$EmojiVar $EmojiMods]+;
105