]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/char.txt
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / data / brkitr / char.txt
1 #
2 # Copyright (C) 2002-2015, International Business Machines Corporation and others.
3 # All Rights Reserved.
4 #
5 # file: char.txt
6 #
7 # ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8 # See Unicode Standard Annex #29.
9 # These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
10 #
11
12 #
13 # Character Class Definitions.
14 #
15 $CR = [\p{Grapheme_Cluster_Break = CR}];
16 $LF = [\p{Grapheme_Cluster_Break = LF}];
17 $Control = [\p{Grapheme_Cluster_Break = Control}];
18 # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
19 $Extend = [\p{Grapheme_Cluster_Break = Extend}];
20 $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
21
22 $RI_A = \U0001F1E6; # Trail ETU
23 $RI_B = \U0001F1E7; # Trail ER
24 $RI_C = \U0001F1E8; # Trail AHLNO
25 $RI_D = \U0001F1E9; # Trail EK
26 $RI_E = \U0001F1EA; # Trail S
27 $RI_F = \U0001F1EB; # Trail IR
28 $RI_G = \U0001F1EC; # Trail B
29 $RI_H = \U0001F1ED; # Trail K
30 $RI_I = \U0001F1EE; # Trail DELNT
31 $RI_J = \U0001F1EF; # Trail OP
32 $RI_K = \U0001F1F0; # Trail R
33 $RI_M = \U0001F1F2; # Trail OXY
34 $RI_N = \U0001F1F3; # Trail LOZ
35 $RI_P = \U0001F1F5; # Trail HLRT
36 $RI_R = \U0001F1F7; # Trail U
37 $RI_S = \U0001F1F8; # Trail AEG
38 $RI_T = \U0001F1F9; # Trail R
39 $RI_U = \U0001F1FA; # Trail S
40 $RI_V = \U0001F1FB; # Trail N
41 $RI_Z = \U0001F1FF; # Trail A
42 # unused trail values, safe as addl lead: C F J M Q V W
43 # unused lead values, safe as addl trail: L O Q W X Y
44
45 $RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU
46 $RI_B_End = [\U0001F1EA \U0001F1F7]; # ER
47 $RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
48 $RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
49 $RI_E_End = \U0001F1F8; # S
50 $RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
51 $RI_G_End = \U0001F1E7; # B
52 $RI_H_End = \U0001F1F0; # K
53 $RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
54 $RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP
55 $RI_K_End = \U0001F1F7; # R
56 $RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
57 $RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ
58 $RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT
59 $RI_R_End = \U0001F1FA; # U
60 $RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG
61 $RI_T_End = \U0001F1F7; # R
62 $RI_U_End = \U0001F1F8; # S
63 $RI_V_End = \U0001F1F3; # N
64 $RI_Z_End = \U0001F1E6; # A
65
66 # Special character classes for people & body part emoji:
67 # Subsets of $Extend:
68 $ZWJ = \u200D;
69 $EmojiVar = [\uFE0F];
70 # The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here
71 $EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
72 $EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
73 $EmojiMods = [\U0001F3FB-\U0001F3FF];
74
75 #
76 # Korean Syllable Definitions
77 #
78 $L = [\p{Grapheme_Cluster_Break = L}];
79 $V = [\p{Grapheme_Cluster_Break = V}];
80 $T = [\p{Grapheme_Cluster_Break = T}];
81
82 $LV = [\p{Grapheme_Cluster_Break = LV}];
83 $LVT = [\p{Grapheme_Cluster_Break = LVT}];
84
85
86 ## -------------------------------------------------
87 !!chain;
88
89 !!forward;
90
91 $CR $LF;
92
93 $L ($L | $V | $LV | $LVT);
94 ($LV | $V) ($V | $T);
95 ($LVT | $T) $T;
96
97 $RI_A $RI_A_End;
98 $RI_B $RI_B_End;
99 $RI_C $RI_C_End;
100 $RI_D $RI_D_End;
101 $RI_E $RI_E_End;
102 $RI_F $RI_F_End;
103 $RI_G $RI_G_End;
104 $RI_H $RI_H_End;
105 $RI_I $RI_I_End;
106 $RI_J $RI_J_End;
107 $RI_K $RI_K_End;
108 $RI_M $RI_M_End;
109 $RI_N $RI_N_End;
110 $RI_P $RI_P_End;
111 $RI_R $RI_R_End;
112 $RI_S $RI_S_End;
113 $RI_T $RI_T_End;
114 $RI_U $RI_U_End;
115 $RI_V $RI_V_End;
116 $RI_Z $RI_Z_End;
117
118 [^$Control $CR $LF] $Extend;
119
120 [^$Control $CR $LF] $SpacingMark;
121 # TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
122
123 # Special forward rules for people & body part emoji:
124 # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
125 $ZWJ $EmojiForSeqs;
126 $EmojiForMods $EmojiVar? $EmojiMods;
127
128
129 ## -------------------------------------------------
130
131 !!reverse;
132 $LF $CR;
133 ($L | $V | $LV | $LVT) $L;
134 ($V | $T) ($LV | $V);
135 $T ($LVT | $T);
136
137 $RI_A_End $RI_A;
138 $RI_B_End $RI_B;
139 $RI_C_End $RI_C;
140 $RI_D_End $RI_D;
141 $RI_E_End $RI_E;
142 $RI_F_End $RI_F;
143 $RI_G_End $RI_G;
144 $RI_H_End $RI_H;
145 $RI_I_End $RI_I;
146 $RI_J_End $RI_J;
147 $RI_K_End $RI_K;
148 $RI_M_End $RI_M;
149 $RI_N_End $RI_N;
150 $RI_P_End $RI_P;
151 $RI_R_End $RI_R;
152 $RI_S_End $RI_S;
153 $RI_T_End $RI_T;
154 $RI_U_End $RI_U;
155 $RI_V_End $RI_V;
156 $RI_Z_End $RI_Z;
157
158 $Extend [^$Control $CR $LF];
159 $SpacingMark [^$Control $CR $LF];
160 # TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
161
162 # Special reverse rules for people & body part emoji:
163 # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
164 $EmojiForSeqs $ZWJ;
165 $EmojiMods $EmojiVar? $EmojiForMods;
166
167 ## -------------------------------------------------
168 # We don't logically need safe char break rules, but if we don't provide any at all
169 # the engine for preceding() and following() will fall back to the
170 # old style inefficient algorithm.
171
172 !!safe_reverse;
173 $LF $CR;
174 [$EmojiVar $EmojiMods]+ $EmojiForMods;
175
176 ## -------------------------------------------------
177
178 !!safe_forward;
179 $CR $LF;
180 $EmojiForMods [$EmojiVar $EmojiMods]+;
181