]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/char.txt
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / data / brkitr / char.txt
CommitLineData
b75a7d8f 1#
08b89b0a 2# Copyright (C) 2002-2015, International Business Machines Corporation and others.
b75a7d8f
A
3# All Rights Reserved.
4#
5# file: char.txt
6#
7# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8# See Unicode Standard Annex #29.
51004dcb 9# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
b75a7d8f
A
10#
11
12#
13# Character Class Definitions.
b75a7d8f 14#
46f4442e
A
15$CR = [\p{Grapheme_Cluster_Break = CR}];
16$LF = [\p{Grapheme_Cluster_Break = LF}];
17$Control = [\p{Grapheme_Cluster_Break = Control}];
4388f060 18# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
46f4442e
A
19$Extend = [\p{Grapheme_Cluster_Break = Extend}];
20$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
b75a7d8f 21
08b89b0a
A
22$RI_A = \U0001F1E6; # Trail ETU
23$RI_B = \U0001F1E7; # Trail ER
24$RI_C = \U0001F1E8; # Trail AHLNO
51004dcb 25$RI_D = \U0001F1E9; # Trail EK
08b89b0a 26$RI_E = \U0001F1EA; # Trail S
51004dcb 27$RI_F = \U0001F1EB; # Trail IR
08b89b0a
A
28$RI_G = \U0001F1EC; # Trail B
29$RI_H = \U0001F1ED; # Trail K
30$RI_I = \U0001F1EE; # Trail DELNT
51004dcb
A
31$RI_J = \U0001F1EF; # Trail OP
32$RI_K = \U0001F1F0; # Trail R
51004dcb 33$RI_M = \U0001F1F2; # Trail OXY
08b89b0a
A
34$RI_N = \U0001F1F3; # Trail LOZ
35$RI_P = \U0001F1F5; # Trail HLRT
36$RI_R = \U0001F1F7; # Trail U
37$RI_S = \U0001F1F8; # Trail AEG
38$RI_T = \U0001F1F9; # Trail R
39$RI_U = \U0001F1FA; # Trail S
51004dcb 40$RI_V = \U0001F1FB; # Trail N
08b89b0a
A
41$RI_Z = \U0001F1FF; # Trail A
42# unused trail values, safe as addl lead: C F J M Q V W
43# unused lead values, safe as addl trail: L O Q W X Y
51004dcb 44
08b89b0a
A
45$RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU
46$RI_B_End = [\U0001F1EA \U0001F1F7]; # ER
47$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO
51004dcb 48$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
08b89b0a 49$RI_E_End = \U0001F1F8; # S
51004dcb 50$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
08b89b0a
A
51$RI_G_End = \U0001F1E7; # B
52$RI_H_End = \U0001F1F0; # K
53$RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT
54$RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP
51004dcb 55$RI_K_End = \U0001F1F7; # R
51004dcb 56$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
08b89b0a
A
57$RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ
58$RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT
59$RI_R_End = \U0001F1FA; # U
60$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG
61$RI_T_End = \U0001F1F7; # R
62$RI_U_End = \U0001F1F8; # S
51004dcb 63$RI_V_End = \U0001F1F3; # N
08b89b0a
A
64$RI_Z_End = \U0001F1E6; # A
65
66# Special character classes for people & body part emoji:
67# Subsets of $Extend:
68$ZWJ = \u200D;
69$EmojiVar = [\uFE0F];
70# The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here
71$EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B];
72$EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0];
73$EmojiMods = [\U0001F3FB-\U0001F3FF];
51004dcb 74
b75a7d8f
A
75#
76# Korean Syllable Definitions
77#
73c04bcf
A
78$L = [\p{Grapheme_Cluster_Break = L}];
79$V = [\p{Grapheme_Cluster_Break = V}];
80$T = [\p{Grapheme_Cluster_Break = T}];
b75a7d8f 81
73c04bcf
A
82$LV = [\p{Grapheme_Cluster_Break = LV}];
83$LVT = [\p{Grapheme_Cluster_Break = LVT}];
b75a7d8f 84
b75a7d8f 85
374ca955 86## -------------------------------------------------
46f4442e 87!!chain;
374ca955
A
88
89!!forward;
90
b75a7d8f 91$CR $LF;
46f4442e
A
92
93$L ($L | $V | $LV | $LVT);
94($LV | $V) ($V | $T);
95($LVT | $T) $T;
96
51004dcb
A
97$RI_A $RI_A_End;
98$RI_B $RI_B_End;
99$RI_C $RI_C_End;
100$RI_D $RI_D_End;
101$RI_E $RI_E_End;
102$RI_F $RI_F_End;
103$RI_G $RI_G_End;
104$RI_H $RI_H_End;
105$RI_I $RI_I_End;
106$RI_J $RI_J_End;
107$RI_K $RI_K_End;
51004dcb
A
108$RI_M $RI_M_End;
109$RI_N $RI_N_End;
110$RI_P $RI_P_End;
111$RI_R $RI_R_End;
112$RI_S $RI_S_End;
113$RI_T $RI_T_End;
114$RI_U $RI_U_End;
115$RI_V $RI_V_End;
08b89b0a 116$RI_Z $RI_Z_End;
51004dcb 117
46f4442e
A
118[^$Control $CR $LF] $Extend;
119
120[^$Control $CR $LF] $SpacingMark;
4388f060 121# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
46f4442e 122
08b89b0a
A
123# Special forward rules for people & body part emoji:
124# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
125$ZWJ $EmojiForSeqs;
126$EmojiForMods $EmojiVar? $EmojiMods;
127
b75a7d8f 128
374ca955 129## -------------------------------------------------
b75a7d8f 130
374ca955 131!!reverse;
46f4442e
A
132$LF $CR;
133($L | $V | $LV | $LVT) $L;
134($V | $T) ($LV | $V);
135$T ($LVT | $T);
136
51004dcb
A
137$RI_A_End $RI_A;
138$RI_B_End $RI_B;
139$RI_C_End $RI_C;
140$RI_D_End $RI_D;
141$RI_E_End $RI_E;
142$RI_F_End $RI_F;
143$RI_G_End $RI_G;
144$RI_H_End $RI_H;
145$RI_I_End $RI_I;
146$RI_J_End $RI_J;
147$RI_K_End $RI_K;
51004dcb
A
148$RI_M_End $RI_M;
149$RI_N_End $RI_N;
150$RI_P_End $RI_P;
151$RI_R_End $RI_R;
152$RI_S_End $RI_S;
153$RI_T_End $RI_T;
154$RI_U_End $RI_U;
155$RI_V_End $RI_V;
08b89b0a 156$RI_Z_End $RI_Z;
51004dcb 157
46f4442e
A
158$Extend [^$Control $CR $LF];
159$SpacingMark [^$Control $CR $LF];
4388f060 160# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
374ca955 161
08b89b0a
A
162# Special reverse rules for people & body part emoji:
163# don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods
164$EmojiForSeqs $ZWJ;
165$EmojiMods $EmojiVar? $EmojiForMods;
374ca955
A
166
167## -------------------------------------------------
51004dcb
A
168# We don't logically need safe char break rules, but if we don't provide any at all
169# the engine for preceding() and following() will fall back to the
170# old style inefficient algorithm.
374ca955
A
171
172!!safe_reverse;
51004dcb 173$LF $CR;
08b89b0a 174[$EmojiVar $EmojiMods]+ $EmojiForMods;
374ca955
A
175
176## -------------------------------------------------
177
178!!safe_forward;
51004dcb 179$CR $LF;
08b89b0a 180$EmojiForMods [$EmojiVar $EmojiMods]+;
374ca955 181