]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/data/brkitr/char.txt
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / data / brkitr / char.txt
... / ...
CommitLineData
1#
2# Copyright (C) 2002-2013, International Business Machines Corporation and others.
3# All Rights Reserved.
4#
5# file: char.txt
6#
7# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8# See Unicode Standard Annex #29.
9# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
10#
11
12#
13# Character Class Definitions.
14#
15$CR = [\p{Grapheme_Cluster_Break = CR}];
16$LF = [\p{Grapheme_Cluster_Break = LF}];
17$Control = [\p{Grapheme_Cluster_Break = Control}];
18# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
19$Extend = [\p{Grapheme_Cluster_Break = Extend}];
20$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
21
22$RI_A = \U0001F1E6; # Trail ERTU
23$RI_B = \U0001F1E7; # Trail EGR
24$RI_C = \U0001F1E8; # Trail AHLNZ
25$RI_D = \U0001F1E9; # Trail EK
26$RI_E = \U0001F1EA; # Trail GS
27$RI_F = \U0001F1EB; # Trail IR
28$RI_G = \U0001F1EC; # Trail BR
29$RI_H = \U0001F1ED; # Trail KU
30$RI_I = \U0001F1EE; # Trail DLNT
31$RI_J = \U0001F1EF; # Trail OP
32$RI_K = \U0001F1F0; # Trail R
33$RI_L = \U0001F1F1; # Trail B
34$RI_M = \U0001F1F2; # Trail OXY
35$RI_N = \U0001F1F3; # Trail LO
36$RI_P = \U0001F1F5; # Trail LT
37$RI_R = \U0001F1F7; # Trail OU
38$RI_S = \U0001F1F8; # Trail AEGK
39$RI_T = \U0001F1F9; # Trail HRW
40$RI_U = \U0001F1FA; # Trail AS
41$RI_V = \U0001F1FB; # Trail N
42
43$RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU
44$RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR
45$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
46$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
47$RI_E_End = [\U0001F1EC \U0001F1F8]; # GS
48$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
49$RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR
50$RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU
51$RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT
52$RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP
53$RI_K_End = \U0001F1F7; # R
54$RI_L_End = \U0001F1E7; # B
55$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
56$RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO
57$RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT
58$RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU
59$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK
60$RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW
61$RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS
62$RI_V_End = \U0001F1F3; # N
63
64#
65# Korean Syllable Definitions
66#
67$L = [\p{Grapheme_Cluster_Break = L}];
68$V = [\p{Grapheme_Cluster_Break = V}];
69$T = [\p{Grapheme_Cluster_Break = T}];
70
71$LV = [\p{Grapheme_Cluster_Break = LV}];
72$LVT = [\p{Grapheme_Cluster_Break = LVT}];
73
74
75## -------------------------------------------------
76!!chain;
77
78!!forward;
79
80$CR $LF;
81
82$L ($L | $V | $LV | $LVT);
83($LV | $V) ($V | $T);
84($LVT | $T) $T;
85
86$RI_A $RI_A_End;
87$RI_B $RI_B_End;
88$RI_C $RI_C_End;
89$RI_D $RI_D_End;
90$RI_E $RI_E_End;
91$RI_F $RI_F_End;
92$RI_G $RI_G_End;
93$RI_H $RI_H_End;
94$RI_I $RI_I_End;
95$RI_J $RI_J_End;
96$RI_K $RI_K_End;
97$RI_L $RI_L_End;
98$RI_M $RI_M_End;
99$RI_N $RI_N_End;
100$RI_P $RI_P_End;
101$RI_R $RI_R_End;
102$RI_S $RI_S_End;
103$RI_T $RI_T_End;
104$RI_U $RI_U_End;
105$RI_V $RI_V_End;
106
107[^$Control $CR $LF] $Extend;
108
109[^$Control $CR $LF] $SpacingMark;
110# TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
111
112
113## -------------------------------------------------
114
115!!reverse;
116$LF $CR;
117($L | $V | $LV | $LVT) $L;
118($V | $T) ($LV | $V);
119$T ($LVT | $T);
120
121$RI_A_End $RI_A;
122$RI_B_End $RI_B;
123$RI_C_End $RI_C;
124$RI_D_End $RI_D;
125$RI_E_End $RI_E;
126$RI_F_End $RI_F;
127$RI_G_End $RI_G;
128$RI_H_End $RI_H;
129$RI_I_End $RI_I;
130$RI_J_End $RI_J;
131$RI_K_End $RI_K;
132$RI_L_End $RI_L;
133$RI_M_End $RI_M;
134$RI_N_End $RI_N;
135$RI_P_End $RI_P;
136$RI_R_End $RI_R;
137$RI_S_End $RI_S;
138$RI_T_End $RI_T;
139$RI_U_End $RI_U;
140$RI_V_End $RI_V;
141
142$Extend [^$Control $CR $LF];
143$SpacingMark [^$Control $CR $LF];
144# TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
145
146
147## -------------------------------------------------
148# We don't logically need safe char break rules, but if we don't provide any at all
149# the engine for preceding() and following() will fall back to the
150# old style inefficient algorithm.
151
152!!safe_reverse;
153$LF $CR;
154
155## -------------------------------------------------
156
157!!safe_forward;
158$CR $LF;
159