]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
08b89b0a | 2 | # Copyright (C) 2002-2015, International Business Machines Corporation and others. |
b75a7d8f A |
3 | # All Rights Reserved. |
4 | # | |
5 | # file: char.txt | |
6 | # | |
7 | # ICU Character Break Rules, also known as Grapheme Cluster Boundaries | |
8 | # See Unicode Standard Annex #29. | |
51004dcb | 9 | # These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 |
b75a7d8f A |
10 | # |
11 | ||
12 | # | |
13 | # Character Class Definitions. | |
b75a7d8f | 14 | # |
46f4442e A |
15 | $CR = [\p{Grapheme_Cluster_Break = CR}]; |
16 | $LF = [\p{Grapheme_Cluster_Break = LF}]; | |
17 | $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
4388f060 | 18 | # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; |
46f4442e A |
19 | $Extend = [\p{Grapheme_Cluster_Break = Extend}]; |
20 | $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; | |
b75a7d8f | 21 | |
08b89b0a A |
22 | $RI_A = \U0001F1E6; # Trail ETU |
23 | $RI_B = \U0001F1E7; # Trail ER | |
24 | $RI_C = \U0001F1E8; # Trail AHLNO | |
51004dcb | 25 | $RI_D = \U0001F1E9; # Trail EK |
08b89b0a | 26 | $RI_E = \U0001F1EA; # Trail S |
51004dcb | 27 | $RI_F = \U0001F1EB; # Trail IR |
08b89b0a A |
28 | $RI_G = \U0001F1EC; # Trail B |
29 | $RI_H = \U0001F1ED; # Trail K | |
30 | $RI_I = \U0001F1EE; # Trail DELNT | |
51004dcb A |
31 | $RI_J = \U0001F1EF; # Trail OP |
32 | $RI_K = \U0001F1F0; # Trail R | |
51004dcb | 33 | $RI_M = \U0001F1F2; # Trail OXY |
08b89b0a A |
34 | $RI_N = \U0001F1F3; # Trail LOZ |
35 | $RI_P = \U0001F1F5; # Trail HLRT | |
36 | $RI_R = \U0001F1F7; # Trail U | |
37 | $RI_S = \U0001F1F8; # Trail AEG | |
38 | $RI_T = \U0001F1F9; # Trail R | |
39 | $RI_U = \U0001F1FA; # Trail S | |
51004dcb | 40 | $RI_V = \U0001F1FB; # Trail N |
08b89b0a A |
41 | $RI_Z = \U0001F1FF; # Trail A |
42 | # unused trail values, safe as addl lead: C F J M Q V W | |
43 | # unused lead values, safe as addl trail: L O Q W X Y | |
51004dcb | 44 | |
08b89b0a A |
45 | $RI_A_End = [\U0001F1EA \U0001F1F9 \U0001F1FA]; # ETU |
46 | $RI_B_End = [\U0001F1EA \U0001F1F7]; # ER | |
47 | $RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1F4]; # AHLNO | |
51004dcb | 48 | $RI_D_End = [\U0001F1EA \U0001F1F0]; # EK |
08b89b0a | 49 | $RI_E_End = \U0001F1F8; # S |
51004dcb | 50 | $RI_F_End = [\U0001F1EE \U0001F1F7]; # IR |
08b89b0a A |
51 | $RI_G_End = \U0001F1E7; # B |
52 | $RI_H_End = \U0001F1F0; # K | |
53 | $RI_I_End = [\U0001F1E9 \U0001F1EA \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DELNT | |
54 | $RI_J_End = [\U0001F1F5 \U0001F1F4]; # OP | |
51004dcb | 55 | $RI_K_End = \U0001F1F7; # R |
51004dcb | 56 | $RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY |
08b89b0a A |
57 | $RI_N_End = [\U0001F1F1 \U0001F1F4 \U0001F1FF]; # LOZ |
58 | $RI_P_End = [\U0001F1ED \U0001F1F1 \U0001F1F7 \U0001F1F9]; # HLRT | |
59 | $RI_R_End = \U0001F1FA; # U | |
60 | $RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC]; # AEG | |
61 | $RI_T_End = \U0001F1F7; # R | |
62 | $RI_U_End = \U0001F1F8; # S | |
51004dcb | 63 | $RI_V_End = \U0001F1F3; # N |
08b89b0a A |
64 | $RI_Z_End = \U0001F1E6; # A |
65 | ||
66 | # Special character classes for people & body part emoji: | |
67 | # Subsets of $Extend: | |
68 | $ZWJ = \u200D; | |
69 | $EmojiVar = [\uFE0F]; | |
70 | # The following are subsets of \p{Grapheme_Cluster_Break = Other} which is not otherwise used here | |
71 | $EmojiForSeqs = [\u2764 \U0001F466-\U0001F469 \U0001F48B]; | |
72 | $EmojiForMods = [\u261D \u270A-\u270C \U0001F385 \U0001F3C3-\U0001F3C4 \U0001F3C7 \U0001F3CA \U0001F442-\U0001F443 \U0001F446-\U0001F450 \U0001F466-\U0001F469 \U0001F46E-\U0001F478 \U0001F47C \U0001F481-\U0001F483 \U0001F485-\U0001F487 \U0001F4AA \U0001F596 \U0001F645-\U0001F647 \U0001F64B-\U0001F64F \U0001F6A3 \U0001F6B4-\U0001F6B6 \U0001F6C0]; | |
73 | $EmojiMods = [\U0001F3FB-\U0001F3FF]; | |
51004dcb | 74 | |
b75a7d8f A |
75 | # |
76 | # Korean Syllable Definitions | |
77 | # | |
73c04bcf A |
78 | $L = [\p{Grapheme_Cluster_Break = L}]; |
79 | $V = [\p{Grapheme_Cluster_Break = V}]; | |
80 | $T = [\p{Grapheme_Cluster_Break = T}]; | |
b75a7d8f | 81 | |
73c04bcf A |
82 | $LV = [\p{Grapheme_Cluster_Break = LV}]; |
83 | $LVT = [\p{Grapheme_Cluster_Break = LVT}]; | |
b75a7d8f | 84 | |
b75a7d8f | 85 | |
374ca955 | 86 | ## ------------------------------------------------- |
46f4442e | 87 | !!chain; |
374ca955 A |
88 | |
89 | !!forward; | |
90 | ||
b75a7d8f | 91 | $CR $LF; |
46f4442e A |
92 | |
93 | $L ($L | $V | $LV | $LVT); | |
94 | ($LV | $V) ($V | $T); | |
95 | ($LVT | $T) $T; | |
96 | ||
51004dcb A |
97 | $RI_A $RI_A_End; |
98 | $RI_B $RI_B_End; | |
99 | $RI_C $RI_C_End; | |
100 | $RI_D $RI_D_End; | |
101 | $RI_E $RI_E_End; | |
102 | $RI_F $RI_F_End; | |
103 | $RI_G $RI_G_End; | |
104 | $RI_H $RI_H_End; | |
105 | $RI_I $RI_I_End; | |
106 | $RI_J $RI_J_End; | |
107 | $RI_K $RI_K_End; | |
51004dcb A |
108 | $RI_M $RI_M_End; |
109 | $RI_N $RI_N_End; | |
110 | $RI_P $RI_P_End; | |
111 | $RI_R $RI_R_End; | |
112 | $RI_S $RI_S_End; | |
113 | $RI_T $RI_T_End; | |
114 | $RI_U $RI_U_End; | |
115 | $RI_V $RI_V_End; | |
08b89b0a | 116 | $RI_Z $RI_Z_End; |
51004dcb | 117 | |
46f4442e A |
118 | [^$Control $CR $LF] $Extend; |
119 | ||
120 | [^$Control $CR $LF] $SpacingMark; | |
4388f060 | 121 | # TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; |
46f4442e | 122 | |
08b89b0a A |
123 | # Special forward rules for people & body part emoji: |
124 | # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods | |
125 | $ZWJ $EmojiForSeqs; | |
126 | $EmojiForMods $EmojiVar? $EmojiMods; | |
127 | ||
b75a7d8f | 128 | |
374ca955 | 129 | ## ------------------------------------------------- |
b75a7d8f | 130 | |
374ca955 | 131 | !!reverse; |
46f4442e A |
132 | $LF $CR; |
133 | ($L | $V | $LV | $LVT) $L; | |
134 | ($V | $T) ($LV | $V); | |
135 | $T ($LVT | $T); | |
136 | ||
51004dcb A |
137 | $RI_A_End $RI_A; |
138 | $RI_B_End $RI_B; | |
139 | $RI_C_End $RI_C; | |
140 | $RI_D_End $RI_D; | |
141 | $RI_E_End $RI_E; | |
142 | $RI_F_End $RI_F; | |
143 | $RI_G_End $RI_G; | |
144 | $RI_H_End $RI_H; | |
145 | $RI_I_End $RI_I; | |
146 | $RI_J_End $RI_J; | |
147 | $RI_K_End $RI_K; | |
51004dcb A |
148 | $RI_M_End $RI_M; |
149 | $RI_N_End $RI_N; | |
150 | $RI_P_End $RI_P; | |
151 | $RI_R_End $RI_R; | |
152 | $RI_S_End $RI_S; | |
153 | $RI_T_End $RI_T; | |
154 | $RI_U_End $RI_U; | |
155 | $RI_V_End $RI_V; | |
08b89b0a | 156 | $RI_Z_End $RI_Z; |
51004dcb | 157 | |
46f4442e A |
158 | $Extend [^$Control $CR $LF]; |
159 | $SpacingMark [^$Control $CR $LF]; | |
4388f060 | 160 | # TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; |
374ca955 | 161 | |
08b89b0a A |
162 | # Special reverse rules for people & body part emoji: |
163 | # don't break $ZWJ from subsequent $EmojiForSeqs; don't break between relevant emoji and $EmojiMods | |
164 | $EmojiForSeqs $ZWJ; | |
165 | $EmojiMods $EmojiVar? $EmojiForMods; | |
374ca955 A |
166 | |
167 | ## ------------------------------------------------- | |
51004dcb A |
168 | # We don't logically need safe char break rules, but if we don't provide any at all |
169 | # the engine for preceding() and following() will fall back to the | |
170 | # old style inefficient algorithm. | |
374ca955 A |
171 | |
172 | !!safe_reverse; | |
51004dcb | 173 | $LF $CR; |
08b89b0a | 174 | [$EmojiVar $EmojiMods]+ $EmojiForMods; |
374ca955 A |
175 | |
176 | ## ------------------------------------------------- | |
177 | ||
178 | !!safe_forward; | |
51004dcb | 179 | $CR $LF; |
08b89b0a | 180 | $EmojiForMods [$EmojiVar $EmojiMods]+; |
374ca955 | 181 |