]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
51004dcb | 2 | # Copyright (C) 2002-2013, International Business Machines Corporation and others. |
b75a7d8f A |
3 | # All Rights Reserved. |
4 | # | |
5 | # file: char.txt | |
6 | # | |
7 | # ICU Character Break Rules, also known as Grapheme Cluster Boundaries | |
8 | # See Unicode Standard Annex #29. | |
51004dcb | 9 | # These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 |
b75a7d8f A |
10 | # |
11 | ||
12 | # | |
13 | # Character Class Definitions. | |
b75a7d8f | 14 | # |
46f4442e A |
15 | $CR = [\p{Grapheme_Cluster_Break = CR}]; |
16 | $LF = [\p{Grapheme_Cluster_Break = LF}]; | |
17 | $Control = [\p{Grapheme_Cluster_Break = Control}]; | |
4388f060 | 18 | # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; |
46f4442e A |
19 | $Extend = [\p{Grapheme_Cluster_Break = Extend}]; |
20 | $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; | |
b75a7d8f | 21 | |
51004dcb A |
22 | $RI_A = \U0001F1E6; # Trail ERTU |
23 | $RI_B = \U0001F1E7; # Trail EGR | |
24 | $RI_C = \U0001F1E8; # Trail AHLNZ | |
25 | $RI_D = \U0001F1E9; # Trail EK | |
26 | $RI_E = \U0001F1EA; # Trail GS | |
27 | $RI_F = \U0001F1EB; # Trail IR | |
28 | $RI_G = \U0001F1EC; # Trail BR | |
29 | $RI_H = \U0001F1ED; # Trail KU | |
30 | $RI_I = \U0001F1EE; # Trail DLNT | |
31 | $RI_J = \U0001F1EF; # Trail OP | |
32 | $RI_K = \U0001F1F0; # Trail R | |
33 | $RI_L = \U0001F1F1; # Trail B | |
34 | $RI_M = \U0001F1F2; # Trail OXY | |
35 | $RI_N = \U0001F1F3; # Trail LO | |
36 | $RI_P = \U0001F1F5; # Trail LT | |
37 | $RI_R = \U0001F1F7; # Trail OU | |
38 | $RI_S = \U0001F1F8; # Trail AEGK | |
39 | $RI_T = \U0001F1F9; # Trail HRW | |
40 | $RI_U = \U0001F1FA; # Trail AS | |
41 | $RI_V = \U0001F1FB; # Trail N | |
42 | ||
43 | $RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU | |
44 | $RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR | |
45 | $RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ | |
46 | $RI_D_End = [\U0001F1EA \U0001F1F0]; # EK | |
47 | $RI_E_End = [\U0001F1EC \U0001F1F8]; # GS | |
48 | $RI_F_End = [\U0001F1EE \U0001F1F7]; # IR | |
49 | $RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR | |
50 | $RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU | |
51 | $RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT | |
52 | $RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP | |
53 | $RI_K_End = \U0001F1F7; # R | |
54 | $RI_L_End = \U0001F1E7; # B | |
55 | $RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY | |
56 | $RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO | |
57 | $RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT | |
58 | $RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU | |
59 | $RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK | |
60 | $RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW | |
61 | $RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS | |
62 | $RI_V_End = \U0001F1F3; # N | |
63 | ||
b75a7d8f A |
64 | # |
65 | # Korean Syllable Definitions | |
66 | # | |
73c04bcf A |
67 | $L = [\p{Grapheme_Cluster_Break = L}]; |
68 | $V = [\p{Grapheme_Cluster_Break = V}]; | |
69 | $T = [\p{Grapheme_Cluster_Break = T}]; | |
b75a7d8f | 70 | |
73c04bcf A |
71 | $LV = [\p{Grapheme_Cluster_Break = LV}]; |
72 | $LVT = [\p{Grapheme_Cluster_Break = LVT}]; | |
b75a7d8f | 73 | |
b75a7d8f | 74 | |
374ca955 | 75 | ## ------------------------------------------------- |
46f4442e | 76 | !!chain; |
374ca955 A |
77 | |
78 | !!forward; | |
79 | ||
b75a7d8f | 80 | $CR $LF; |
46f4442e A |
81 | |
82 | $L ($L | $V | $LV | $LVT); | |
83 | ($LV | $V) ($V | $T); | |
84 | ($LVT | $T) $T; | |
85 | ||
51004dcb A |
86 | $RI_A $RI_A_End; |
87 | $RI_B $RI_B_End; | |
88 | $RI_C $RI_C_End; | |
89 | $RI_D $RI_D_End; | |
90 | $RI_E $RI_E_End; | |
91 | $RI_F $RI_F_End; | |
92 | $RI_G $RI_G_End; | |
93 | $RI_H $RI_H_End; | |
94 | $RI_I $RI_I_End; | |
95 | $RI_J $RI_J_End; | |
96 | $RI_K $RI_K_End; | |
97 | $RI_L $RI_L_End; | |
98 | $RI_M $RI_M_End; | |
99 | $RI_N $RI_N_End; | |
100 | $RI_P $RI_P_End; | |
101 | $RI_R $RI_R_End; | |
102 | $RI_S $RI_S_End; | |
103 | $RI_T $RI_T_End; | |
104 | $RI_U $RI_U_End; | |
105 | $RI_V $RI_V_End; | |
106 | ||
46f4442e A |
107 | [^$Control $CR $LF] $Extend; |
108 | ||
109 | [^$Control $CR $LF] $SpacingMark; | |
4388f060 | 110 | # TODO: Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; |
46f4442e | 111 | |
b75a7d8f | 112 | |
374ca955 | 113 | ## ------------------------------------------------- |
b75a7d8f | 114 | |
374ca955 | 115 | !!reverse; |
46f4442e A |
116 | $LF $CR; |
117 | ($L | $V | $LV | $LVT) $L; | |
118 | ($V | $T) ($LV | $V); | |
119 | $T ($LVT | $T); | |
120 | ||
51004dcb A |
121 | $RI_A_End $RI_A; |
122 | $RI_B_End $RI_B; | |
123 | $RI_C_End $RI_C; | |
124 | $RI_D_End $RI_D; | |
125 | $RI_E_End $RI_E; | |
126 | $RI_F_End $RI_F; | |
127 | $RI_G_End $RI_G; | |
128 | $RI_H_End $RI_H; | |
129 | $RI_I_End $RI_I; | |
130 | $RI_J_End $RI_J; | |
131 | $RI_K_End $RI_K; | |
132 | $RI_L_End $RI_L; | |
133 | $RI_M_End $RI_M; | |
134 | $RI_N_End $RI_N; | |
135 | $RI_P_End $RI_P; | |
136 | $RI_R_End $RI_R; | |
137 | $RI_S_End $RI_S; | |
138 | $RI_T_End $RI_T; | |
139 | $RI_U_End $RI_U; | |
140 | $RI_V_End $RI_V; | |
141 | ||
46f4442e A |
142 | $Extend [^$Control $CR $LF]; |
143 | $SpacingMark [^$Control $CR $LF]; | |
4388f060 | 144 | # TODO: Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; |
374ca955 | 145 | |
374ca955 A |
146 | |
147 | ## ------------------------------------------------- | |
51004dcb A |
148 | # We don't logically need safe char break rules, but if we don't provide any at all |
149 | # the engine for preceding() and following() will fall back to the | |
150 | # old style inefficient algorithm. | |
374ca955 A |
151 | |
152 | !!safe_reverse; | |
51004dcb | 153 | $LF $CR; |
374ca955 A |
154 | |
155 | ## ------------------------------------------------- | |
156 | ||
157 | !!safe_forward; | |
51004dcb | 158 | $CR $LF; |
374ca955 | 159 |