]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | # |
374ca955 A |
2 | # Copyright (C) 2002-2004, International Business Machines Corporation |
3 | # and others. All Rights Reserved. | |
b75a7d8f | 4 | # |
374ca955 | 5 | # file: word.txt |
b75a7d8f | 6 | # |
374ca955 | 7 | # ICU Word Break Rules |
b75a7d8f | 8 | # See Unicode Standard Annex #29. |
374ca955 | 9 | # These rules are based on Version 4.1 draft, dated 2004-11-11 |
b75a7d8f A |
10 | # |
11 | ||
374ca955 | 12 | ############################################################################## |
b75a7d8f A |
13 | # |
14 | # Character class definitions from TR 29 | |
15 | # | |
374ca955 A |
16 | ############################################################################## |
17 | ||
18 | !!chain; | |
19 | ||
20 | $Katakana = [[:Script = KATAKANA:] | |
21 | [:name = VERTICAL KANA REPEAT MARK:] | |
22 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:] | |
23 | [:name = VERTICAL KANA REPEAT MARK UPPER HALF:] | |
24 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:] | |
25 | [:name = VERTICAL KANA REPEAT MARK LOWER HALF:] | |
26 | [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:] | |
27 | [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:] | |
28 | [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:] | |
29 | [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
30 | [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
31 | [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] | |
32 | [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; | |
33 | ||
34 | ||
35 | $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] | |
36 | - [:Ideographic:] | |
37 | - $Katakana | |
38 | - [:Script = Hiragana:] | |
39 | - [:Script = Lao:] | |
40 | - [:Grapheme_Extend = TRUE:]]; | |
41 | ||
42 | $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] | |
43 | [:name = HEBREW PUNCTUATION GERSHAYIM:] | |
44 | [:name = RIGHT SINGLE QUOTATION MARK:] | |
45 | [:name = HYPHENATION POINT:] | |
46 | [:name = COLON:]]; | |
47 | ||
48 | ||
49 | $MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]]; | |
50 | $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]]; | |
51 | $ExtendNumLet = [[:Connector_Punctuation:] | |
52 | - [:name = KATAKANA MIDDLE DOT:] | |
53 | - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]]; | |
54 | ||
55 | ||
b75a7d8f A |
56 | |
57 | # | |
58 | # Character Class Definitions. | |
59 | # The names are those from TR29. | |
60 | # | |
b75a7d8f | 61 | |
374ca955 A |
62 | $CR = \u000d; |
63 | $LF = \u000a; | |
64 | $Extend = [[:Grapheme_Extend = TRUE:]]; | |
65 | $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; | |
66 | $Format = [[:Cf:] - $Extend]; | |
67 | $Hiragana = [:Hiragana:]; | |
68 | $Ideographic = [:IDEOGRAPHIC:]; | |
b75a7d8f | 69 | |
374ca955 A |
70 | $ALetterEx = $ALetter $Extend*; |
71 | $NumericEx = $Numeric $Extend*; | |
72 | $MidNumEx = $MidNum $Extend*; | |
73 | $MidLetterEx = $MidLetter $Extend*; | |
74 | $KatakanaEx = $Katakana $Extend*; | |
75 | $ExtendNumLetEx = $ExtendNumLet $Extend*; | |
b75a7d8f | 76 | |
374ca955 | 77 | ## ------------------------------------------------- |
b75a7d8f | 78 | |
374ca955 | 79 | !!forward; |
b75a7d8f | 80 | |
b75a7d8f | 81 | |
374ca955 A |
82 | # Rule 3 - don't break grapheme clusters. |
83 | # see character breaks | |
b75a7d8f | 84 | |
374ca955 A |
85 | $CR $LF; |
86 | #[^$Control] $Extend*; | |
87 | #$NumericEx $Extend* {100}; | |
88 | #$ALetterEx $Extend* {200}; | |
89 | [^$Control] $Extend+; | |
90 | $NumericEx {100}; | |
91 | $ALetterEx {200}; | |
92 | $KatakanaEx {300}; | |
b75a7d8f | 93 | |
374ca955 | 94 | # rule 5 |
b75a7d8f | 95 | |
374ca955 | 96 | $ALetterEx $Format* $ALetterEx {200}; |
b75a7d8f | 97 | |
374ca955 A |
98 | # rule 6 and 7 |
99 | $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200}; | |
b75a7d8f | 100 | |
374ca955 | 101 | # rule 8 |
b75a7d8f | 102 | |
374ca955 | 103 | $NumericEx $Format* $NumericEx {100}; |
b75a7d8f | 104 | |
374ca955 | 105 | # rule 9 |
b75a7d8f | 106 | |
374ca955 | 107 | $ALetterEx $Format* $NumericEx {200}; |
b75a7d8f | 108 | |
374ca955 | 109 | # rule 10 |
b75a7d8f | 110 | |
374ca955 | 111 | $NumericEx $Format* $ALetterEx {200}; |
b75a7d8f | 112 | |
374ca955 A |
113 | # rule 11 and 12 |
114 | ||
115 | $NumericEx $Format* $MidNumEx $Format* $NumericEx {100}; | |
116 | ||
117 | # rule 13 | |
118 | ||
119 | $KatakanaEx $Format* $KatakanaEx {300}; | |
120 | $Hiragana $Extend* {300}; | |
121 | $Ideographic $Extend* {400}; | |
122 | ||
123 | # rule 13a/b | |
124 | ||
125 | $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a) | |
126 | $NumericEx $Format* $ExtendNumLetEx {100}; # (13a) | |
127 | $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a) | |
128 | $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a) | |
129 | ||
130 | $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b) | |
131 | $ExtendNumLetEx $Format* $NumericEx {100}; # (13b) | |
132 | $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b) | |
133 | ||
134 | ||
135 | ||
136 | ## ------------------------------------------------- | |
b75a7d8f | 137 | |
374ca955 A |
138 | !!reverse; |
139 | ||
140 | $BackALetterEx = $Extend* $ALetter; | |
141 | $BackNumericEx = $Extend* $Numeric; | |
142 | $BackMidNumEx = $Extend* $MidNum; | |
143 | $BackMidLetterEx = $Extend* $MidLetter; | |
144 | $BackKatakanaEx = $Extend* $Katakana; | |
145 | $BackExtendNumLetEx= $Extend* $ExtendNumLet; | |
146 | ||
147 | $LF $CR; | |
148 | ||
149 | # see character breaks | |
150 | ||
151 | $Extend* [^$Control]; | |
152 | ||
153 | # rule 5 | |
154 | ||
155 | $BackALetterEx $Format* $BackALetterEx; | |
156 | ||
157 | # rule 6 and 7 | |
158 | ||
159 | $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx; | |
160 | ||
161 | ||
162 | # rule 8 | |
163 | ||
164 | $BackNumericEx $Format* $BackNumericEx; | |
165 | ||
166 | # rule 9 | |
167 | ||
168 | $BackNumericEx $Format* $BackALetterEx; | |
169 | ||
170 | # rule 10 | |
171 | ||
172 | $BackALetterEx $Format* $BackNumericEx; | |
173 | ||
174 | # rule 11 and 12 | |
175 | ||
176 | $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx; | |
177 | ||
178 | # rule 13 | |
179 | ||
180 | $BackKatakanaEx $Format* $BackKatakanaEx; | |
181 | ||
182 | # rules 13 a/b | |
b75a7d8f | 183 | # |
374ca955 A |
184 | ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx; |
185 | $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); | |
186 | ||
187 | ## ------------------------------------------------- | |
188 | ||
189 | !!safe_reverse; | |
190 | ||
191 | # rule 3 | |
192 | $Extend+ [^$Extend]; | |
193 | $Extend+; # comes into play when buffer _begins_ with an $Extend+. | |
194 | ||
195 | # rule 4 | |
196 | $Format+ $BackALetterEx; | |
197 | $Format+ $BackNumericEx; | |
198 | $Format+ $BackMidLetterEx; | |
199 | $Format+ $BackMidNumEx; | |
200 | $Format+ $BackKatakanaEx; | |
201 | $Format+ $BackExtendNumLetEx; | |
202 | ||
203 | ||
204 | # rule 6 | |
205 | $MidLetter $Format* $BackALetterEx; | |
206 | ||
207 | # rule 11 | |
208 | $MidNum $Format* $BackNumericEx; | |
209 | ||
210 | ## ------------------------------------------------- | |
211 | ||
212 | !!safe_forward; | |
213 | ||
214 | # rule 3 | |
215 | $Extend+; | |
216 | ||
217 | # rule 4 | |
218 | $Extend* $Format+ $ALetterEx; | |
219 | $Extend* $Format+ $NumericEx; | |
220 | $Extend* $Format+ $MidLetterEx; | |
221 | $Extend* $Format+ $MidNumEx; | |
222 | $Extend* $Format+ $KatakanaEx; | |
223 | $Extend* $Format+ $ExtendNumLetEx; | |
224 | ||
225 | $Extend+ $Format* $ALetterEx; | |
226 | $Extend+ $Format* $NumericEx; | |
227 | $Extend+ $Format* $MidLetterEx; | |
228 | $Extend+ $Format* $MidNumEx; | |
229 | $Extend+ $Format* $KatakanaEx; | |
230 | $Extend+ $Format* $ExtendNumLetEx; | |
231 | ||
232 | # rule 6 | |
233 | $MidLetterEx $Format* $ALetterEx; | |
b75a7d8f | 234 | |
374ca955 A |
235 | # rule 11 |
236 | $MidNumEx $Format* $NumericEx; |