]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | # |
2 | # Copyright (C) 2002-2004, International Business Machines Corporation | |
3 | # and others. All Rights Reserved. | |
4 | # | |
5 | # file: word_POSIX.txt | |
6 | # | |
7 | # ICU Word Break Rules | |
8 | # See Unicode Standard Annex #29. | |
9 | # These rules are based on Version 4.1 draft, dated 2004-11-11 | |
10 | # | |
11 | ||
12 | ############################################################################## | |
13 | # | |
14 | # Character class definitions from TR 29 | |
15 | # | |
16 | ############################################################################## | |
17 | ||
18 | !!chain; | |
19 | ||
20 | $Katakana = [[:Script = KATAKANA:] | |
21 | [:name = VERTICAL KANA REPEAT MARK:] | |
22 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:] | |
23 | [:name = VERTICAL KANA REPEAT MARK UPPER HALF:] | |
24 | [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:] | |
25 | [:name = VERTICAL KANA REPEAT MARK LOWER HALF:] | |
26 | [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:] | |
27 | [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:] | |
28 | [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:] | |
29 | [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
30 | [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] | |
31 | [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] | |
32 | [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; | |
33 | ||
34 | ||
35 | $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:] | |
36 | - [:Ideographic:] | |
37 | - $Katakana | |
38 | - [:Script = Hiragana:] | |
39 | - [:Script = Lao:] | |
40 | - [:Grapheme_Extend = TRUE:]]; | |
41 | ||
42 | $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] | |
43 | [:name = HEBREW PUNCTUATION GERSHAYIM:] | |
44 | [:name = RIGHT SINGLE QUOTATION MARK:] | |
45 | [:name = HYPHENATION POINT:]]; | |
46 | ||
47 | ||
48 | $MidNum = [[:LineBreak = Infix_Numeric:]]; | |
49 | $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]]; | |
50 | $ExtendNumLet = [[:Connector_Punctuation:] | |
51 | - [:name = KATAKANA MIDDLE DOT:] | |
52 | - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]]; | |
53 | ||
54 | ||
55 | ||
56 | # | |
57 | # Character Class Definitions. | |
58 | # The names are those from TR29. | |
59 | # | |
60 | ||
61 | $CR = \u000d; | |
62 | $LF = \u000a; | |
63 | $Extend = [[:Grapheme_Extend = TRUE:]]; | |
64 | $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend]; | |
65 | $Format = [[:Cf:] - $Extend]; | |
66 | $Hiragana = [:Hiragana:]; | |
67 | $Ideographic = [:IDEOGRAPHIC:]; | |
68 | ||
69 | $ALetterEx = $ALetter $Extend*; | |
70 | $NumericEx = $Numeric $Extend*; | |
71 | $MidNumEx = $MidNum $Extend*; | |
72 | $MidLetterEx = $MidLetter $Extend*; | |
73 | $KatakanaEx = $Katakana $Extend*; | |
74 | $ExtendNumLetEx = $ExtendNumLet $Extend*; | |
75 | ||
76 | ## ------------------------------------------------- | |
77 | ||
78 | !!forward; | |
79 | ||
80 | ||
81 | # Rule 3 - don't break grapheme clusters. | |
82 | # see character breaks | |
83 | ||
84 | $CR $LF; | |
85 | #[^$Control] $Extend*; | |
86 | #$NumericEx $Extend* {100}; | |
87 | #$ALetterEx $Extend* {200}; | |
88 | [^$Control] $Extend+; | |
89 | $NumericEx {100}; | |
90 | $ALetterEx {200}; | |
91 | $KatakanaEx {300}; | |
92 | ||
93 | # rule 5 | |
94 | ||
95 | $ALetterEx $Format* $ALetterEx {200}; | |
96 | ||
97 | # rule 6 and 7 | |
98 | $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200}; | |
99 | ||
100 | # rule 8 | |
101 | ||
102 | $NumericEx $Format* $NumericEx {100}; | |
103 | ||
104 | # rule 9 | |
105 | ||
106 | $ALetterEx $Format* $NumericEx {200}; | |
107 | ||
108 | # rule 10 | |
109 | ||
110 | $NumericEx $Format* $ALetterEx {200}; | |
111 | ||
112 | # rule 11 and 12 | |
113 | ||
114 | $NumericEx $Format* $MidNumEx $Format* $NumericEx {100}; | |
115 | ||
116 | # rule 13 | |
117 | ||
118 | $KatakanaEx $Format* $KatakanaEx {300}; | |
119 | $Hiragana $Extend* {300}; | |
120 | $Ideographic $Extend* {400}; | |
121 | ||
122 | # rule 13a/b | |
123 | ||
124 | $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a) | |
125 | $NumericEx $Format* $ExtendNumLetEx {100}; # (13a) | |
126 | $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a) | |
127 | $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a) | |
128 | ||
129 | $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b) | |
130 | $ExtendNumLetEx $Format* $NumericEx {100}; # (13b) | |
131 | $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b) | |
132 | ||
133 | ||
134 | ||
135 | ## ------------------------------------------------- | |
136 | ||
137 | !!reverse; | |
138 | ||
139 | $BackALetterEx = $Extend* $ALetter; | |
140 | $BackNumericEx = $Extend* $Numeric; | |
141 | $BackMidNumEx = $Extend* $MidNum; | |
142 | $BackMidLetterEx = $Extend* $MidLetter; | |
143 | $BackKatakanaEx = $Extend* $Katakana; | |
144 | $BackExtendNumLetEx= $Extend* $ExtendNumLet; | |
145 | ||
146 | $LF $CR; | |
147 | ||
148 | # see character breaks | |
149 | ||
150 | $Extend* [^$Control]; | |
151 | ||
152 | # rule 5 | |
153 | ||
154 | $BackALetterEx $Format* $BackALetterEx; | |
155 | ||
156 | # rule 6 and 7 | |
157 | ||
158 | $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx; | |
159 | ||
160 | ||
161 | # rule 8 | |
162 | ||
163 | $BackNumericEx $Format* $BackNumericEx; | |
164 | ||
165 | # rule 9 | |
166 | ||
167 | $BackNumericEx $Format* $BackALetterEx; | |
168 | ||
169 | # rule 10 | |
170 | ||
171 | $BackALetterEx $Format* $BackNumericEx; | |
172 | ||
173 | # rule 11 and 12 | |
174 | ||
175 | $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx; | |
176 | ||
177 | # rule 13 | |
178 | ||
179 | $BackKatakanaEx $Format* $BackKatakanaEx; | |
180 | ||
181 | # rules 13 a/b | |
182 | # | |
183 | ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx; | |
184 | $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx); | |
185 | ||
186 | ## ------------------------------------------------- | |
187 | ||
188 | !!safe_reverse; | |
189 | ||
190 | # rule 3 | |
191 | $Extend+ [^$Extend]; | |
192 | $Extend+; # comes into play when buffer _begins_ with an $Extend+. | |
193 | ||
194 | # rule 4 | |
195 | $Format+ $BackALetterEx; | |
196 | $Format+ $BackNumericEx; | |
197 | $Format+ $BackMidLetterEx; | |
198 | $Format+ $BackMidNumEx; | |
199 | $Format+ $BackKatakanaEx; | |
200 | $Format+ $BackExtendNumLetEx; | |
201 | ||
202 | ||
203 | # rule 6 | |
204 | $MidLetter $Format* $BackALetterEx; | |
205 | ||
206 | # rule 11 | |
207 | $MidNum $Format* $BackNumericEx; | |
208 | ||
209 | ## ------------------------------------------------- | |
210 | ||
211 | !!safe_forward; | |
212 | ||
213 | # rule 3 | |
214 | $Extend+; | |
215 | ||
216 | # rule 4 | |
217 | $Extend* $Format+ $ALetterEx; | |
218 | $Extend* $Format+ $NumericEx; | |
219 | $Extend* $Format+ $MidLetterEx; | |
220 | $Extend* $Format+ $MidNumEx; | |
221 | $Extend* $Format+ $KatakanaEx; | |
222 | $Extend* $Format+ $ExtendNumLetEx; | |
223 | ||
224 | $Extend+ $Format* $ALetterEx; | |
225 | $Extend+ $Format* $NumericEx; | |
226 | $Extend+ $Format* $MidLetterEx; | |
227 | $Extend+ $Format* $MidNumEx; | |
228 | $Extend+ $Format* $KatakanaEx; | |
229 | $Extend+ $Format* $ExtendNumLetEx; | |
230 | ||
231 | # rule 6 | |
232 | $MidLetterEx $Format* $ALetterEx; | |
233 | ||
234 | # rule 11 | |
235 | $MidNumEx $Format* $NumericEx; |