]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/word_POSIX.txt
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word_POSIX.txt
1 #
2 # Copyright (C) 2002-2004, International Business Machines Corporation
3 # and others. All Rights Reserved.
4 #
5 # file: word_POSIX.txt
6 #
7 # ICU Word Break Rules
8 # See Unicode Standard Annex #29.
9 # These rules are based on Version 4.1 draft, dated 2004-11-11
10 #
11
12 ##############################################################################
13 #
14 # Character class definitions from TR 29
15 #
16 ##############################################################################
17
18 !!chain;
19
20 $Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
33
34
35 $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
36 - [:Ideographic:]
37 - $Katakana
38 - [:Script = Hiragana:]
39 - [:Script = Lao:]
40 - [:Grapheme_Extend = TRUE:]];
41
42 $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]];
46
47
48 $MidNum = [[:LineBreak = Infix_Numeric:]];
49 $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
50 $ExtendNumLet = [[:Connector_Punctuation:]
51 - [:name = KATAKANA MIDDLE DOT:]
52 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
53
54
55
56 #
57 # Character Class Definitions.
58 # The names are those from TR29.
59 #
60
61 $CR = \u000d;
62 $LF = \u000a;
63 $Extend = [[:Grapheme_Extend = TRUE:]];
64 $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
65 $Format = [[:Cf:] - $Extend];
66 $Hiragana = [:Hiragana:];
67 $Ideographic = [:IDEOGRAPHIC:];
68
69 $ALetterEx = $ALetter $Extend*;
70 $NumericEx = $Numeric $Extend*;
71 $MidNumEx = $MidNum $Extend*;
72 $MidLetterEx = $MidLetter $Extend*;
73 $KatakanaEx = $Katakana $Extend*;
74 $ExtendNumLetEx = $ExtendNumLet $Extend*;
75
76 ## -------------------------------------------------
77
78 !!forward;
79
80
81 # Rule 3 - don't break grapheme clusters.
82 # see character breaks
83
84 $CR $LF;
85 #[^$Control] $Extend*;
86 #$NumericEx $Extend* {100};
87 #$ALetterEx $Extend* {200};
88 [^$Control] $Extend+;
89 $NumericEx {100};
90 $ALetterEx {200};
91 $KatakanaEx {300};
92
93 # rule 5
94
95 $ALetterEx $Format* $ALetterEx {200};
96
97 # rule 6 and 7
98 $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
99
100 # rule 8
101
102 $NumericEx $Format* $NumericEx {100};
103
104 # rule 9
105
106 $ALetterEx $Format* $NumericEx {200};
107
108 # rule 10
109
110 $NumericEx $Format* $ALetterEx {200};
111
112 # rule 11 and 12
113
114 $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
115
116 # rule 13
117
118 $KatakanaEx $Format* $KatakanaEx {300};
119 $Hiragana $Extend* {300};
120 $Ideographic $Extend* {400};
121
122 # rule 13a/b
123
124 $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
125 $NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
126 $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
127 $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
128
129 $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
130 $ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
131 $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
132
133
134
135 ## -------------------------------------------------
136
137 !!reverse;
138
139 $BackALetterEx = $Extend* $ALetter;
140 $BackNumericEx = $Extend* $Numeric;
141 $BackMidNumEx = $Extend* $MidNum;
142 $BackMidLetterEx = $Extend* $MidLetter;
143 $BackKatakanaEx = $Extend* $Katakana;
144 $BackExtendNumLetEx= $Extend* $ExtendNumLet;
145
146 $LF $CR;
147
148 # see character breaks
149
150 $Extend* [^$Control];
151
152 # rule 5
153
154 $BackALetterEx $Format* $BackALetterEx;
155
156 # rule 6 and 7
157
158 $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
159
160
161 # rule 8
162
163 $BackNumericEx $Format* $BackNumericEx;
164
165 # rule 9
166
167 $BackNumericEx $Format* $BackALetterEx;
168
169 # rule 10
170
171 $BackALetterEx $Format* $BackNumericEx;
172
173 # rule 11 and 12
174
175 $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
176
177 # rule 13
178
179 $BackKatakanaEx $Format* $BackKatakanaEx;
180
181 # rules 13 a/b
182 #
183 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
184 $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
185
186 ## -------------------------------------------------
187
188 !!safe_reverse;
189
190 # rule 3
191 $Extend+ [^$Extend];
192 $Extend+; # comes into play when buffer _begins_ with an $Extend+.
193
194 # rule 4
195 $Format+ $BackALetterEx;
196 $Format+ $BackNumericEx;
197 $Format+ $BackMidLetterEx;
198 $Format+ $BackMidNumEx;
199 $Format+ $BackKatakanaEx;
200 $Format+ $BackExtendNumLetEx;
201
202
203 # rule 6
204 $MidLetter $Format* $BackALetterEx;
205
206 # rule 11
207 $MidNum $Format* $BackNumericEx;
208
209 ## -------------------------------------------------
210
211 !!safe_forward;
212
213 # rule 3
214 $Extend+;
215
216 # rule 4
217 $Extend* $Format+ $ALetterEx;
218 $Extend* $Format+ $NumericEx;
219 $Extend* $Format+ $MidLetterEx;
220 $Extend* $Format+ $MidNumEx;
221 $Extend* $Format+ $KatakanaEx;
222 $Extend* $Format+ $ExtendNumLetEx;
223
224 $Extend+ $Format* $ALetterEx;
225 $Extend+ $Format* $NumericEx;
226 $Extend+ $Format* $MidLetterEx;
227 $Extend+ $Format* $MidNumEx;
228 $Extend+ $Format* $KatakanaEx;
229 $Extend+ $Format* $ExtendNumLetEx;
230
231 # rule 6
232 $MidLetterEx $Format* $ALetterEx;
233
234 # rule 11
235 $MidNumEx $Format* $NumericEx;