]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/word.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word.txt
1 #
2 # Copyright (C) 2002-2004, International Business Machines Corporation
3 # and others. All Rights Reserved.
4 #
5 # file: word.txt
6 #
7 # ICU Word Break Rules
8 # See Unicode Standard Annex #29.
9 # These rules are based on Version 4.1 draft, dated 2004-11-11
10 #
11
12 ##############################################################################
13 #
14 # Character class definitions from TR 29
15 #
16 ##############################################################################
17
18 !!chain;
19
20 $Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
33
34
35 $ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
36 - [:Ideographic:]
37 - $Katakana
38 - [:Script = Hiragana:]
39 - [:Script = Lao:]
40 - [:Grapheme_Extend = TRUE:]];
41
42 $MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]
46 [:name = COLON:]];
47
48
49 $MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
50 $Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
51 $ExtendNumLet = [[:Connector_Punctuation:]
52 - [:name = KATAKANA MIDDLE DOT:]
53 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
54
55
56
57 #
58 # Character Class Definitions.
59 # The names are those from TR29.
60 #
61
62 $CR = \u000d;
63 $LF = \u000a;
64 $Extend = [[:Grapheme_Extend = TRUE:]];
65 $Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
66 $Format = [[:Cf:] - $Extend];
67 $Hiragana = [:Hiragana:];
68 $Ideographic = [:IDEOGRAPHIC:];
69
70 $ALetterEx = $ALetter $Extend*;
71 $NumericEx = $Numeric $Extend*;
72 $MidNumEx = $MidNum $Extend*;
73 $MidLetterEx = $MidLetter $Extend*;
74 $KatakanaEx = $Katakana $Extend*;
75 $ExtendNumLetEx = $ExtendNumLet $Extend*;
76
77 ## -------------------------------------------------
78
79 !!forward;
80
81
82 # Rule 3 - don't break grapheme clusters.
83 # see character breaks
84
85 $CR $LF;
86 #[^$Control] $Extend*;
87 #$NumericEx $Extend* {100};
88 #$ALetterEx $Extend* {200};
89 [^$Control] $Extend+;
90 $NumericEx {100};
91 $ALetterEx {200};
92 $KatakanaEx {300};
93
94 # rule 5
95
96 $ALetterEx $Format* $ALetterEx {200};
97
98 # rule 6 and 7
99 $ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
100
101 # rule 8
102
103 $NumericEx $Format* $NumericEx {100};
104
105 # rule 9
106
107 $ALetterEx $Format* $NumericEx {200};
108
109 # rule 10
110
111 $NumericEx $Format* $ALetterEx {200};
112
113 # rule 11 and 12
114
115 $NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
116
117 # rule 13
118
119 $KatakanaEx $Format* $KatakanaEx {300};
120 $Hiragana $Extend* {300};
121 $Ideographic $Extend* {400};
122
123 # rule 13a/b
124
125 $ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
126 $NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
127 $KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
128 $ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
129
130 $ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
131 $ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
132 $ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
133
134
135
136 ## -------------------------------------------------
137
138 !!reverse;
139
140 $BackALetterEx = $Extend* $ALetter;
141 $BackNumericEx = $Extend* $Numeric;
142 $BackMidNumEx = $Extend* $MidNum;
143 $BackMidLetterEx = $Extend* $MidLetter;
144 $BackKatakanaEx = $Extend* $Katakana;
145 $BackExtendNumLetEx= $Extend* $ExtendNumLet;
146
147 $LF $CR;
148
149 # see character breaks
150
151 $Extend* [^$Control];
152
153 # rule 5
154
155 $BackALetterEx $Format* $BackALetterEx;
156
157 # rule 6 and 7
158
159 $BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
160
161
162 # rule 8
163
164 $BackNumericEx $Format* $BackNumericEx;
165
166 # rule 9
167
168 $BackNumericEx $Format* $BackALetterEx;
169
170 # rule 10
171
172 $BackALetterEx $Format* $BackNumericEx;
173
174 # rule 11 and 12
175
176 $BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
177
178 # rule 13
179
180 $BackKatakanaEx $Format* $BackKatakanaEx;
181
182 # rules 13 a/b
183 #
184 ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
185 $BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
186
187 ## -------------------------------------------------
188
189 !!safe_reverse;
190
191 # rule 3
192 $Extend+ [^$Extend];
193 $Extend+; # comes into play when buffer _begins_ with an $Extend+.
194
195 # rule 4
196 $Format+ $BackALetterEx;
197 $Format+ $BackNumericEx;
198 $Format+ $BackMidLetterEx;
199 $Format+ $BackMidNumEx;
200 $Format+ $BackKatakanaEx;
201 $Format+ $BackExtendNumLetEx;
202
203
204 # rule 6
205 $MidLetter $Format* $BackALetterEx;
206
207 # rule 11
208 $MidNum $Format* $BackNumericEx;
209
210 ## -------------------------------------------------
211
212 !!safe_forward;
213
214 # rule 3
215 $Extend+;
216
217 # rule 4
218 $Extend* $Format+ $ALetterEx;
219 $Extend* $Format+ $NumericEx;
220 $Extend* $Format+ $MidLetterEx;
221 $Extend* $Format+ $MidNumEx;
222 $Extend* $Format+ $KatakanaEx;
223 $Extend* $Format+ $ExtendNumLetEx;
224
225 $Extend+ $Format* $ALetterEx;
226 $Extend+ $Format* $NumericEx;
227 $Extend+ $Format* $MidLetterEx;
228 $Extend+ $Format* $MidNumEx;
229 $Extend+ $Format* $KatakanaEx;
230 $Extend+ $Format* $ExtendNumLetEx;
231
232 # rule 6
233 $MidLetterEx $Format* $ALetterEx;
234
235 # rule 11
236 $MidNumEx $Format* $NumericEx;