]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/word.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word.txt
CommitLineData
b75a7d8f 1#
374ca955
A
2# Copyright (C) 2002-2004, International Business Machines Corporation
3# and others. All Rights Reserved.
b75a7d8f 4#
374ca955 5# file: word.txt
b75a7d8f 6#
374ca955 7# ICU Word Break Rules
b75a7d8f 8# See Unicode Standard Annex #29.
374ca955 9# These rules are based on Version 4.1 draft, dated 2004-11-11
b75a7d8f
A
10#
11
374ca955 12##############################################################################
b75a7d8f
A
13#
14# Character class definitions from TR 29
15#
374ca955
A
16##############################################################################
17
18!!chain;
19
20$Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
33
34
35$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
36 - [:Ideographic:]
37 - $Katakana
38 - [:Script = Hiragana:]
39 - [:Script = Lao:]
40 - [:Grapheme_Extend = TRUE:]];
41
42$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]
46 [:name = COLON:]];
47
48
49$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = COLON:]];
50$Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
51$ExtendNumLet = [[:Connector_Punctuation:]
52 - [:name = KATAKANA MIDDLE DOT:]
53 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
54
55
b75a7d8f
A
56
57#
58# Character Class Definitions.
59# The names are those from TR29.
60#
b75a7d8f 61
374ca955
A
62$CR = \u000d;
63$LF = \u000a;
64$Extend = [[:Grapheme_Extend = TRUE:]];
65$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
66$Format = [[:Cf:] - $Extend];
67$Hiragana = [:Hiragana:];
68$Ideographic = [:IDEOGRAPHIC:];
b75a7d8f 69
374ca955
A
70$ALetterEx = $ALetter $Extend*;
71$NumericEx = $Numeric $Extend*;
72$MidNumEx = $MidNum $Extend*;
73$MidLetterEx = $MidLetter $Extend*;
74$KatakanaEx = $Katakana $Extend*;
75$ExtendNumLetEx = $ExtendNumLet $Extend*;
b75a7d8f 76
374ca955 77## -------------------------------------------------
b75a7d8f 78
374ca955 79!!forward;
b75a7d8f 80
b75a7d8f 81
374ca955
A
82# Rule 3 - don't break grapheme clusters.
83# see character breaks
b75a7d8f 84
374ca955
A
85$CR $LF;
86#[^$Control] $Extend*;
87#$NumericEx $Extend* {100};
88#$ALetterEx $Extend* {200};
89[^$Control] $Extend+;
90$NumericEx {100};
91$ALetterEx {200};
92$KatakanaEx {300};
b75a7d8f 93
374ca955 94# rule 5
b75a7d8f 95
374ca955 96$ALetterEx $Format* $ALetterEx {200};
b75a7d8f 97
374ca955
A
98# rule 6 and 7
99$ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
b75a7d8f 100
374ca955 101# rule 8
b75a7d8f 102
374ca955 103$NumericEx $Format* $NumericEx {100};
b75a7d8f 104
374ca955 105# rule 9
b75a7d8f 106
374ca955 107$ALetterEx $Format* $NumericEx {200};
b75a7d8f 108
374ca955 109# rule 10
b75a7d8f 110
374ca955 111$NumericEx $Format* $ALetterEx {200};
b75a7d8f 112
374ca955
A
113# rule 11 and 12
114
115$NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
116
117# rule 13
118
119$KatakanaEx $Format* $KatakanaEx {300};
120$Hiragana $Extend* {300};
121$Ideographic $Extend* {400};
122
123# rule 13a/b
124
125$ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
126$NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
127$KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
128$ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
129
130$ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
131$ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
132$ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
133
134
135
136## -------------------------------------------------
b75a7d8f 137
374ca955
A
138!!reverse;
139
140$BackALetterEx = $Extend* $ALetter;
141$BackNumericEx = $Extend* $Numeric;
142$BackMidNumEx = $Extend* $MidNum;
143$BackMidLetterEx = $Extend* $MidLetter;
144$BackKatakanaEx = $Extend* $Katakana;
145$BackExtendNumLetEx= $Extend* $ExtendNumLet;
146
147$LF $CR;
148
149# see character breaks
150
151$Extend* [^$Control];
152
153# rule 5
154
155$BackALetterEx $Format* $BackALetterEx;
156
157# rule 6 and 7
158
159$BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
160
161
162# rule 8
163
164$BackNumericEx $Format* $BackNumericEx;
165
166# rule 9
167
168$BackNumericEx $Format* $BackALetterEx;
169
170# rule 10
171
172$BackALetterEx $Format* $BackNumericEx;
173
174# rule 11 and 12
175
176$BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
177
178# rule 13
179
180$BackKatakanaEx $Format* $BackKatakanaEx;
181
182# rules 13 a/b
b75a7d8f 183#
374ca955
A
184($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
185$BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
186
187## -------------------------------------------------
188
189!!safe_reverse;
190
191# rule 3
192$Extend+ [^$Extend];
193$Extend+; # comes into play when buffer _begins_ with an $Extend+.
194
195# rule 4
196$Format+ $BackALetterEx;
197$Format+ $BackNumericEx;
198$Format+ $BackMidLetterEx;
199$Format+ $BackMidNumEx;
200$Format+ $BackKatakanaEx;
201$Format+ $BackExtendNumLetEx;
202
203
204# rule 6
205$MidLetter $Format* $BackALetterEx;
206
207# rule 11
208$MidNum $Format* $BackNumericEx;
209
210## -------------------------------------------------
211
212!!safe_forward;
213
214# rule 3
215$Extend+;
216
217# rule 4
218$Extend* $Format+ $ALetterEx;
219$Extend* $Format+ $NumericEx;
220$Extend* $Format+ $MidLetterEx;
221$Extend* $Format+ $MidNumEx;
222$Extend* $Format+ $KatakanaEx;
223$Extend* $Format+ $ExtendNumLetEx;
224
225$Extend+ $Format* $ALetterEx;
226$Extend+ $Format* $NumericEx;
227$Extend+ $Format* $MidLetterEx;
228$Extend+ $Format* $MidNumEx;
229$Extend+ $Format* $KatakanaEx;
230$Extend+ $Format* $ExtendNumLetEx;
231
232# rule 6
233$MidLetterEx $Format* $ALetterEx;
b75a7d8f 234
374ca955
A
235# rule 11
236$MidNumEx $Format* $NumericEx;