]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/word_POSIX.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word_POSIX.txt
CommitLineData
374ca955
A
1#
2# Copyright (C) 2002-2004, International Business Machines Corporation
3# and others. All Rights Reserved.
4#
5# file: word_POSIX.txt
6#
7# ICU Word Break Rules
8# See Unicode Standard Annex #29.
9# These rules are based on Version 4.1 draft, dated 2004-11-11
10#
11
12##############################################################################
13#
14# Character class definitions from TR 29
15#
16##############################################################################
17
18!!chain;
19
20$Katakana = [[:Script = KATAKANA:]
21 [:name = VERTICAL KANA REPEAT MARK:]
22 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK:]
23 [:name = VERTICAL KANA REPEAT MARK UPPER HALF:]
24 [:name = VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF:]
25 [:name = VERTICAL KANA REPEAT MARK LOWER HALF:]
26 [:name = KATAKANA-HIRAGANA VOICED SOUND MARK:]
27 [:name = KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK:]
28 [:name = KATAKANA-HIRAGANA DOUBLE HYPHEN:]
29 [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
30 [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:]
31 [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:]
32 [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]];
33
34
35$ALetter = [[:Alphabetic:] [:name= HEBREW PUNCTUATION GERESH:]
36 - [:Ideographic:]
37 - $Katakana
38 - [:Script = Hiragana:]
39 - [:Script = Lao:]
40 - [:Grapheme_Extend = TRUE:]];
41
42$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:]
43 [:name = HEBREW PUNCTUATION GERSHAYIM:]
44 [:name = RIGHT SINGLE QUOTATION MARK:]
45 [:name = HYPHENATION POINT:]];
46
47
48$MidNum = [[:LineBreak = Infix_Numeric:]];
49$Numeric = [[:Nd:][:name = ARABIC DECIMAL SEPARATOR:][:name = ARABIC THOUSANDS SEPARATOR:]];
50$ExtendNumLet = [[:Connector_Punctuation:]
51 - [:name = KATAKANA MIDDLE DOT:]
52 - [:name = HALFWIDTH KATAKANA MIDDLE DOT:]];
53
54
55
56#
57# Character Class Definitions.
58# The names are those from TR29.
59#
60
61$CR = \u000d;
62$LF = \u000a;
63$Extend = [[:Grapheme_Extend = TRUE:]];
64$Control = [[:Zl:] [:Zp:] [:Cc:] [:Cf:] - $Extend];
65$Format = [[:Cf:] - $Extend];
66$Hiragana = [:Hiragana:];
67$Ideographic = [:IDEOGRAPHIC:];
68
69$ALetterEx = $ALetter $Extend*;
70$NumericEx = $Numeric $Extend*;
71$MidNumEx = $MidNum $Extend*;
72$MidLetterEx = $MidLetter $Extend*;
73$KatakanaEx = $Katakana $Extend*;
74$ExtendNumLetEx = $ExtendNumLet $Extend*;
75
76## -------------------------------------------------
77
78!!forward;
79
80
81# Rule 3 - don't break grapheme clusters.
82# see character breaks
83
84$CR $LF;
85#[^$Control] $Extend*;
86#$NumericEx $Extend* {100};
87#$ALetterEx $Extend* {200};
88[^$Control] $Extend+;
89$NumericEx {100};
90$ALetterEx {200};
91$KatakanaEx {300};
92
93# rule 5
94
95$ALetterEx $Format* $ALetterEx {200};
96
97# rule 6 and 7
98$ALetterEx $Format* $MidLetterEx $Format* $ALetterEx {200};
99
100# rule 8
101
102$NumericEx $Format* $NumericEx {100};
103
104# rule 9
105
106$ALetterEx $Format* $NumericEx {200};
107
108# rule 10
109
110$NumericEx $Format* $ALetterEx {200};
111
112# rule 11 and 12
113
114$NumericEx $Format* $MidNumEx $Format* $NumericEx {100};
115
116# rule 13
117
118$KatakanaEx $Format* $KatakanaEx {300};
119$Hiragana $Extend* {300};
120$Ideographic $Extend* {400};
121
122# rule 13a/b
123
124$ALetterEx $Format* $ExtendNumLetEx {200}; # (13a)
125$NumericEx $Format* $ExtendNumLetEx {100}; # (13a)
126$KatakanaEx $Format* $ExtendNumLetEx {300}; # (13a)
127$ExtendNumLetEx $Format* $ExtendNumLetEx{200}; # (13a)
128
129$ExtendNumLetEx $Format* $ALetterEx {200}; # (13b)
130$ExtendNumLetEx $Format* $NumericEx {100}; # (13b)
131$ExtendNumLetEx $Format* $KatakanaEx {300}; # (13b)
132
133
134
135## -------------------------------------------------
136
137!!reverse;
138
139$BackALetterEx = $Extend* $ALetter;
140$BackNumericEx = $Extend* $Numeric;
141$BackMidNumEx = $Extend* $MidNum;
142$BackMidLetterEx = $Extend* $MidLetter;
143$BackKatakanaEx = $Extend* $Katakana;
144$BackExtendNumLetEx= $Extend* $ExtendNumLet;
145
146$LF $CR;
147
148# see character breaks
149
150$Extend* [^$Control];
151
152# rule 5
153
154$BackALetterEx $Format* $BackALetterEx;
155
156# rule 6 and 7
157
158$BackALetterEx $Format* $BackMidLetterEx $Format* $BackALetterEx;
159
160
161# rule 8
162
163$BackNumericEx $Format* $BackNumericEx;
164
165# rule 9
166
167$BackNumericEx $Format* $BackALetterEx;
168
169# rule 10
170
171$BackALetterEx $Format* $BackNumericEx;
172
173# rule 11 and 12
174
175$BackNumericEx $Format* $BackMidNumEx $Format* $BackNumericEx;
176
177# rule 13
178
179$BackKatakanaEx $Format* $BackKatakanaEx;
180
181# rules 13 a/b
182#
183($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx) $Format* $BackExtendNumLetEx;
184$BackExtendNumLetEx $Format* ($BackALetterEx | $BackNumericEx | $BackKatakanaEx);
185
186## -------------------------------------------------
187
188!!safe_reverse;
189
190# rule 3
191$Extend+ [^$Extend];
192$Extend+; # comes into play when buffer _begins_ with an $Extend+.
193
194# rule 4
195$Format+ $BackALetterEx;
196$Format+ $BackNumericEx;
197$Format+ $BackMidLetterEx;
198$Format+ $BackMidNumEx;
199$Format+ $BackKatakanaEx;
200$Format+ $BackExtendNumLetEx;
201
202
203# rule 6
204$MidLetter $Format* $BackALetterEx;
205
206# rule 11
207$MidNum $Format* $BackNumericEx;
208
209## -------------------------------------------------
210
211!!safe_forward;
212
213# rule 3
214$Extend+;
215
216# rule 4
217$Extend* $Format+ $ALetterEx;
218$Extend* $Format+ $NumericEx;
219$Extend* $Format+ $MidLetterEx;
220$Extend* $Format+ $MidNumEx;
221$Extend* $Format+ $KatakanaEx;
222$Extend* $Format+ $ExtendNumLetEx;
223
224$Extend+ $Format* $ALetterEx;
225$Extend+ $Format* $NumericEx;
226$Extend+ $Format* $MidLetterEx;
227$Extend+ $Format* $MidNumEx;
228$Extend+ $Format* $KatakanaEx;
229$Extend+ $Format* $ExtendNumLetEx;
230
231# rule 6
232$MidLetterEx $Format* $ALetterEx;
233
234# rule 11
235$MidNumEx $Format* $NumericEx;