]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/word.txt
ICU-511.34.tar.gz
[apple/icu.git] / icuSources / data / brkitr / word.txt
CommitLineData
b75a7d8f 1#
51004dcb 2# Copyright (C) 2002-2013, International Business Machines Corporation
374ca955 3# and others. All Rights Reserved.
b75a7d8f 4#
374ca955 5# file: word.txt
b75a7d8f 6#
374ca955 7# ICU Word Break Rules
b75a7d8f 8# See Unicode Standard Annex #29.
51004dcb 9# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
b75a7d8f 10#
73c04bcf 11# Note: Updates to word.txt will usually need to be merged into
51004dcb 12# word_POSIX.txt also.
b75a7d8f 13
374ca955 14##############################################################################
b75a7d8f
A
15#
16# Character class definitions from TR 29
17#
374ca955
A
18##############################################################################
19
20!!chain;
21
b75a7d8f
A
22
23#
24# Character Class Definitions.
b75a7d8f 25#
b75a7d8f 26
46f4442e
A
27$CR = [\p{Word_Break = CR}];
28$LF = [\p{Word_Break = LF}];
29$Newline = [\p{Word_Break = Newline}];
30$Extend = [\p{Word_Break = Extend}];
73c04bcf 31$Format = [\p{Word_Break = Format}];
51004dcb 32$Hiragana = [:Hiragana:];
46f4442e 33$Katakana = [\p{Word_Break = Katakana}];
51004dcb 34$Han = [:Han:];
73c04bcf 35$ALetter = [\p{Word_Break = ALetter}];
46f4442e 36$MidNumLet = [\p{Word_Break = MidNumLet}];
73c04bcf
A
37$MidLetter = [\p{Word_Break = MidLetter}];
38$MidNum = [\p{Word_Break = MidNum}];
39$Numeric = [\p{Word_Break = Numeric}];
40$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
41
51004dcb
A
42$RI_A = \U0001F1E6; # Trail ERTU
43$RI_B = \U0001F1E7; # Trail EGR
44$RI_C = \U0001F1E8; # Trail AHLNZ
45$RI_D = \U0001F1E9; # Trail EK
46$RI_E = \U0001F1EA; # Trail GS
47$RI_F = \U0001F1EB; # Trail IR
48$RI_G = \U0001F1EC; # Trail BR
49$RI_H = \U0001F1ED; # Trail KU
50$RI_I = \U0001F1EE; # Trail DLNT
51$RI_J = \U0001F1EF; # Trail OP
52$RI_K = \U0001F1F0; # Trail R
53$RI_L = \U0001F1F1; # Trail B
54$RI_M = \U0001F1F2; # Trail OXY
55$RI_N = \U0001F1F3; # Trail LO
56$RI_P = \U0001F1F5; # Trail LT
57$RI_R = \U0001F1F7; # Trail OU
58$RI_S = \U0001F1F8; # Trail AEGK
59$RI_T = \U0001F1F9; # Trail HRW
60$RI_U = \U0001F1FA; # Trail AS
61$RI_V = \U0001F1FB; # Trail N
62
63$RI_A_End = [\U0001F1EA \U0001F1F7 \U0001F1F9 \U0001F1FA]; # ERTU
64$RI_B_End = [\U0001F1EA \U0001F1EC \U0001F1F7]; # EGR
65$RI_C_End = [\U0001F1E6 \U0001F1ED \U0001F1F1 \U0001F1F3 \U0001F1FF]; # AHLNZ
66$RI_D_End = [\U0001F1EA \U0001F1F0]; # EK
67$RI_E_End = [\U0001F1EC \U0001F1F8]; # GS
68$RI_F_End = [\U0001F1EE \U0001F1F7]; # IR
69$RI_G_End = [\U0001F1E7 \U0001F1F7]; # BR
70$RI_H_End = [\U0001F1F0 \U0001F1FA]; # KU
71$RI_I_End = [\U0001F1E9 \U0001F1F1 \U0001F1F3 \U0001F1F9]; # DLNT
72$RI_J_End = [\U0001F1F4 \U0001F1F5]; # OP
73$RI_K_End = \U0001F1F7; # R
74$RI_L_End = \U0001F1E7; # B
75$RI_M_End = [\U0001F1F4 \U0001F1FD \U0001F1FE]; # OXY
76$RI_N_End = [\U0001F1F1 \U0001F1F4]; # LO
77$RI_P_End = [\U0001F1F1 \U0001F1F9]; # LT
78$RI_R_End = [\U0001F1F4 \U0001F1FA]; # OU
79$RI_S_End = [\U0001F1E6 \U0001F1EA \U0001F1EC \U0001F1F0]; # AEGK
80$RI_T_End = [\U0001F1ED \U0001F1F7 \U0001F1FC]; # HRW
81$RI_U_End = [\U0001F1E6 \U0001F1F8]; # AS
82$RI_V_End = \U0001F1F3; # N
83
73c04bcf 84
73c04bcf
A
85# Dictionary character set, for triggering language-based break engines. Currently
86# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
87# 5.0 or later as the definition of Complex_Context was corrected to include all
88# characters requiring dictionary break.
89
46f4442e 90$Control = [\p{Grapheme_Cluster_Break = Control}];
51004dcb
A
91$HangulSyllable = [\uac00-\ud7a3];
92$ComplexContext = [:LineBreak = Complex_Context:];
93$KanaKanji = [$Han $Hiragana $Katakana];
94$dictionaryCJK = [$KanaKanji $HangulSyllable];
95$dictionary = [$ComplexContext $dictionaryCJK];
96
97# leave CJK scripts out of ALetterPlus
98$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
99
73c04bcf
A
100
101#
46f4442e
A
102# Rules 4 Ignore Format and Extend characters,
103# except when they appear at the beginning of a region of text.
73c04bcf 104#
51004dcb 105# TODO: check if handling of katakana in dictionary makes rules incorrect/void
73c04bcf
A
106$KatakanaEx = $Katakana ($Extend | $Format)*;
107$ALetterEx = $ALetterPlus ($Extend | $Format)*;
46f4442e 108$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
73c04bcf
A
109$MidLetterEx = $MidLetter ($Extend | $Format)*;
110$MidNumEx = $MidNum ($Extend | $Format)*;
111$NumericEx = $Numeric ($Extend | $Format)*;
112$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
113
46f4442e 114$Ideographic = [\p{Ideographic}];
73c04bcf
A
115$HiraganaEx = $Hiragana ($Extend | $Format)*;
116$IdeographicEx = $Ideographic ($Extend | $Format)*;
b75a7d8f 117
374ca955 118## -------------------------------------------------
b75a7d8f 119
374ca955 120!!forward;
b75a7d8f 121
b75a7d8f 122
73c04bcf 123# Rule 3 - CR x LF
46f4442e
A
124#
125$CR $LF;
73c04bcf
A
126
127# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
128# of a region of Text. The rule here comes into play when the start of text
129# begins with a group of Format chars, or with a "word" consisting of a single
130# char that is not in any of the listed word break categories followed by
51004dcb 131# format char(s), or is not a CJK dictionary character.
46f4442e 132[^$CR $LF $Newline]? ($Extend | $Format)+;
b75a7d8f 133
374ca955
A
134$NumericEx {100};
135$ALetterEx {200};
51004dcb
A
136$HangulSyllable {200};
137$KatakanaEx {400}; # note: these status values override those from rule 5
138$HiraganaEx {400}; # by virtue of being numerically larger.
46f4442e 139$IdeographicEx {400}; #
b75a7d8f 140
46f4442e 141#
374ca955 142# rule 5
46f4442e
A
143# Do not break between most letters.
144#
73c04bcf 145$ALetterEx $ALetterEx {200};
b75a7d8f 146
374ca955 147# rule 6 and 7
46f4442e 148$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
b75a7d8f 149
374ca955 150# rule 8
b75a7d8f 151
73c04bcf 152$NumericEx $NumericEx {100};
b75a7d8f 153
374ca955 154# rule 9
b75a7d8f 155
46f4442e 156$ALetterEx $NumericEx {200};
b75a7d8f 157
374ca955 158# rule 10
b75a7d8f 159
73c04bcf 160$NumericEx $ALetterEx {200};
b75a7d8f 161
374ca955
A
162# rule 11 and 12
163
46f4442e 164$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
374ca955
A
165
166# rule 13
51004dcb
A
167# to be consistent with $KanaKanji $KanaKanhi, changed
168# from 300 to 400.
169# See also TestRuleStatus in intltest/rbbiapts.cpp
170$KatakanaEx $KatakanaEx {400};
374ca955
A
171
172# rule 13a/b
173
73c04bcf
A
174$ALetterEx $ExtendNumLetEx {200}; # (13a)
175$NumericEx $ExtendNumLetEx {100}; # (13a)
51004dcb 176$KatakanaEx $ExtendNumLetEx {400}; # (13a)
46f4442e 177$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
374ca955 178
73c04bcf
A
179$ExtendNumLetEx $ALetterEx {200}; # (13b)
180$ExtendNumLetEx $NumericEx {100}; # (13b)
51004dcb
A
181$ExtendNumLetEx $KatakanaEx {400}; # (13b)
182
183# rule 13c
184
185$RI_A ($Extend|$Format)* $RI_A_End ($Extend|$Format)*;
186$RI_B ($Extend|$Format)* $RI_B_End ($Extend|$Format)*;
187$RI_C ($Extend|$Format)* $RI_C_End ($Extend|$Format)*;
188$RI_D ($Extend|$Format)* $RI_D_End ($Extend|$Format)*;
189$RI_E ($Extend|$Format)* $RI_E_End ($Extend|$Format)*;
190$RI_F ($Extend|$Format)* $RI_F_End ($Extend|$Format)*;
191$RI_G ($Extend|$Format)* $RI_G_End ($Extend|$Format)*;
192$RI_H ($Extend|$Format)* $RI_H_End ($Extend|$Format)*;
193$RI_I ($Extend|$Format)* $RI_I_End ($Extend|$Format)*;
194$RI_J ($Extend|$Format)* $RI_J_End ($Extend|$Format)*;
195$RI_K ($Extend|$Format)* $RI_K_End ($Extend|$Format)*;
196$RI_L ($Extend|$Format)* $RI_L_End ($Extend|$Format)*;
197$RI_M ($Extend|$Format)* $RI_M_End ($Extend|$Format)*;
198$RI_N ($Extend|$Format)* $RI_N_End ($Extend|$Format)*;
199$RI_P ($Extend|$Format)* $RI_P_End ($Extend|$Format)*;
200$RI_R ($Extend|$Format)* $RI_R_End ($Extend|$Format)*;
201$RI_S ($Extend|$Format)* $RI_S_End ($Extend|$Format)*;
202$RI_T ($Extend|$Format)* $RI_T_End ($Extend|$Format)*;
203$RI_U ($Extend|$Format)* $RI_U_End ($Extend|$Format)*;
204$RI_V ($Extend|$Format)* $RI_V_End ($Extend|$Format)*;
205
206# special handling for CJK characters: chain for later dictionary segmentation
207$HangulSyllable $HangulSyllable {200};
208$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
374ca955
A
209
210
211## -------------------------------------------------
b75a7d8f 212
374ca955
A
213!!reverse;
214
51004dcb
A
215$BackALetterEx = ($Format | $Extend)* $ALetterPlus;
216$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;
217$BackNumericEx = ($Format | $Extend)* $Numeric;
218$BackMidNumEx = ($Format | $Extend)* $MidNum;
219$BackMidLetterEx = ($Format | $Extend)* $MidLetter;
220$BackKatakanaEx = ($Format | $Extend)* $Katakana;
221$BackHiraganaEx = ($Format | $Extend)* $Hiragana;
222$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet;
374ca955 223
73c04bcf 224# rule 3
46f4442e 225$LF $CR;
374ca955 226
73c04bcf 227# rule 4
46f4442e 228($Format | $Extend)* [^$CR $LF $Newline]?;
374ca955
A
229
230# rule 5
231
73c04bcf 232$BackALetterEx $BackALetterEx;
374ca955
A
233
234# rule 6 and 7
235
46f4442e 236$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
374ca955
A
237
238
239# rule 8
240
73c04bcf 241$BackNumericEx $BackNumericEx;
374ca955
A
242
243# rule 9
244
73c04bcf 245$BackNumericEx $BackALetterEx;
374ca955
A
246
247# rule 10
248
73c04bcf 249$BackALetterEx $BackNumericEx;
374ca955
A
250
251# rule 11 and 12
252
46f4442e 253$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
374ca955
A
254
255# rule 13
256
73c04bcf 257$BackKatakanaEx $BackKatakanaEx;
374ca955
A
258
259# rules 13 a/b
b75a7d8f 260#
46f4442e
A
261$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
262($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
374ca955 263
51004dcb
A
264# rule 13c
265
266($Format|$Extend)* $RI_A_End ($Format|$Extend)* $RI_A;
267($Format|$Extend)* $RI_B_End ($Format|$Extend)* $RI_B;
268($Format|$Extend)* $RI_C_End ($Format|$Extend)* $RI_C;
269($Format|$Extend)* $RI_D_End ($Format|$Extend)* $RI_D;
270($Format|$Extend)* $RI_E_End ($Format|$Extend)* $RI_E;
271($Format|$Extend)* $RI_F_End ($Format|$Extend)* $RI_F;
272($Format|$Extend)* $RI_G_End ($Format|$Extend)* $RI_G;
273($Format|$Extend)* $RI_H_End ($Format|$Extend)* $RI_H;
274($Format|$Extend)* $RI_I_End ($Format|$Extend)* $RI_I;
275($Format|$Extend)* $RI_J_End ($Format|$Extend)* $RI_J;
276($Format|$Extend)* $RI_K_End ($Format|$Extend)* $RI_K;
277($Format|$Extend)* $RI_L_End ($Format|$Extend)* $RI_L;
278($Format|$Extend)* $RI_M_End ($Format|$Extend)* $RI_M;
279($Format|$Extend)* $RI_N_End ($Format|$Extend)* $RI_N;
280($Format|$Extend)* $RI_P_End ($Format|$Extend)* $RI_P;
281($Format|$Extend)* $RI_R_End ($Format|$Extend)* $RI_R;
282($Format|$Extend)* $RI_S_End ($Format|$Extend)* $RI_S;
283($Format|$Extend)* $RI_T_End ($Format|$Extend)* $RI_T;
284($Format|$Extend)* $RI_U_End ($Format|$Extend)* $RI_U;
285($Format|$Extend)* $RI_V_End ($Format|$Extend)* $RI_V;
286
287# special handling for CJK characters: chain for later dictionary segmentation
288$HangulSyllable $HangulSyllable;
289$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
290
374ca955
A
291## -------------------------------------------------
292
293!!safe_reverse;
294
295# rule 3
73c04bcf 296($Extend | $Format)+ .?;
374ca955
A
297
298# rule 6
46f4442e 299($MidLetter | $MidNumLet) $BackALetterEx;
374ca955
A
300
301# rule 11
46f4442e 302($MidNum | $MidNumLet) $BackNumericEx;
73c04bcf
A
303
304# For dictionary-based break
305$dictionary $dictionary;
374ca955
A
306
307## -------------------------------------------------
308
309!!safe_forward;
310
374ca955 311# rule 4
73c04bcf 312($Extend | $Format)+ .?;
374ca955
A
313
314# rule 6
46f4442e 315($MidLetterEx | $MidNumLetEx) $ALetterEx;
b75a7d8f 316
374ca955 317# rule 11
46f4442e 318($MidNumEx | $MidNumLetEx) $NumericEx;
73c04bcf
A
319
320# For dictionary-based break
321$dictionary $dictionary;