]> git.saurik.com Git - apple/icu.git/blame - icuSources/data/brkitr/rules/word_fi_sv.txt
ICU-57166.0.1.tar.gz
[apple/icu.git] / icuSources / data / brkitr / rules / word_fi_sv.txt
CommitLineData
b75a7d8f 1#
2ca993e8 2# Copyright (C) 2002-2016, International Business Machines Corporation
374ca955 3# and others. All Rights Reserved.
b75a7d8f 4#
2ca993e8 5# file: word_fi_sv.txt
b75a7d8f 6#
2ca993e8 7# ICU Word Break Rules, fi/sv locales (these are actually the standard UAX #29 rules)
b75a7d8f 8# See Unicode Standard Annex #29.
2ca993e8
A
9# These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
10# with additions from L2/16-011R3 for Emoji sequences.
b75a7d8f 11#
73c04bcf 12# Note: Updates to word.txt will usually need to be merged into
51004dcb 13# word_POSIX.txt also.
b75a7d8f 14
374ca955 15##############################################################################
b75a7d8f
A
16#
17# Character class definitions from TR 29
18#
374ca955
A
19##############################################################################
20
21!!chain;
22
b75a7d8f
A
23
24#
25# Character Class Definitions.
b75a7d8f 26#
b75a7d8f 27
2ca993e8 28# Apple $EmojiForMods becomes $E_Base here
a62d09fc 29$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C2-\U0001F3C4\U0001F3C7\U0001F3CA-\U0001F3CC\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F574-\U0001F575\U0001F57A\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F6CC\U0001F918-\U0001F91E\U0001F926\U0001F930\U0001F933-\U0001F939\U0001F93C-\U0001F93E];
2ca993e8
A
30# Apple $EmojiMods becomes $E_Modifier here, same chars
31$E_Modifier = [\U0001F3FB-\U0001F3FF];
32$ZWJ = [\u200D];
33# Apple $EmojiForSeqs becomes $GAZ here (only emoji that follow a ZWJ)
a62d09fc 34$GAZ = [\u2640\u2642\u2764\u2695-\u2696\u2708\U0001F308\U0001F33E\U0001F373\U0001F393\U0001F3A4\U0001F3A8\U0001F3EB\U0001F3ED\U0001F466-\U0001F469\U0001F48B\U0001F4BB-\U0001F4BC\U0001F527\U0001F52C\U0001F5E8\U0001F680\U0001F692];
2ca993e8 35
57a6839d
A
36$CR = [\p{Word_Break = CR}];
37$LF = [\p{Word_Break = LF}];
2ca993e8
A
38$Newline = [\p{Word_Break = Newline} ];
39$Extend = [[\p{Word_Break = Extend}][:Block=Tags:]];
40$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
41$Format = [[\p{Word_Break = Format}] - [:Block=Tags:]];
57a6839d
A
42$Katakana = [\p{Word_Break = Katakana}];
43$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
44$ALetter = [\p{Word_Break = ALetter}];
45$Single_Quote = [\p{Word_Break = Single_Quote}];
46$Double_Quote = [\p{Word_Break = Double_Quote}];
47$MidNumLet = [\p{Word_Break = MidNumLet}];
48$MidLetter = [\p{Word_Break = MidLetter}];
49$MidNum = [\p{Word_Break = MidNum}];
50$Numeric = [\p{Word_Break = Numeric}];
51$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
52
53$Han = [:Han:];
54$Hiragana = [:Hiragana:];
73c04bcf
A
55
56
73c04bcf
A
57# Dictionary character set, for triggering language-based break engines. Currently
58# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
59# 5.0 or later as the definition of Complex_Context was corrected to include all
60# characters requiring dictionary break.
61
2ca993e8 62$Control = [\p{Grapheme_Cluster_Break = Control}];
51004dcb
A
63$HangulSyllable = [\uac00-\ud7a3];
64$ComplexContext = [:LineBreak = Complex_Context:];
65$KanaKanji = [$Han $Hiragana $Katakana];
66$dictionaryCJK = [$KanaKanji $HangulSyllable];
67$dictionary = [$ComplexContext $dictionaryCJK];
68
69# leave CJK scripts out of ALetterPlus
70$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
71
73c04bcf
A
72
73#
2ca993e8 74# Rules 4 Ignore Format and Extend characters,
46f4442e 75# except when they appear at the beginning of a region of text.
73c04bcf 76#
51004dcb 77# TODO: check if handling of katakana in dictionary makes rules incorrect/void
2ca993e8
A
78$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*;
79$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*;
80$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*;
81$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*;
82$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*;
83$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*;
84$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*;
85$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*;
86$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*;
87$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*;
88$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*;
73c04bcf 89
46f4442e 90$Ideographic = [\p{Ideographic}];
2ca993e8
A
91$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*;
92$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
b75a7d8f 93
374ca955 94## -------------------------------------------------
b75a7d8f 95
374ca955 96!!forward;
b75a7d8f 97
b75a7d8f 98
73c04bcf 99# Rule 3 - CR x LF
46f4442e
A
100#
101$CR $LF;
73c04bcf 102
2ca993e8
A
103# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
104#
105$ZWJ $GAZ;
106
107
73c04bcf
A
108# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
109# of a region of Text. The rule here comes into play when the start of text
110# begins with a group of Format chars, or with a "word" consisting of a single
111# char that is not in any of the listed word break categories followed by
51004dcb 112# format char(s), or is not a CJK dictionary character.
2ca993e8 113[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+;
b75a7d8f 114
374ca955
A
115$NumericEx {100};
116$ALetterEx {200};
51004dcb 117$HangulSyllable {200};
57a6839d 118$Hebrew_LetterEx{200};
51004dcb
A
119$KatakanaEx {400}; # note: these status values override those from rule 5
120$HiraganaEx {400}; # by virtue of being numerically larger.
46f4442e 121$IdeographicEx {400}; #
b75a7d8f 122
2ca993e8
A
123$E_Base ($Extend | $Format | $ZWJ)*;
124$E_Modifier ($Extend | $Format | $ZWJ)*;
125$GAZ ($Extend | $Format | $ZWJ)*;
126
46f4442e 127#
374ca955 128# rule 5
46f4442e
A
129# Do not break between most letters.
130#
57a6839d 131($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
b75a7d8f 132
374ca955 133# rule 6 and 7
57a6839d
A
134($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
135
136# rule 7a
137$Hebrew_LetterEx $Single_QuoteEx {200};
138
139# rule 7b and 7c
140$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
b75a7d8f 141
374ca955 142# rule 8
b75a7d8f 143
73c04bcf 144$NumericEx $NumericEx {100};
b75a7d8f 145
374ca955 146# rule 9
b75a7d8f 147
57a6839d 148($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
b75a7d8f 149
374ca955 150# rule 10
b75a7d8f 151
57a6839d 152$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
b75a7d8f 153
2ca993e8 154# rule 11 and 12
374ca955 155
57a6839d 156$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
374ca955
A
157
158# rule 13
51004dcb
A
159# to be consistent with $KanaKanji $KanaKanhi, changed
160# from 300 to 400.
161# See also TestRuleStatus in intltest/rbbiapts.cpp
162$KatakanaEx $KatakanaEx {400};
374ca955
A
163
164# rule 13a/b
165
57a6839d
A
166$ALetterEx $ExtendNumLetEx {200}; # (13a)
167$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
168$NumericEx $ExtendNumLetEx {100}; # (13a)
169$KatakanaEx $ExtendNumLetEx {400}; # (13a)
170$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
374ca955 171
57a6839d
A
172$ExtendNumLetEx $ALetterEx {200}; # (13b)
173$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
174$ExtendNumLetEx $NumericEx {100}; # (13b)
175$ExtendNumLetEx $KatakanaEx {400}; # (13b)
51004dcb
A
176
177# rule 13c
2ca993e8
A
178# Pairs of Regional Indicators stay together.
179# With rule chaining disabled by ^, this rule will match exactly two of them.
180# No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
181#
182^$Regional_IndicatorEx $Regional_IndicatorEx;
51004dcb
A
183
184# special handling for CJK characters: chain for later dictionary segmentation
185$HangulSyllable $HangulSyllable {200};
2ca993e8
A
186$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
187
188# rule 13d
189# E_Base x E_Modifier
190#
191($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
374ca955
A
192
193
194## -------------------------------------------------
b75a7d8f 195
374ca955
A
196!!reverse;
197
2ca993e8
A
198$BackHebrew_LetterEx = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
199$BackALetterEx = ($Format | $Extend | $ZWJ)* $ALetterPlus;
200$BackSingle_QuoteEx = ($Format | $Extend | $ZWJ)* $Single_Quote;
201$BackDouble_QuoteEx = ($Format | $Extend | $ZWJ)* $Double_Quote;
202$BackMidNumLetEx = ($Format | $Extend | $ZWJ)* $MidNumLet;
203$BackNumericEx = ($Format | $Extend | $ZWJ)* $Numeric;
204$BackMidNumEx = ($Format | $Extend | $ZWJ)* $MidNum;
205$BackMidLetterEx = ($Format | $Extend | $ZWJ)* $MidLetter;
206$BackKatakanaEx = ($Format | $Extend | $ZWJ)* $Katakana;
207$BackHiraganaEx = ($Format | $Extend | $ZWJ)* $Hiragana;
208$BackExtendNumLetEx = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
209$BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
374ca955 210
73c04bcf 211# rule 3
46f4442e 212$LF $CR;
374ca955 213
2ca993e8
A
214# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
215#
216$GAZ $ZWJ;
217
73c04bcf 218# rule 4
2ca993e8 219($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
374ca955
A
220
221# rule 5
222
57a6839d 223($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
374ca955
A
224
225# rule 6 and 7
226
57a6839d
A
227($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
228
229# rule 7a
230$BackSingle_QuoteEx $BackHebrew_LetterEx;
374ca955 231
57a6839d
A
232# Rule 7b and 7c
233$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
374ca955
A
234
235# rule 8
236
73c04bcf 237$BackNumericEx $BackNumericEx;
374ca955
A
238
239# rule 9
240
57a6839d 241$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
374ca955
A
242
243# rule 10
244
57a6839d 245($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
374ca955
A
246
247# rule 11 and 12
248
57a6839d 249$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
374ca955
A
250
251# rule 13
252
73c04bcf 253$BackKatakanaEx $BackKatakanaEx;
374ca955
A
254
255# rules 13 a/b
b75a7d8f 256#
57a6839d 257$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
2ca993e8 258($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
374ca955 259
51004dcb
A
260# rule 13c
261
2ca993e8
A
262^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
263 ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
264^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
265 ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
08b89b0a 266
2ca993e8
A
267$GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
268 ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
269$GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
270 ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
51004dcb
A
271
272# special handling for CJK characters: chain for later dictionary segmentation
273$HangulSyllable $HangulSyllable;
274$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
275
2ca993e8
A
276# rule 13d
277
278$E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
279
280
281
374ca955
A
282## -------------------------------------------------
283
284!!safe_reverse;
285
286# rule 3
2ca993e8 287($Extend | $Format | $ZWJ)+ .?;
374ca955
A
288
289# rule 6
57a6839d
A
290($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
291
292# rule 7b
293$Double_Quote $BackHebrew_LetterEx;
294
374ca955
A
295
296# rule 11
57a6839d 297($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
73c04bcf 298
2ca993e8
A
299# rule 13c
300$BackRegional_IndicatorEx*;
301
73c04bcf
A
302# For dictionary-based break
303$dictionary $dictionary;
374ca955
A
304
305## -------------------------------------------------
306
307!!safe_forward;
308
374ca955 309# rule 4
2ca993e8 310($Extend | $Format | $ZWJ)+ .?;
374ca955
A
311
312# rule 6
57a6839d
A
313($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
314
315# rule 7b
316$Double_QuoteEx $Hebrew_LetterEx;
b75a7d8f 317
374ca955 318# rule 11
57a6839d 319($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
73c04bcf 320
2ca993e8
A
321# rule 13c
322$Regional_IndicatorEx*;
323
73c04bcf
A
324# For dictionary-based break
325$dictionary $dictionary;