]> git.saurik.com Git - apple/icu.git/blob - icuSources/data/brkitr/line.txt
ICU-6.2.13.tar.gz
[apple/icu.git] / icuSources / data / brkitr / line.txt
1 # Copyright (c) 2002-2004 International Business Machines Corporation and
2 # others. All Rights Reserved.
3 #
4 # file: line.txt
5 #
6 # Line Breaking Rules
7 # Implement default line breaking as defined by Unicode TR 14.
8 #
9
10
11 #
12 # Character Classes defined by TR 14.
13 #
14
15 !!chain;
16 !!LBCMNoChain;
17 !!lookAheadHardBreak;
18
19 $AI = [:LineBreak = Ambiguous:];
20 $AL = [:LineBreak = Alphabetic:];
21 $BA = [:LineBreak = Break_After:];
22 $BB = [:LineBreak = Break_Before:];
23 $BK = [:LineBreak = Mandatory_Break:];
24 $B2 = [:LineBreak = Break_Both:];
25 $CB = [:LineBreak = Contingent_Break:];
26 $CL = [:LineBreak = Close_Punctuation:];
27 $CM = [:LineBreak = Combining_Mark:];
28 $CR = [:LineBreak = Carriage_Return:];
29 $EX = [:LineBreak = Exclamation:];
30 $GL = [:LineBreak = Glue:];
31 $HY = [:LineBreak = Hyphen:];
32 $ID = [:LineBreak = Ideographic:];
33 $IN = [:LineBreak = Inseperable:];
34 $IS = [:LineBreak = Infix_Numeric:];
35 $LF = [:LineBreak = Line_Feed:];
36 $NL = [:LineBreak = Next_Line:];
37 $NS = [:LineBreak = Nonstarter:];
38 $NU = [:LineBreak = Numeric:];
39 $OP = [:LineBreak = Open_Punctuation:];
40 $PO = [:LineBreak = Postfix_Numeric:];
41 $PR = [:LineBreak = Prefix_Numeric:];
42 $QU = [:LineBreak = Quotation:];
43 $SA = [:LineBreak = Complex_Context:];
44 $SG = [:LineBreak = Surrogate:];
45 $SP = [:LineBreak = Space:];
46 $SY = [:LineBreak = Break_Symbols:];
47 $WJ = [:LineBreak = Word_Joiner:];
48 $XX = [:LineBreak = Unknown:];
49 $ZW = [:LineBreak = ZWSpace:];
50
51
52 #
53 # Korean Syllable Definitions
54 #
55 $L = [:Hangul_Syllable_Type = L:];
56 $V = [:Hangul_Syllable_Type = V:];
57 $T = [:Hangul_Syllable_Type = T:];
58
59 $LV = [:Hangul_Syllable_Type = LV:];
60 $LVT = [:Hangul_Syllable_Type = LVT:];
61
62 $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
63
64 #
65 # Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
66 # SA (South East Asian: Thai, Lao, Khmer)
67 # XX (Unknown, unassigned)
68 # as $AL (Alphabetic)
69 #
70 $ALPlus = $AL | $AI | $SA | $XX;
71
72 #
73 # Combining Marks. X $CM* behaves as if it were X. Rule LB6.
74 #
75 $ALcm = $ALPlus $CM*;
76 $BAcm = $BA $CM*;
77 $BBcm = $BB $CM*;
78 $B2cm = $B2 $CM*;
79 $CLcm = $CL $CM*;
80 $EXcm = $EX $CM*;
81 $GLcm = $GL $CM*;
82 $HYcm = $HY $CM*;
83 $IDcm = ($ID | $HangulSyllable) $CM*;
84 $INcm = $IN $CM*;
85 $IScm = $IS $CM*;
86 $NScm = $NS $CM*;
87 $NUcm = $NU $CM*;
88 $OPcm = $OP $CM*;
89 $POcm = $PO $CM*;
90 $PRcm = $PR $CM*;
91 $QUcm = $QU $CM*;
92 $SPcm = $SP $CM*;
93 $SYcm = $SY $CM*;
94 $WJcm = $WJ $CM*;
95
96 #
97 # Each class of character can stand by itself as an unbroken token, with trailing combining stuff
98 #
99 $ALPlus $CM+;
100 $BA $CM+;
101 $BB $CM+;
102 $B2 $CM+;
103 $CL $CM+;
104 $EX $CM+;
105 $GL $CM+;
106 $HY $CM+;
107 $ID $CM+;
108 $IN $CM+;
109 $IS $CM+;
110 $NS $CM+;
111 $NU $CM+;
112 $OP $CM+;
113 $PO $CM+;
114 $PR $CM+;
115 $QU $CM+;
116 $SP $CM+;
117 $SY $CM+;
118 $WJ $CM+;
119
120 ## -------------------------------------------------
121
122 !!forward;
123
124 #
125 # Rule LB 3
126 $LB3Breaks = [$BK $CR $LF $NL];
127 $LB3NonBreaks = [^$BK $CR $LF $NL];
128 $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
129
130 $LB3NonBreaks? $LB3Breaks {100};
131 $LB5NonBreaks $CM* $LB3Breaks {100};
132 $CR $LF {100};
133
134 # LB 4 x SP
135 # x ZW
136 $ZW [$SP $ZW];
137 $LB5NonBreaks $CM* [$SP $ZW];
138
139 # LB 5 Break after zero width space
140 $LB5Breaks = [$LB3Breaks $ZW];
141
142 # LB 6
143 #
144 # Korean Syllable Definitions
145 #
146
147 ($HangulSyllable) $CM*;
148
149 # LB 7 Combining marks. $SP $CM needs to behave like $ID.
150 # X $CM needs to behave like X, where X is not $SP.
151 # $CM not covered by the above needs to behave like $AL
152 #
153 $LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
154
155 # LB 8
156 $LB5NonBreaks $CM* $CL;
157 $LB5NonBreaks $CM* $EX;
158 $LB5NonBreaks $CM* $IS;
159 $LB5NonBreaks $CM* $SY;
160
161 # LB 9
162 $OPcm $SP* .?;
163 $OPcm $SP* $LB5NonBreaks $CM*;
164
165 # LB 10
166 $QUcm $SP* $OPcm;
167
168 # LB 11
169 $CLcm $SP* $NScm;
170
171 # LB 11a
172 ($B2cm)+;
173
174 # LB 11b
175 $LB5NonBreaks $CM* ($GLcm | $WJcm);
176 ($GLcm | $WJcm) .?;
177
178 # LB 12
179 $LB12NonBreaks = [$LB5NonBreaks - $SP];
180
181 # LB 14
182 $LB12NonBreaks $CM* $QUcm+ .?;
183 $LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
184 $SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
185 $SP $CM+ $QUcm+ $LB5NonBreaks $CM*;
186
187 $QUcm $LB3NonBreaks?;
188 $QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
189
190 # LB 14a
191 $LB14NonBreaks = [$LB12NonBreaks - $CB];
192 $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
193
194 # LB 15
195 $LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
196 $BBcm [^$CB];
197 $BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
198
199 # LB 16
200 $ALcm $INcm;
201 $CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
202 $IDcm $INcm;
203 $SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
204 $INcm $INcm;
205 $NUcm $INcm;
206
207
208 # $LB 17
209 ($IDcm | $SP $CM+) $POcm;
210 $ALcm+ $NUcm; # includes $LB19
211 $CM+ $NUcm; # Rule 7c
212 $NUcm $ALcm+;
213
214 # LB 18
215 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm | $SYcm)* $CLcm? $POcm?;
216 $PRcm $ALcm;
217 $PRcm $IDcm;
218
219 # LB 19
220 $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
221 $IScm $ALcm;
222
223 #
224 # Reverse Rules.
225 #
226 ## -------------------------------------------------
227
228 !!reverse;
229
230 $CM+ $ALPlus;
231 $CM+ $BA;
232 $CM+ $BB;
233 $CM+ $B2;
234 $CM+ $CL;
235 $CM+ $EX;
236 $CM+ $GL;
237 $CM+ $HY;
238 $CM+ $ID;
239 $CM+ $IN;
240 $CM+ $IS;
241 $CM+ $NS;
242 $CM+ $NU;
243 $CM+ $OP;
244 $CM+ $PO;
245 $CM+ $PR;
246 $CM+ $QU;
247 $CM+ $SP;
248 $CM+ $SY;
249 $CM+ $WJ;
250
251 # LB 3
252
253 $LB3Breaks $LB3NonBreaks;
254 $LB3Breaks $CM* $LB5NonBreaks;
255 $LF $CR;
256
257 # LB 4 x SP
258 # x ZW
259 [$SP $ZW] $LB3NonBreaks;
260 [$SP $ZW] $CM* $LB5NonBreaks;
261
262 # LB 5 Break after zero width space
263
264 # LB 6 Jamo is treated like an alphabet
265
266 $BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
267 $CM* $BackHangulSyllable;
268
269 # LB 7 Combining marks.
270 # $SP $CM needs to behave like $ID.
271 # X $CM needs to behave like X, where X is not $SP.
272 # $CM not covered by the above needs to behave like $AL
273 # Stick together any combining sequences that don't match other rules.
274 $CM+ $LB5NonBreaks;
275
276 # LB 8
277 $CL $CM* $LB5NonBreaks;
278 $EX $CM* $LB5NonBreaks;
279 $IS $CM* $LB5NonBreaks;
280 $SY $CM* $LB5NonBreaks;
281
282 # LB 9
283 $LB5NonBreaks $SP* $CM* $OP;
284
285 # LB 10
286 $CM* $OP $SP* $CM* $QU;
287
288 # LB 11
289 $CM* $NS $SP* $CM* $CL;
290
291 # LB 11a
292 ($CM* $B2)+;
293
294 # LB 11b
295 $CM* ($GL | $WJ) $CM* $LB5NonBreaks;
296 $CM* $LB5NonBreaks $CM* ($GL | $WJ);
297 . $CM* ($GL | $WJ);
298
299 # LB 12
300
301 # LB 14
302 $CM* $QU $CM* $LB12NonBreaks;
303 $CM* $QU $CM+ $SP;
304 $CM* $LB5NonBreaks $CM* $QU;
305
306 # LB 14a
307 $BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
308
309 # LB 15
310 $CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
311 ($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
312 [$CR $LF $BK $NL $ZW] $CM* $BB;
313 $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
314
315 # LB 16
316 $CM* $IN $CM* $ALPlus;
317 # by rule 7c, any otherwise unattached CM behaves as AL
318 $CM* $IN $CM+ / $LB5Breaks;
319
320 $CM* $IN $CM* ($ID | $CM $SP);
321 $CM* $IN $CM* $IN;
322 $CM* $IN $CM* $NU;
323
324 # $LB 17
325 $CM* $PO $CM* ($ID | $CM $SP);
326 $CM* $NU ($CM* $ALPlus)+; # includes $LB19
327 $CM* $NU $CM+ / $LB5Breaks; # Rule 7c
328
329 $CM* $ALPlus $CM* $NU;
330
331 # LB 18
332 ($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?;
333 $CM* $ALPlus $CM* $PR;
334 $CM* ($ID | $BackHangulSyllable) $CM* $PR;
335
336 # LB 19
337 $CM* $ALPlus $CM* $ALPlus;
338 # The $CM* is from rule 7C, and unattached CM is treated as AL
339 $CM* $ALPlus $CM* $IS;
340 $CM* $ALPlus $CM+ / $LB5Breaks;
341
342 ## problem state table can't handle lookahead when it is at the
343 ## start of the string, currently handled in the rbbi code
344 ## todo fix this
345
346 ## -------------------------------------------------
347
348 !!safe_reverse;
349
350 # LB 6
351 $V+ $L;
352
353 # LB 7
354 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
355 $CM+ $SP / .;
356
357 # LB 9
358 $SP+ $CM* $OP;
359
360 # LB 10
361 $SP+ $CM* $QU;
362
363 # LB 11
364 $SP+ $CM* $CL;
365
366 # LB 18
367 ($CM* ($IS | $SY))+ $CM* $NU;
368 $CL $CM* ($NU | $IS | $SY);
369
370 ## -------------------------------------------------
371
372 !!safe_forward;
373
374 # LB 6
375 $V+ $T;
376
377 # LB 7
378 [^$BK $CR $LF $NL $ZW $SP] $CM+;
379 $SP $CM+ / [^$CM];
380
381 # LB 9
382 $OP $CM* $SP+;
383
384 # LB 10
385 $QU $CM* $SP+;
386
387 # LB 11
388 $CL $CM* $SP+;
389
390 # LB 18
391 $CM* $PRcm? ($OPcm | $HYcm)? $NU;