1 # Copyright (c) 2002-2004 International Business Machines Corporation and
2 # others. All Rights Reserved.
7 # Implement default line breaking as defined by Unicode TR 14.
12 # Character Classes defined by TR 14.
19 $AI = [:LineBreak = Ambiguous:];
20 $AL = [:LineBreak = Alphabetic:];
21 $BA = [:LineBreak = Break_After:];
22 $BB = [:LineBreak = Break_Before:];
23 $BK = [:LineBreak = Mandatory_Break:];
24 $B2 = [:LineBreak = Break_Both:];
25 $CB = [:LineBreak = Contingent_Break:];
26 $CL = [:LineBreak = Close_Punctuation:];
27 $CM = [:LineBreak = Combining_Mark:];
28 $CR = [:LineBreak = Carriage_Return:];
29 $EX = [:LineBreak = Exclamation:];
30 $GL = [:LineBreak = Glue:];
31 $HY = [:LineBreak = Hyphen:];
32 $ID = [:LineBreak = Ideographic:];
33 $IN = [:LineBreak = Inseperable:];
34 $IS = [:LineBreak = Infix_Numeric:];
35 $LF = [:LineBreak = Line_Feed:];
36 $NL = [:LineBreak = Next_Line:];
37 $NS = [:LineBreak = Nonstarter:];
38 $NU = [:LineBreak = Numeric:];
39 $OP = [:LineBreak = Open_Punctuation:];
40 $PO = [:LineBreak = Postfix_Numeric:];
41 $PR = [:LineBreak = Prefix_Numeric:];
42 $QU = [:LineBreak = Quotation:];
43 $SA = [:LineBreak = Complex_Context:];
44 $SG = [:LineBreak = Surrogate:];
45 $SP = [:LineBreak = Space:];
46 $SY = [:LineBreak = Break_Symbols:];
47 $WJ = [:LineBreak = Word_Joiner:];
48 $XX = [:LineBreak = Unknown:];
49 $ZW = [:LineBreak = ZWSpace:];
53 # Korean Syllable Definitions
55 $L = [:Hangul_Syllable_Type = L:];
56 $V = [:Hangul_Syllable_Type = V:];
57 $T = [:Hangul_Syllable_Type = T:];
59 $LV = [:Hangul_Syllable_Type = LV:];
60 $LVT = [:Hangul_Syllable_Type = LVT:];
62 $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+;
65 # Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
66 # SA (South East Asian: Thai, Lao, Khmer)
67 # XX (Unknown, unassigned)
70 $ALPlus = $AL | $AI | $SA | $XX;
73 # Combining Marks. X $CM* behaves as if it were X. Rule LB6.
83 $IDcm = ($ID | $HangulSyllable) $CM*;
97 # Each class of character can stand by itself as an unbroken token, with trailing combining stuff
120 ## -------------------------------------------------
126 $LB3Breaks = [$BK $CR $LF $NL];
127 $LB3NonBreaks = [^$BK $CR $LF $NL];
128 $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
130 $LB3NonBreaks? $LB3Breaks {100};
131 $LB5NonBreaks $CM* $LB3Breaks {100};
137 $LB5NonBreaks $CM* [$SP $ZW];
139 # LB 5 Break after zero width space
140 $LB5Breaks = [$LB3Breaks $ZW];
144 # Korean Syllable Definitions
147 ($HangulSyllable) $CM*;
149 # LB 7 Combining marks. $SP $CM needs to behave like $ID.
150 # X $CM needs to behave like X, where X is not $SP.
151 # $CM not covered by the above needs to behave like $AL
153 $LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules.
156 $LB5NonBreaks $CM* $CL;
157 $LB5NonBreaks $CM* $EX;
158 $LB5NonBreaks $CM* $IS;
159 $LB5NonBreaks $CM* $SY;
163 $OPcm $SP* $LB5NonBreaks $CM*;
175 $LB5NonBreaks $CM* ($GLcm | $WJcm);
179 $LB12NonBreaks = [$LB5NonBreaks - $SP];
182 $LB12NonBreaks $CM* $QUcm+ .?;
183 $LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*;
184 $SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID
185 $SP $CM+ $QUcm+ $LB5NonBreaks $CM*;
187 $QUcm $LB3NonBreaks?;
188 $QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.
191 $LB14NonBreaks = [$LB12NonBreaks - $CB];
192 $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+;
195 $LB14CanBreakAfter ($BAcm | $HYcm | $NScm);
197 $BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*;
201 $CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL
203 $SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID
209 ($IDcm | $SP $CM+) $POcm;
210 $ALcm+ $NUcm; # includes $LB19
211 $CM+ $NUcm; # Rule 7c
215 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm | $SYcm)* $CLcm? $POcm?;
220 $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL
226 ## -------------------------------------------------
253 $LB3Breaks $LB3NonBreaks;
254 $LB3Breaks $CM* $LB5NonBreaks;
259 [$SP $ZW] $LB3NonBreaks;
260 [$SP $ZW] $CM* $LB5NonBreaks;
262 # LB 5 Break after zero width space
264 # LB 6 Jamo is treated like an alphabet
266 $BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+;
267 $CM* $BackHangulSyllable;
269 # LB 7 Combining marks.
270 # $SP $CM needs to behave like $ID.
271 # X $CM needs to behave like X, where X is not $SP.
272 # $CM not covered by the above needs to behave like $AL
273 # Stick together any combining sequences that don't match other rules.
277 $CL $CM* $LB5NonBreaks;
278 $EX $CM* $LB5NonBreaks;
279 $IS $CM* $LB5NonBreaks;
280 $SY $CM* $LB5NonBreaks;
283 $LB5NonBreaks $SP* $CM* $OP;
286 $CM* $OP $SP* $CM* $QU;
289 $CM* $NS $SP* $CM* $CL;
295 $CM* ($GL | $WJ) $CM* $LB5NonBreaks;
296 $CM* $LB5NonBreaks $CM* ($GL | $WJ);
302 $CM* $QU $CM* $LB12NonBreaks;
304 $CM* $LB5NonBreaks $CM* $QU;
307 $BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);
310 $CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;
311 ($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks;
312 [$CR $LF $BK $NL $ZW] $CM* $BB;
313 $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;
316 $CM* $IN $CM* $ALPlus;
317 # by rule 7c, any otherwise unattached CM behaves as AL
318 $CM* $IN $CM+ / $LB5Breaks;
320 $CM* $IN $CM* ($ID | $CM $SP);
325 $CM* $PO $CM* ($ID | $CM $SP);
326 $CM* $NU ($CM* $ALPlus)+; # includes $LB19
327 $CM* $NU $CM+ / $LB5Breaks; # Rule 7c
329 $CM* $ALPlus $CM* $NU;
332 ($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?;
333 $CM* $ALPlus $CM* $PR;
334 $CM* ($ID | $BackHangulSyllable) $CM* $PR;
337 $CM* $ALPlus $CM* $ALPlus;
338 # The $CM* is from rule 7C, and unattached CM is treated as AL
339 $CM* $ALPlus $CM* $IS;
340 $CM* $ALPlus $CM+ / $LB5Breaks;
342 ## problem state table can't handle lookahead when it is at the
343 ## start of the string, currently handled in the rbbi code
346 ## -------------------------------------------------
354 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
367 ($CM* ($IS | $SY))+ $CM* $NU;
368 $CL $CM* ($NU | $IS | $SY);
370 ## -------------------------------------------------
378 [^$BK $CR $LF $NL $ZW $SP] $CM+;
391 $CM* $PRcm? ($OPcm | $HYcm)? $NU;