]>
Commit | Line | Data |
---|---|---|
1 | # Copyright (c) 2002-2004 International Business Machines Corporation and | |
2 | # others. All Rights Reserved. | |
3 | # | |
4 | # file: line.txt | |
5 | # | |
6 | # Line Breaking Rules | |
7 | # Implement default line breaking as defined by Unicode TR 14. | |
8 | # | |
9 | ||
10 | ||
11 | # | |
12 | # Character Classes defined by TR 14. | |
13 | # | |
14 | ||
15 | !!chain; | |
16 | !!LBCMNoChain; | |
17 | !!lookAheadHardBreak; | |
18 | ||
19 | $AI = [:LineBreak = Ambiguous:]; | |
20 | $AL = [:LineBreak = Alphabetic:]; | |
21 | $BA = [:LineBreak = Break_After:]; | |
22 | $BB = [:LineBreak = Break_Before:]; | |
23 | $BK = [:LineBreak = Mandatory_Break:]; | |
24 | $B2 = [:LineBreak = Break_Both:]; | |
25 | $CB = [:LineBreak = Contingent_Break:]; | |
26 | $CL = [:LineBreak = Close_Punctuation:]; | |
27 | $CM = [:LineBreak = Combining_Mark:]; | |
28 | $CR = [:LineBreak = Carriage_Return:]; | |
29 | $EX = [:LineBreak = Exclamation:]; | |
30 | $GL = [:LineBreak = Glue:]; | |
31 | $HY = [:LineBreak = Hyphen:]; | |
32 | $ID = [:LineBreak = Ideographic:]; | |
33 | $IN = [:LineBreak = Inseperable:]; | |
34 | $IS = [:LineBreak = Infix_Numeric:]; | |
35 | $LF = [:LineBreak = Line_Feed:]; | |
36 | $NL = [:LineBreak = Next_Line:]; | |
37 | $NS = [:LineBreak = Nonstarter:]; | |
38 | $NU = [:LineBreak = Numeric:]; | |
39 | $OP = [:LineBreak = Open_Punctuation:]; | |
40 | $PO = [:LineBreak = Postfix_Numeric:]; | |
41 | $PR = [:LineBreak = Prefix_Numeric:]; | |
42 | $QU = [:LineBreak = Quotation:]; | |
43 | $SA = [:LineBreak = Complex_Context:]; | |
44 | $SG = [:LineBreak = Surrogate:]; | |
45 | $SP = [:LineBreak = Space:]; | |
46 | $SY = [:LineBreak = Break_Symbols:]; | |
47 | $WJ = [:LineBreak = Word_Joiner:]; | |
48 | $XX = [:LineBreak = Unknown:]; | |
49 | $ZW = [:LineBreak = ZWSpace:]; | |
50 | ||
51 | ||
52 | # | |
53 | # Korean Syllable Definitions | |
54 | # | |
55 | $L = [:Hangul_Syllable_Type = L:]; | |
56 | $V = [:Hangul_Syllable_Type = V:]; | |
57 | $T = [:Hangul_Syllable_Type = T:]; | |
58 | ||
59 | $LV = [:Hangul_Syllable_Type = LV:]; | |
60 | $LVT = [:Hangul_Syllable_Type = LVT:]; | |
61 | ||
62 | $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+; | |
63 | ||
64 | # | |
65 | # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), | |
66 | # SA (South East Asian: Thai, Lao, Khmer) | |
67 | # XX (Unknown, unassigned) | |
68 | # as $AL (Alphabetic) | |
69 | # | |
70 | $ALPlus = $AL | $AI | $SA | $XX; | |
71 | ||
72 | # | |
73 | # Combining Marks. X $CM* behaves as if it were X. Rule LB6. | |
74 | # | |
75 | $ALcm = $ALPlus $CM*; | |
76 | $BAcm = $BA $CM*; | |
77 | $BBcm = $BB $CM*; | |
78 | $B2cm = $B2 $CM*; | |
79 | $CLcm = $CL $CM*; | |
80 | $EXcm = $EX $CM*; | |
81 | $GLcm = $GL $CM*; | |
82 | $HYcm = $HY $CM*; | |
83 | $IDcm = ($ID | $HangulSyllable) $CM*; | |
84 | $INcm = $IN $CM*; | |
85 | $IScm = $IS $CM*; | |
86 | $NScm = $NS $CM*; | |
87 | $NUcm = $NU $CM*; | |
88 | $OPcm = $OP $CM*; | |
89 | $POcm = $PO $CM*; | |
90 | $PRcm = $PR $CM*; | |
91 | $QUcm = $QU $CM*; | |
92 | $SPcm = $SP $CM*; | |
93 | $SYcm = $SY $CM*; | |
94 | $WJcm = $WJ $CM*; | |
95 | ||
96 | # | |
97 | # Each class of character can stand by itself as an unbroken token, with trailing combining stuff | |
98 | # | |
99 | $ALPlus $CM+; | |
100 | $BA $CM+; | |
101 | $BB $CM+; | |
102 | $B2 $CM+; | |
103 | $CL $CM+; | |
104 | $EX $CM+; | |
105 | $GL $CM+; | |
106 | $HY $CM+; | |
107 | $ID $CM+; | |
108 | $IN $CM+; | |
109 | $IS $CM+; | |
110 | $NS $CM+; | |
111 | $NU $CM+; | |
112 | $OP $CM+; | |
113 | $PO $CM+; | |
114 | $PR $CM+; | |
115 | $QU $CM+; | |
116 | $SP $CM+; | |
117 | $SY $CM+; | |
118 | $WJ $CM+; | |
119 | ||
120 | ## ------------------------------------------------- | |
121 | ||
122 | !!forward; | |
123 | ||
124 | # | |
125 | # Rule LB 3 | |
126 | $LB3Breaks = [$BK $CR $LF $NL]; | |
127 | $LB3NonBreaks = [^$BK $CR $LF $NL]; | |
128 | $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]]; | |
129 | ||
130 | $LB3NonBreaks? $LB3Breaks {100}; | |
131 | $LB5NonBreaks $CM* $LB3Breaks {100}; | |
132 | $CR $LF {100}; | |
133 | ||
134 | # LB 4 x SP | |
135 | # x ZW | |
136 | $ZW [$SP $ZW]; | |
137 | $LB5NonBreaks $CM* [$SP $ZW]; | |
138 | ||
139 | # LB 5 Break after zero width space | |
140 | $LB5Breaks = [$LB3Breaks $ZW]; | |
141 | ||
142 | # LB 6 | |
143 | # | |
144 | # Korean Syllable Definitions | |
145 | # | |
146 | ||
147 | ($HangulSyllable) $CM*; | |
148 | ||
149 | # LB 7 Combining marks. $SP $CM needs to behave like $ID. | |
150 | # X $CM needs to behave like X, where X is not $SP. | |
151 | # $CM not covered by the above needs to behave like $AL | |
152 | # | |
153 | $LB5NonBreaks $CM+; # Stick together any combining sequences that don't match other rules. | |
154 | ||
155 | # LB 8 | |
156 | $LB5NonBreaks $CM* $CL; | |
157 | $LB5NonBreaks $CM* $EX; | |
158 | $LB5NonBreaks $CM* $IS; | |
159 | $LB5NonBreaks $CM* $SY; | |
160 | ||
161 | # LB 9 | |
162 | $OPcm $SP* .?; | |
163 | $OPcm $SP* $LB5NonBreaks $CM*; | |
164 | ||
165 | # LB 10 | |
166 | $QUcm $SP* $OPcm; | |
167 | ||
168 | # LB 11 | |
169 | $CLcm $SP* $NScm; | |
170 | ||
171 | # LB 11a | |
172 | ($B2cm)+; | |
173 | ||
174 | # LB 11b | |
175 | $LB5NonBreaks $CM* ($GLcm | $WJcm); | |
176 | ($GLcm | $WJcm) .?; | |
177 | ||
178 | # LB 12 | |
179 | $LB12NonBreaks = [$LB5NonBreaks - $SP]; | |
180 | ||
181 | # LB 14 | |
182 | $LB12NonBreaks $CM* $QUcm+ .?; | |
183 | $LB12NonBreaks $CM* $QUcm+ $LB5NonBreaks $CM*; | |
184 | $SP $CM+ $QUcm+ .?; # LB7a SP CM+ behaves as ID | |
185 | $SP $CM+ $QUcm+ $LB5NonBreaks $CM*; | |
186 | ||
187 | $QUcm $LB3NonBreaks?; | |
188 | $QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. | |
189 | ||
190 | # LB 14a | |
191 | $LB14NonBreaks = [$LB12NonBreaks - $CB]; | |
192 | $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+; | |
193 | ||
194 | # LB 15 | |
195 | $LB14CanBreakAfter ($BAcm | $HYcm | $NScm); | |
196 | $BBcm [^$CB]; | |
197 | $BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*; | |
198 | ||
199 | # LB 16 | |
200 | $ALcm $INcm; | |
201 | $CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL | |
202 | $IDcm $INcm; | |
203 | $SP $CM+ $INcm; # by rule 7a, $SP $CM behaves like ID | |
204 | $INcm $INcm; | |
205 | $NUcm $INcm; | |
206 | ||
207 | ||
208 | # $LB 17 | |
209 | ($IDcm | $SP $CM+) $POcm; | |
210 | $ALcm+ $NUcm; # includes $LB19 | |
211 | $CM+ $NUcm; # Rule 7c | |
212 | $NUcm $ALcm+; | |
213 | ||
214 | # LB 18 | |
215 | $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm | $SYcm)* $CLcm? $POcm?; | |
216 | $PRcm $ALcm; | |
217 | $PRcm $IDcm; | |
218 | ||
219 | # LB 19 | |
220 | $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL | |
221 | $IScm $ALcm; | |
222 | ||
223 | # | |
224 | # Reverse Rules. | |
225 | # | |
226 | ## ------------------------------------------------- | |
227 | ||
228 | !!reverse; | |
229 | ||
230 | $CM+ $ALPlus; | |
231 | $CM+ $BA; | |
232 | $CM+ $BB; | |
233 | $CM+ $B2; | |
234 | $CM+ $CL; | |
235 | $CM+ $EX; | |
236 | $CM+ $GL; | |
237 | $CM+ $HY; | |
238 | $CM+ $ID; | |
239 | $CM+ $IN; | |
240 | $CM+ $IS; | |
241 | $CM+ $NS; | |
242 | $CM+ $NU; | |
243 | $CM+ $OP; | |
244 | $CM+ $PO; | |
245 | $CM+ $PR; | |
246 | $CM+ $QU; | |
247 | $CM+ $SP; | |
248 | $CM+ $SY; | |
249 | $CM+ $WJ; | |
250 | ||
251 | # LB 3 | |
252 | ||
253 | $LB3Breaks $LB3NonBreaks; | |
254 | $LB3Breaks $CM* $LB5NonBreaks; | |
255 | $LF $CR; | |
256 | ||
257 | # LB 4 x SP | |
258 | # x ZW | |
259 | [$SP $ZW] $LB3NonBreaks; | |
260 | [$SP $ZW] $CM* $LB5NonBreaks; | |
261 | ||
262 | # LB 5 Break after zero width space | |
263 | ||
264 | # LB 6 Jamo is treated like an alphabet | |
265 | ||
266 | $BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+; | |
267 | $CM* $BackHangulSyllable; | |
268 | ||
269 | # LB 7 Combining marks. | |
270 | # $SP $CM needs to behave like $ID. | |
271 | # X $CM needs to behave like X, where X is not $SP. | |
272 | # $CM not covered by the above needs to behave like $AL | |
273 | # Stick together any combining sequences that don't match other rules. | |
274 | $CM+ $LB5NonBreaks; | |
275 | ||
276 | # LB 8 | |
277 | $CL $CM* $LB5NonBreaks; | |
278 | $EX $CM* $LB5NonBreaks; | |
279 | $IS $CM* $LB5NonBreaks; | |
280 | $SY $CM* $LB5NonBreaks; | |
281 | ||
282 | # LB 9 | |
283 | $LB5NonBreaks $SP* $CM* $OP; | |
284 | ||
285 | # LB 10 | |
286 | $CM* $OP $SP* $CM* $QU; | |
287 | ||
288 | # LB 11 | |
289 | $CM* $NS $SP* $CM* $CL; | |
290 | ||
291 | # LB 11a | |
292 | ($CM* $B2)+; | |
293 | ||
294 | # LB 11b | |
295 | $CM* ($GL | $WJ) $CM* $LB5NonBreaks; | |
296 | $CM* $LB5NonBreaks $CM* ($GL | $WJ); | |
297 | . $CM* ($GL | $WJ); | |
298 | ||
299 | # LB 12 | |
300 | ||
301 | # LB 14 | |
302 | $CM* $QU $CM* $LB12NonBreaks; | |
303 | $CM* $QU $CM+ $SP; | |
304 | $CM* $LB5NonBreaks $CM* $QU; | |
305 | ||
306 | # LB 14a | |
307 | $BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP); | |
308 | ||
309 | # LB 15 | |
310 | $CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter; | |
311 | ($CM* ($BA | $HY | $NS))+ $CM+ / $LB5Breaks; | |
312 | [$CR $LF $BK $NL $ZW] $CM* $BB; | |
313 | $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB; | |
314 | ||
315 | # LB 16 | |
316 | $CM* $IN $CM* $ALPlus; | |
317 | # by rule 7c, any otherwise unattached CM behaves as AL | |
318 | $CM* $IN $CM+ / $LB5Breaks; | |
319 | ||
320 | $CM* $IN $CM* ($ID | $CM $SP); | |
321 | $CM* $IN $CM* $IN; | |
322 | $CM* $IN $CM* $NU; | |
323 | ||
324 | # $LB 17 | |
325 | $CM* $PO $CM* ($ID | $CM $SP); | |
326 | $CM* $NU ($CM* $ALPlus)+; # includes $LB19 | |
327 | $CM* $NU $CM+ / $LB5Breaks; # Rule 7c | |
328 | ||
329 | $CM* $ALPlus $CM* $NU; | |
330 | ||
331 | # LB 18 | |
332 | ($CM* $PO)? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* $PR)?; | |
333 | $CM* $ALPlus $CM* $PR; | |
334 | $CM* ($ID | $BackHangulSyllable) $CM* $PR; | |
335 | ||
336 | # LB 19 | |
337 | $CM* $ALPlus $CM* $ALPlus; | |
338 | # The $CM* is from rule 7C, and unattached CM is treated as AL | |
339 | $CM* $ALPlus $CM* $IS; | |
340 | $CM* $ALPlus $CM+ / $LB5Breaks; | |
341 | ||
342 | ## problem state table can't handle lookahead when it is at the | |
343 | ## start of the string, currently handled in the rbbi code | |
344 | ## todo fix this | |
345 | ||
346 | ## ------------------------------------------------- | |
347 | ||
348 | !!safe_reverse; | |
349 | ||
350 | # LB 6 | |
351 | $V+ $L; | |
352 | ||
353 | # LB 7 | |
354 | $CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; | |
355 | $CM+ $SP / .; | |
356 | ||
357 | # LB 9 | |
358 | $SP+ $CM* $OP; | |
359 | ||
360 | # LB 10 | |
361 | $SP+ $CM* $QU; | |
362 | ||
363 | # LB 11 | |
364 | $SP+ $CM* $CL; | |
365 | ||
366 | # LB 18 | |
367 | ($CM* ($IS | $SY))+ $CM* $NU; | |
368 | $CL $CM* ($NU | $IS | $SY); | |
369 | ||
370 | ## ------------------------------------------------- | |
371 | ||
372 | !!safe_forward; | |
373 | ||
374 | # LB 6 | |
375 | $V+ $T; | |
376 | ||
377 | # LB 7 | |
378 | [^$BK $CR $LF $NL $ZW $SP] $CM+; | |
379 | $SP $CM+ / [^$CM]; | |
380 | ||
381 | # LB 9 | |
382 | $OP $CM* $SP+; | |
383 | ||
384 | # LB 10 | |
385 | $QU $CM* $SP+; | |
386 | ||
387 | # LB 11 | |
388 | $CL $CM* $SP+; | |
389 | ||
390 | # LB 18 | |
391 | $CM* $PRcm? ($OPcm | $HYcm)? $NU; |