]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | # Copyright (c) 2002-2006 International Business Machines Corporation and\r |
2 | # others. All Rights Reserved.\r | |
3 | #\r | |
4 | # file: line.txt\r | |
5 | #\r | |
6 | # Line Breaking Rules\r | |
7 | # Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0\r | |
8 | # http://www.unicode.org/reports/tr14/\r | |
9 | \r | |
10 | \r | |
11 | \r | |
12 | #\r | |
13 | # Character Classes defined by TR 14.\r | |
14 | #\r | |
15 | \r | |
16 | !!chain;\r | |
17 | !!LBCMNoChain;\r | |
18 | \r | |
19 | \r | |
20 | !!lookAheadHardBreak;\r | |
21 | #\r | |
22 | # !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere\r | |
23 | # and only used for the line break rules.\r | |
24 | #\r | |
25 | # It is used in the implementation of the incredibly annoying rule LB 10\r | |
26 | # which says to treat any combining mark that is not attached to a base\r | |
27 | # character as if it were of class AL (alphabetic).\r | |
28 | #\r | |
29 | # The problem occurs in the reverse rules.\r | |
30 | #\r | |
31 | # Consider a sequence like, with correct breaks as shown\r | |
32 | # LF ID CM AL AL\r | |
33 | # ^ ^ ^\r | |
34 | # Then consider the sequence without the initial ID (ideographic)\r | |
35 | # LF CM AL AL\r | |
36 | # ^ ^\r | |
37 | # Our CM, which in the first example was attached to the ideograph,\r | |
38 | # is now unattached, becomes an alpha, and joins in with the other\r | |
39 | # alphas.\r | |
40 | #\r | |
41 | # When iterating forwards, these sequences do not present any problems\r | |
42 | # When iterating backwards, we need to look ahead when encountering\r | |
43 | # a CM to see whether it attaches to something further on or not.\r | |
44 | # (Look-ahead in a reverse rule is looking towards the start)\r | |
45 | #\r | |
46 | # If the CM is unattached, we need to force a break.\r | |
47 | #\r | |
48 | # !!lookAheadHardBreak forces the run time state machine to\r | |
49 | # stop immediately when a look ahead rule ( '/' operator) matches,\r | |
50 | # and set the match position to that of the look-ahead operator,\r | |
51 | # no matter what other rules may be in play at the time.\r | |
52 | #\r | |
53 | # See rule LB 19 for an example.\r | |
54 | #\r | |
55 | \r | |
56 | $AI = [:LineBreak = Ambiguous:];\r | |
57 | $AL = [:LineBreak = Alphabetic:];\r | |
58 | $BA = [:LineBreak = Break_After:];\r | |
59 | $BB = [:LineBreak = Break_Before:];\r | |
60 | $BK = [:LineBreak = Mandatory_Break:];\r | |
61 | $B2 = [:LineBreak = Break_Both:];\r | |
62 | $CB = [:LineBreak = Contingent_Break:];\r | |
63 | $CL = [:LineBreak = Close_Punctuation:];\r | |
64 | $CM = [:LineBreak = Combining_Mark:];\r | |
65 | $CR = [:LineBreak = Carriage_Return:];\r | |
66 | $EX = [:LineBreak = Exclamation:];\r | |
67 | $GL = [:LineBreak = Glue:];\r | |
68 | $HY = [:LineBreak = Hyphen:];\r | |
69 | $H2 = [:LineBreak = H2:];\r | |
70 | $H3 = [:LineBreak = H3:];\r | |
71 | $ID = [:LineBreak = Ideographic:];\r | |
72 | $IN = [:LineBreak = Inseperable:];\r | |
73 | $IS = [:LineBreak = Infix_Numeric:];\r | |
74 | $JL = [:LineBreak = JL:];\r | |
75 | $JV = [:LineBreak = JV:];\r | |
76 | $JT = [:LineBreak = JT:];\r | |
77 | $LF = [:LineBreak = Line_Feed:];\r | |
78 | $NL = [:LineBreak = Next_Line:];\r | |
79 | $NS = [:LineBreak = Nonstarter:];\r | |
80 | $NU = [:LineBreak = Numeric:];\r | |
81 | $OP = [:LineBreak = Open_Punctuation:];\r | |
82 | $PO = [:LineBreak = Postfix_Numeric:];\r | |
83 | $PR = [:LineBreak = Prefix_Numeric:];\r | |
84 | $QU = [:LineBreak = Quotation:];\r | |
85 | $SA = [:LineBreak = Complex_Context:];\r | |
86 | $SG = [:LineBreak = Surrogate:];\r | |
87 | $SP = [:LineBreak = Space:];\r | |
88 | $SY = [:LineBreak = Break_Symbols:];\r | |
89 | $WJ = [:LineBreak = Word_Joiner:];\r | |
90 | $XX = [:LineBreak = Unknown:];\r | |
91 | $ZW = [:LineBreak = ZWSpace:];\r | |
92 | \r | |
93 | # Dictionary character set, for triggering language-based break engines. Currently\r | |
94 | # limited to LineBreak=Complex_Context. Note that this set only works in Unicode\r | |
95 | # 5.0 or later as the definition of Complex_Context was corrected to include all\r | |
96 | # characters requiring dictionary break.\r | |
97 | \r | |
98 | $dictionary = [:LineBreak = Complex_Context:];\r | |
99 | \r | |
100 | #\r | |
101 | # Rule LB1. By default, treat AI (characters with ambiguous east Asian width),\r | |
102 | # SA (South East Asian: Thai, Lao, Khmer)\r | |
103 | # SG (Unpaired Surrogates)\r | |
104 | # XX (Unknown, unassigned)\r | |
105 | # as $AL (Alphabetic)\r | |
106 | #\r | |
107 | $ALPlus = [$AL $AI $SA $SG $XX];\r | |
108 | \r | |
109 | #\r | |
110 | # Combining Marks. X $CM* behaves as if it were X. Rule LB6.\r | |
111 | #\r | |
112 | $ALcm = $ALPlus $CM*;\r | |
113 | $BAcm = $BA $CM*;\r | |
114 | $BBcm = $BB $CM*;\r | |
115 | $B2cm = $B2 $CM*;\r | |
116 | $CLcm = $CL $CM*;\r | |
117 | $EXcm = $EX $CM*;\r | |
118 | $GLcm = $GL $CM*;\r | |
119 | $HYcm = $HY $CM*;\r | |
120 | $H2cm = $H2 $CM*;\r | |
121 | $H3cm = $H3 $CM*;\r | |
122 | $IDcm = $ID $CM*;\r | |
123 | $INcm = $IN $CM*;\r | |
124 | $IScm = $IS $CM*;\r | |
125 | $JLcm = $JL $CM*;\r | |
126 | $JVcm = $JV $CM*;\r | |
127 | $JTcm = $JT $CM*;\r | |
128 | $NScm = $NS $CM*;\r | |
129 | $NUcm = $NU $CM*;\r | |
130 | $OPcm = $OP $CM*;\r | |
131 | $POcm = $PO $CM*;\r | |
132 | $PRcm = $PR $CM*;\r | |
133 | $QUcm = $QU $CM*;\r | |
134 | $SYcm = $SY $CM*;\r | |
135 | $WJcm = $WJ $CM*;\r | |
136 | \r | |
137 | ## -------------------------------------------------\r | |
138 | \r | |
139 | !!forward;\r | |
140 | \r | |
141 | #\r | |
142 | # Each class of character can stand by itself as an unbroken token, with trailing combining stuff\r | |
143 | #\r | |
144 | $ALPlus $CM+;\r | |
145 | $BA $CM+;\r | |
146 | $BB $CM+;\r | |
147 | $B2 $CM+;\r | |
148 | $CL $CM+;\r | |
149 | $EX $CM+;\r | |
150 | $GL $CM+;\r | |
151 | $HY $CM+;\r | |
152 | $H2 $CM+;\r | |
153 | $H3 $CM+;\r | |
154 | $ID $CM+;\r | |
155 | $IN $CM+;\r | |
156 | $IS $CM+;\r | |
157 | $JL $CM+;\r | |
158 | $JV $CM+;\r | |
159 | $JT $CM+;\r | |
160 | $NS $CM+;\r | |
161 | $NU $CM+;\r | |
162 | $OP $CM+;\r | |
163 | $PO $CM+;\r | |
164 | $PR $CM+;\r | |
165 | $QU $CM+;\r | |
166 | $SY $CM+;\r | |
167 | $WJ $CM+;\r | |
168 | \r | |
169 | #\r | |
170 | # CAN_CM is the set of characters that may combine with CM combining chars.\r | |
171 | # Note that Linebreak UAX 14's concept of a combining char and the rules\r | |
172 | # for what they can combine with are _very_ different from the rest of Unicode.\r | |
173 | #\r | |
174 | # Note that $CM itself is left out of this set. If CM is needed as a base\r | |
175 | # it must be listed separately in the rule.\r | |
176 | #\r | |
177 | $CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs\r | |
178 | $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs\r | |
179 | \r | |
180 | #\r | |
181 | # AL_FOLLOW set of chars that can unconditionally follow an AL\r | |
182 | # Needed in rules where stand-alone $CM s are treated as AL.\r | |
183 | # Chaining is disabled with CM because it causes other failures,\r | |
184 | # so for this one case we need to manually list out longer sequences.\r | |
185 | #\r | |
186 | $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];\r | |
187 | $AL_FOLLOW_CM = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP];\r | |
188 | $AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];\r | |
189 | \r | |
190 | \r | |
191 | #\r | |
192 | # Rule LB 4, 5 Mandatory (Hard) breaks.\r | |
193 | #\r | |
194 | $LB4Breaks = [$BK $CR $LF $NL];\r | |
195 | $LB4NonBreaks = [^$BK $CR $LF $NL];\r | |
196 | $CR $LF {100};\r | |
197 | \r | |
198 | #\r | |
199 | # LB 6 Do not break before hard line breaks.\r | |
200 | #\r | |
201 | $LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.\r | |
202 | $CAN_CM $CM* $LB4Breaks {100};\r | |
203 | $CM+ $LB4Breaks {100};\r | |
204 | \r | |
205 | # LB 7 x SP\r | |
206 | # x ZW\r | |
207 | $LB4NonBreaks [$SP $ZW];\r | |
208 | $CAN_CM $CM* [$SP $ZW];\r | |
209 | $CM+ [$SP $ZW];\r | |
210 | \r | |
211 | #\r | |
212 | # LB 8 Break after zero width space\r | |
213 | #\r | |
214 | $LB8Breaks = [$LB4Breaks $ZW];\r | |
215 | $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];\r | |
216 | \r | |
217 | \r | |
218 | # LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL \r | |
219 | # $CM not covered by the above needs to behave like $AL \r | |
220 | # See definition of $CAN_CM.\r | |
221 | \r | |
222 | $CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.\r | |
223 | $CM+;\r | |
224 | \r | |
225 | #\r | |
226 | # LB 11 Do not break before or after WORD JOINER & related characters.\r | |
227 | #\r | |
228 | $CAN_CM $CM* $WJcm;\r | |
229 | $LB8NonBreaks $WJcm;\r | |
230 | $CM+ $WJcm;\r | |
231 | \r | |
232 | $WJcm [^$CAN_CM];\r | |
233 | $WJcm $CAN_CM $CM*;\r | |
234 | \r | |
235 | #\r | |
236 | # LB 12 Do not break before or after NBSP and related characters.\r | |
237 | #\r | |
238 | # (!SP) x GL\r | |
239 | [$LB8NonBreaks-$SP] $CM* $GLcm;\r | |
240 | $CM+ $GLcm;\r | |
241 | \r | |
242 | # GL x\r | |
243 | $GLcm ($LB8Breaks | $SP);\r | |
244 | $GLcm [$LB8NonBreaks-$SP] $CM*; # Don't let a combining mark go onto $CR, $BK, etc.\r | |
245 | # TODO: I don't think we need this rule.\r | |
246 | # All but $CM will chain off of preceding rule.\r | |
247 | # $GLcm will pick up the CM case by itself.\r | |
248 | \r | |
249 | \r | |
250 | \r | |
251 | \r | |
252 | #\r | |
253 | # LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.\r | |
254 | #\r | |
255 | $LB8NonBreaks $CL;\r | |
256 | $CAN_CM $CM* $CL;\r | |
257 | $CM+ $CL; # by rule 10, stand-alone CM behaves as AL\r | |
258 | \r | |
259 | $LB8NonBreaks $EX;\r | |
260 | $CAN_CM $CM* $EX;\r | |
261 | $CM+ $EX; # by rule 10, stand-alone CM behaves as AL\r | |
262 | \r | |
263 | $LB8NonBreaks $IS;\r | |
264 | $CAN_CM $CM* $IS;\r | |
265 | $CM+ $IS; # by rule 10, stand-alone CM behaves as AL\r | |
266 | \r | |
267 | $LB8NonBreaks $SY;\r | |
268 | $CAN_CM $CM* $SY;\r | |
269 | $CM+ $SY; # by rule 10, stand-alone CM behaves as AL\r | |
270 | \r | |
271 | \r | |
272 | #\r | |
273 | # LB 14 Do not break after OP, even after spaced\r | |
274 | #\r | |
275 | $OPcm $SP* $CAN_CM $CM*;\r | |
276 | $OPcm $SP* $CANT_CM;\r | |
277 | \r | |
278 | $OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL\r | |
279 | \r | |
280 | # LB 15\r | |
281 | $QUcm $SP* $OPcm;\r | |
282 | \r | |
283 | # LB 16\r | |
284 | $CLcm $SP* $NScm;\r | |
285 | \r | |
286 | # LB 17\r | |
287 | $B2cm $SP* $B2cm;\r | |
288 | \r | |
289 | #\r | |
290 | # LB 18 Break after spaces.\r | |
291 | #\r | |
292 | $LB18NonBreaks = [$LB8NonBreaks - [$SP]];\r | |
293 | $LB18Breaks = [$LB8Breaks $SP];\r | |
294 | \r | |
295 | \r | |
296 | # LB 19\r | |
297 | # x QU\r | |
298 | $LB18NonBreaks $CM* $QUcm;\r | |
299 | $CM+ $QUcm;\r | |
300 | \r | |
301 | # QU x\r | |
302 | $QUcm .?;\r | |
303 | $QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc.\r | |
304 | # TODO: I don't think this rule is needed.\r | |
305 | \r | |
306 | \r | |
307 | # LB 20\r | |
308 | # <break> $CB\r | |
309 | # $CB <break>\r | |
310 | \r | |
311 | $LB20NonBreaks = [$LB18NonBreaks - $CB];\r | |
312 | \r | |
313 | # LB 21 x (BA | HY | NS)\r | |
314 | # BB x\r | |
315 | #\r | |
316 | $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); \r | |
317 | \r | |
318 | $BBcm [^$CB]; # $BB x\r | |
319 | $BBcm $LB20NonBreaks $CM*;\r | |
320 | \r | |
321 | # LB 22\r | |
322 | $ALcm $INcm;\r | |
323 | $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL\r | |
324 | $IDcm $INcm;\r | |
325 | $INcm $INcm;\r | |
326 | $NUcm $INcm;\r | |
327 | \r | |
328 | \r | |
329 | # $LB 23\r | |
330 | $IDcm $POcm;\r | |
331 | $ALcm $NUcm; # includes $LB19\r | |
332 | $CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL\r | |
333 | $NUcm $ALcm;\r | |
334 | \r | |
335 | #\r | |
336 | # LB 24\r | |
337 | #\r | |
338 | $PRcm $IDcm;\r | |
339 | $PRcm $ALcm;\r | |
340 | $POcm $ALcm;\r | |
341 | \r | |
342 | #\r | |
343 | # LB 25 Numbers.\r | |
344 | #\r | |
345 | ($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?;\r | |
346 | \r | |
347 | # LB 26 Do not break a Korean syllable\r | |
348 | #\r | |
349 | $JLcm ($JLcm | $JVcm | $H2cm | $H3cm);\r | |
350 | ($JVcm | $H2cm) ($JVcm | $JTcm);\r | |
351 | ($JTcm | $H3cm) $JTcm;\r | |
352 | \r | |
353 | # LB 27 Treat korean Syllable Block the same as ID (don't break it)\r | |
354 | ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;\r | |
355 | ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;\r | |
356 | $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);\r | |
357 | \r | |
358 | \r | |
359 | # LB 28 Do not break between alphabetics\r | |
360 | #\r | |
361 | $ALcm $ALcm;\r | |
362 | $CM+ $ALcm; # The $CM+ is from rule 10, and unattached CM is treated as AL\r | |
363 | \r | |
364 | # LB 29\r | |
365 | $IScm $ALcm;\r | |
366 | \r | |
367 | #\r | |
368 | # Rule 30 Do not break between letters, numbers or ordinary symbols\r | |
369 | # and opening or closing punctuation\r | |
370 | #\r | |
371 | ($ALcm | $NUcm) $OPcm;\r | |
372 | $CM+ $OPcm;\r | |
373 | $CLcm ($ALcm | $NUcm);\r | |
374 | \r | |
375 | \r | |
376 | \r | |
377 | #\r | |
378 | # Reverse Rules.\r | |
379 | #\r | |
380 | ## -------------------------------------------------\r | |
381 | \r | |
382 | !!reverse;\r | |
383 | \r | |
384 | $CM+ $ALPlus;\r | |
385 | $CM+ $BA;\r | |
386 | $CM+ $BB;\r | |
387 | $CM+ $B2;\r | |
388 | $CM+ $CL;\r | |
389 | $CM+ $EX;\r | |
390 | $CM+ $GL;\r | |
391 | $CM+ $HY;\r | |
392 | $CM+ $H2;\r | |
393 | $CM+ $H3;\r | |
394 | $CM+ $ID;\r | |
395 | $CM+ $IN;\r | |
396 | $CM+ $IS;\r | |
397 | $CM+ $JL;\r | |
398 | $CM+ $JV;\r | |
399 | $CM+ $JT;\r | |
400 | $CM+ $NS;\r | |
401 | $CM+ $NU;\r | |
402 | $CM+ $OP;\r | |
403 | $CM+ $PO;\r | |
404 | $CM+ $PR;\r | |
405 | $CM+ $QU;\r | |
406 | $CM+ $SY;\r | |
407 | $CM+ $WJ;\r | |
408 | $CM+;\r | |
409 | \r | |
410 | \r | |
411 | #\r | |
412 | # Sequences of the form (shown forwards)\r | |
413 | # [CANT_CM] <break> [CM] [whatever]\r | |
414 | # The CM needs to behave as an AL\r | |
415 | #\r | |
416 | $AL_FOLLOW $CM+ / (\r | |
417 | [$BK $CR $LF $NL $ZW {eof}] |\r | |
418 | $SP+ $CM+ $SP |\r | |
419 | $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break.\r | |
420 | # LB14 says OP SP* x . \r | |
421 | # becomes OP SP* x AL\r | |
422 | # becomes OP SP* x CM+ AL_FOLLOW\r | |
423 | #\r | |
424 | # Further note: the $AL in [$AL {eof}] is only to work around\r | |
425 | # a rule compiler bug which complains about\r | |
426 | # empty sets otherwise.\r | |
427 | \r | |
428 | #\r | |
429 | # Sequences of the form (shown forwards)\r | |
430 | # [CANT_CM] <break> [CM] <break> [PR]\r | |
431 | # The CM needs to behave as an AL\r | |
432 | # This rule is concerned about getting the second of the two <breaks> in place.\r | |
433 | #\r | |
434 | \r | |
435 | [$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];\r | |
436 | \r | |
437 | \r | |
438 | \r | |
439 | # LB 4, 5, 5\r | |
440 | \r | |
441 | $LB4Breaks [$LB4NonBreaks-$CM];\r | |
442 | $LB4Breaks $CM+ $CAN_CM;\r | |
443 | $LF $CR;\r | |
444 | \r | |
445 | \r | |
446 | # LB 7 x SP\r | |
447 | # x ZW\r | |
448 | [$SP $ZW] [$LB4NonBreaks-$CM];\r | |
449 | [$SP $ZW] $CM+ $CAN_CM;\r | |
450 | \r | |
451 | # LB 8 Break after zero width space\r | |
452 | \r | |
453 | \r | |
454 | # LB 9,10 Combining marks.\r | |
455 | # X $CM needs to behave like X, where X is not $SP or controls.\r | |
456 | # $CM not covered by the above needs to behave like $AL\r | |
457 | # Stick together any combining sequences that don't match other rules.\r | |
458 | $CM+ $CAN_CM;\r | |
459 | \r | |
460 | \r | |
461 | # LB 11\r | |
462 | $CM* $WJ $CM* $CAN_CM;\r | |
463 | $CM* $WJ [$LB8NonBreaks-$CM];\r | |
464 | \r | |
465 | $CANT_CM $CM* $WJ;\r | |
466 | $CM* $CAN_CM $CM* $WJ;\r | |
467 | \r | |
468 | # LB 12\r | |
469 | # x GL\r | |
470 | #\r | |
471 | $CM* $GL $CM* [$LB8NonBreaks-$CM-$SP];\r | |
472 | \r | |
473 | #\r | |
474 | # GL x\r | |
475 | #\r | |
476 | $CANT_CM $CM* $GL;\r | |
477 | $CM* $CAN_CM $CM* $GL;\r | |
478 | \r | |
479 | \r | |
480 | # LB 13\r | |
481 | $CL $CM+ $CAN_CM;\r | |
482 | $EX $CM+ $CAN_CM;\r | |
483 | $IS $CM+ $CAN_CM;\r | |
484 | $SY $CM+ $CAN_CM;\r | |
485 | \r | |
486 | $CL [$LB8NonBreaks-$CM];\r | |
487 | $EX [$LB8NonBreaks-$CM];\r | |
488 | $IS [$LB8NonBreaks-$CM];\r | |
489 | $SY [$LB8NonBreaks-$CM];\r | |
490 | \r | |
491 | # Rule 13 & 14 taken together for an edge case.\r | |
492 | # Match this, shown forward\r | |
493 | # OP SP+ ($CM+ behaving as $AL) (CL | EX | IS | IY)\r | |
494 | # This really wants to chain at the $CM+ (which is acting as an $AL)\r | |
495 | # except for $CM chaining being disabled.\r | |
496 | [$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP; \r | |
497 | \r | |
498 | # LB 14 OP SP* x\r | |
499 | #\r | |
500 | $CM* $CAN_CM $SP* $CM* $OP;\r | |
501 | $CANT_CM $SP* $CM* $OP;\r | |
502 | $AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP\r | |
503 | \r | |
504 | $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;\r | |
505 | $CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP;\r | |
506 | $SY $CM $SP+ $OP; # TODO: Experiment. Remove.\r | |
507 | \r | |
508 | \r | |
509 | \r | |
510 | # LB 15\r | |
511 | $CM* $OP $SP* $CM* $QU;\r | |
512 | \r | |
513 | # LB 16\r | |
514 | $CM* $NS $SP* $CM* $CL;\r | |
515 | \r | |
516 | # LB 17\r | |
517 | $CM* $B2 $SP* $CM* $B2;\r | |
518 | \r | |
519 | # LB 18 break after spaces\r | |
520 | # Nothing explicit needed here.\r | |
521 | \r | |
522 | \r | |
523 | #\r | |
524 | # LB 19\r | |
525 | #\r | |
526 | $CM* $QU $CM* $CAN_CM; # . x QU\r | |
527 | $CM* $QU $LB18NonBreaks;\r | |
528 | \r | |
529 | \r | |
530 | $CM* $CAN_CM $CM* $QU; # QU x .\r | |
531 | $CANT_CM $CM* $QU;\r | |
532 | \r | |
533 | #\r | |
534 | # LB 20 Break before and after CB.\r | |
535 | # nothing needed here.\r | |
536 | #\r | |
537 | \r | |
538 | # LB 21\r | |
539 | $CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS)\r | |
540 | \r | |
541 | $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x .\r | |
542 | [^$CB] $CM* $BB; # \r | |
543 | \r | |
544 | \r | |
545 | \r | |
546 | # LB 22\r | |
547 | $CM* $IN $CM* $ALPlus;\r | |
548 | $CM* $IN $CM* $ID;\r | |
549 | $CM* $IN $CM* $IN;\r | |
550 | $CM* $IN $CM* $NU;\r | |
551 | \r | |
552 | # LB 23\r | |
553 | $CM* $PO $CM* $ID;\r | |
554 | $CM* $NU $CM* $ALPlus;\r | |
555 | $CM* $ALPlus $CM* $NU;\r | |
556 | \r | |
557 | # LB 24\r | |
558 | $CM* $ID $CM* $PR;\r | |
559 | $CM* $ALPlus $CM* $PR;\r | |
560 | $CM* $ALPlus $CM* $PO;\r | |
561 | \r | |
562 | \r | |
563 | # LB 25\r | |
564 | ($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?;\r | |
565 | \r | |
566 | # LB 26\r | |
567 | $CM* ($H3 | $H2 | $JV | $JL) $CM* $JL;\r | |
568 | $CM* ($JT | $JV) $CM* ($H2 | $JV);\r | |
569 | $CM* $JT $CM* ($H3 | $JT);\r | |
570 | \r | |
571 | # LB 27\r | |
572 | $CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);\r | |
573 | $CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);\r | |
574 | $CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;\r | |
575 | \r | |
576 | # LB 28\r | |
577 | $CM* $ALPlus $CM* $ALPlus;\r | |
578 | \r | |
579 | \r | |
580 | # LB 29\r | |
581 | $CM* $ALPlus $CM* $IS;\r | |
582 | \r | |
583 | # LB 30\r | |
584 | $CM* $OP $CM* ($NU | $ALPlus);\r | |
585 | $CM* ($NU | $ALPlus) $CM* $CL;\r | |
586 | \r | |
587 | \r | |
588 | ## -------------------------------------------------\r | |
589 | \r | |
590 | !!safe_reverse;\r | |
591 | \r | |
592 | # LB 7\r | |
593 | $CM+ [^$CM $BK $CR $LF $NL $ZW $SP];\r | |
594 | $CM+ $SP / .;\r | |
595 | \r | |
596 | # LB 9\r | |
597 | $SP+ $CM* $OP;\r | |
598 | \r | |
599 | # LB 10\r | |
600 | $SP+ $CM* $QU;\r | |
601 | \r | |
602 | # LB 11\r | |
603 | $SP+ $CM* $CL;\r | |
604 | $SP+ $CM* $B2;\r | |
605 | \r | |
606 | # LB 18\r | |
607 | ($CM* ($IS | $SY))+ $CM* $NU;\r | |
608 | $CL $CM* ($NU | $IS | $SY);\r | |
609 | \r | |
610 | # For dictionary-based break\r | |
611 | $dictionary $dictionary;\r | |
612 | \r | |
613 | ## -------------------------------------------------\r | |
614 | \r | |
615 | !!safe_forward;\r | |
616 | \r | |
617 | # Skip forward over all character classes that are involved in\r | |
618 | # rules containing patterns with possibly more than one char\r | |
619 | # of context.\r | |
620 | #\r | |
621 | # It might be slightly more efficient to have specific rules\r | |
622 | # instead of one generic one, but only if we could\r | |
623 | # turn off rule chaining. We don't want to move more\r | |
624 | # than necessary.\r | |
625 | #\r | |
626 | [$CM $OP $QU $CL $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $dictionary];\r | |
627 | $dictionary $dictionary;\r | |
628 | \r |