icuSources/data/brkitr/rules/line_loose_cj.txt

   1 # Copyright (c) 2002-2016  International Business Machines Corporation and
   2 # others. All Rights Reserved.
   3 #
   4 #  file:  line_loose_cj.txt
   5 #
   6 #         Line Breaking Rules
   7 #         Implement default line breaking as defined by
   8 #         Unicode Standard Annex #14 Revision 35 for Unicode 8.0
   9 #         http://www.unicode.org/reports/tr14/
  10 #
  11 #         Includes the Emoji breaking proposals from Unicode L2/16-011R3.
  12 #         http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
  13 #
  14 #         tailored as noted in 2nd paragraph below..
  15 #
  16 #         TODO:  Rule LB 8 remains as it was in Unicode 5.2
  17 #         This is only because of a limitation of ICU break engine implementation,
  18 #         not because the older behavior is desirable.
  19 #
  20 #         This tailors the line break behavior to correspond to CSS
  21 #         line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
  22 #         It sets characters of class CJ to behave like ID.
  23 #         In addition, it allows breaks:
  24 #         * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
  25 #         * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
  26 #         * between characters of LineBreak class IN such as 2026
  27 #         * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
  28 #           FF65 (all NS) and FF01, FF1F (both EX).
  29 #         * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
  30 #           this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
  31 #         * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
  32 #           this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
  33
  34
  35 #
  36 #  Character Classes defined by TR 14.
  37 #
  38
  39 !!chain;
  40
  41 !!lookAheadHardBreak;
  42 #
  43 #  !!lookAheadHardBreak    Described here because it is (as yet) undocumented elsewhere
  44 #                          and only used for the line break rules.
  45 #
  46 #           It is used in the implementation of rule LB 10
  47 #           which says to treat any combining mark that is not attached to a base
  48 #           character as if it were of class AL  (alphabetic).
  49 #
  50 #           The problem occurs in the reverse rules.
  51 #
  52 #           Consider a sequence like, with correct breaks as shown
  53 #               LF  ID  CM  AL  AL
  54 #                  ^       ^       ^
  55 #           Then consider the sequence without the initial ID (ideographic)
  56 #                 LF  CM  AL  AL
  57 #                    ^           ^
  58 #           Our CM, which in the first example was attached to the ideograph,
  59 #           is now unattached, becomes an alpha, and joins in with the other
  60 #           alphas.
  61 #
  62 #           When iterating forwards, these sequences do not present any problems
  63 #           When iterating backwards, we need to look ahead when encountering
  64 #           a CM to see whether it attaches to something further on or not.
  65 #           (Look-ahead in a reverse rule is looking towards the start)
  66 #
  67 #           If the CM is unattached, we need to force a break.
  68 #
  69 #           !!lookAheadHardBreak forces the run time state machine to
  70 #           stop immediately when a look ahead rule ( '/' operator) matches,
  71 #           and set the match position to that of the look-ahead operator,
  72 #           no matter what other rules may be in play at the time.
  73 #
  74 #           See rule LB 19 for an example.
  75 #
  76
  77 # Temporary definitions of Emoji Base and Emoji Modifiers, until properties are available.
  78
  79 $EB = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
  80 $EM = [\U0001F3FB-\U0001F3FF];
  81
  82 $AI = [:LineBreak =  Ambiguous:];
  83 $AL = [[:LineBreak =  Alphabetic:] - [$EM\u2764]];
  84 $BAX = [\u2010 \u2013];
  85 $BA = [[:LineBreak =  Break_After:] - $BAX];
  86 $BB = [:LineBreak =  Break_Before:];
  87 $BK = [:LineBreak =  Mandatory_Break:];
  88 $B2 = [:LineBreak =  Break_Both:];
  89 $CB = [:LineBreak =  Contingent_Break:];
  90 $CJ = [:LineBreak =  Conditional_Japanese_Starter:];
  91 $CL = [:LineBreak =  Close_Punctuation:];
  92 $CM = [[:LineBreak =  Combining_Mark:] \u200d];
  93 $CP = [:LineBreak =  Close_Parenthesis:];
  94 $CR = [:LineBreak =  Carriage_Return:];
  95 $EXX = [\uFF01 \uFF1F];
  96 $EX = [[:LineBreak =  Exclamation:] - $EXX];
  97 $GL = [:LineBreak =  Glue:];
  98 $HL = [:LineBreak =  Hebrew_Letter:];
  99 $HY = [:LineBreak =  Hyphen:];
 100 $H2 = [:LineBreak =  H2:];
 101 $H3 = [:LineBreak =  H3:];
 102 $ID = [[:LineBreak =  Ideographic:] $CJ [\u2764] - $EB];
 103 $IN = [:LineBreak =  Inseperable:];
 104 $IS = [:LineBreak =  Infix_Numeric:];
 105 $JL = [:LineBreak =  JL:];
 106 $JV = [:LineBreak =  JV:];
 107 $JT = [:LineBreak =  JT:];
 108 $LF = [:LineBreak =  Line_Feed:];
 109 $NL = [:LineBreak =  Next_Line:];
 110 $NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
 111 $NS = [[:LineBreak =  Nonstarter:] - $NSX];
 112 $NU = [:LineBreak =  Numeric:];
 113 $OP = [:LineBreak =  Open_Punctuation:];
 114 $POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
 115 $PO = [[:LineBreak =  Postfix_Numeric:] - $POX];
 116 $PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
 117 $PR = [[:LineBreak =  Prefix_Numeric:] - $PRX];
 118 $QU = [:LineBreak =  Quotation:];
 119 $RI = [:LineBreak =  Regional_Indicator:];
 120 $SA = [:LineBreak =  Complex_Context:];
 121 $SG = [:LineBreak =  Surrogate:];
 122 $SP = [:LineBreak =  Space:];
 123 $SY = [:LineBreak =  Break_Symbols:];
 124 $WJ = [:LineBreak =  Word_Joiner:];
 125 $XX = [:LineBreak =  Unknown:];
 126 $ZW = [:LineBreak =  ZWSpace:];
 127 $ZWJ = [\u200d];
 128
 129 #   Dictionary character set, for triggering language-based break engines. Currently
 130 #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
 131 #   5.0 or later as the definition of Complex_Context was corrected to include all
 132 #   characters requiring dictionary break.
 133
 134 $dictionary = [:LineBreak = Complex_Context:];
 135
 136 #
 137 #  Rule LB1.  By default, treat AI  (characters with ambiguous east Asian width),
 138 #                               SA  (South East Asian: Thai, Lao, Khmer)
 139 #                               SG  (Unpaired Surrogates)
 140 #                               XX  (Unknown, unassigned)
 141 #                         as $AL  (Alphabetic)
 142 #
 143 $ALPlus = [$AL $AI $SA $SG $XX];
 144
 145 #
 146 #  Combining Marks.   X $CM*  behaves as if it were X.  Rule LB6.
 147 #
 148 $ALcm = $ALPlus $CM*;
 149 $BAcm = $BA $CM*;
 150 $BAXcm = $BAX $CM*;
 151 $BBcm = $BB $CM*;
 152 $B2cm = $B2 $CM*;
 153 $CLcm = $CL $CM*;
 154 $CPcm = $CP $CM*;
 155 $EXcm = $EX $CM*;
 156 $EXXcm = $EXX $CM*;
 157 $GLcm = $GL $CM*;
 158 $HLcm = $HL $CM*;
 159 $HYcm = $HY $CM*;
 160 $H2cm = $H2 $CM*;
 161 $H3cm = $H3 $CM*;
 162 $INcm = $IN $CM*;
 163 $IScm = $IS $CM*;
 164 $JLcm = $JL $CM*;
 165 $JVcm = $JV $CM*;
 166 $JTcm = $JT $CM*;
 167 $NScm = $NS $CM*;
 168 $NSXcm = $NSX $CM*;
 169 $NUcm = $NU $CM*;
 170 $OPcm = $OP $CM*;
 171 $POcm = $PO $CM*;
 172 $POXcm = $POX $CM*;
 173 $PRcm = $PR $CM*;
 174 $PRXcm = $PRX $CM*;
 175 $QUcm = $QU $CM*;
 176 $RIcm = $RI $CM*;
 177 $SYcm = $SY $CM*;
 178 $WJcm = $WJ $CM*;
 179
 180 ## -------------------------------------------------
 181
 182 !!forward;
 183
 184 #
 185 #  Each class of character can stand by itself as an unbroken token, with trailing combining stuff
 186 #
 187 $ALPlus $CM+;
 188 $BA $CM+;
 189 $BAX $CM+;
 190 $BB $CM+;
 191 $B2 $CM+;
 192 $CL $CM+;
 193 $CP $CM+;
 194 $EB $CM+;
 195 $EM $CM+;
 196 $EX $CM+;
 197 $EXX $CM+;
 198 $GL $CM+;
 199 $HL $CM+;
 200 $HY $CM+;
 201 $H2 $CM+;
 202 $H3 $CM+;
 203 $ID $CM+;
 204 $IN $CM+;
 205 $IS $CM+;
 206 $JL $CM+;
 207 $JV $CM+;
 208 $JT $CM+;
 209 $NS $CM+;
 210 $NSX $CM+;
 211 $NU $CM+;
 212 $OP $CM+;
 213 $PO $CM+;
 214 $POX $CM+;
 215 $PR $CM+;
 216 $PRX $CM+;
 217 $QU $CM+;
 218 $RI $CM+;
 219 $SY $CM+;
 220 $WJ $CM+;
 221
 222 #
 223 # CAN_CM  is the set of characters that may combine with CM combining chars.
 224 #         Note that Linebreak UAX 14's concept of a combining char and the rules
 225 #         for what they can combine with are _very_ different from the rest of Unicode.
 226 #
 227 #         Note that $CM itself is left out of this set.  If CM is needed as a base
 228 #         it must be listed separately in the rule.
 229 #
 230 $CAN_CM  = [^$SP $BK $CR $LF $NL $ZW $CM];       # Bases that can   take CMs
 231 $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM];       # Bases that can't take CMs
 232
 233 #
 234 # AL_FOLLOW  set of chars that can unconditionally follow an AL
 235 #            Needed in rules where stand-alone $CM s are treated as AL.
 236 #            Chaining is disabled with CM because it causes other failures,
 237 #            so for this one case we need to manually list out longer sequences.
 238 #
 239 $AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP];
 240 $AL_FOLLOW_CM   = [$CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus];
 241 $AL_FOLLOW      = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM];
 242
 243
 244 #
 245 #  Rule LB 4, 5    Mandatory (Hard) breaks.
 246 #
 247 $LB4Breaks    = [$BK $CR $LF $NL];
 248 $LB4NonBreaks = [^$BK $CR $LF $NL $CM];
 249 $CR $LF {100};
 250
 251 #
 252 #  LB 6    Do not break before hard line breaks.
 253 #
 254 $LB4NonBreaks?  $LB4Breaks {100};    # LB 5  do not break before hard breaks.
 255 $CAN_CM $CM*    $LB4Breaks {100};
 256 ^$CM+           $LB4Breaks {100};
 257
 258 # LB 7         x SP
 259 #              x ZW
 260 $LB4NonBreaks [$SP $ZW];
 261 $CAN_CM $CM*  [$SP $ZW];
 262 ^$CM+         [$SP $ZW];
 263
 264 #
 265 # LB 8         Break after zero width space
 266 #              TODO:  ZW SP* <break>
 267 #              An engine change is required to write the reverse rule for this.
 268 #              For now, leave the Unicode 5.2 rule, ZW <break>
 269 #
 270 $LB8Breaks    = [$LB4Breaks $ZW];
 271 $LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
 272
 273 # LB 8a        ZWJ x ID    Emoji proposal.
 274 #
 275 $ZWJ ($ID | $EB | $EM);
 276
 277 # LB 9     Combining marks.      X   $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
 278 #                                $CM not covered by the above needs to behave like $AL
 279 #                                See definition of $CAN_CM.
 280
 281 $CAN_CM $CM+;                   #  Stick together any combining sequences that don't match other rules.
 282 ^$CM+;
 283
 284 #
 285 # LB 11  Do not break before or after WORD JOINER & related characters.
 286 #
 287 $CAN_CM $CM*  $WJcm;
 288 $LB8NonBreaks $WJcm;
 289 ^$CM+         $WJcm;
 290
 291 $WJcm $CANT_CM;
 292 $WJcm $CAN_CM $CM*;
 293
 294 #
 295 # LB 12  Do not break after NBSP and related characters.
 296 #         GL  x
 297 #
 298 $GLcm $CAN_CM $CM*;
 299 $GLcm $CANT_CM;
 300
 301 #
 302 # LB 12a  Do not break before NBSP and related characters ...
 303 #            [^SP BA HY] x GL
 304 #
 305 [[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GLcm;
 306 ^$CM+ $GLcm;
 307
 308
 309 #
 310 # LB 13   Don't break before ']' or '!' or ';' or '/', even after spaces.
 311 #
 312 # Do not include $EXX here
 313 $LB8NonBreaks $CL;
 314 $CAN_CM $CM*  $CL;
 315 ^$CM+         $CL;              # by rule 10, stand-alone CM behaves as AL
 316
 317 $LB8NonBreaks $CP;
 318 $CAN_CM $CM*  $CP;
 319 ^$CM+         $CP;              # by rule 10, stand-alone CM behaves as AL
 320
 321 $LB8NonBreaks $EX;
 322 $CAN_CM $CM*  $EX;
 323 ^$CM+         $EX;              # by rule 10, stand-alone CM behaves as AL
 324
 325 $LB8NonBreaks $IS;
 326 $CAN_CM $CM*  $IS;
 327 ^$CM+         $IS;              # by rule 10, stand-alone CM behaves as AL
 328
 329 $LB8NonBreaks $SY;
 330 $CAN_CM $CM*  $SY;
 331 ^$CM+         $SY;              # by rule 10, stand-alone CM behaves as AL
 332
 333
 334 #
 335 # LB 14  Do not break after OP, even after spaces
 336 #
 337 $OPcm $SP* $CAN_CM $CM*;
 338 $OPcm $SP* $CANT_CM;
 339
 340 $OPcm $SP+ $CM+ $AL_FOLLOW?;    # by rule 10, stand-alone CM behaves as AL
 341
 342 # LB 15
 343 $QUcm $SP* $OPcm;
 344
 345 # LB 16
 346 # Do not break between closing punctuation and $NS, even with intervening spaces
 347 # But DO allow a break between closing punctuation and $NSX, don't include it here
 348 ($CLcm | $CPcm) $SP* $NScm;
 349
 350 # LB 17
 351 $B2cm $SP* $B2cm;
 352
 353 #
 354 # LB 18  Break after spaces.
 355 #
 356 $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
 357 $LB18Breaks    = [$LB8Breaks $SP];
 358
 359
 360 # LB 19
 361 #         x QU
 362 $LB18NonBreaks $CM* $QUcm;
 363 ^$CM+               $QUcm;
 364
 365 #         QU  x
 366 $QUcm .?;
 367
 368
 369 # LB 20
 370 #        <break>  $CB
 371 #        $CB   <break>
 372
 373 $LB20NonBreaks = [$LB18NonBreaks - $CB];
 374
 375 # LB 21        x   (BA | HY | NS)
 376 #           BB x
 377 #
 378 # DO allow breaks here before $BAXcm and $NSXcm, so don't include them
 379 $LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm);
 380 ^$CM+ ($BAcm | $HYcm | $NScm);
 381
 382 $BBcm [^$CB];                                  #  $BB  x
 383 $BBcm $LB20NonBreaks $CM*;
 384
 385 # LB 21a Don't break after Hebrew + Hyphen
 386 #   HL (HY | BA) x
 387 #
 388 $HLcm ($HYcm | $BAcm | $BAXcm) [^$CB]?;
 389
 390 # LB 21b (forward) Don't break between SY and HL
 391 # (break between HL and SY already disallowed by LB 13 above)
 392 $SYcm $HLcm;
 393
 394 # LB 22
 395 ($ALcm | $HLcm) $INcm;
 396 ^$CM+    $INcm;     #  by rule 10, any otherwise unattached CM behaves as AL
 397 $EXcm    $INcm;
 398 ($ID | $EB | $EM) $CM*  $INcm;
 399 # $INcm  $INcm; # delete this rule for CSS loose
 400 $NUcm    $INcm;
 401
 402
 403 # $LB 23
 404 # Do not include $POX here
 405 ($ID | $EB | $EM) $CM*  $POcm;
 406 $ALcm  $NUcm;       # includes $LB19
 407 $HLcm  $NUcm;
 408 ^$CM+  $NUcm;       # Rule 10, any otherwise unattached CM behaves as AL
 409 $NUcm  $ALcm;
 410 $NUcm  $HLcm;
 411
 412 #
 413 # LB 24
 414 #
 415 # Do not include $PRX here
 416 $PRcm ($ID | $EB | $EM);
 417 $PRcm ($ALcm | $HLcm);
 418 ($POcm | $POXcm) ($ALcm | $HLcm);
 419 ($ALcm | $HLcm) ($PRcm | $POcm | $POXcm);
 420 ^$CM+ ($PRcm | $POcm | $POXcm);       # Rule 10, any otherwise unattached CM behaves as AL
 421
 422 #
 423 # LB 25   Numbers.
 424 #
 425 # Here do not include $PRX at the beginning or $POX at the end
 426 ($PRcm | $POcm | $POXcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $PRXcm | $POcm)?;
 427
 428 # LB 26  Do not break a Korean syllable
 429 #
 430 $JLcm ($JLcm | $JVcm | $H2cm | $H3cm);
 431 ($JVcm | $H2cm) ($JVcm | $JTcm);
 432 ($JTcm | $H3cm) $JTcm;
 433
 434 # LB 27  Treat korean Syllable Block the same as ID  (don't break it)
 435 # Do not include $POX or $PRX here
 436 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm;
 437 ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm;
 438 $PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm);
 439
 440
 441 # LB 28   Do not break between alphabetics
 442 #
 443 ($ALcm | $HLcm) ($ALcm | $HLcm);
 444 ^$CM+ ($ALcm | $HLcm);      # The $CM+ is from rule 10, an unattached CM is treated as AL
 445
 446 # LB 29
 447 $IScm ($ALcm | $HLcm);
 448
 449 # LB 30
 450 ($ALcm | $HLcm | $NUcm) $OPcm;
 451 ^$CM+ $OPcm;         # The $CM+ is from rule 10, an unattached CM is treated as AL.
 452 $CPcm ($ALcm | $HLcm | $NUcm);
 453
 454 # LB 30a  Do not break between regional indicators. Break after pairs of them.
 455 #         Tricky interaction with LB8a: ZWJ x ID
 456 $RIcm $RI                / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
 457 $RIcm $RI $CM*  $ZWJ     / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
 458 $RIcm $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
 459
 460 $RIcm $RIcm [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
 461 $RIcm $RIcm $ZWJ ($ID | $EB | $EM);
 462
 463 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
 464 $EB $CM* $EM;
 465
 466 #
 467 #  Reverse Rules.
 468 #
 469 ## -------------------------------------------------
 470
 471 !!reverse;
 472
 473 ^$CM+ $ALPlus;
 474 ^$CM+ $BA;
 475 ^$CM+ $BAX;
 476 ^$CM+ $BB;
 477 ^$CM+ $B2;
 478 ^$CM+ $CL;
 479 ^$CM+ $CP;
 480 ^$CM+ $EB;
 481 ^$CM+ $EM;
 482 ^$CM+ $EX;
 483 ^$CM+ $EXX;
 484 ^$CM+ $GL;
 485 ^$CM+ $HL;
 486 ^$CM+ $HY;
 487 ^$CM+ $H2;
 488 ^$CM+ $H3;
 489 ^$CM+ $ID;
 490 ^$CM+ $IN;
 491 ^$CM+ $IS;
 492 ^$CM+ $JL;
 493 ^$CM+ $JV;
 494 ^$CM+ $JT;
 495 ^$CM+ $NS;
 496 ^$CM+ $NSX;
 497 ^$CM+ $NU;
 498 ^$CM+ $OP;
 499 ^$CM+ $PO;
 500 ^$CM+ $POX;
 501 ^$CM+ $PR;
 502 ^$CM+ $PRX;
 503 ^$CM+ $QU;
 504 ^$CM+ $RI;
 505 ^$CM+ $SY;
 506 ^$CM+ $WJ;
 507 ^$CM+;
 508
 509
 510 #
 511 #  Sequences of the form  (shown forwards)
 512 #      [CANT_CM]  <break>  [CM]  [whatever]
 513 #  The CM needs to behave as an AL
 514 #
 515 $AL_FOLLOW $CM+ / (
 516           [$BK $CR $LF $NL $ZW {eof}] |
 517           $SP+ $CM+ $SP |
 518           $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}]));   # if LB 14 will match, need to surpress this break.
 519                                                #  LB14 says    OP SP* x .
 520                                                #    becomes    OP SP* x AL
 521                                                #    becomes    OP SP* x CM+ AL_FOLLOW
 522                                                #
 523                                                # Further note:  the $AL in [$AL {eof}] is only to work around
 524                                                #                a rule compiler bug which complains about
 525                                                #                empty sets otherwise.
 526
 527 #
 528 #  Sequences of the form  (shown forwards)
 529 #      [CANT_CM]  <break> [CM]  <break>  [PR]
 530 #  The CM needs to behave as an AL
 531 #  This rule is concerned about getting the second of the two <breaks> in place.
 532 #
 533
 534 # Apple early addition, remove $PR from this, superseded by LB24
 535 # [$PR $PRX  ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
 536 [$PRX  ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}];
 537
 538
 539
 540 # LB 4, 5, 6
 541
 542 $LB4Breaks [$LB4NonBreaks-$CM];
 543 $LB4Breaks $CM+ $CAN_CM;
 544 $LF $CR;
 545
 546
 547 # LB 7         x SP
 548 #              x ZW
 549 [$SP $ZW] [$LB4NonBreaks-$CM];
 550 [$SP $ZW] $CM+ $CAN_CM;
 551
 552 # LB 8 ZW SP* <break>
 553 #     TODO: to implement this, we need more than one look-ahead hard break in play at a time.
 554 #           Requires an engine enhancement.
 555 #   / $SP* $ZW
 556
 557 # LB 8a        ZWJ x ID    Unicode Emoji proposal L2/16-011R3
 558 #                          The ZWJ will look like a CM to whatever precedes it.
 559 #
 560 ($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
 561
 562
 563 # LB 9,10  Combining marks.
 564 #    X   $CM needs to behave like X, where X is not $SP or controls.
 565 #    $CM not covered by the above needs to behave like $AL
 566 # Stick together any combining sequences that don't match other rules.
 567 ^$CM+ $CAN_CM;
 568
 569
 570 # LB 11
 571 #
 572 $WJ $CM* $CAN_CM;
 573 $WJ      [$LB8NonBreaks-$CM];
 574
 575      $CANT_CM $CM* $WJ;
 576 $CAN_CM  $CM* $WJ;
 577
 578 # LB 12a
 579 #      [^SP BA HY] x GL
 580 #
 581 $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $BAX $HY]];
 582
 583 # LB 12
 584 #     GL  x
 585 #
 586 $CANT_CM $CM* $GL;
 587 $CAN_CM $CM* $GL;
 588
 589
 590 # LB 13
 591 # Do not include $EXX here
 592 $CL $CM+ $CAN_CM;
 593 $CP $CM+ $CAN_CM;
 594 $EX $CM+ $CAN_CM;
 595 $IS $CM+ $CAN_CM;
 596 $SY $CM+ $CAN_CM;
 597
 598 $CL [$LB8NonBreaks-$CM];
 599 $CP [$LB8NonBreaks-$CM];
 600 $EX [$LB8NonBreaks-$CM];
 601 $IS [$LB8NonBreaks-$CM];
 602 $SY [$LB8NonBreaks-$CM];
 603
 604 # Rule 13 & 14 taken together for an edge case.
 605 #   Match this, shown forward
 606 #     OP SP+  ($CM+ behaving as $AL) (CL | CP | EX | IS | IY)
 607 #   This really wants to chain at the $CM+ (which is acting as an $AL)
 608 #   except for $CM chaining being disabled.
 609 [$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP;
 610
 611 # LB 14    OP SP* x
 612 #
 613 $CAN_CM    $SP* $CM* $OP;
 614      $CANT_CM   $SP* $CM* $OP;
 615 $AL_FOLLOW? $CM+  $SP $SP* $CM* $OP;     #  by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP
 616
 617      $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP;
 618 $AL_FOLLOW_CM   $CM+ $SP+ $CM* $OP;
 619
 620
 621 # LB 15
 622 $OP $SP* $CM* $QU;
 623
 624 # LB 16
 625 # Don't include $NSX here
 626 $NS $SP* $CM* ($CL | $CP);
 627
 628 # LB 17
 629 $B2 $SP* $CM* $B2;
 630
 631 # LB 18  break after spaces
 632 #        Nothing explicit needed here.
 633
 634
 635 #
 636 # LB 19
 637 #
 638 $QU $CM* $CAN_CM;                                #   . x QU
 639 $QU      $LB18NonBreaks;
 640
 641
 642 $CAN_CM  $CM* $QU;                               #   QU x .
 643      $CANT_CM $CM* $QU;
 644
 645 #
 646 #  LB 20  Break before and after CB.
 647 #         nothing needed here.
 648 #
 649
 650 # LB 21
 651 # Don't include $BAX or $NSX here
 652 ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM];     #  . x (BA | HY | NS)
 653
 654 [$LB20NonBreaks-$CM] $CM* $BB;                   #  BB x .
 655 [^$CB] $CM* $BB;                                      #
 656
 657 # LB21a
 658 [^$CB]? $CM* ($HY | $BA | $BAX) $CM* $HL;
 659
 660 # LB21b (reverse)
 661 $HL $CM* $SY;
 662
 663 # LB 22
 664 $IN $CM* ($ALPlus | $HL);
 665 $IN $CM* $EX;
 666 $IN $CM* ($ID | $EB | $EM);
 667 # $IN $CM* $IN; # delete this rule for CSS loose
 668 $CM* $IN $CM* $NU;
 669
 670 # LB 23
 671 # Do not include $POX here
 672 $PO $CM* ($ID | $EB | $EM);
 673 $NU $CM* ($ALPlus | $HL);
 674 ($ALPlus | $HL) $CM* $NU;
 675
 676 # LB 24
 677 # Do not include $PRX here
 678 ($ID | $EB | $EM) $CM* $PR;
 679 ($ALPlus | $HL) $CM* $PR;
 680 ($ALPlus | $HL) $CM* ($PO | $POX);
 681 $CM* ($PR | $PO | $POX) $CM* ($ALPlus | $HL);
 682 $CM* ($PR | $PO | $POX) $CM+ / [$BK $CR $LF $NL $ZW $SP {eof}];
 683
 684 # LB 25
 685 # Here do not include $POX at the beginning or $PRX at the end
 686 ($CM* ($PR | $PRX | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO | $POX))?;
 687
 688 # LB 26
 689 ($H3 | $H2 | $JV | $JL) $CM* $JL;
 690 ($JT | $JV) $CM* ($H2 | $JV);
 691 $JT $CM* ($H3 | $JT);
 692
 693 # LB 27
 694 # Do not include $POX or $PRX here
 695 $IN $CM* ($H3 | $H2 | $JT | $JV | $JL);
 696 $PO $CM* ($H3 | $H2 | $JT | $JV | $JL);
 697 ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR;
 698
 699 # LB 28
 700 ($ALPlus | $HL) $CM* ($ALPlus | $HL);
 701
 702
 703 # LB 29
 704 ($ALPlus | $HL) $CM* $IS;
 705
 706 # LB 30
 707 $OP $CM* ($ALPlus | $HL | $NU);
 708 ($ALPlus | $HL | $NU) $CM* $CP;
 709
 710 # LB 30a
 711 #    Pairs of Regional Indicators.
 712 #    The following two rules are nearly identical. The first matches only sequences with an odd number of adjacent RIs,
 713 #    the second with an even number. Stripping away the cruft they look like
 714 #         [^RI] RI / (RI RI)+ ^RI;
 715 #         [^RI] RI RI / (RI RI)+ ^RI;
 716 #
 717 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
 718 [{bof} $NS $HY $BA $QU $CL $CP $EX $IS $SY $WJ $GL $ZW $SP $BK $CR $LF $NL $ZWJ] $CM* $RI $CM* $RI / ($CM* $RI $CM* $RI)+ $CM* [{eof}[^$RI $CM]];
 719
 720 # In general, adjacent RIs stay together. The hard-break rules, above, overide this, forcing in the boundaries between pairs.
 721 $RI $CM* $RI;
 722
 723 #    WJ, GL, QU, etc. are classes with rules like "WJ x "   which includes "WJ x RI".
 724 $RI $CM* ([$WJ $GL $QU $BB] |  (($HY | $BA)$CM* $HL));
 725
 726
 727 # LB 30b Do not break between an Emoji Base and an Emoji Modifier
 728 $EM $CM* $EB;
 729
 730
 731 ## -------------------------------------------------
 732
 733 !!safe_reverse;
 734
 735 # LB 9
 736 ^$CM+ [^$CM $BK $CR $LF $NL $ZW $SP];
 737 ^$CM+ $SP / .;
 738
 739 # LB 14
 740 $SP+ $CM* $OP;
 741
 742 # LB 15
 743 $SP+ $CM* $QU;
 744
 745 # LB 16
 746 $SP+ $CM* ($CL | $CP);
 747
 748 # LB 17
 749 $SP+ $CM* $B2;
 750
 751 # LB 21
 752 $CM* ($HY | $BA | $BAX) $CM* $HL;
 753
 754 # LB 25
 755 ($CM* ($IS | $SY))+ $CM* $NU;
 756 ($CL | $CP) $CM* ($NU | $IS | $SY);
 757
 758 #  LB 30
 759 ($CM* $RI)+;
 760
 761 # For dictionary-based break
 762 $dictionary $dictionary;
 763
 764 ## -------------------------------------------------
 765
 766 !!safe_forward;
 767
 768 # Skip forward over all character classes that are involved in
 769 #   rules containing patterns with possibly more than one char
 770 #   of context.
 771 #
 772 #  It might be slightly more efficient to have specific rules
 773 #  instead of one generic one, but only if we could
 774 #  turn off rule chaining.  We don't want to move more
 775 #  than necessary.
 776 #
 777 ^[$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $SP $RI $ZWJ $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $PRX $HY $BA $BAX $RI $ZWJ $dictionary];
 778 $dictionary $dictionary;
 779