icuSources/data/brkitr/rules/word.txt

   1 #
   2 # Copyright (C) 2002-2016, International Business Machines Corporation
   3 # and others. All Rights Reserved.
   4 #
   5 # file:  word.txt
   6 #
   7 # ICU Word Break Rules (modified from standard to remove colon from $MidLetter)
   8 #      See Unicode Standard Annex #29.
   9 #      These rules are based on UAX #29 Revision 27 for Unicode Version 8.0
  10 #      with additions from L2/16-011R3 for Emoji sequences.
  11 #
  12 # Note:  Updates to word.txt will usually need to be merged into
  13 #        word_POSIX.txt also.
  14
  15 ##############################################################################
  16 #
  17 #  Character class definitions from TR 29
  18 #
  19 ##############################################################################
  20
  21 !!chain;
  22
  23
  24 #
  25 #  Character Class Definitions.
  26 #
  27
  28 # Apple $EmojiForMods becomes $E_Base here
  29 $E_Base      = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
  30 # Apple $EmojiMods becomes $E_Modifier here, same chars
  31 $E_Modifier  = [\U0001F3FB-\U0001F3FF];
  32 $ZWJ = [\u200D];
  33 # Apple $EmojiForSeqs becomes $GAZ here (only emoji that follow a ZWJ)
  34 $GAZ         = [\u2640\u2642\u2764\U0001F308\U0001F466-\U0001F469\U0001F48B\U0001F5E8];
  35
  36 $CR                 = [\p{Word_Break = CR}];
  37 $LF                 = [\p{Word_Break = LF}];
  38 $Newline            = [\p{Word_Break = Newline} ];
  39 $Extend             = [[\p{Word_Break = Extend}][:Block=Tags:]];
  40 $Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
  41 $Format             = [[\p{Word_Break = Format}] - [:Block=Tags:]];
  42 $Katakana           = [\p{Word_Break = Katakana}];
  43 $Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
  44 $ALetter            = [\p{Word_Break = ALetter}];
  45 $Single_Quote       = [\p{Word_Break = Single_Quote}];
  46 $Double_Quote       = [\p{Word_Break = Double_Quote}];
  47 $MidNumLet          = [\p{Word_Break = MidNumLet}];
  48 $MidLetter          = [\p{Word_Break = MidLetter} - [\:]];
  49 $MidNum             = [\p{Word_Break = MidNum}];
  50 $Numeric            = [\p{Word_Break = Numeric}];
  51 $ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
  52
  53 $Han                = [:Han:];
  54 $Hiragana           = [:Hiragana:];
  55
  56
  57 #   Dictionary character set, for triggering language-based break engines. Currently
  58 #   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
  59 #   5.0 or later as the definition of Complex_Context was corrected to include all
  60 #   characters requiring dictionary break.
  61
  62 $Control        = [\p{Grapheme_Cluster_Break = Control}];
  63 $HangulSyllable = [\uac00-\ud7a3];
  64 $ComplexContext = [:LineBreak = Complex_Context:];
  65 $KanaKanji      = [$Han $Hiragana $Katakana];
  66 $dictionaryCJK  = [$KanaKanji $HangulSyllable];
  67 $dictionary     = [$ComplexContext $dictionaryCJK];
  68
  69 # leave CJK scripts out of ALetterPlus
  70 $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
  71
  72
  73 #
  74 #  Rules 4    Ignore Format and Extend characters,
  75 #             except when they appear at the beginning of a region of text.
  76 #
  77 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
  78 $KatakanaEx           = $Katakana           ($Extend |  $Format | $ZWJ)*;
  79 $Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format | $ZWJ)*;
  80 $ALetterEx            = $ALetterPlus        ($Extend |  $Format | $ZWJ)*;
  81 $Single_QuoteEx       = $Single_Quote       ($Extend |  $Format | $ZWJ)*;
  82 $Double_QuoteEx       = $Double_Quote       ($Extend |  $Format | $ZWJ)*;
  83 $MidNumLetEx          = $MidNumLet          ($Extend |  $Format | $ZWJ)*;
  84 $MidLetterEx          = $MidLetter          ($Extend |  $Format | $ZWJ)*;
  85 $MidNumEx             = $MidNum             ($Extend |  $Format | $ZWJ)*;
  86 $NumericEx            = $Numeric            ($Extend |  $Format | $ZWJ)*;
  87 $ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format | $ZWJ)*;
  88 $Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format | $ZWJ)*;
  89
  90 $Ideographic    = [\p{Ideographic}];
  91 $HiraganaEx     = $Hiragana     ($Extend |  $Format | $ZWJ)*;
  92 $IdeographicEx  = $Ideographic  ($Extend |  $Format | $ZWJ)*;
  93
  94 ## -------------------------------------------------
  95
  96 !!forward;
  97
  98
  99 # Rule 3 - CR x LF
 100 #
 101 $CR $LF;
 102
 103 # Rule 3c   ZWJ x GAZ.  Preceeds WB4, so no intervening Extend chars allowed.
 104 #
 105 $ZWJ $GAZ;
 106
 107
 108 # Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
 109 #          of a region of Text.   The rule here comes into play when the start of text
 110 #          begins with a group of Format chars, or with a "word" consisting of a single
 111 #          char that is not in any of the listed word break categories followed by
 112 #          format char(s), or is not a CJK dictionary character.
 113 [^$CR $LF $Newline]? ($Extend |  $Format | $ZWJ)+;
 114
 115 $NumericEx {100};
 116 $ALetterEx {200};
 117 $HangulSyllable {200};
 118 $Hebrew_LetterEx{200};
 119 $KatakanaEx {400};       # note:  these status values override those from rule 5
 120 $HiraganaEx {400};       #        by virtue of being numerically larger.
 121 $IdeographicEx {400};    #
 122
 123 $E_Base ($Extend | $Format | $ZWJ)*;
 124 $E_Modifier ($Extend | $Format | $ZWJ)*;
 125 $GAZ ($Extend | $Format | $ZWJ)*;
 126
 127 #
 128 # rule 5
 129 #    Do not break between most letters.
 130 #
 131 ($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
 132
 133 # rule 6 and 7
 134 ($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
 135
 136 # rule 7a
 137 $Hebrew_LetterEx $Single_QuoteEx {200};
 138
 139 # rule 7b and 7c
 140 $Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
 141
 142 # rule 8
 143
 144 $NumericEx $NumericEx {100};
 145
 146 # rule 9
 147
 148 ($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
 149
 150 # rule 10
 151
 152 $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
 153
 154 # rule 11 and 12
 155
 156 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
 157
 158 # rule 13
 159 # to be consistent with $KanaKanji $KanaKanhi, changed
 160 # from 300 to 400.
 161 # See also TestRuleStatus in intltest/rbbiapts.cpp
 162 $KatakanaEx  $KatakanaEx {400};
 163
 164 # rule 13a/b
 165
 166 $ALetterEx       $ExtendNumLetEx {200};    #  (13a)
 167 $Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
 168 $NumericEx       $ExtendNumLetEx {100};    #  (13a)
 169 $KatakanaEx      $ExtendNumLetEx {400};    #  (13a)
 170 $ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
 171
 172 $ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
 173 $ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
 174 $ExtendNumLetEx  $NumericEx      {100};    #  (13b)
 175 $ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)
 176
 177 # rule 13c
 178 #    Pairs of Regional Indicators stay together.
 179 #    With rule chaining disabled by ^, this rule will match exactly two of them.
 180 #    No other rule begins with a Regional_Indicator, so chaining cannot extend the match.
 181 #
 182 ^$Regional_IndicatorEx $Regional_IndicatorEx;
 183
 184 # special handling for CJK characters: chain for later dictionary segmentation
 185 $HangulSyllable $HangulSyllable {200};
 186 $KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
 187
 188 # rule 13d
 189 #    E_Base x E_Modifier
 190 #
 191 ($E_Base | $GAZ) ($Format | $Extend | $ZWJ)* $E_Modifier;
 192
 193
 194 ## -------------------------------------------------
 195
 196 !!reverse;
 197
 198 $BackHebrew_LetterEx      = ($Format | $Extend | $ZWJ)* $Hebrew_Letter;
 199 $BackALetterEx            = ($Format | $Extend | $ZWJ)* $ALetterPlus;
 200 $BackSingle_QuoteEx       = ($Format | $Extend | $ZWJ)* $Single_Quote;
 201 $BackDouble_QuoteEx       = ($Format | $Extend | $ZWJ)* $Double_Quote;
 202 $BackMidNumLetEx          = ($Format | $Extend | $ZWJ)* $MidNumLet;
 203 $BackNumericEx            = ($Format | $Extend | $ZWJ)* $Numeric;
 204 $BackMidNumEx             = ($Format | $Extend | $ZWJ)* $MidNum;
 205 $BackMidLetterEx          = ($Format | $Extend | $ZWJ)* $MidLetter;
 206 $BackKatakanaEx           = ($Format | $Extend | $ZWJ)* $Katakana;
 207 $BackHiraganaEx           = ($Format | $Extend | $ZWJ)* $Hiragana;
 208 $BackExtendNumLetEx       = ($Format | $Extend | $ZWJ)* $ExtendNumLet;
 209 $BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
 210
 211 # rule 3
 212 $LF $CR;
 213
 214 # Rule 3c   ZWJ x GAZ.  Preceeds WB4, so no intervening Extend chars allowed.
 215 #
 216 $GAZ $ZWJ;
 217
 218 # rule 4
 219 ($Format | $Extend | $ZWJ)*  [^$CR $LF $Newline]?;
 220
 221 # rule 5
 222
 223 ($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
 224
 225 # rule 6 and 7
 226
 227 ($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
 228
 229 # rule 7a
 230 $BackSingle_QuoteEx $BackHebrew_LetterEx;
 231
 232 # Rule 7b and 7c
 233 $BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
 234
 235 # rule 8
 236
 237 $BackNumericEx $BackNumericEx;
 238
 239 # rule 9
 240
 241 $BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
 242
 243 # rule 10
 244
 245 ($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
 246
 247 # rule 11 and 12
 248
 249 $BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
 250
 251 # rule 13
 252
 253 $BackKatakanaEx $BackKatakanaEx;
 254
 255 # rules 13 a/b
 256 #
 257 $BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
 258 ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
 259
 260 # rule 13c
 261
 262 ^$BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
 263         ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
 264 ^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
 265         ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
 266
 267 $GAZ $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
 268         ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
 269 $GAZ $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
 270         ($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
 271
 272 # special handling for CJK characters: chain for later dictionary segmentation
 273 $HangulSyllable $HangulSyllable;
 274 $KanaKanji $KanaKanji; #different rule status if both kanji and kana found
 275
 276 # rule 13d
 277
 278 $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $GAZ);
 279
 280
 281
 282 ## -------------------------------------------------
 283
 284 !!safe_reverse;
 285
 286 # rule 3
 287 ($Extend | $Format | $ZWJ)+ .?;
 288
 289 # rule 6
 290 ($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
 291
 292 # rule 7b
 293 $Double_Quote $BackHebrew_LetterEx;
 294
 295
 296 # rule 11
 297 ($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
 298
 299 # rule 13c
 300 $BackRegional_IndicatorEx*;
 301
 302 # For dictionary-based break
 303 $dictionary $dictionary;
 304
 305 ## -------------------------------------------------
 306
 307 !!safe_forward;
 308
 309 # rule 4
 310 ($Extend | $Format | $ZWJ)+ .?;
 311
 312 # rule 6
 313 ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
 314
 315 # rule 7b
 316 $Double_QuoteEx $Hebrew_LetterEx;
 317
 318 # rule 11
 319 ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
 320
 321 # rule 13c
 322 $Regional_IndicatorEx*;
 323
 324 # For dictionary-based break
 325 $dictionary $dictionary;