icuSources/i18n/regexcst.txt

   1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
   2 # License & terms of use: http://www.unicode.org/copyright.html
   3 #*****************************************************************************
   4 #
   5 #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
   6 #   All Rights Reserved.
   7 #
   8 #*****************************************************************************
   9 #
  10 #  file:  regexcst.txt
  11 #  ICU Regular Expression Parser State Table
  12 #
  13 #     This state table is used when reading and parsing a regular expression pattern
  14 #     The pattern parser uses a state machine; the data in this file define the
  15 #     state transitions that occur for each input character.
  16 #
  17 #     *** This file defines the regex pattern grammar.   This is it.
  18 #     *** The determination of what is accepted is here.
  19 #
  20 #     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
  21 #     that are then built with the rule parser.
  22 #
  23
  24 #
  25 # Here is the syntax of the state definitions in this file:
  26 #
  27 #
  28 #StateName:
  29 #   input-char           n next-state           ^push-state     action
  30 #   input-char           n next-state           ^push-state     action
  31 #       |                |   |                      |             |
  32 #       |                |   |                      |             |--- action to be performed by state machine
  33 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
  34 #       |                |   |                      |
  35 #       |                |   |                      |--- Push this named state onto the state stack.
  36 #       |                |   |                           Later, when next state is specified as "pop",
  37 #       |                |   |                           the pushed state will become the current state.
  38 #       |                |   |
  39 #       |                |   |--- Transition to this state if the current input character matches the input
  40 #       |                |        character or char class in the left hand column.  "pop" causes the next
  41 #       |                |        state to be popped from the state stack.
  42 #       |                |
  43 #       |                |--- When making the state transition specified on this line, advance to the next
  44 #       |                     character from the input only if 'n' appears here.
  45 #       |
  46 #       |--- Character or named character classes to test for.  If the current character being scanned
  47 #            matches, peform the actions and go to the state specified on this line.
  48 #            The input character is tested sequentally, in the order written.  The characters and
  49 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
  50 #
  51
  52
  53
  54
  55 #
  56 #  start state, scan position is at the beginning of the pattern.
  57 #
  58 start:
  59    default                 term                                     doPatStart
  60
  61
  62
  63
  64 #
  65 #  term.  At a position where we can accept the start most items in a pattern.
  66 #
  67 term:
  68     quoted               n expr-quant                               doLiteralChar
  69     rule_char            n expr-quant                               doLiteralChar
  70     '['                  n set-open       ^set-finish               doSetBegin
  71     '('                  n open-paren
  72     '.'                  n expr-quant                               doDotAny
  73     '^'                  n expr-quant                               doCaret
  74     '$'                  n expr-quant                               doDollar
  75     '\'                  n backslash
  76     '|'                  n  term                                    doOrOperator
  77     ')'                  n  pop                                     doCloseParen
  78     eof                    term                                     doPatFinish
  79     default                errorDeath                               doRuleError
  80
  81
  82
  83 #
  84 #   expr-quant    We've just finished scanning a term, now look for the optional
  85 #                 trailing quantifier - *, +, ?, *?,  etc.
  86 #
  87 expr-quant:
  88     '*'                  n  quant-star
  89     '+'                  n  quant-plus
  90     '?'                  n  quant-opt
  91     '{'                  n  interval-open                          doIntervalInit
  92     '('                  n  open-paren-quant
  93     default                 expr-cont
  94
  95
  96 #
  97 #  expr-cont      Expression, continuation.  At a point where additional terms are
  98 #                                            allowed, but not required.  No Quantifiers
  99 #
 100 expr-cont:
 101     '|'                  n  term                                    doOrOperator
 102     ')'                  n  pop                                     doCloseParen
 103     default                 term
 104
 105
 106 #
 107 #   open-paren-quant   Special case handling for comments appearing before a quantifier,
 108 #                        e.g.   x(?#comment )*
 109 #                      Open parens from expr-quant come here; anything but a (?# comment
 110 #                      branches into the normal parenthesis sequence as quickly as possible.
 111 #
 112 open-paren-quant:
 113     '?'                  n  open-paren-quant2                      doSuppressComments
 114     default                 open-paren
 115
 116 open-paren-quant2:
 117     '#'                  n  paren-comment   ^expr-quant
 118     default                 open-paren-extended
 119
 120
 121 #
 122 #   open-paren    We've got an open paren.  We need to scan further to
 123 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
 124 #
 125 open-paren:
 126     '?'                  n  open-paren-extended                     doSuppressComments
 127     default                 term            ^expr-quant             doOpenCaptureParen
 128
 129 open-paren-extended:
 130     ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
 131     '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
 132     '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
 133     '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
 134     '<'                  n  open-paren-lookbehind
 135     '#'                  n  paren-comment   ^term
 136     'i'                     paren-flag                              doBeginMatchMode
 137     'd'                     paren-flag                              doBeginMatchMode
 138     'm'                     paren-flag                              doBeginMatchMode
 139     's'                     paren-flag                              doBeginMatchMode
 140     'u'                     paren-flag                              doBeginMatchMode
 141     'w'                     paren-flag                              doBeginMatchMode
 142     'x'                     paren-flag                              doBeginMatchMode
 143     '-'                     paren-flag                              doBeginMatchMode
 144     '('                  n  errorDeath                              doConditionalExpr
 145     '{'                  n  errorDeath                              doPerlInline
 146     default                 errorDeath                              doBadOpenParenType
 147
 148 open-paren-lookbehind:
 149     '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
 150     '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
 151     ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
 152     default                 errorDeath                              doBadOpenParenType
 153
 154
 155 #
 156 #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
 157 #
 158 paren-comment:
 159     ')'                  n  pop
 160     eof                         errorDeath                              doMismatchedParenErr
 161     default              n  paren-comment
 162
 163 #
 164 #  paren-flag    Scanned a (?ismx-ismx  flag setting
 165 #
 166 paren-flag:
 167     'i'                  n  paren-flag                              doMatchMode
 168     'd'                  n  paren-flag                              doMatchMode
 169     'm'                  n  paren-flag                              doMatchMode
 170     's'                  n  paren-flag                              doMatchMode
 171     'u'                  n  paren-flag                              doMatchMode
 172     'w'                  n  paren-flag                              doMatchMode
 173     'x'                  n  paren-flag                              doMatchMode
 174     '-'                  n  paren-flag                              doMatchMode
 175     ')'                  n  term                                    doSetMatchMode
 176     ':'                  n  term              ^expr-quant           doMatchModeParen
 177     default                 errorDeath                              doBadModeFlag
 178
 179 #
 180 #  named-capture    (?<name> ... ), position currently on the name.
 181 #
 182 named-capture:
 183     ascii_letter         n  named-capture                           doContinueNamedCapture
 184     digit_char           n  named-capture                           doContinueNamedCapture
 185     '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
 186     default                 errorDeath                              doBadNamedCapture
 187
 188 #
 189 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
 190 #                 between plain '*', '*?', '*+'
 191 #
 192 quant-star:
 193      '?'                 n  expr-cont                               doNGStar               #  *?
 194      '+'                 n  expr-cont                               doPossessiveStar       #  *+
 195      default                expr-cont                               doStar
 196
 197
 198 #
 199 #  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
 200 #                 between plain '+', '+?', '++'
 201 #
 202 quant-plus:
 203      '?'                 n  expr-cont                               doNGPlus               #  *?
 204      '+'                 n  expr-cont                               doPossessivePlus       #  *+
 205      default                expr-cont                               doPlus
 206
 207
 208 #
 209 #  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
 210 #                  between plain '?', '??', '?+'
 211 #
 212 quant-opt:
 213      '?'                 n  expr-cont                               doNGOpt                 #  ??
 214      '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
 215      default                expr-cont                               doOpt                   #  ?
 216
 217
 218 #
 219 #   Interval         scanning a '{', the opening delimiter for an interval specification
 220 #                                   {number} or {min, max} or {min,}
 221 #
 222 interval-open:
 223     digit_char              interval-lower
 224     default                 errorDeath                              doIntervalError
 225
 226 interval-lower:
 227     digit_char           n  interval-lower                          doIntevalLowerDigit
 228     ','                          n  interval-upper
 229     '}'                  n  interval-type                           doIntervalSame             # {n}
 230     default                 errorDeath                              doIntervalError
 231
 232 interval-upper:
 233     digit_char           n  interval-upper                          doIntervalUpperDigit
 234     '}'                  n  interval-type
 235     default                 errorDeath                              doIntervalError
 236
 237 interval-type:
 238     '?'                  n  expr-cont                               doNGInterval                # {n,m}?
 239     '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
 240     default                 expr-cont                               doInterval                  # {m,n}
 241
 242
 243 #
 244 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
 245 #                                  The low level next-char function will have preprocessed
 246 #                                  some of them already; those won't come here.
 247 backslash:
 248    'A'                   n  term                                    doBackslashA
 249    'B'                   n  term                                    doBackslashB
 250    'b'                   n  term                                    doBackslashb
 251    'd'                   n  expr-quant                              doBackslashd
 252    'D'                   n  expr-quant                              doBackslashD
 253    'G'                   n  term                                    doBackslashG
 254    'h'                   n  expr-quant                              doBackslashh
 255    'H'                   n  expr-quant                              doBackslashH
 256    'k'                   n  named-backref
 257    'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
 258    'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
 259    'P'                      expr-quant                              doProperty
 260    'R'                   n  expr-quant                              doBackslashR
 261    'Q'                   n  term                                    doEnterQuoteMode
 262    'S'                   n  expr-quant                              doBackslashS
 263    's'                   n  expr-quant                              doBackslashs
 264    'v'                   n  expr-quant                              doBackslashv
 265    'V'                   n  expr-quant                              doBackslashV
 266    'W'                   n  expr-quant                              doBackslashW
 267    'w'                   n  expr-quant                              doBackslashw
 268    'X'                   n  expr-quant                              doBackslashX
 269    'Z'                   n  term                                    doBackslashZ
 270    'z'                   n  term                                    doBackslashz
 271    digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
 272    eof                      errorDeath                              doEscapeError
 273    default               n  expr-quant                              doEscapedLiteralChar
 274
 275
 276 # named-backref   Scanned \k
 277 #                 Leading to \k<captureName>
 278 #                 Failure to get the full sequence is an error.
 279 #
 280 named-backref:
 281     '<'                  n  named-backref-2                         doBeginNamedBackRef
 282     default                 errorDeath                              doBadNamedCapture
 283
 284 named-backref-2:
 285     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
 286     default                 errorDeath                              doBadNamedCapture
 287
 288 named-backref-3:
 289     ascii_letter         n  named-backref-3                         doContinueNamedBackRef
 290     digit_char           n  named-backref-3                         doContinueNamedBackRef
 291     '>'                  n  expr-quant                              doCompleteNamedBackRef
 292     default                 errorDeath                              doBadNamedCapture
 293
 294
 295 #
 296 # [set expression] parsing,
 297 #    All states involved in parsing set expressions have names beginning with "set-"
 298 #
 299
 300 set-open:
 301    '^'                   n  set-open2                               doSetNegate
 302    ':'                      set-posix                               doSetPosixProp
 303    default                  set-open2
 304
 305 set-open2:
 306    ']'                   n  set-after-lit                           doSetLiteral
 307    default                  set-start
 308
 309 #  set-posix:
 310 #                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
 311 #                  moved the scan to the closing ']'.  If it wasn't a property
 312 #                  expression, the scan will still be at the opening ':', which should
 313 #                  be interpreted as a normal set expression.
 314 set-posix:
 315     ']'                  n   pop                                    doSetEnd
 316     ':'                      set-start
 317     default                  errorDeath                             doRuleError  # should not be possible.
 318
 319 #
 320 #   set-start   after the [ and special case leading characters (^ and/or ]) but before
 321 #               everything else.   A '-' is literal at this point.
 322 #
 323 set-start:
 324     ']'                  n  pop                                     doSetEnd
 325     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 326     '\'                  n  set-escape
 327     '-'                  n  set-start-dash
 328     '&'                  n  set-start-amp
 329     default              n  set-after-lit                           doSetLiteral
 330
 331 #    set-start-dash    Turn "[--" into a syntax error.
 332 #                           "[-x" is good, - and x are literals.
 333 #
 334 set-start-dash:
 335     '-'                     errorDeath                              doRuleError
 336     default                 set-after-lit                           doSetAddDash
 337
 338 #    set-start-amp     Turn "[&&" into a syntax error.
 339 #                           "[&x" is good, & and x are literals.
 340 #
 341 set-start-amp:
 342     '&'                     errorDeath                              doRuleError
 343     default                 set-after-lit                           doSetAddAmp
 344
 345 #
 346 #   set-after-lit    The last thing scanned was a literal character within a set.
 347 #                    Can be followed by anything.  Single '-' or '&' are
 348 #                    literals in this context, not operators.
 349 set-after-lit:
 350     ']'                  n  pop                                     doSetEnd
 351     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 352     '-'                  n  set-lit-dash
 353     '&'                  n  set-lit-amp
 354     '\'                  n  set-escape
 355     eof                     errorDeath                              doSetNoCloseError
 356     default              n  set-after-lit                           doSetLiteral
 357
 358 set-after-set:
 359     ']'                  n  pop                                     doSetEnd
 360     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 361     '-'                  n  set-set-dash
 362     '&'                  n  set-set-amp
 363     '\'                  n  set-escape
 364     eof                     errorDeath                              doSetNoCloseError
 365     default              n  set-after-lit                           doSetLiteral
 366
 367 set-after-range:
 368     ']'                  n  pop                                     doSetEnd
 369     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 370     '-'                  n  set-range-dash
 371     '&'                  n  set-range-amp
 372     '\'                  n  set-escape
 373     eof                     errorDeath                              doSetNoCloseError
 374     default              n  set-after-lit                           doSetLiteral
 375
 376
 377 # set-after-op
 378 #     After a --  or &&
 379 #     It is an error to close a set at this point.
 380 #
 381 set-after-op:
 382     '['                  n  set-open         ^set-after-set         doSetBeginUnion
 383     ']'                     errorDeath                              doSetOpError
 384     '\'                  n  set-escape
 385     default              n  set-after-lit                           doSetLiteral
 386
 387 #
 388 #   set-set-amp
 389 #      Have scanned [[set]&
 390 #      Could be a '&' intersection operator, if a set follows.
 391 #      Could be the start of a '&&' operator.
 392 #      Otherewise is a literal.
 393 set-set-amp:
 394     '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
 395     '&'                  n  set-after-op                           doSetIntersection2
 396     default                 set-after-lit                          doSetAddAmp
 397
 398
 399 # set-lit-amp   Have scanned "[literals&"
 400 #               Could be a start of "&&" operator or a literal
 401 #               In [abc&[def]],   the '&' is a literal
 402 #
 403 set-lit-amp:
 404     '&'                  n  set-after-op                            doSetIntersection2
 405     default                 set-after-lit                           doSetAddAmp
 406
 407
 408 #
 409 #  set-set-dash
 410 #      Have scanned [set]-
 411 #      Could be a '-' difference operator, if a [set] follows.
 412 #      Could be the start of a '--' operator.
 413 #      Otherewise is a literal.
 414 set-set-dash:
 415     '['                  n  set-open      ^set-after-set           doSetBeginDifference1
 416     '-'                  n  set-after-op                           doSetDifference2
 417     default                 set-after-lit                          doSetAddDash
 418
 419
 420 #
 421 #  set-range-dash
 422 #      scanned  a-b-  or \w-
 423 #         any set or range like item where the trailing single '-' should
 424 #         be literal, not a set difference operation.
 425 #         A trailing "--" is still a difference operator.
 426 set-range-dash:
 427     '-'                  n  set-after-op                           doSetDifference2
 428     default                 set-after-lit                          doSetAddDash
 429
 430
 431 set-range-amp:
 432     '&'                  n  set-after-op                           doSetIntersection2
 433     default                 set-after-lit                          doSetAddAmp
 434
 435
 436 #  set-lit-dash
 437 #     Have scanned "[literals-" Could be a range or a -- operator or a literal
 438 #     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
 439 #        [abc-\p{xx}  the '-' is an error
 440 #        [abc-]       the '-' is a literal
 441 #        [ab-xy]      the '-' is a range
 442 #
 443 set-lit-dash:
 444     '-'                  n  set-after-op                            doSetDifference2
 445     '['                     set-after-lit                           doSetAddDash
 446     ']'                     set-after-lit                           doSetAddDash
 447     '\'                  n  set-lit-dash-escape
 448     default              n  set-after-range                         doSetRange
 449
 450 # set-lit-dash-escape
 451 #
 452 #    scanned "[literal-\"
 453 #    Could be a range, if the \ introduces an escaped literal char or a named char.
 454 #    Otherwise it is an error.
 455 #
 456 set-lit-dash-escape:
 457    's'                      errorDeath                             doSetOpError
 458    'S'                      errorDeath                             doSetOpError
 459    'w'                      errorDeath                             doSetOpError
 460    'W'                      errorDeath                             doSetOpError
 461    'd'                      errorDeath                             doSetOpError
 462    'D'                      errorDeath                             doSetOpError
 463    'N'                      set-after-range                        doSetNamedRange
 464    default               n  set-after-range                        doSetRange
 465
 466
 467 #
 468 #  set-escape
 469 #       Common back-slash escape processing within set expressions
 470 #
 471 set-escape:
 472    'p'                      set-after-set                           doSetProp
 473    'P'                      set-after-set                           doSetProp
 474    'N'                      set-after-lit                           doSetNamedChar
 475    's'                   n  set-after-range                         doSetBackslash_s
 476    'S'                   n  set-after-range                         doSetBackslash_S
 477    'w'                   n  set-after-range                         doSetBackslash_w
 478    'W'                   n  set-after-range                         doSetBackslash_W
 479    'd'                   n  set-after-range                         doSetBackslash_d
 480    'D'                   n  set-after-range                         doSetBackslash_D
 481    'h'                   n  set-after-range                         doSetBackslash_h
 482    'H'                   n  set-after-range                         doSetBackslash_H
 483    'v'                   n  set-after-range                         doSetBackslash_v
 484    'V'                   n  set-after-range                         doSetBackslash_V
 485    default               n  set-after-lit                           doSetLiteralEscaped
 486
 487 #
 488 # set-finish
 489 #     Have just encountered the final ']' that completes a [set], and
 490 #     arrived here via a pop.  From here, we exit the set parsing world, and go
 491 #     back to generic regular expression parsing.
 492 #
 493 set-finish:
 494     default                 expr-quant                              doSetFinish
 495
 496
 497 #
 498 # errorDeath.   This state is specified as the next state whenever a syntax error
 499 #               in the source rules is detected.  Barring bugs, the state machine will never
 500 #               actually get here, but will stop because of the action associated with the error.
 501 #               But, just in case, this state asks the state machine to exit.
 502 errorDeath:
 503     default              n errorDeath                               doExit
 504
 505