icuSources/i18n/regexcst.txt

   1
   2 #*****************************************************************************
   3 #
   4 #   Copyright (C) 2002-2007, International Business Machines Corporation and others.
   5 #   All Rights Reserved.
   6 #
   7 #*****************************************************************************
   8 #
   9 #  file:  regexcst.txt
  10 #  ICU Regular Expression Parser State Table
  11 #
  12 #     This state table is used when reading and parsing a regular expression pattern
  13 #     The pattern parser uses a state machine; the data in this file define the
  14 #     state transitions that occur for each input character.
  15 #
  16 #     *** This file defines the regex pattern grammar.   This is it.
  17 #     *** The determination of what is accepted is here.
  18 #
  19 #     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
  20 #     that are then built with the rule parser.
  21 #
  22
  23 #
  24 # Here is the syntax of the state definitions in this file:
  25 #
  26 #
  27 #StateName:
  28 #   input-char           n next-state           ^push-state     action
  29 #   input-char           n next-state           ^push-state     action
  30 #       |                |   |                      |             |
  31 #       |                |   |                      |             |--- action to be performed by state machine
  32 #       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
  33 #       |                |   |                      |
  34 #       |                |   |                      |--- Push this named state onto the state stack.
  35 #       |                |   |                           Later, when next state is specified as "pop",
  36 #       |                |   |                           the pushed state will become the current state.
  37 #       |                |   |
  38 #       |                |   |--- Transition to this state if the current input character matches the input
  39 #       |                |        character or char class in the left hand column.  "pop" causes the next
  40 #       |                |        state to be popped from the state stack.
  41 #       |                |
  42 #       |                |--- When making the state transition specified on this line, advance to the next
  43 #       |                     character from the input only if 'n' appears here.
  44 #       |
  45 #       |--- Character or named character classes to test for.  If the current character being scanned
  46 #            matches, peform the actions and go to the state specified on this line.
  47 #            The input character is tested sequentally, in the order written.  The characters and
  48 #            character classes tested for do not need to be mutually exclusive.  The first match wins.
  49 #
  50
  51
  52
  53
  54 #
  55 #  start state, scan position is at the beginning of the pattern.
  56 #
  57 start:
  58    default                 term                                     doPatStart
  59
  60
  61
  62
  63 #
  64 #  term.  At a position where we can accept the start most items in a pattern.
  65 #
  66 term:
  67     quoted               n expr-quant                               doLiteralChar
  68     rule_char            n expr-quant                               doLiteralChar
  69     '['                  n set-open       ^set-finish               doSetBegin
  70     '('                  n open-paren
  71     '.'                  n expr-quant                               doDotAny
  72     '^'                  n expr-quant                               doCaret
  73     '$'                  n expr-quant                               doDollar
  74     '\'                  n backslash
  75     '|'                  n  term                                    doOrOperator
  76     ')'                  n  pop                                     doCloseParen
  77     eof                    term                                     doPatFinish
  78     default                errorDeath                               doRuleError
  79
  80
  81
  82 #
  83 #   expr-quant    We've just finished scanning a term, now look for the optional
  84 #                 trailing quantifier - *, +, ?, *?,  etc.
  85 #
  86 expr-quant:
  87     '*'                  n  quant-star
  88     '+'                  n  quant-plus
  89     '?'                  n  quant-opt
  90     '{'                  n  interval-open                          doIntervalInit
  91     '('                  n  open-paren-quant
  92     default                 expr-cont
  93
  94
  95 #
  96 #  expr-cont      Expression, continuation.  At a point where additional terms are
  97 #                                            allowed, but not required.  No Quantifiers
  98 #
  99 expr-cont:
 100     '|'                  n  term                                    doOrOperator
 101     ')'                  n  pop                                     doCloseParen
 102     default                 term
 103
 104
 105 #
 106 #   open-paren-quant   Special case handling for comments appearing before a quantifier,
 107 #                        e.g.   x(?#comment )*
 108 #                      Open parens from expr-quant come here; anything but a (?# comment
 109 #                      branches into the normal parenthesis sequence as quickly as possible.
 110 #
 111 open-paren-quant:
 112     '?'                  n  open-paren-quant2                      doSuppressComments
 113     default                 open-paren
 114
 115 open-paren-quant2:
 116     '#'                  n  paren-comment   ^expr-quant
 117     default                 open-paren-extended
 118
 119
 120 #
 121 #   open-paren    We've got an open paren.  We need to scan further to
 122 #                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
 123 #
 124 open-paren:
 125     '?'                  n  open-paren-extended                     doSuppressComments
 126     default                 term            ^expr-quant             doOpenCaptureParen
 127
 128 open-paren-extended:
 129     ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
 130     '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
 131     '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
 132     '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
 133     '<'                  n  open-paren-lookbehind
 134     '#'                  n  paren-comment   ^term
 135     'i'                     paren-flag                              doBeginMatchMode
 136     'd'                     paren-flag                              doBeginMatchMode
 137     'm'                     paren-flag                              doBeginMatchMode
 138     's'                     paren-flag                              doBeginMatchMode
 139     'u'                     paren-flag                              doBeginMatchMode
 140     'w'                     paren-flag                              doBeginMatchMode
 141     'x'                     paren-flag                              doBeginMatchMode
 142     '-'                     paren-flag                              doBeginMatchMode
 143     '('                  n  errorDeath                              doConditionalExpr
 144     '{'                  n  errorDeath                              doPerlInline
 145     default                 errorDeath                              doBadOpenParenType
 146
 147 open-paren-lookbehind:
 148     '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
 149     '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
 150     default                 errorDeath                              doBadOpenParenType
 151
 152
 153 #
 154 #   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
 155 #
 156 paren-comment:
 157     ')'                  n  pop
 158     eof                         errorDeath                              doMismatchedParenErr
 159     default              n  paren-comment
 160
 161 #
 162 #  paren-flag    Scanned a (?ismx-ismx  flag setting
 163 #
 164 paren-flag:
 165     'i'                  n  paren-flag                              doMatchMode
 166     'd'                  n  paren-flag                              doMatchMode
 167     'm'                  n  paren-flag                              doMatchMode
 168     's'                  n  paren-flag                              doMatchMode
 169     'u'                  n  paren-flag                              doMatchMode
 170     'w'                  n  paren-flag                              doMatchMode
 171     'x'                  n  paren-flag                              doMatchMode
 172     '-'                  n  paren-flag                              doMatchMode
 173     ')'                  n  term                                    doSetMatchMode
 174     ':'                  n  term              ^expr-quant           doMatchModeParen
 175     default                 errorDeath                              doBadModeFlag
 176
 177
 178 #
 179 #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
 180 #                 between plain '*', '*?', '*+'
 181 #
 182 quant-star:
 183      '?'                 n  expr-cont                               doNGStar               #  *?
 184      '+'                 n  expr-cont                               doPossessiveStar       #  *+
 185      default                expr-cont                               doStar
 186
 187
 188 #
 189 #  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
 190 #                 between plain '+', '+?', '++'
 191 #
 192 quant-plus:
 193      '?'                 n  expr-cont                               doNGPlus               #  *?
 194      '+'                 n  expr-cont                               doPossessivePlus       #  *+
 195      default                expr-cont                               doPlus
 196
 197
 198 #
 199 #  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
 200 #                  between plain '?', '??', '?+'
 201 #
 202 quant-opt:
 203      '?'                 n  expr-cont                               doNGOpt                 #  ??
 204      '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
 205      default                expr-cont                               doOpt                   #  ?
 206
 207
 208 #
 209 #   Interval         scanning a '{', the opening delimiter for an interval specification
 210 #                                   {number} or {min, max} or {min,}
 211 #
 212 interval-open:
 213     digit_char              interval-lower
 214     default                 errorDeath                              doIntervalError
 215
 216 interval-lower:
 217     digit_char           n  interval-lower                          doIntevalLowerDigit
 218     ','                          n  interval-upper
 219     '}'                  n  interval-type                           doIntervalSame             # {n}
 220     default                 errorDeath                              doIntervalError
 221
 222 interval-upper:
 223     digit_char           n  interval-upper                          doIntervalUpperDigit
 224     '}'                  n  interval-type
 225     default                 errorDeath                              doIntervalError
 226
 227 interval-type:
 228     '?'                  n  expr-cont                               doNGInterval                # {n,m}?
 229     '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
 230     default                 expr-cont                               doInterval                  # {m,n}
 231
 232
 233 #
 234 #  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
 235 #                                  The low level next-char function will have preprocessed
 236 #                                  some of them already; those won't come here.
 237 backslash:
 238    'A'                   n  term                                    doBackslashA
 239    'B'                   n  term                                    doBackslashB
 240    'b'                   n  term                                    doBackslashb
 241    'd'                   n  expr-quant                              doBackslashd
 242    'D'                   n  expr-quant                              doBackslashD
 243    'G'                   n  term                                    doBackslashG
 244    'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
 245    'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
 246    'P'                      expr-quant                              doProperty
 247    'Q'                   n  term                                    doEnterQuoteMode
 248    'S'                   n  expr-quant                              doBackslashS
 249    's'                   n  expr-quant                              doBackslashs
 250    'W'                   n  expr-quant                              doBackslashW
 251    'w'                   n  expr-quant                              doBackslashw
 252    'X'                   n  expr-quant                              doBackslashX
 253    'Z'                   n  term                                    doBackslashZ
 254    'z'                   n  term                                    doBackslashz
 255    digit_char            n  expr-quant                              doBackRef         #  Will scan multiple digits
 256    eof                      errorDeath                              doEscapeError
 257    default               n  expr-quant                              doEscapedLiteralChar
 258
 259
 260
 261 #
 262 # [set expression] parsing,
 263 #    All states involved in parsing set expressions have names beginning with "set-"
 264 #
 265
 266 set-open:
 267    '^'                   n  set-open2                               doSetNegate
 268    ':'                      set-posix                               doSetPosixProp
 269    default                  set-open2
 270
 271 set-open2:
 272    ']'                   n  set-after-lit                           doSetLiteral
 273    default                  set-start
 274
 275 #  set-posix:
 276 #                  scanned a '[:'  If it really is a [:property:], doSetPosixProp will have
 277 #                  moved the scan to the closing ']'.  If it wasn't a property
 278 #                  expression, the scan will still be at the opening ':', which should
 279 #                  be interpreted as a normal set expression.
 280 set-posix:
 281     ']'                  n   pop                                    doSetEnd
 282     ':'                      set-start
 283     default                  errorDeath                             doRuleError  # should not be possible.
 284
 285 #
 286 #   set-start   after the [ and special case leading characters (^ and/or ]) but before
 287 #               everything else.   A '-' is literal at this point.
 288 #
 289 set-start:
 290     ']'                  n  pop                                     doSetEnd
 291     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 292     '\'                  n  set-escape
 293     '-'                  n  set-start-dash
 294     '&'                  n  set-start-amp
 295     default              n  set-after-lit                           doSetLiteral
 296
 297 #    set-start-dash    Turn "[--" into a syntax error.
 298 #                           "[-x" is good, - and x are literals.
 299 #
 300 set-start-dash:
 301     '-'                     errorDeath                              doRuleError
 302     default                 set-after-lit                           doSetAddDash
 303
 304 #    set-start-amp     Turn "[&&" into a syntax error.
 305 #                           "[&x" is good, & and x are literals.
 306 #
 307 set-start-amp:
 308     '&'                     errorDeath                              doRuleError
 309     default                 set-after-lit                           doSetAddAmp
 310
 311 #
 312 #   set-after-lit    The last thing scanned was a literal character within a set.
 313 #                    Can be followed by anything.  Single '-' or '&' are
 314 #                    literals in this context, not operators.
 315 set-after-lit:
 316     ']'                  n  pop                                     doSetEnd
 317     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 318     '-'                  n  set-lit-dash
 319     '&'                  n  set-lit-amp
 320     '\'                  n  set-escape
 321     eof                     errorDeath                              doSetNoCloseError
 322     default              n  set-after-lit                           doSetLiteral
 323
 324 set-after-set:
 325     ']'                  n  pop                                     doSetEnd
 326     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 327     '-'                  n  set-set-dash
 328     '&'                  n  set-set-amp
 329     '\'                  n  set-escape
 330     eof                     errorDeath                              doSetNoCloseError
 331     default              n  set-after-lit                           doSetLiteral
 332
 333 set-after-range:
 334     ']'                  n  pop                                     doSetEnd
 335     '['                  n  set-open      ^set-after-set            doSetBeginUnion
 336     '-'                  n  set-range-dash
 337     '&'                  n  set-range-amp
 338     '\'                  n  set-escape
 339     eof                     errorDeath                              doSetNoCloseError
 340     default              n  set-after-lit                           doSetLiteral
 341
 342
 343 # set-after-op
 344 #     After a --  or &&
 345 #     It is an error to close a set at this point.
 346 #
 347 set-after-op:
 348     '['                  n  set-open         ^set-after-set         doSetBeginUnion
 349     ']'                     errorDeath                              doSetOpError
 350     '\'                  n  set-escape
 351     default              n  set-after-lit                           doSetLiteral
 352
 353 #
 354 #   set-set-amp
 355 #      Have scanned [[set]&
 356 #      Could be a '&' intersection operator, if a set follows.
 357 #      Could be the start of a '&&' operator.
 358 #      Otherewise is a literal.
 359 set-set-amp:
 360     '['                  n  set-open      ^set-after-set           doSetBeginIntersection1
 361     '&'                  n  set-after-op                           doSetIntersection2
 362     default                 set-after-lit                          doSetAddAmp
 363
 364
 365 # set-lit-amp   Have scanned "[literals&"
 366 #               Could be a start of "&&" operator or a literal
 367 #               In [abc&[def]],   the '&' is a literal
 368 #
 369 set-lit-amp:
 370     '&'                  n  set-after-op                            doSetIntersection2
 371     default                 set-after-lit                           doSetAddAmp
 372
 373
 374 #
 375 #  set-set-dash
 376 #      Have scanned [set]-
 377 #      Could be a '-' difference operator, if a [set] follows.
 378 #      Could be the start of a '--' operator.
 379 #      Otherewise is a literal.
 380 set-set-dash:
 381     '['                  n  set-open      ^set-after-set           doSetBeginDifference1
 382     '-'                  n  set-after-op                           doSetDifference2
 383     default                 set-after-lit                          doSetAddDash
 384
 385
 386 #
 387 #  set-range-dash
 388 #      scanned  a-b-  or \w-
 389 #         any set or range like item where the trailing single '-' should
 390 #         be literal, not a set difference operation.
 391 #         A trailing "--" is still a difference operator.
 392 set-range-dash:
 393     '-'                  n  set-after-op                           doSetDifference2
 394     default                 set-after-lit                          doSetAddDash
 395
 396
 397 set-range-amp:
 398     '&'                  n  set-after-op                           doSetIntersection2
 399     default                 set-after-lit                          doSetAddAmp
 400
 401
 402 #  set-lit-dash
 403 #     Have scanned "[literals-" Could be a range or a -- operator or a literal
 404 #     In [abc-[def]], the '-' is a literal (confirmed with a Java test)
 405 #        [abc-\p{xx}  the '-' is an error
 406 #        [abc-]       the '-' is a literal
 407 #        [ab-xy]      the '-' is a range
 408 #
 409 set-lit-dash:
 410     '-'                  n  set-after-op                            doSetDifference2
 411     '['                     set-after-lit                           doSetAddDash
 412     ']'                     set-after-lit                           doSetAddDash
 413     '\'                  n  set-lit-dash-escape
 414     default              n  set-after-range                         doSetRange
 415
 416 # set-lit-dash-escape
 417 #
 418 #    scanned "[literal-\"
 419 #    Could be a range, if the \ introduces an escaped literal char or a named char.
 420 #    Otherwise it is an error.
 421 #
 422 set-lit-dash-escape:
 423    's'                      errorDeath                             doSetOpError
 424    'S'                      errorDeath                             doSetOpError
 425    'w'                      errorDeath                             doSetOpError
 426    'W'                      errorDeath                             doSetOpError
 427    'd'                      errorDeath                             doSetOpError
 428    'D'                      errorDeath                             doSetOpError
 429    'N'                      set-after-range                        doSetNamedRange
 430    default               n  set-after-range                        doSetRange
 431
 432
 433 #
 434 #  set-escape
 435 #       Common back-slash escape processing within set expressions
 436 #
 437 set-escape:
 438    'p'                      set-after-set                           doSetProp
 439    'P'                      set-after-set                           doSetProp
 440    'N'                      set-after-lit                           doSetNamedChar
 441    's'                   n  set-after-range                         doSetBackslash_s
 442    'S'                   n  set-after-range                         doSetBackslash_S
 443    'w'                   n  set-after-range                         doSetBackslash_w
 444    'W'                   n  set-after-range                         doSetBackslash_W
 445    'd'                   n  set-after-range                         doSetBackslash_d
 446    'D'                   n  set-after-range                         doSetBackslash_D
 447    default               n  set-after-lit                           doSetLiteralEscaped
 448
 449 #
 450 # set-finish
 451 #     Have just encountered the final ']' that completes a [set], and
 452 #     arrived here via a pop.  From here, we exit the set parsing world, and go
 453 #     back to generic regular expression parsing.
 454 #
 455 set-finish:
 456     default                 expr-quant                              doSetFinish
 457
 458
 459 #
 460 # errorDeath.   This state is specified as the next state whenever a syntax error
 461 #               in the source rules is detected.  Barring bugs, the state machine will never
 462 #               actually get here, but will stop because of the action associated with the error.
 463 #               But, just in case, this state asks the state machine to exit.
 464 errorDeath:
 465     default              n errorDeath                               doExit
 466
 467