2 #*****************************************************************************
4 # Copyright (C) 2002-2015, International Business Machines Corporation and others.
7 #*****************************************************************************
10 # ICU Regular Expression Parser State Table
12 # This state table is used when reading and parsing a regular expression pattern
13 # The pattern parser uses a state machine; the data in this file define the
14 # state transitions that occur for each input character.
16 # *** This file defines the regex pattern grammar. This is it.
17 # *** The determination of what is accepted is here.
19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20 # that are then built with the rule parser.
24 # Here is the syntax of the state definitions in this file:
28 # input-char n next-state ^push-state action
29 # input-char n next-state ^push-state action
31 # | | | | |--- action to be performed by state machine
32 # | | | | See function RBBIRuleScanner::doParseActions()
34 # | | | |--- Push this named state onto the state stack.
35 # | | | Later, when next state is specified as "pop",
36 # | | | the pushed state will become the current state.
38 # | | |--- Transition to this state if the current input character matches the input
39 # | | character or char class in the left hand column. "pop" causes the next
40 # | | state to be popped from the state stack.
42 # | |--- When making the state transition specified on this line, advance to the next
43 # | character from the input only if 'n' appears here.
45 # |--- Character or named character classes to test for. If the current character being scanned
46 # matches, peform the actions and go to the state specified on this line.
47 # The input character is tested sequentally, in the order written. The characters and
48 # character classes tested for do not need to be mutually exclusive. The first match wins.
55 # start state, scan position is at the beginning of the pattern.
58 default term doPatStart
64 # term. At a position where we can accept the start most items in a pattern.
67 quoted n expr-quant doLiteralChar
68 rule_char n expr-quant doLiteralChar
69 '[' n set-open ^set-finish doSetBegin
71 '.' n expr-quant doDotAny
72 '^' n expr-quant doCaret
73 '$' n expr-quant doDollar
75 '|' n term doOrOperator
76 ')' n pop doCloseParen
78 default errorDeath doRuleError
83 # expr-quant We've just finished scanning a term, now look for the optional
84 # trailing quantifier - *, +, ?, *?, etc.
90 '{' n interval-open doIntervalInit
91 '(' n open-paren-quant
96 # expr-cont Expression, continuation. At a point where additional terms are
97 # allowed, but not required. No Quantifiers
100 '|' n term doOrOperator
101 ')' n pop doCloseParen
106 # open-paren-quant Special case handling for comments appearing before a quantifier,
107 # e.g. x(?#comment )*
108 # Open parens from expr-quant come here; anything but a (?# comment
109 # branches into the normal parenthesis sequence as quickly as possible.
112 '?' n open-paren-quant2 doSuppressComments
116 '#' n paren-comment ^expr-quant
117 default open-paren-extended
121 # open-paren We've got an open paren. We need to scan further to
122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
125 '?' n open-paren-extended doSuppressComments
126 default term ^expr-quant doOpenCaptureParen
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
130 '>' n term ^expr-quant doOpenAtomicParen # (?>
131 '=' n term ^expr-cont doOpenLookAhead # (?=
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
133 '<' n open-paren-lookbehind
134 '#' n paren-comment ^term
135 'i' paren-flag doBeginMatchMode
136 'd' paren-flag doBeginMatchMode
137 'm' paren-flag doBeginMatchMode
138 's' paren-flag doBeginMatchMode
139 'u' paren-flag doBeginMatchMode
140 'w' paren-flag doBeginMatchMode
141 'x' paren-flag doBeginMatchMode
142 '-' paren-flag doBeginMatchMode
143 '(' n errorDeath doConditionalExpr
144 '{' n errorDeath doPerlInline
145 default errorDeath doBadOpenParenType
147 open-paren-lookbehind:
148 '=' n term ^expr-cont doOpenLookBehind # (?<=
149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
150 ascii_letter named-capture doBeginNamedCapture # (?<name
151 default errorDeath doBadOpenParenType
155 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
159 eof errorDeath doMismatchedParenErr
160 default n paren-comment
163 # paren-flag Scanned a (?ismx-ismx flag setting
166 'i' n paren-flag doMatchMode
167 'd' n paren-flag doMatchMode
168 'm' n paren-flag doMatchMode
169 's' n paren-flag doMatchMode
170 'u' n paren-flag doMatchMode
171 'w' n paren-flag doMatchMode
172 'x' n paren-flag doMatchMode
173 '-' n paren-flag doMatchMode
174 ')' n term doSetMatchMode
175 ':' n term ^expr-quant doMatchModeParen
176 default errorDeath doBadModeFlag
179 # named-capture (?<name> ... ), position currently on the name.
182 ascii_letter n named-capture doContinueNamedCapture
183 digit_char n named-capture doContinueNamedCapture
184 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
185 default errorDeath doBadNamedCapture
188 # quant-star Scanning a '*' quantifier. Need to look ahead to decide
189 # between plain '*', '*?', '*+'
192 '?' n expr-cont doNGStar # *?
193 '+' n expr-cont doPossessiveStar # *+
194 default expr-cont doStar
198 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide
199 # between plain '+', '+?', '++'
202 '?' n expr-cont doNGPlus # *?
203 '+' n expr-cont doPossessivePlus # *+
204 default expr-cont doPlus
208 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide
209 # between plain '?', '??', '?+'
212 '?' n expr-cont doNGOpt # ??
213 '+' n expr-cont doPossessiveOpt # ?+
214 default expr-cont doOpt # ?
218 # Interval scanning a '{', the opening delimiter for an interval specification
219 # {number} or {min, max} or {min,}
222 digit_char interval-lower
223 default errorDeath doIntervalError
226 digit_char n interval-lower doIntevalLowerDigit
228 '}' n interval-type doIntervalSame # {n}
229 default errorDeath doIntervalError
232 digit_char n interval-upper doIntervalUpperDigit
234 default errorDeath doIntervalError
237 '?' n expr-cont doNGInterval # {n,m}?
238 '+' n expr-cont doPossessiveInterval # {n,m}+
239 default expr-cont doInterval # {m,n}
243 # backslash # Backslash. Figure out which of the \thingies we have encountered.
244 # The low level next-char function will have preprocessed
245 # some of them already; those won't come here.
247 'A' n term doBackslashA
248 'B' n term doBackslashB
249 'b' n term doBackslashb
250 'd' n expr-quant doBackslashd
251 'D' n expr-quant doBackslashD
252 'G' n term doBackslashG
253 'h' n expr-quant doBackslashh
254 'H' n expr-quant doBackslashH
256 'N' expr-quant doNamedChar # \N{NAME} named char
257 'p' expr-quant doProperty # \p{Lu} style property
258 'P' expr-quant doProperty
259 'R' n expr-quant doBackslashR
260 'Q' n term doEnterQuoteMode
261 'S' n expr-quant doBackslashS
262 's' n expr-quant doBackslashs
263 'v' n expr-quant doBackslashv
264 'V' n expr-quant doBackslashV
265 'W' n expr-quant doBackslashW
266 'w' n expr-quant doBackslashw
267 'X' n expr-quant doBackslashX
268 'Z' n term doBackslashZ
269 'z' n term doBackslashz
270 digit_char n expr-quant doBackRef # Will scan multiple digits
271 eof errorDeath doEscapeError
272 default n expr-quant doEscapedLiteralChar
275 # named-backref Scanned \k
276 # Leading to \k<captureName>
277 # Failure to get the full sequence is an error.
280 '<' n named-backref-2 doBeginNamedBackRef
281 default errorDeath doBadNamedCapture
284 ascii_letter n named-backref-3 doContinueNamedBackRef
285 default errorDeath doBadNamedCapture
288 ascii_letter n named-backref-3 doContinueNamedBackRef
289 digit_char n named-backref-3 doContinueNamedBackRef
290 '>' n expr-quant doCompleteNamedBackRef
291 default errorDeath doBadNamedCapture
295 # [set expression] parsing,
296 # All states involved in parsing set expressions have names beginning with "set-"
300 '^' n set-open2 doSetNegate
301 ':' set-posix doSetPosixProp
305 ']' n set-after-lit doSetLiteral
309 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have
310 # moved the scan to the closing ']'. If it wasn't a property
311 # expression, the scan will still be at the opening ':', which should
312 # be interpreted as a normal set expression.
316 default errorDeath doRuleError # should not be possible.
319 # set-start after the [ and special case leading characters (^ and/or ]) but before
320 # everything else. A '-' is literal at this point.
324 '[' n set-open ^set-after-set doSetBeginUnion
328 default n set-after-lit doSetLiteral
330 # set-start-dash Turn "[--" into a syntax error.
331 # "[-x" is good, - and x are literals.
334 '-' errorDeath doRuleError
335 default set-after-lit doSetAddDash
337 # set-start-amp Turn "[&&" into a syntax error.
338 # "[&x" is good, & and x are literals.
341 '&' errorDeath doRuleError
342 default set-after-lit doSetAddAmp
345 # set-after-lit The last thing scanned was a literal character within a set.
346 # Can be followed by anything. Single '-' or '&' are
347 # literals in this context, not operators.
350 '[' n set-open ^set-after-set doSetBeginUnion
354 eof errorDeath doSetNoCloseError
355 default n set-after-lit doSetLiteral
359 '[' n set-open ^set-after-set doSetBeginUnion
363 eof errorDeath doSetNoCloseError
364 default n set-after-lit doSetLiteral
368 '[' n set-open ^set-after-set doSetBeginUnion
372 eof errorDeath doSetNoCloseError
373 default n set-after-lit doSetLiteral
378 # It is an error to close a set at this point.
381 '[' n set-open ^set-after-set doSetBeginUnion
382 ']' errorDeath doSetOpError
384 default n set-after-lit doSetLiteral
388 # Have scanned [[set]&
389 # Could be a '&' intersection operator, if a set follows.
390 # Could be the start of a '&&' operator.
391 # Otherewise is a literal.
393 '[' n set-open ^set-after-set doSetBeginIntersection1
394 '&' n set-after-op doSetIntersection2
395 default set-after-lit doSetAddAmp
398 # set-lit-amp Have scanned "[literals&"
399 # Could be a start of "&&" operator or a literal
400 # In [abc&[def]], the '&' is a literal
403 '&' n set-after-op doSetIntersection2
404 default set-after-lit doSetAddAmp
409 # Have scanned [set]-
410 # Could be a '-' difference operator, if a [set] follows.
411 # Could be the start of a '--' operator.
412 # Otherewise is a literal.
414 '[' n set-open ^set-after-set doSetBeginDifference1
415 '-' n set-after-op doSetDifference2
416 default set-after-lit doSetAddDash
421 # scanned a-b- or \w-
422 # any set or range like item where the trailing single '-' should
423 # be literal, not a set difference operation.
424 # A trailing "--" is still a difference operator.
426 '-' n set-after-op doSetDifference2
427 default set-after-lit doSetAddDash
431 '&' n set-after-op doSetIntersection2
432 default set-after-lit doSetAddAmp
436 # Have scanned "[literals-" Could be a range or a -- operator or a literal
437 # In [abc-[def]], the '-' is a literal (confirmed with a Java test)
438 # [abc-\p{xx} the '-' is an error
439 # [abc-] the '-' is a literal
440 # [ab-xy] the '-' is a range
443 '-' n set-after-op doSetDifference2
444 '[' set-after-lit doSetAddDash
445 ']' set-after-lit doSetAddDash
446 '\' n set-lit-dash-escape
447 default n set-after-range doSetRange
449 # set-lit-dash-escape
451 # scanned "[literal-\"
452 # Could be a range, if the \ introduces an escaped literal char or a named char.
453 # Otherwise it is an error.
456 's' errorDeath doSetOpError
457 'S' errorDeath doSetOpError
458 'w' errorDeath doSetOpError
459 'W' errorDeath doSetOpError
460 'd' errorDeath doSetOpError
461 'D' errorDeath doSetOpError
462 'N' set-after-range doSetNamedRange
463 default n set-after-range doSetRange
468 # Common back-slash escape processing within set expressions
471 'p' set-after-set doSetProp
472 'P' set-after-set doSetProp
473 'N' set-after-lit doSetNamedChar
474 's' n set-after-range doSetBackslash_s
475 'S' n set-after-range doSetBackslash_S
476 'w' n set-after-range doSetBackslash_w
477 'W' n set-after-range doSetBackslash_W
478 'd' n set-after-range doSetBackslash_d
479 'D' n set-after-range doSetBackslash_D
480 'h' n set-after-range doSetBackslash_h
481 'H' n set-after-range doSetBackslash_H
482 'v' n set-after-range doSetBackslash_v
483 'V' n set-after-range doSetBackslash_V
484 default n set-after-lit doSetLiteralEscaped
488 # Have just encountered the final ']' that completes a [set], and
489 # arrived here via a pop. From here, we exit the set parsing world, and go
490 # back to generic regular expression parsing.
493 default expr-quant doSetFinish
497 # errorDeath. This state is specified as the next state whenever a syntax error
498 # in the source rules is detected. Barring bugs, the state machine will never
499 # actually get here, but will stop because of the action associated with the error.
500 # But, just in case, this state asks the state machine to exit.
502 default n errorDeath doExit