1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
3 #*****************************************************************************
5 # Copyright (C) 2002-2015, International Business Machines Corporation and others.
8 #*****************************************************************************
11 # ICU Regular Expression Parser State Table
13 # This state table is used when reading and parsing a regular expression pattern
14 # The pattern parser uses a state machine; the data in this file define the
15 # state transitions that occur for each input character.
17 # *** This file defines the regex pattern grammar. This is it.
18 # *** The determination of what is accepted is here.
20 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
21 # that are then built with the rule parser.
25 # Here is the syntax of the state definitions in this file:
29 # input-char n next-state ^push-state action
30 # input-char n next-state ^push-state action
32 # | | | | |--- action to be performed by state machine
33 # | | | | See function RBBIRuleScanner::doParseActions()
35 # | | | |--- Push this named state onto the state stack.
36 # | | | Later, when next state is specified as "pop",
37 # | | | the pushed state will become the current state.
39 # | | |--- Transition to this state if the current input character matches the input
40 # | | character or char class in the left hand column. "pop" causes the next
41 # | | state to be popped from the state stack.
43 # | |--- When making the state transition specified on this line, advance to the next
44 # | character from the input only if 'n' appears here.
46 # |--- Character or named character classes to test for. If the current character being scanned
47 # matches, peform the actions and go to the state specified on this line.
48 # The input character is tested sequentally, in the order written. The characters and
49 # character classes tested for do not need to be mutually exclusive. The first match wins.
56 # start state, scan position is at the beginning of the pattern.
59 default term doPatStart
65 # term. At a position where we can accept the start most items in a pattern.
68 quoted n expr-quant doLiteralChar
69 rule_char n expr-quant doLiteralChar
70 '[' n set-open ^set-finish doSetBegin
72 '.' n expr-quant doDotAny
73 '^' n expr-quant doCaret
74 '$' n expr-quant doDollar
76 '|' n term doOrOperator
77 ')' n pop doCloseParen
79 default errorDeath doRuleError
84 # expr-quant We've just finished scanning a term, now look for the optional
85 # trailing quantifier - *, +, ?, *?, etc.
91 '{' n interval-open doIntervalInit
92 '(' n open-paren-quant
97 # expr-cont Expression, continuation. At a point where additional terms are
98 # allowed, but not required. No Quantifiers
101 '|' n term doOrOperator
102 ')' n pop doCloseParen
107 # open-paren-quant Special case handling for comments appearing before a quantifier,
108 # e.g. x(?#comment )*
109 # Open parens from expr-quant come here; anything but a (?# comment
110 # branches into the normal parenthesis sequence as quickly as possible.
113 '?' n open-paren-quant2 doSuppressComments
117 '#' n paren-comment ^expr-quant
118 default open-paren-extended
122 # open-paren We've got an open paren. We need to scan further to
123 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
126 '?' n open-paren-extended doSuppressComments
127 default term ^expr-quant doOpenCaptureParen
130 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
131 '>' n term ^expr-quant doOpenAtomicParen # (?>
132 '=' n term ^expr-cont doOpenLookAhead # (?=
133 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
134 '<' n open-paren-lookbehind
135 '#' n paren-comment ^term
136 'i' paren-flag doBeginMatchMode
137 'd' paren-flag doBeginMatchMode
138 'm' paren-flag doBeginMatchMode
139 's' paren-flag doBeginMatchMode
140 'u' paren-flag doBeginMatchMode
141 'w' paren-flag doBeginMatchMode
142 'x' paren-flag doBeginMatchMode
143 '-' paren-flag doBeginMatchMode
144 '(' n errorDeath doConditionalExpr
145 '{' n errorDeath doPerlInline
146 default errorDeath doBadOpenParenType
148 open-paren-lookbehind:
149 '=' n term ^expr-cont doOpenLookBehind # (?<=
150 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
151 ascii_letter named-capture doBeginNamedCapture # (?<name
152 default errorDeath doBadOpenParenType
156 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
160 eof errorDeath doMismatchedParenErr
161 default n paren-comment
164 # paren-flag Scanned a (?ismx-ismx flag setting
167 'i' n paren-flag doMatchMode
168 'd' n paren-flag doMatchMode
169 'm' n paren-flag doMatchMode
170 's' n paren-flag doMatchMode
171 'u' n paren-flag doMatchMode
172 'w' n paren-flag doMatchMode
173 'x' n paren-flag doMatchMode
174 '-' n paren-flag doMatchMode
175 ')' n term doSetMatchMode
176 ':' n term ^expr-quant doMatchModeParen
177 default errorDeath doBadModeFlag
180 # named-capture (?<name> ... ), position currently on the name.
183 ascii_letter n named-capture doContinueNamedCapture
184 digit_char n named-capture doContinueNamedCapture
185 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
186 default errorDeath doBadNamedCapture
189 # quant-star Scanning a '*' quantifier. Need to look ahead to decide
190 # between plain '*', '*?', '*+'
193 '?' n expr-cont doNGStar # *?
194 '+' n expr-cont doPossessiveStar # *+
195 default expr-cont doStar
199 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide
200 # between plain '+', '+?', '++'
203 '?' n expr-cont doNGPlus # *?
204 '+' n expr-cont doPossessivePlus # *+
205 default expr-cont doPlus
209 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide
210 # between plain '?', '??', '?+'
213 '?' n expr-cont doNGOpt # ??
214 '+' n expr-cont doPossessiveOpt # ?+
215 default expr-cont doOpt # ?
219 # Interval scanning a '{', the opening delimiter for an interval specification
220 # {number} or {min, max} or {min,}
223 digit_char interval-lower
224 default errorDeath doIntervalError
227 digit_char n interval-lower doIntevalLowerDigit
229 '}' n interval-type doIntervalSame # {n}
230 default errorDeath doIntervalError
233 digit_char n interval-upper doIntervalUpperDigit
235 default errorDeath doIntervalError
238 '?' n expr-cont doNGInterval # {n,m}?
239 '+' n expr-cont doPossessiveInterval # {n,m}+
240 default expr-cont doInterval # {m,n}
244 # backslash # Backslash. Figure out which of the \thingies we have encountered.
245 # The low level next-char function will have preprocessed
246 # some of them already; those won't come here.
248 'A' n term doBackslashA
249 'B' n term doBackslashB
250 'b' n term doBackslashb
251 'd' n expr-quant doBackslashd
252 'D' n expr-quant doBackslashD
253 'G' n term doBackslashG
254 'h' n expr-quant doBackslashh
255 'H' n expr-quant doBackslashH
257 'N' expr-quant doNamedChar # \N{NAME} named char
258 'p' expr-quant doProperty # \p{Lu} style property
259 'P' expr-quant doProperty
260 'R' n expr-quant doBackslashR
261 'Q' n term doEnterQuoteMode
262 'S' n expr-quant doBackslashS
263 's' n expr-quant doBackslashs
264 'v' n expr-quant doBackslashv
265 'V' n expr-quant doBackslashV
266 'W' n expr-quant doBackslashW
267 'w' n expr-quant doBackslashw
268 'X' n expr-quant doBackslashX
269 'Z' n term doBackslashZ
270 'z' n term doBackslashz
271 digit_char n expr-quant doBackRef # Will scan multiple digits
272 eof errorDeath doEscapeError
273 default n expr-quant doEscapedLiteralChar
276 # named-backref Scanned \k
277 # Leading to \k<captureName>
278 # Failure to get the full sequence is an error.
281 '<' n named-backref-2 doBeginNamedBackRef
282 default errorDeath doBadNamedCapture
285 ascii_letter n named-backref-3 doContinueNamedBackRef
286 default errorDeath doBadNamedCapture
289 ascii_letter n named-backref-3 doContinueNamedBackRef
290 digit_char n named-backref-3 doContinueNamedBackRef
291 '>' n expr-quant doCompleteNamedBackRef
292 default errorDeath doBadNamedCapture
296 # [set expression] parsing,
297 # All states involved in parsing set expressions have names beginning with "set-"
301 '^' n set-open2 doSetNegate
302 ':' set-posix doSetPosixProp
306 ']' n set-after-lit doSetLiteral
310 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have
311 # moved the scan to the closing ']'. If it wasn't a property
312 # expression, the scan will still be at the opening ':', which should
313 # be interpreted as a normal set expression.
317 default errorDeath doRuleError # should not be possible.
320 # set-start after the [ and special case leading characters (^ and/or ]) but before
321 # everything else. A '-' is literal at this point.
325 '[' n set-open ^set-after-set doSetBeginUnion
329 default n set-after-lit doSetLiteral
331 # set-start-dash Turn "[--" into a syntax error.
332 # "[-x" is good, - and x are literals.
335 '-' errorDeath doRuleError
336 default set-after-lit doSetAddDash
338 # set-start-amp Turn "[&&" into a syntax error.
339 # "[&x" is good, & and x are literals.
342 '&' errorDeath doRuleError
343 default set-after-lit doSetAddAmp
346 # set-after-lit The last thing scanned was a literal character within a set.
347 # Can be followed by anything. Single '-' or '&' are
348 # literals in this context, not operators.
351 '[' n set-open ^set-after-set doSetBeginUnion
355 eof errorDeath doSetNoCloseError
356 default n set-after-lit doSetLiteral
360 '[' n set-open ^set-after-set doSetBeginUnion
364 eof errorDeath doSetNoCloseError
365 default n set-after-lit doSetLiteral
369 '[' n set-open ^set-after-set doSetBeginUnion
373 eof errorDeath doSetNoCloseError
374 default n set-after-lit doSetLiteral
379 # It is an error to close a set at this point.
382 '[' n set-open ^set-after-set doSetBeginUnion
383 ']' errorDeath doSetOpError
385 default n set-after-lit doSetLiteral
389 # Have scanned [[set]&
390 # Could be a '&' intersection operator, if a set follows.
391 # Could be the start of a '&&' operator.
392 # Otherewise is a literal.
394 '[' n set-open ^set-after-set doSetBeginIntersection1
395 '&' n set-after-op doSetIntersection2
396 default set-after-lit doSetAddAmp
399 # set-lit-amp Have scanned "[literals&"
400 # Could be a start of "&&" operator or a literal
401 # In [abc&[def]], the '&' is a literal
404 '&' n set-after-op doSetIntersection2
405 default set-after-lit doSetAddAmp
410 # Have scanned [set]-
411 # Could be a '-' difference operator, if a [set] follows.
412 # Could be the start of a '--' operator.
413 # Otherewise is a literal.
415 '[' n set-open ^set-after-set doSetBeginDifference1
416 '-' n set-after-op doSetDifference2
417 default set-after-lit doSetAddDash
422 # scanned a-b- or \w-
423 # any set or range like item where the trailing single '-' should
424 # be literal, not a set difference operation.
425 # A trailing "--" is still a difference operator.
427 '-' n set-after-op doSetDifference2
428 default set-after-lit doSetAddDash
432 '&' n set-after-op doSetIntersection2
433 default set-after-lit doSetAddAmp
437 # Have scanned "[literals-" Could be a range or a -- operator or a literal
438 # In [abc-[def]], the '-' is a literal (confirmed with a Java test)
439 # [abc-\p{xx} the '-' is an error
440 # [abc-] the '-' is a literal
441 # [ab-xy] the '-' is a range
444 '-' n set-after-op doSetDifference2
445 '[' set-after-lit doSetAddDash
446 ']' set-after-lit doSetAddDash
447 '\' n set-lit-dash-escape
448 default n set-after-range doSetRange
450 # set-lit-dash-escape
452 # scanned "[literal-\"
453 # Could be a range, if the \ introduces an escaped literal char or a named char.
454 # Otherwise it is an error.
457 's' errorDeath doSetOpError
458 'S' errorDeath doSetOpError
459 'w' errorDeath doSetOpError
460 'W' errorDeath doSetOpError
461 'd' errorDeath doSetOpError
462 'D' errorDeath doSetOpError
463 'N' set-after-range doSetNamedRange
464 default n set-after-range doSetRange
469 # Common back-slash escape processing within set expressions
472 'p' set-after-set doSetProp
473 'P' set-after-set doSetProp
474 'N' set-after-lit doSetNamedChar
475 's' n set-after-range doSetBackslash_s
476 'S' n set-after-range doSetBackslash_S
477 'w' n set-after-range doSetBackslash_w
478 'W' n set-after-range doSetBackslash_W
479 'd' n set-after-range doSetBackslash_d
480 'D' n set-after-range doSetBackslash_D
481 'h' n set-after-range doSetBackslash_h
482 'H' n set-after-range doSetBackslash_H
483 'v' n set-after-range doSetBackslash_v
484 'V' n set-after-range doSetBackslash_V
485 default n set-after-lit doSetLiteralEscaped
489 # Have just encountered the final ']' that completes a [set], and
490 # arrived here via a pop. From here, we exit the set parsing world, and go
491 # back to generic regular expression parsing.
494 default expr-quant doSetFinish
498 # errorDeath. This state is specified as the next state whenever a syntax error
499 # in the source rules is detected. Barring bugs, the state machine will never
500 # actually get here, but will stop because of the action associated with the error.
501 # But, just in case, this state asks the state machine to exit.
503 default n errorDeath doExit