2 #*****************************************************************************
4 # Copyright (C) 2002-2007, International Business Machines Corporation and others.
7 #*****************************************************************************
10 # ICU Regular Expression Parser State Table
12 # This state table is used when reading and parsing a regular expression pattern
13 # The pattern parser uses a state machine; the data in this file define the
14 # state transitions that occur for each input character.
16 # *** This file defines the regex pattern grammar. This is it.
17 # *** The determination of what is accepted is here.
19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20 # that are then built with the rule parser.
24 # Here is the syntax of the state definitions in this file:
28 # input-char n next-state ^push-state action
29 # input-char n next-state ^push-state action
31 # | | | | |--- action to be performed by state machine
32 # | | | | See function RBBIRuleScanner::doParseActions()
34 # | | | |--- Push this named state onto the state stack.
35 # | | | Later, when next state is specified as "pop",
36 # | | | the pushed state will become the current state.
38 # | | |--- Transition to this state if the current input character matches the input
39 # | | character or char class in the left hand column. "pop" causes the next
40 # | | state to be popped from the state stack.
42 # | |--- When making the state transition specified on this line, advance to the next
43 # | character from the input only if 'n' appears here.
45 # |--- Character or named character classes to test for. If the current character being scanned
46 # matches, peform the actions and go to the state specified on this line.
47 # The input character is tested sequentally, in the order written. The characters and
48 # character classes tested for do not need to be mutually exclusive. The first match wins.
55 # start state, scan position is at the beginning of the pattern.
58 default term doPatStart
64 # term. At a position where we can accept the start most items in a pattern.
67 quoted n expr-quant doLiteralChar
68 rule_char n expr-quant doLiteralChar
69 '[' n set-open ^set-finish doSetBegin
71 '.' n expr-quant doDotAny
72 '^' n expr-quant doCaret
73 '$' n expr-quant doDollar
75 '|' n term doOrOperator
76 ')' n pop doCloseParen
78 default errorDeath doRuleError
83 # expr-quant We've just finished scanning a term, now look for the optional
84 # trailing quantifier - *, +, ?, *?, etc.
90 '{' n interval-open doIntervalInit
91 '(' n open-paren-quant
96 # expr-cont Expression, continuation. At a point where additional terms are
97 # allowed, but not required. No Quantifiers
100 '|' n term doOrOperator
101 ')' n pop doCloseParen
106 # open-paren-quant Special case handling for comments appearing before a quantifier,
107 # e.g. x(?#comment )*
108 # Open parens from expr-quant come here; anything but a (?# comment
109 # branches into the normal parenthesis sequence as quickly as possible.
112 '?' n open-paren-quant2 doSuppressComments
116 '#' n paren-comment ^expr-quant
117 default open-paren-extended
121 # open-paren We've got an open paren. We need to scan further to
122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
125 '?' n open-paren-extended doSuppressComments
126 default term ^expr-quant doOpenCaptureParen
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
130 '>' n term ^expr-quant doOpenAtomicParen # (?>
131 '=' n term ^expr-cont doOpenLookAhead # (?=
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
133 '<' n open-paren-lookbehind
134 '#' n paren-comment ^term
135 'i' paren-flag doBeginMatchMode
136 'd' paren-flag doBeginMatchMode
137 'm' paren-flag doBeginMatchMode
138 's' paren-flag doBeginMatchMode
139 'u' paren-flag doBeginMatchMode
140 'w' paren-flag doBeginMatchMode
141 'x' paren-flag doBeginMatchMode
142 '-' paren-flag doBeginMatchMode
143 '(' n errorDeath doConditionalExpr
144 '{' n errorDeath doPerlInline
145 default errorDeath doBadOpenParenType
147 open-paren-lookbehind:
148 '=' n term ^expr-cont doOpenLookBehind # (?<=
149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
150 default errorDeath doBadOpenParenType
154 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
158 eof errorDeath doMismatchedParenErr
159 default n paren-comment
162 # paren-flag Scanned a (?ismx-ismx flag setting
165 'i' n paren-flag doMatchMode
166 'd' n paren-flag doMatchMode
167 'm' n paren-flag doMatchMode
168 's' n paren-flag doMatchMode
169 'u' n paren-flag doMatchMode
170 'w' n paren-flag doMatchMode
171 'x' n paren-flag doMatchMode
172 '-' n paren-flag doMatchMode
173 ')' n term doSetMatchMode
174 ':' n term ^expr-quant doMatchModeParen
175 default errorDeath doBadModeFlag
179 # quant-star Scanning a '*' quantifier. Need to look ahead to decide
180 # between plain '*', '*?', '*+'
183 '?' n expr-cont doNGStar # *?
184 '+' n expr-cont doPossessiveStar # *+
185 default expr-cont doStar
189 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide
190 # between plain '+', '+?', '++'
193 '?' n expr-cont doNGPlus # *?
194 '+' n expr-cont doPossessivePlus # *+
195 default expr-cont doPlus
199 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide
200 # between plain '?', '??', '?+'
203 '?' n expr-cont doNGOpt # ??
204 '+' n expr-cont doPossessiveOpt # ?+
205 default expr-cont doOpt # ?
209 # Interval scanning a '{', the opening delimiter for an interval specification
210 # {number} or {min, max} or {min,}
213 digit_char interval-lower
214 default errorDeath doIntervalError
217 digit_char n interval-lower doIntevalLowerDigit
219 '}' n interval-type doIntervalSame # {n}
220 default errorDeath doIntervalError
223 digit_char n interval-upper doIntervalUpperDigit
225 default errorDeath doIntervalError
228 '?' n expr-cont doNGInterval # {n,m}?
229 '+' n expr-cont doPossessiveInterval # {n,m}+
230 default expr-cont doInterval # {m,n}
234 # backslash # Backslash. Figure out which of the \thingies we have encountered.
235 # The low level next-char function will have preprocessed
236 # some of them already; those won't come here.
238 'A' n term doBackslashA
239 'B' n term doBackslashB
240 'b' n term doBackslashb
241 'd' n expr-quant doBackslashd
242 'D' n expr-quant doBackslashD
243 'G' n term doBackslashG
244 'N' expr-quant doNamedChar # \N{NAME} named char
245 'p' expr-quant doProperty # \p{Lu} style property
246 'P' expr-quant doProperty
247 'Q' n term doEnterQuoteMode
248 'S' n expr-quant doBackslashS
249 's' n expr-quant doBackslashs
250 'W' n expr-quant doBackslashW
251 'w' n expr-quant doBackslashw
252 'X' n expr-quant doBackslashX
253 'Z' n term doBackslashZ
254 'z' n term doBackslashz
255 digit_char n expr-quant doBackRef # Will scan multiple digits
256 eof errorDeath doEscapeError
257 default n expr-quant doEscapedLiteralChar
262 # [set expression] parsing,
263 # All states involved in parsing set expressions have names beginning with "set-"
267 '^' n set-open2 doSetNegate
268 ':' set-posix doSetPosixProp
272 ']' n set-after-lit doSetLiteral
276 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have
277 # moved the scan to the closing ']'. If it wasn't a property
278 # expression, the scan will still be at the opening ':', which should
279 # be interpreted as a normal set expression.
283 default errorDeath doRuleError # should not be possible.
286 # set-start after the [ and special case leading characters (^ and/or ]) but before
287 # everything else. A '-' is literal at this point.
291 '[' n set-open ^set-after-set doSetBeginUnion
295 default n set-after-lit doSetLiteral
297 # set-start-dash Turn "[--" into a syntax error.
298 # "[-x" is good, - and x are literals.
301 '-' errorDeath doRuleError
302 default set-after-lit doSetAddDash
304 # set-start-amp Turn "[&&" into a syntax error.
305 # "[&x" is good, & and x are literals.
308 '&' errorDeath doRuleError
309 default set-after-lit doSetAddAmp
312 # set-after-lit The last thing scanned was a literal character within a set.
313 # Can be followed by anything. Single '-' or '&' are
314 # literals in this context, not operators.
317 '[' n set-open ^set-after-set doSetBeginUnion
321 eof errorDeath doSetNoCloseError
322 default n set-after-lit doSetLiteral
326 '[' n set-open ^set-after-set doSetBeginUnion
330 eof errorDeath doSetNoCloseError
331 default n set-after-lit doSetLiteral
335 '[' n set-open ^set-after-set doSetBeginUnion
339 eof errorDeath doSetNoCloseError
340 default n set-after-lit doSetLiteral
345 # It is an error to close a set at this point.
348 '[' n set-open ^set-after-set doSetBeginUnion
349 ']' errorDeath doSetOpError
351 default n set-after-lit doSetLiteral
355 # Have scanned [[set]&
356 # Could be a '&' intersection operator, if a set follows.
357 # Could be the start of a '&&' operator.
358 # Otherewise is a literal.
360 '[' n set-open ^set-after-set doSetBeginIntersection1
361 '&' n set-after-op doSetIntersection2
362 default set-after-lit doSetAddAmp
365 # set-lit-amp Have scanned "[literals&"
366 # Could be a start of "&&" operator or a literal
367 # In [abc&[def]], the '&' is a literal
370 '&' n set-after-op doSetIntersection2
371 default set-after-lit doSetAddAmp
376 # Have scanned [set]-
377 # Could be a '-' difference operator, if a [set] follows.
378 # Could be the start of a '--' operator.
379 # Otherewise is a literal.
381 '[' n set-open ^set-after-set doSetBeginDifference1
382 '-' n set-after-op doSetDifference2
383 default set-after-lit doSetAddDash
388 # scanned a-b- or \w-
389 # any set or range like item where the trailing single '-' should
390 # be literal, not a set difference operation.
391 # A trailing "--" is still a difference operator.
393 '-' n set-after-op doSetDifference2
394 default set-after-lit doSetAddDash
398 '&' n set-after-op doSetIntersection2
399 default set-after-lit doSetAddAmp
403 # Have scanned "[literals-" Could be a range or a -- operator or a literal
404 # In [abc-[def]], the '-' is a literal (confirmed with a Java test)
405 # [abc-\p{xx} the '-' is an error
406 # [abc-] the '-' is a literal
407 # [ab-xy] the '-' is a range
410 '-' n set-after-op doSetDifference2
411 '[' set-after-lit doSetAddDash
412 ']' set-after-lit doSetAddDash
413 '\' n set-lit-dash-escape
414 default n set-after-range doSetRange
416 # set-lit-dash-escape
418 # scanned "[literal-\"
419 # Could be a range, if the \ introduces an escaped literal char or a named char.
420 # Otherwise it is an error.
423 's' errorDeath doSetOpError
424 'S' errorDeath doSetOpError
425 'w' errorDeath doSetOpError
426 'W' errorDeath doSetOpError
427 'd' errorDeath doSetOpError
428 'D' errorDeath doSetOpError
429 'N' set-after-range doSetNamedRange
430 default n set-after-range doSetRange
435 # Common back-slash escape processing within set expressions
438 'p' set-after-set doSetProp
439 'P' set-after-set doSetProp
440 'N' set-after-lit doSetNamedChar
441 's' n set-after-range doSetBackslash_s
442 'S' n set-after-range doSetBackslash_S
443 'w' n set-after-range doSetBackslash_w
444 'W' n set-after-range doSetBackslash_W
445 'd' n set-after-range doSetBackslash_d
446 'D' n set-after-range doSetBackslash_D
447 default n set-after-lit doSetLiteralEscaped
451 # Have just encountered the final ']' that completes a [set], and
452 # arrived here via a pop. From here, we exit the set parsing world, and go
453 # back to generic regular expression parsing.
456 default expr-quant doSetFinish
460 # errorDeath. This state is specified as the next state whenever a syntax error
461 # in the source rules is detected. Barring bugs, the state machine will never
462 # actually get here, but will stop because of the action associated with the error.
463 # But, just in case, this state asks the state machine to exit.
465 default n errorDeath doExit