]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | # Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | # License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | #***************************************************************************** |
4 | # | |
b331163b | 5 | # Copyright (C) 2002-2015, International Business Machines Corporation and others. |
b75a7d8f A |
6 | # All Rights Reserved. |
7 | # | |
8 | #***************************************************************************** | |
9 | # | |
10 | # file: regexcst.txt | |
11 | # ICU Regular Expression Parser State Table | |
12 | # | |
13 | # This state table is used when reading and parsing a regular expression pattern | |
14 | # The pattern parser uses a state machine; the data in this file define the | |
15 | # state transitions that occur for each input character. | |
16 | # | |
17 | # *** This file defines the regex pattern grammar. This is it. | |
18 | # *** The determination of what is accepted is here. | |
19 | # | |
20 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
21 | # that are then built with the rule parser. | |
22 | # | |
23 | ||
24 | # | |
25 | # Here is the syntax of the state definitions in this file: | |
26 | # | |
27 | # | |
28 | #StateName: | |
46f4442e A |
29 | # input-char n next-state ^push-state action |
30 | # input-char n next-state ^push-state action | |
b75a7d8f A |
31 | # | | | | | |
32 | # | | | | |--- action to be performed by state machine | |
33 | # | | | | See function RBBIRuleScanner::doParseActions() | |
34 | # | | | | | |
35 | # | | | |--- Push this named state onto the state stack. | |
36 | # | | | Later, when next state is specified as "pop", | |
37 | # | | | the pushed state will become the current state. | |
38 | # | | | | |
39 | # | | |--- Transition to this state if the current input character matches the input | |
40 | # | | character or char class in the left hand column. "pop" causes the next | |
41 | # | | state to be popped from the state stack. | |
42 | # | | | |
43 | # | |--- When making the state transition specified on this line, advance to the next | |
44 | # | character from the input only if 'n' appears here. | |
45 | # | | |
46 | # |--- Character or named character classes to test for. If the current character being scanned | |
47 | # matches, peform the actions and go to the state specified on this line. | |
48 | # The input character is tested sequentally, in the order written. The characters and | |
49 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
46f4442e | 50 | # |
b75a7d8f A |
51 | |
52 | ||
53 | ||
54 | ||
55 | # | |
56 | # start state, scan position is at the beginning of the pattern. | |
57 | # | |
58 | start: | |
59 | default term doPatStart | |
b75a7d8f | 60 | |
46f4442e A |
61 | |
62 | ||
63 | ||
b75a7d8f A |
64 | # |
65 | # term. At a position where we can accept the start most items in a pattern. | |
66 | # | |
67 | term: | |
68 | quoted n expr-quant doLiteralChar | |
69 | rule_char n expr-quant doLiteralChar | |
46f4442e A |
70 | '[' n set-open ^set-finish doSetBegin |
71 | '(' n open-paren | |
b75a7d8f | 72 | '.' n expr-quant doDotAny |
46f4442e A |
73 | '^' n expr-quant doCaret |
74 | '$' n expr-quant doDollar | |
b75a7d8f A |
75 | '\' n backslash |
76 | '|' n term doOrOperator | |
77 | ')' n pop doCloseParen | |
78 | eof term doPatFinish | |
79 | default errorDeath doRuleError | |
46f4442e | 80 | |
b75a7d8f A |
81 | |
82 | ||
83 | # | |
84 | # expr-quant We've just finished scanning a term, now look for the optional | |
85 | # trailing quantifier - *, +, ?, *?, etc. | |
86 | # | |
87 | expr-quant: | |
46f4442e A |
88 | '*' n quant-star |
89 | '+' n quant-plus | |
90 | '?' n quant-opt | |
b75a7d8f A |
91 | '{' n interval-open doIntervalInit |
92 | '(' n open-paren-quant | |
46f4442e A |
93 | default expr-cont |
94 | ||
95 | ||
b75a7d8f A |
96 | # |
97 | # expr-cont Expression, continuation. At a point where additional terms are | |
98 | # allowed, but not required. No Quantifiers | |
99 | # | |
100 | expr-cont: | |
101 | '|' n term doOrOperator | |
102 | ')' n pop doCloseParen | |
46f4442e A |
103 | default term |
104 | ||
b75a7d8f A |
105 | |
106 | # | |
107 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
108 | # e.g. x(?#comment )* | |
109 | # Open parens from expr-quant come here; anything but a (?# comment | |
110 | # branches into the normal parenthesis sequence as quickly as possible. | |
111 | # | |
112 | open-paren-quant: | |
113 | '?' n open-paren-quant2 doSuppressComments | |
114 | default open-paren | |
46f4442e | 115 | |
b75a7d8f A |
116 | open-paren-quant2: |
117 | '#' n paren-comment ^expr-quant | |
118 | default open-paren-extended | |
46f4442e A |
119 | |
120 | ||
b75a7d8f A |
121 | # |
122 | # open-paren We've got an open paren. We need to scan further to | |
123 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
124 | # | |
125 | open-paren: | |
126 | '?' n open-paren-extended doSuppressComments | |
127 | default term ^expr-quant doOpenCaptureParen | |
46f4442e | 128 | |
b75a7d8f A |
129 | open-paren-extended: |
130 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
131 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
132 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
133 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
134 | '<' n open-paren-lookbehind | |
135 | '#' n paren-comment ^term | |
136 | 'i' paren-flag doBeginMatchMode | |
46f4442e | 137 | 'd' paren-flag doBeginMatchMode |
b75a7d8f A |
138 | 'm' paren-flag doBeginMatchMode |
139 | 's' paren-flag doBeginMatchMode | |
46f4442e | 140 | 'u' paren-flag doBeginMatchMode |
374ca955 | 141 | 'w' paren-flag doBeginMatchMode |
b75a7d8f A |
142 | 'x' paren-flag doBeginMatchMode |
143 | '-' paren-flag doBeginMatchMode | |
144 | '(' n errorDeath doConditionalExpr | |
145 | '{' n errorDeath doPerlInline | |
146 | default errorDeath doBadOpenParenType | |
46f4442e | 147 | |
b75a7d8f A |
148 | open-paren-lookbehind: |
149 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
150 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
b331163b | 151 | ascii_letter named-capture doBeginNamedCapture # (?<name |
b75a7d8f | 152 | default errorDeath doBadOpenParenType |
46f4442e | 153 | |
b75a7d8f A |
154 | |
155 | # | |
156 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
b75a7d8f A |
157 | # |
158 | paren-comment: | |
159 | ')' n pop | |
160 | eof errorDeath doMismatchedParenErr | |
161 | default n paren-comment | |
162 | ||
163 | # | |
46f4442e A |
164 | # paren-flag Scanned a (?ismx-ismx flag setting |
165 | # | |
b75a7d8f A |
166 | paren-flag: |
167 | 'i' n paren-flag doMatchMode | |
46f4442e | 168 | 'd' n paren-flag doMatchMode |
b75a7d8f A |
169 | 'm' n paren-flag doMatchMode |
170 | 's' n paren-flag doMatchMode | |
46f4442e | 171 | 'u' n paren-flag doMatchMode |
374ca955 | 172 | 'w' n paren-flag doMatchMode |
b75a7d8f A |
173 | 'x' n paren-flag doMatchMode |
174 | '-' n paren-flag doMatchMode | |
175 | ')' n term doSetMatchMode | |
176 | ':' n term ^expr-quant doMatchModeParen | |
374ca955 | 177 | default errorDeath doBadModeFlag |
46f4442e | 178 | |
b331163b A |
179 | # |
180 | # named-capture (?<name> ... ), position currently on the name. | |
181 | # | |
182 | named-capture: | |
183 | ascii_letter n named-capture doContinueNamedCapture | |
184 | digit_char n named-capture doContinueNamedCapture | |
185 | '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. | |
186 | default errorDeath doBadNamedCapture | |
46f4442e | 187 | |
b75a7d8f A |
188 | # |
189 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
190 | # between plain '*', '*?', '*+' | |
191 | # | |
192 | quant-star: | |
193 | '?' n expr-cont doNGStar # *? | |
194 | '+' n expr-cont doPossessiveStar # *+ | |
195 | default expr-cont doStar | |
196 | ||
197 | ||
198 | # | |
199 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
200 | # between plain '+', '+?', '++' | |
201 | # | |
202 | quant-plus: | |
203 | '?' n expr-cont doNGPlus # *? | |
204 | '+' n expr-cont doPossessivePlus # *+ | |
205 | default expr-cont doPlus | |
206 | ||
207 | ||
208 | # | |
209 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
210 | # between plain '?', '??', '?+' | |
211 | # | |
212 | quant-opt: | |
213 | '?' n expr-cont doNGOpt # ?? | |
214 | '+' n expr-cont doPossessiveOpt # ?+ | |
215 | default expr-cont doOpt # ? | |
216 | ||
217 | ||
218 | # | |
219 | # Interval scanning a '{', the opening delimiter for an interval specification | |
46f4442e | 220 | # {number} or {min, max} or {min,} |
b75a7d8f A |
221 | # |
222 | interval-open: | |
46f4442e | 223 | digit_char interval-lower |
b75a7d8f | 224 | default errorDeath doIntervalError |
46f4442e | 225 | |
b75a7d8f A |
226 | interval-lower: |
227 | digit_char n interval-lower doIntevalLowerDigit | |
228 | ',' n interval-upper | |
229 | '}' n interval-type doIntervalSame # {n} | |
230 | default errorDeath doIntervalError | |
231 | ||
232 | interval-upper: | |
233 | digit_char n interval-upper doIntervalUpperDigit | |
234 | '}' n interval-type | |
235 | default errorDeath doIntervalError | |
46f4442e | 236 | |
b75a7d8f A |
237 | interval-type: |
238 | '?' n expr-cont doNGInterval # {n,m}? | |
239 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
240 | default expr-cont doInterval # {m,n} | |
46f4442e A |
241 | |
242 | ||
b75a7d8f A |
243 | # |
244 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
245 | # The low level next-char function will have preprocessed | |
246 | # some of them already; those won't come here. | |
247 | backslash: | |
248 | 'A' n term doBackslashA | |
249 | 'B' n term doBackslashB | |
250 | 'b' n term doBackslashb | |
251 | 'd' n expr-quant doBackslashd | |
252 | 'D' n expr-quant doBackslashD | |
253 | 'G' n term doBackslashG | |
b331163b A |
254 | 'h' n expr-quant doBackslashh |
255 | 'H' n expr-quant doBackslashH | |
256 | 'k' n named-backref | |
46f4442e | 257 | 'N' expr-quant doNamedChar # \N{NAME} named char |
b75a7d8f A |
258 | 'p' expr-quant doProperty # \p{Lu} style property |
259 | 'P' expr-quant doProperty | |
b331163b | 260 | 'R' n expr-quant doBackslashR |
b75a7d8f A |
261 | 'Q' n term doEnterQuoteMode |
262 | 'S' n expr-quant doBackslashS | |
263 | 's' n expr-quant doBackslashs | |
b331163b A |
264 | 'v' n expr-quant doBackslashv |
265 | 'V' n expr-quant doBackslashV | |
b75a7d8f A |
266 | 'W' n expr-quant doBackslashW |
267 | 'w' n expr-quant doBackslashw | |
268 | 'X' n expr-quant doBackslashX | |
269 | 'Z' n term doBackslashZ | |
270 | 'z' n term doBackslashz | |
46f4442e | 271 | digit_char n expr-quant doBackRef # Will scan multiple digits |
b75a7d8f | 272 | eof errorDeath doEscapeError |
46f4442e A |
273 | default n expr-quant doEscapedLiteralChar |
274 | ||
b75a7d8f | 275 | |
b331163b A |
276 | # named-backref Scanned \k |
277 | # Leading to \k<captureName> | |
278 | # Failure to get the full sequence is an error. | |
279 | # | |
280 | named-backref: | |
281 | '<' n named-backref-2 doBeginNamedBackRef | |
282 | default errorDeath doBadNamedCapture | |
283 | ||
284 | named-backref-2: | |
285 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
286 | default errorDeath doBadNamedCapture | |
287 | ||
288 | named-backref-3: | |
289 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
290 | digit_char n named-backref-3 doContinueNamedBackRef | |
291 | '>' n expr-quant doCompleteNamedBackRef | |
292 | default errorDeath doBadNamedCapture | |
293 | ||
46f4442e A |
294 | |
295 | # | |
296 | # [set expression] parsing, | |
297 | # All states involved in parsing set expressions have names beginning with "set-" | |
298 | # | |
299 | ||
300 | set-open: | |
301 | '^' n set-open2 doSetNegate | |
302 | ':' set-posix doSetPosixProp | |
303 | default set-open2 | |
304 | ||
305 | set-open2: | |
306 | ']' n set-after-lit doSetLiteral | |
307 | default set-start | |
308 | ||
309 | # set-posix: | |
310 | # scanned a '[:' If it really is a [:property:], doSetPosixProp will have | |
311 | # moved the scan to the closing ']'. If it wasn't a property | |
312 | # expression, the scan will still be at the opening ':', which should | |
313 | # be interpreted as a normal set expression. | |
314 | set-posix: | |
315 | ']' n pop doSetEnd | |
316 | ':' set-start | |
317 | default errorDeath doRuleError # should not be possible. | |
318 | ||
319 | # | |
320 | # set-start after the [ and special case leading characters (^ and/or ]) but before | |
321 | # everything else. A '-' is literal at this point. | |
322 | # | |
323 | set-start: | |
324 | ']' n pop doSetEnd | |
325 | '[' n set-open ^set-after-set doSetBeginUnion | |
326 | '\' n set-escape | |
327 | '-' n set-start-dash | |
328 | '&' n set-start-amp | |
329 | default n set-after-lit doSetLiteral | |
330 | ||
331 | # set-start-dash Turn "[--" into a syntax error. | |
332 | # "[-x" is good, - and x are literals. | |
333 | # | |
334 | set-start-dash: | |
335 | '-' errorDeath doRuleError | |
336 | default set-after-lit doSetAddDash | |
337 | ||
338 | # set-start-amp Turn "[&&" into a syntax error. | |
339 | # "[&x" is good, & and x are literals. | |
340 | # | |
341 | set-start-amp: | |
342 | '&' errorDeath doRuleError | |
343 | default set-after-lit doSetAddAmp | |
344 | ||
345 | # | |
346 | # set-after-lit The last thing scanned was a literal character within a set. | |
347 | # Can be followed by anything. Single '-' or '&' are | |
348 | # literals in this context, not operators. | |
349 | set-after-lit: | |
350 | ']' n pop doSetEnd | |
351 | '[' n set-open ^set-after-set doSetBeginUnion | |
352 | '-' n set-lit-dash | |
353 | '&' n set-lit-amp | |
354 | '\' n set-escape | |
355 | eof errorDeath doSetNoCloseError | |
356 | default n set-after-lit doSetLiteral | |
357 | ||
358 | set-after-set: | |
359 | ']' n pop doSetEnd | |
360 | '[' n set-open ^set-after-set doSetBeginUnion | |
361 | '-' n set-set-dash | |
362 | '&' n set-set-amp | |
363 | '\' n set-escape | |
364 | eof errorDeath doSetNoCloseError | |
365 | default n set-after-lit doSetLiteral | |
366 | ||
367 | set-after-range: | |
368 | ']' n pop doSetEnd | |
369 | '[' n set-open ^set-after-set doSetBeginUnion | |
370 | '-' n set-range-dash | |
371 | '&' n set-range-amp | |
372 | '\' n set-escape | |
373 | eof errorDeath doSetNoCloseError | |
374 | default n set-after-lit doSetLiteral | |
b75a7d8f | 375 | |
46f4442e A |
376 | |
377 | # set-after-op | |
378 | # After a -- or && | |
379 | # It is an error to close a set at this point. | |
380 | # | |
381 | set-after-op: | |
382 | '[' n set-open ^set-after-set doSetBeginUnion | |
383 | ']' errorDeath doSetOpError | |
384 | '\' n set-escape | |
385 | default n set-after-lit doSetLiteral | |
386 | ||
387 | # | |
388 | # set-set-amp | |
389 | # Have scanned [[set]& | |
390 | # Could be a '&' intersection operator, if a set follows. | |
391 | # Could be the start of a '&&' operator. | |
392 | # Otherewise is a literal. | |
393 | set-set-amp: | |
394 | '[' n set-open ^set-after-set doSetBeginIntersection1 | |
395 | '&' n set-after-op doSetIntersection2 | |
396 | default set-after-lit doSetAddAmp | |
397 | ||
398 | ||
399 | # set-lit-amp Have scanned "[literals&" | |
400 | # Could be a start of "&&" operator or a literal | |
401 | # In [abc&[def]], the '&' is a literal | |
402 | # | |
403 | set-lit-amp: | |
404 | '&' n set-after-op doSetIntersection2 | |
405 | default set-after-lit doSetAddAmp | |
406 | ||
407 | ||
408 | # | |
409 | # set-set-dash | |
410 | # Have scanned [set]- | |
411 | # Could be a '-' difference operator, if a [set] follows. | |
412 | # Could be the start of a '--' operator. | |
413 | # Otherewise is a literal. | |
414 | set-set-dash: | |
415 | '[' n set-open ^set-after-set doSetBeginDifference1 | |
416 | '-' n set-after-op doSetDifference2 | |
417 | default set-after-lit doSetAddDash | |
418 | ||
419 | ||
420 | # | |
421 | # set-range-dash | |
422 | # scanned a-b- or \w- | |
423 | # any set or range like item where the trailing single '-' should | |
424 | # be literal, not a set difference operation. | |
425 | # A trailing "--" is still a difference operator. | |
426 | set-range-dash: | |
427 | '-' n set-after-op doSetDifference2 | |
428 | default set-after-lit doSetAddDash | |
429 | ||
430 | ||
431 | set-range-amp: | |
432 | '&' n set-after-op doSetIntersection2 | |
433 | default set-after-lit doSetAddAmp | |
434 | ||
435 | ||
436 | # set-lit-dash | |
437 | # Have scanned "[literals-" Could be a range or a -- operator or a literal | |
438 | # In [abc-[def]], the '-' is a literal (confirmed with a Java test) | |
439 | # [abc-\p{xx} the '-' is an error | |
440 | # [abc-] the '-' is a literal | |
441 | # [ab-xy] the '-' is a range | |
442 | # | |
443 | set-lit-dash: | |
444 | '-' n set-after-op doSetDifference2 | |
445 | '[' set-after-lit doSetAddDash | |
446 | ']' set-after-lit doSetAddDash | |
447 | '\' n set-lit-dash-escape | |
448 | default n set-after-range doSetRange | |
449 | ||
450 | # set-lit-dash-escape | |
451 | # | |
452 | # scanned "[literal-\" | |
453 | # Could be a range, if the \ introduces an escaped literal char or a named char. | |
454 | # Otherwise it is an error. | |
455 | # | |
456 | set-lit-dash-escape: | |
457 | 's' errorDeath doSetOpError | |
458 | 'S' errorDeath doSetOpError | |
459 | 'w' errorDeath doSetOpError | |
460 | 'W' errorDeath doSetOpError | |
461 | 'd' errorDeath doSetOpError | |
462 | 'D' errorDeath doSetOpError | |
463 | 'N' set-after-range doSetNamedRange | |
464 | default n set-after-range doSetRange | |
465 | ||
466 | ||
467 | # | |
468 | # set-escape | |
469 | # Common back-slash escape processing within set expressions | |
470 | # | |
471 | set-escape: | |
472 | 'p' set-after-set doSetProp | |
473 | 'P' set-after-set doSetProp | |
474 | 'N' set-after-lit doSetNamedChar | |
475 | 's' n set-after-range doSetBackslash_s | |
476 | 'S' n set-after-range doSetBackslash_S | |
477 | 'w' n set-after-range doSetBackslash_w | |
478 | 'W' n set-after-range doSetBackslash_W | |
479 | 'd' n set-after-range doSetBackslash_d | |
480 | 'D' n set-after-range doSetBackslash_D | |
b331163b A |
481 | 'h' n set-after-range doSetBackslash_h |
482 | 'H' n set-after-range doSetBackslash_H | |
483 | 'v' n set-after-range doSetBackslash_v | |
484 | 'V' n set-after-range doSetBackslash_V | |
46f4442e A |
485 | default n set-after-lit doSetLiteralEscaped |
486 | ||
487 | # | |
488 | # set-finish | |
489 | # Have just encountered the final ']' that completes a [set], and | |
490 | # arrived here via a pop. From here, we exit the set parsing world, and go | |
491 | # back to generic regular expression parsing. | |
492 | # | |
493 | set-finish: | |
494 | default expr-quant doSetFinish | |
495 | ||
496 | ||
b75a7d8f A |
497 | # |
498 | # errorDeath. This state is specified as the next state whenever a syntax error | |
499 | # in the source rules is detected. Barring bugs, the state machine will never | |
500 | # actually get here, but will stop because of the action associated with the error. | |
501 | # But, just in case, this state asks the state machine to exit. | |
502 | errorDeath: | |
503 | default n errorDeath doExit | |
504 | ||
505 |