]>
Commit | Line | Data |
---|---|---|
1 | # Copyright (C) 2016 and later: Unicode, Inc. and others. | |
2 | # License & terms of use: http://www.unicode.org/copyright.html | |
3 | #***************************************************************************** | |
4 | # | |
5 | # Copyright (C) 2002-2015, International Business Machines Corporation and others. | |
6 | # All Rights Reserved. | |
7 | # | |
8 | #***************************************************************************** | |
9 | # | |
10 | # file: regexcst.txt | |
11 | # ICU Regular Expression Parser State Table | |
12 | # | |
13 | # This state table is used when reading and parsing a regular expression pattern | |
14 | # The pattern parser uses a state machine; the data in this file define the | |
15 | # state transitions that occur for each input character. | |
16 | # | |
17 | # *** This file defines the regex pattern grammar. This is it. | |
18 | # *** The determination of what is accepted is here. | |
19 | # | |
20 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
21 | # that are then built with the rule parser. | |
22 | # | |
23 | ||
24 | # | |
25 | # Here is the syntax of the state definitions in this file: | |
26 | # | |
27 | # | |
28 | #StateName: | |
29 | # input-char n next-state ^push-state action | |
30 | # input-char n next-state ^push-state action | |
31 | # | | | | | | |
32 | # | | | | |--- action to be performed by state machine | |
33 | # | | | | See function RBBIRuleScanner::doParseActions() | |
34 | # | | | | | |
35 | # | | | |--- Push this named state onto the state stack. | |
36 | # | | | Later, when next state is specified as "pop", | |
37 | # | | | the pushed state will become the current state. | |
38 | # | | | | |
39 | # | | |--- Transition to this state if the current input character matches the input | |
40 | # | | character or char class in the left hand column. "pop" causes the next | |
41 | # | | state to be popped from the state stack. | |
42 | # | | | |
43 | # | |--- When making the state transition specified on this line, advance to the next | |
44 | # | character from the input only if 'n' appears here. | |
45 | # | | |
46 | # |--- Character or named character classes to test for. If the current character being scanned | |
47 | # matches, peform the actions and go to the state specified on this line. | |
48 | # The input character is tested sequentally, in the order written. The characters and | |
49 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
50 | # | |
51 | ||
52 | ||
53 | ||
54 | ||
55 | # | |
56 | # start state, scan position is at the beginning of the pattern. | |
57 | # | |
58 | start: | |
59 | default term doPatStart | |
60 | ||
61 | ||
62 | ||
63 | ||
64 | # | |
65 | # term. At a position where we can accept the start most items in a pattern. | |
66 | # | |
67 | term: | |
68 | quoted n expr-quant doLiteralChar | |
69 | rule_char n expr-quant doLiteralChar | |
70 | '[' n set-open ^set-finish doSetBegin | |
71 | '(' n open-paren | |
72 | '.' n expr-quant doDotAny | |
73 | '^' n expr-quant doCaret | |
74 | '$' n expr-quant doDollar | |
75 | '\' n backslash | |
76 | '|' n term doOrOperator | |
77 | ')' n pop doCloseParen | |
78 | eof term doPatFinish | |
79 | default errorDeath doRuleError | |
80 | ||
81 | ||
82 | ||
83 | # | |
84 | # expr-quant We've just finished scanning a term, now look for the optional | |
85 | # trailing quantifier - *, +, ?, *?, etc. | |
86 | # | |
87 | expr-quant: | |
88 | '*' n quant-star | |
89 | '+' n quant-plus | |
90 | '?' n quant-opt | |
91 | '{' n interval-open doIntervalInit | |
92 | '(' n open-paren-quant | |
93 | default expr-cont | |
94 | ||
95 | ||
96 | # | |
97 | # expr-cont Expression, continuation. At a point where additional terms are | |
98 | # allowed, but not required. No Quantifiers | |
99 | # | |
100 | expr-cont: | |
101 | '|' n term doOrOperator | |
102 | ')' n pop doCloseParen | |
103 | default term | |
104 | ||
105 | ||
106 | # | |
107 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
108 | # e.g. x(?#comment )* | |
109 | # Open parens from expr-quant come here; anything but a (?# comment | |
110 | # branches into the normal parenthesis sequence as quickly as possible. | |
111 | # | |
112 | open-paren-quant: | |
113 | '?' n open-paren-quant2 doSuppressComments | |
114 | default open-paren | |
115 | ||
116 | open-paren-quant2: | |
117 | '#' n paren-comment ^expr-quant | |
118 | default open-paren-extended | |
119 | ||
120 | ||
121 | # | |
122 | # open-paren We've got an open paren. We need to scan further to | |
123 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
124 | # | |
125 | open-paren: | |
126 | '?' n open-paren-extended doSuppressComments | |
127 | default term ^expr-quant doOpenCaptureParen | |
128 | ||
129 | open-paren-extended: | |
130 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
131 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
132 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
133 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
134 | '<' n open-paren-lookbehind | |
135 | '#' n paren-comment ^term | |
136 | 'i' paren-flag doBeginMatchMode | |
137 | 'd' paren-flag doBeginMatchMode | |
138 | 'm' paren-flag doBeginMatchMode | |
139 | 's' paren-flag doBeginMatchMode | |
140 | 'u' paren-flag doBeginMatchMode | |
141 | 'w' paren-flag doBeginMatchMode | |
142 | 'x' paren-flag doBeginMatchMode | |
143 | '-' paren-flag doBeginMatchMode | |
144 | '(' n errorDeath doConditionalExpr | |
145 | '{' n errorDeath doPerlInline | |
146 | default errorDeath doBadOpenParenType | |
147 | ||
148 | open-paren-lookbehind: | |
149 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
150 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
151 | ascii_letter named-capture doBeginNamedCapture # (?<name | |
152 | default errorDeath doBadOpenParenType | |
153 | ||
154 | ||
155 | # | |
156 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
157 | # | |
158 | paren-comment: | |
159 | ')' n pop | |
160 | eof errorDeath doMismatchedParenErr | |
161 | default n paren-comment | |
162 | ||
163 | # | |
164 | # paren-flag Scanned a (?ismx-ismx flag setting | |
165 | # | |
166 | paren-flag: | |
167 | 'i' n paren-flag doMatchMode | |
168 | 'd' n paren-flag doMatchMode | |
169 | 'm' n paren-flag doMatchMode | |
170 | 's' n paren-flag doMatchMode | |
171 | 'u' n paren-flag doMatchMode | |
172 | 'w' n paren-flag doMatchMode | |
173 | 'x' n paren-flag doMatchMode | |
174 | '-' n paren-flag doMatchMode | |
175 | ')' n term doSetMatchMode | |
176 | ':' n term ^expr-quant doMatchModeParen | |
177 | default errorDeath doBadModeFlag | |
178 | ||
179 | # | |
180 | # named-capture (?<name> ... ), position currently on the name. | |
181 | # | |
182 | named-capture: | |
183 | ascii_letter n named-capture doContinueNamedCapture | |
184 | digit_char n named-capture doContinueNamedCapture | |
185 | '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. | |
186 | default errorDeath doBadNamedCapture | |
187 | ||
188 | # | |
189 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
190 | # between plain '*', '*?', '*+' | |
191 | # | |
192 | quant-star: | |
193 | '?' n expr-cont doNGStar # *? | |
194 | '+' n expr-cont doPossessiveStar # *+ | |
195 | default expr-cont doStar | |
196 | ||
197 | ||
198 | # | |
199 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
200 | # between plain '+', '+?', '++' | |
201 | # | |
202 | quant-plus: | |
203 | '?' n expr-cont doNGPlus # *? | |
204 | '+' n expr-cont doPossessivePlus # *+ | |
205 | default expr-cont doPlus | |
206 | ||
207 | ||
208 | # | |
209 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
210 | # between plain '?', '??', '?+' | |
211 | # | |
212 | quant-opt: | |
213 | '?' n expr-cont doNGOpt # ?? | |
214 | '+' n expr-cont doPossessiveOpt # ?+ | |
215 | default expr-cont doOpt # ? | |
216 | ||
217 | ||
218 | # | |
219 | # Interval scanning a '{', the opening delimiter for an interval specification | |
220 | # {number} or {min, max} or {min,} | |
221 | # | |
222 | interval-open: | |
223 | digit_char interval-lower | |
224 | default errorDeath doIntervalError | |
225 | ||
226 | interval-lower: | |
227 | digit_char n interval-lower doIntevalLowerDigit | |
228 | ',' n interval-upper | |
229 | '}' n interval-type doIntervalSame # {n} | |
230 | default errorDeath doIntervalError | |
231 | ||
232 | interval-upper: | |
233 | digit_char n interval-upper doIntervalUpperDigit | |
234 | '}' n interval-type | |
235 | default errorDeath doIntervalError | |
236 | ||
237 | interval-type: | |
238 | '?' n expr-cont doNGInterval # {n,m}? | |
239 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
240 | default expr-cont doInterval # {m,n} | |
241 | ||
242 | ||
243 | # | |
244 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
245 | # The low level next-char function will have preprocessed | |
246 | # some of them already; those won't come here. | |
247 | backslash: | |
248 | 'A' n term doBackslashA | |
249 | 'B' n term doBackslashB | |
250 | 'b' n term doBackslashb | |
251 | 'd' n expr-quant doBackslashd | |
252 | 'D' n expr-quant doBackslashD | |
253 | 'G' n term doBackslashG | |
254 | 'h' n expr-quant doBackslashh | |
255 | 'H' n expr-quant doBackslashH | |
256 | 'k' n named-backref | |
257 | 'N' expr-quant doNamedChar # \N{NAME} named char | |
258 | 'p' expr-quant doProperty # \p{Lu} style property | |
259 | 'P' expr-quant doProperty | |
260 | 'R' n expr-quant doBackslashR | |
261 | 'Q' n term doEnterQuoteMode | |
262 | 'S' n expr-quant doBackslashS | |
263 | 's' n expr-quant doBackslashs | |
264 | 'v' n expr-quant doBackslashv | |
265 | 'V' n expr-quant doBackslashV | |
266 | 'W' n expr-quant doBackslashW | |
267 | 'w' n expr-quant doBackslashw | |
268 | 'X' n expr-quant doBackslashX | |
269 | 'Z' n term doBackslashZ | |
270 | 'z' n term doBackslashz | |
271 | digit_char n expr-quant doBackRef # Will scan multiple digits | |
272 | eof errorDeath doEscapeError | |
273 | default n expr-quant doEscapedLiteralChar | |
274 | ||
275 | ||
276 | # named-backref Scanned \k | |
277 | # Leading to \k<captureName> | |
278 | # Failure to get the full sequence is an error. | |
279 | # | |
280 | named-backref: | |
281 | '<' n named-backref-2 doBeginNamedBackRef | |
282 | default errorDeath doBadNamedCapture | |
283 | ||
284 | named-backref-2: | |
285 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
286 | default errorDeath doBadNamedCapture | |
287 | ||
288 | named-backref-3: | |
289 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
290 | digit_char n named-backref-3 doContinueNamedBackRef | |
291 | '>' n expr-quant doCompleteNamedBackRef | |
292 | default errorDeath doBadNamedCapture | |
293 | ||
294 | ||
295 | # | |
296 | # [set expression] parsing, | |
297 | # All states involved in parsing set expressions have names beginning with "set-" | |
298 | # | |
299 | ||
300 | set-open: | |
301 | '^' n set-open2 doSetNegate | |
302 | ':' set-posix doSetPosixProp | |
303 | default set-open2 | |
304 | ||
305 | set-open2: | |
306 | ']' n set-after-lit doSetLiteral | |
307 | default set-start | |
308 | ||
309 | # set-posix: | |
310 | # scanned a '[:' If it really is a [:property:], doSetPosixProp will have | |
311 | # moved the scan to the closing ']'. If it wasn't a property | |
312 | # expression, the scan will still be at the opening ':', which should | |
313 | # be interpreted as a normal set expression. | |
314 | set-posix: | |
315 | ']' n pop doSetEnd | |
316 | ':' set-start | |
317 | default errorDeath doRuleError # should not be possible. | |
318 | ||
319 | # | |
320 | # set-start after the [ and special case leading characters (^ and/or ]) but before | |
321 | # everything else. A '-' is literal at this point. | |
322 | # | |
323 | set-start: | |
324 | ']' n pop doSetEnd | |
325 | '[' n set-open ^set-after-set doSetBeginUnion | |
326 | '\' n set-escape | |
327 | '-' n set-start-dash | |
328 | '&' n set-start-amp | |
329 | default n set-after-lit doSetLiteral | |
330 | ||
331 | # set-start-dash Turn "[--" into a syntax error. | |
332 | # "[-x" is good, - and x are literals. | |
333 | # | |
334 | set-start-dash: | |
335 | '-' errorDeath doRuleError | |
336 | default set-after-lit doSetAddDash | |
337 | ||
338 | # set-start-amp Turn "[&&" into a syntax error. | |
339 | # "[&x" is good, & and x are literals. | |
340 | # | |
341 | set-start-amp: | |
342 | '&' errorDeath doRuleError | |
343 | default set-after-lit doSetAddAmp | |
344 | ||
345 | # | |
346 | # set-after-lit The last thing scanned was a literal character within a set. | |
347 | # Can be followed by anything. Single '-' or '&' are | |
348 | # literals in this context, not operators. | |
349 | set-after-lit: | |
350 | ']' n pop doSetEnd | |
351 | '[' n set-open ^set-after-set doSetBeginUnion | |
352 | '-' n set-lit-dash | |
353 | '&' n set-lit-amp | |
354 | '\' n set-escape | |
355 | eof errorDeath doSetNoCloseError | |
356 | default n set-after-lit doSetLiteral | |
357 | ||
358 | set-after-set: | |
359 | ']' n pop doSetEnd | |
360 | '[' n set-open ^set-after-set doSetBeginUnion | |
361 | '-' n set-set-dash | |
362 | '&' n set-set-amp | |
363 | '\' n set-escape | |
364 | eof errorDeath doSetNoCloseError | |
365 | default n set-after-lit doSetLiteral | |
366 | ||
367 | set-after-range: | |
368 | ']' n pop doSetEnd | |
369 | '[' n set-open ^set-after-set doSetBeginUnion | |
370 | '-' n set-range-dash | |
371 | '&' n set-range-amp | |
372 | '\' n set-escape | |
373 | eof errorDeath doSetNoCloseError | |
374 | default n set-after-lit doSetLiteral | |
375 | ||
376 | ||
377 | # set-after-op | |
378 | # After a -- or && | |
379 | # It is an error to close a set at this point. | |
380 | # | |
381 | set-after-op: | |
382 | '[' n set-open ^set-after-set doSetBeginUnion | |
383 | ']' errorDeath doSetOpError | |
384 | '\' n set-escape | |
385 | default n set-after-lit doSetLiteral | |
386 | ||
387 | # | |
388 | # set-set-amp | |
389 | # Have scanned [[set]& | |
390 | # Could be a '&' intersection operator, if a set follows. | |
391 | # Could be the start of a '&&' operator. | |
392 | # Otherewise is a literal. | |
393 | set-set-amp: | |
394 | '[' n set-open ^set-after-set doSetBeginIntersection1 | |
395 | '&' n set-after-op doSetIntersection2 | |
396 | default set-after-lit doSetAddAmp | |
397 | ||
398 | ||
399 | # set-lit-amp Have scanned "[literals&" | |
400 | # Could be a start of "&&" operator or a literal | |
401 | # In [abc&[def]], the '&' is a literal | |
402 | # | |
403 | set-lit-amp: | |
404 | '&' n set-after-op doSetIntersection2 | |
405 | default set-after-lit doSetAddAmp | |
406 | ||
407 | ||
408 | # | |
409 | # set-set-dash | |
410 | # Have scanned [set]- | |
411 | # Could be a '-' difference operator, if a [set] follows. | |
412 | # Could be the start of a '--' operator. | |
413 | # Otherewise is a literal. | |
414 | set-set-dash: | |
415 | '[' n set-open ^set-after-set doSetBeginDifference1 | |
416 | '-' n set-after-op doSetDifference2 | |
417 | default set-after-lit doSetAddDash | |
418 | ||
419 | ||
420 | # | |
421 | # set-range-dash | |
422 | # scanned a-b- or \w- | |
423 | # any set or range like item where the trailing single '-' should | |
424 | # be literal, not a set difference operation. | |
425 | # A trailing "--" is still a difference operator. | |
426 | set-range-dash: | |
427 | '-' n set-after-op doSetDifference2 | |
428 | default set-after-lit doSetAddDash | |
429 | ||
430 | ||
431 | set-range-amp: | |
432 | '&' n set-after-op doSetIntersection2 | |
433 | default set-after-lit doSetAddAmp | |
434 | ||
435 | ||
436 | # set-lit-dash | |
437 | # Have scanned "[literals-" Could be a range or a -- operator or a literal | |
438 | # In [abc-[def]], the '-' is a literal (confirmed with a Java test) | |
439 | # [abc-\p{xx} the '-' is an error | |
440 | # [abc-] the '-' is a literal | |
441 | # [ab-xy] the '-' is a range | |
442 | # | |
443 | set-lit-dash: | |
444 | '-' n set-after-op doSetDifference2 | |
445 | '[' set-after-lit doSetAddDash | |
446 | ']' set-after-lit doSetAddDash | |
447 | '\' n set-lit-dash-escape | |
448 | default n set-after-range doSetRange | |
449 | ||
450 | # set-lit-dash-escape | |
451 | # | |
452 | # scanned "[literal-\" | |
453 | # Could be a range, if the \ introduces an escaped literal char or a named char. | |
454 | # Otherwise it is an error. | |
455 | # | |
456 | set-lit-dash-escape: | |
457 | 's' errorDeath doSetOpError | |
458 | 'S' errorDeath doSetOpError | |
459 | 'w' errorDeath doSetOpError | |
460 | 'W' errorDeath doSetOpError | |
461 | 'd' errorDeath doSetOpError | |
462 | 'D' errorDeath doSetOpError | |
463 | 'N' set-after-range doSetNamedRange | |
464 | default n set-after-range doSetRange | |
465 | ||
466 | ||
467 | # | |
468 | # set-escape | |
469 | # Common back-slash escape processing within set expressions | |
470 | # | |
471 | set-escape: | |
472 | 'p' set-after-set doSetProp | |
473 | 'P' set-after-set doSetProp | |
474 | 'N' set-after-lit doSetNamedChar | |
475 | 's' n set-after-range doSetBackslash_s | |
476 | 'S' n set-after-range doSetBackslash_S | |
477 | 'w' n set-after-range doSetBackslash_w | |
478 | 'W' n set-after-range doSetBackslash_W | |
479 | 'd' n set-after-range doSetBackslash_d | |
480 | 'D' n set-after-range doSetBackslash_D | |
481 | 'h' n set-after-range doSetBackslash_h | |
482 | 'H' n set-after-range doSetBackslash_H | |
483 | 'v' n set-after-range doSetBackslash_v | |
484 | 'V' n set-after-range doSetBackslash_V | |
485 | default n set-after-lit doSetLiteralEscaped | |
486 | ||
487 | # | |
488 | # set-finish | |
489 | # Have just encountered the final ']' that completes a [set], and | |
490 | # arrived here via a pop. From here, we exit the set parsing world, and go | |
491 | # back to generic regular expression parsing. | |
492 | # | |
493 | set-finish: | |
494 | default expr-quant doSetFinish | |
495 | ||
496 | ||
497 | # | |
498 | # errorDeath. This state is specified as the next state whenever a syntax error | |
499 | # in the source rules is detected. Barring bugs, the state machine will never | |
500 | # actually get here, but will stop because of the action associated with the error. | |
501 | # But, just in case, this state asks the state machine to exit. | |
502 | errorDeath: | |
503 | default n errorDeath doExit | |
504 | ||
505 |