]>
Commit | Line | Data |
---|---|---|
1 | ||
2 | #***************************************************************************** | |
3 | # | |
4 | # Copyright (C) 2002-2015, International Business Machines Corporation and others. | |
5 | # All Rights Reserved. | |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: regexcst.txt | |
10 | # ICU Regular Expression Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a regular expression pattern | |
13 | # The pattern parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the regex pattern grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | ||
23 | # | |
24 | # Here is the syntax of the state definitions in this file: | |
25 | # | |
26 | # | |
27 | #StateName: | |
28 | # input-char n next-state ^push-state action | |
29 | # input-char n next-state ^push-state action | |
30 | # | | | | | | |
31 | # | | | | |--- action to be performed by state machine | |
32 | # | | | | See function RBBIRuleScanner::doParseActions() | |
33 | # | | | | | |
34 | # | | | |--- Push this named state onto the state stack. | |
35 | # | | | Later, when next state is specified as "pop", | |
36 | # | | | the pushed state will become the current state. | |
37 | # | | | | |
38 | # | | |--- Transition to this state if the current input character matches the input | |
39 | # | | character or char class in the left hand column. "pop" causes the next | |
40 | # | | state to be popped from the state stack. | |
41 | # | | | |
42 | # | |--- When making the state transition specified on this line, advance to the next | |
43 | # | character from the input only if 'n' appears here. | |
44 | # | | |
45 | # |--- Character or named character classes to test for. If the current character being scanned | |
46 | # matches, peform the actions and go to the state specified on this line. | |
47 | # The input character is tested sequentally, in the order written. The characters and | |
48 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
49 | # | |
50 | ||
51 | ||
52 | ||
53 | ||
54 | # | |
55 | # start state, scan position is at the beginning of the pattern. | |
56 | # | |
57 | start: | |
58 | default term doPatStart | |
59 | ||
60 | ||
61 | ||
62 | ||
63 | # | |
64 | # term. At a position where we can accept the start most items in a pattern. | |
65 | # | |
66 | term: | |
67 | quoted n expr-quant doLiteralChar | |
68 | rule_char n expr-quant doLiteralChar | |
69 | '[' n set-open ^set-finish doSetBegin | |
70 | '(' n open-paren | |
71 | '.' n expr-quant doDotAny | |
72 | '^' n expr-quant doCaret | |
73 | '$' n expr-quant doDollar | |
74 | '\' n backslash | |
75 | '|' n term doOrOperator | |
76 | ')' n pop doCloseParen | |
77 | eof term doPatFinish | |
78 | default errorDeath doRuleError | |
79 | ||
80 | ||
81 | ||
82 | # | |
83 | # expr-quant We've just finished scanning a term, now look for the optional | |
84 | # trailing quantifier - *, +, ?, *?, etc. | |
85 | # | |
86 | expr-quant: | |
87 | '*' n quant-star | |
88 | '+' n quant-plus | |
89 | '?' n quant-opt | |
90 | '{' n interval-open doIntervalInit | |
91 | '(' n open-paren-quant | |
92 | default expr-cont | |
93 | ||
94 | ||
95 | # | |
96 | # expr-cont Expression, continuation. At a point where additional terms are | |
97 | # allowed, but not required. No Quantifiers | |
98 | # | |
99 | expr-cont: | |
100 | '|' n term doOrOperator | |
101 | ')' n pop doCloseParen | |
102 | default term | |
103 | ||
104 | ||
105 | # | |
106 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
107 | # e.g. x(?#comment )* | |
108 | # Open parens from expr-quant come here; anything but a (?# comment | |
109 | # branches into the normal parenthesis sequence as quickly as possible. | |
110 | # | |
111 | open-paren-quant: | |
112 | '?' n open-paren-quant2 doSuppressComments | |
113 | default open-paren | |
114 | ||
115 | open-paren-quant2: | |
116 | '#' n paren-comment ^expr-quant | |
117 | default open-paren-extended | |
118 | ||
119 | ||
120 | # | |
121 | # open-paren We've got an open paren. We need to scan further to | |
122 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
123 | # | |
124 | open-paren: | |
125 | '?' n open-paren-extended doSuppressComments | |
126 | default term ^expr-quant doOpenCaptureParen | |
127 | ||
128 | open-paren-extended: | |
129 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
130 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
131 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
132 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
133 | '<' n open-paren-lookbehind | |
134 | '#' n paren-comment ^term | |
135 | 'i' paren-flag doBeginMatchMode | |
136 | 'd' paren-flag doBeginMatchMode | |
137 | 'm' paren-flag doBeginMatchMode | |
138 | 's' paren-flag doBeginMatchMode | |
139 | 'u' paren-flag doBeginMatchMode | |
140 | 'w' paren-flag doBeginMatchMode | |
141 | 'x' paren-flag doBeginMatchMode | |
142 | '-' paren-flag doBeginMatchMode | |
143 | '(' n errorDeath doConditionalExpr | |
144 | '{' n errorDeath doPerlInline | |
145 | default errorDeath doBadOpenParenType | |
146 | ||
147 | open-paren-lookbehind: | |
148 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
149 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
150 | ascii_letter named-capture doBeginNamedCapture # (?<name | |
151 | default errorDeath doBadOpenParenType | |
152 | ||
153 | ||
154 | # | |
155 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
156 | # | |
157 | paren-comment: | |
158 | ')' n pop | |
159 | eof errorDeath doMismatchedParenErr | |
160 | default n paren-comment | |
161 | ||
162 | # | |
163 | # paren-flag Scanned a (?ismx-ismx flag setting | |
164 | # | |
165 | paren-flag: | |
166 | 'i' n paren-flag doMatchMode | |
167 | 'd' n paren-flag doMatchMode | |
168 | 'm' n paren-flag doMatchMode | |
169 | 's' n paren-flag doMatchMode | |
170 | 'u' n paren-flag doMatchMode | |
171 | 'w' n paren-flag doMatchMode | |
172 | 'x' n paren-flag doMatchMode | |
173 | '-' n paren-flag doMatchMode | |
174 | ')' n term doSetMatchMode | |
175 | ':' n term ^expr-quant doMatchModeParen | |
176 | default errorDeath doBadModeFlag | |
177 | ||
178 | # | |
179 | # named-capture (?<name> ... ), position currently on the name. | |
180 | # | |
181 | named-capture: | |
182 | ascii_letter n named-capture doContinueNamedCapture | |
183 | digit_char n named-capture doContinueNamedCapture | |
184 | '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture. | |
185 | default errorDeath doBadNamedCapture | |
186 | ||
187 | # | |
188 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
189 | # between plain '*', '*?', '*+' | |
190 | # | |
191 | quant-star: | |
192 | '?' n expr-cont doNGStar # *? | |
193 | '+' n expr-cont doPossessiveStar # *+ | |
194 | default expr-cont doStar | |
195 | ||
196 | ||
197 | # | |
198 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
199 | # between plain '+', '+?', '++' | |
200 | # | |
201 | quant-plus: | |
202 | '?' n expr-cont doNGPlus # *? | |
203 | '+' n expr-cont doPossessivePlus # *+ | |
204 | default expr-cont doPlus | |
205 | ||
206 | ||
207 | # | |
208 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
209 | # between plain '?', '??', '?+' | |
210 | # | |
211 | quant-opt: | |
212 | '?' n expr-cont doNGOpt # ?? | |
213 | '+' n expr-cont doPossessiveOpt # ?+ | |
214 | default expr-cont doOpt # ? | |
215 | ||
216 | ||
217 | # | |
218 | # Interval scanning a '{', the opening delimiter for an interval specification | |
219 | # {number} or {min, max} or {min,} | |
220 | # | |
221 | interval-open: | |
222 | digit_char interval-lower | |
223 | default errorDeath doIntervalError | |
224 | ||
225 | interval-lower: | |
226 | digit_char n interval-lower doIntevalLowerDigit | |
227 | ',' n interval-upper | |
228 | '}' n interval-type doIntervalSame # {n} | |
229 | default errorDeath doIntervalError | |
230 | ||
231 | interval-upper: | |
232 | digit_char n interval-upper doIntervalUpperDigit | |
233 | '}' n interval-type | |
234 | default errorDeath doIntervalError | |
235 | ||
236 | interval-type: | |
237 | '?' n expr-cont doNGInterval # {n,m}? | |
238 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
239 | default expr-cont doInterval # {m,n} | |
240 | ||
241 | ||
242 | # | |
243 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
244 | # The low level next-char function will have preprocessed | |
245 | # some of them already; those won't come here. | |
246 | backslash: | |
247 | 'A' n term doBackslashA | |
248 | 'B' n term doBackslashB | |
249 | 'b' n term doBackslashb | |
250 | 'd' n expr-quant doBackslashd | |
251 | 'D' n expr-quant doBackslashD | |
252 | 'G' n term doBackslashG | |
253 | 'h' n expr-quant doBackslashh | |
254 | 'H' n expr-quant doBackslashH | |
255 | 'k' n named-backref | |
256 | 'N' expr-quant doNamedChar # \N{NAME} named char | |
257 | 'p' expr-quant doProperty # \p{Lu} style property | |
258 | 'P' expr-quant doProperty | |
259 | 'R' n expr-quant doBackslashR | |
260 | 'Q' n term doEnterQuoteMode | |
261 | 'S' n expr-quant doBackslashS | |
262 | 's' n expr-quant doBackslashs | |
263 | 'v' n expr-quant doBackslashv | |
264 | 'V' n expr-quant doBackslashV | |
265 | 'W' n expr-quant doBackslashW | |
266 | 'w' n expr-quant doBackslashw | |
267 | 'X' n expr-quant doBackslashX | |
268 | 'Z' n term doBackslashZ | |
269 | 'z' n term doBackslashz | |
270 | digit_char n expr-quant doBackRef # Will scan multiple digits | |
271 | eof errorDeath doEscapeError | |
272 | default n expr-quant doEscapedLiteralChar | |
273 | ||
274 | ||
275 | # named-backref Scanned \k | |
276 | # Leading to \k<captureName> | |
277 | # Failure to get the full sequence is an error. | |
278 | # | |
279 | named-backref: | |
280 | '<' n named-backref-2 doBeginNamedBackRef | |
281 | default errorDeath doBadNamedCapture | |
282 | ||
283 | named-backref-2: | |
284 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
285 | default errorDeath doBadNamedCapture | |
286 | ||
287 | named-backref-3: | |
288 | ascii_letter n named-backref-3 doContinueNamedBackRef | |
289 | digit_char n named-backref-3 doContinueNamedBackRef | |
290 | '>' n expr-quant doCompleteNamedBackRef | |
291 | default errorDeath doBadNamedCapture | |
292 | ||
293 | ||
294 | # | |
295 | # [set expression] parsing, | |
296 | # All states involved in parsing set expressions have names beginning with "set-" | |
297 | # | |
298 | ||
299 | set-open: | |
300 | '^' n set-open2 doSetNegate | |
301 | ':' set-posix doSetPosixProp | |
302 | default set-open2 | |
303 | ||
304 | set-open2: | |
305 | ']' n set-after-lit doSetLiteral | |
306 | default set-start | |
307 | ||
308 | # set-posix: | |
309 | # scanned a '[:' If it really is a [:property:], doSetPosixProp will have | |
310 | # moved the scan to the closing ']'. If it wasn't a property | |
311 | # expression, the scan will still be at the opening ':', which should | |
312 | # be interpreted as a normal set expression. | |
313 | set-posix: | |
314 | ']' n pop doSetEnd | |
315 | ':' set-start | |
316 | default errorDeath doRuleError # should not be possible. | |
317 | ||
318 | # | |
319 | # set-start after the [ and special case leading characters (^ and/or ]) but before | |
320 | # everything else. A '-' is literal at this point. | |
321 | # | |
322 | set-start: | |
323 | ']' n pop doSetEnd | |
324 | '[' n set-open ^set-after-set doSetBeginUnion | |
325 | '\' n set-escape | |
326 | '-' n set-start-dash | |
327 | '&' n set-start-amp | |
328 | default n set-after-lit doSetLiteral | |
329 | ||
330 | # set-start-dash Turn "[--" into a syntax error. | |
331 | # "[-x" is good, - and x are literals. | |
332 | # | |
333 | set-start-dash: | |
334 | '-' errorDeath doRuleError | |
335 | default set-after-lit doSetAddDash | |
336 | ||
337 | # set-start-amp Turn "[&&" into a syntax error. | |
338 | # "[&x" is good, & and x are literals. | |
339 | # | |
340 | set-start-amp: | |
341 | '&' errorDeath doRuleError | |
342 | default set-after-lit doSetAddAmp | |
343 | ||
344 | # | |
345 | # set-after-lit The last thing scanned was a literal character within a set. | |
346 | # Can be followed by anything. Single '-' or '&' are | |
347 | # literals in this context, not operators. | |
348 | set-after-lit: | |
349 | ']' n pop doSetEnd | |
350 | '[' n set-open ^set-after-set doSetBeginUnion | |
351 | '-' n set-lit-dash | |
352 | '&' n set-lit-amp | |
353 | '\' n set-escape | |
354 | eof errorDeath doSetNoCloseError | |
355 | default n set-after-lit doSetLiteral | |
356 | ||
357 | set-after-set: | |
358 | ']' n pop doSetEnd | |
359 | '[' n set-open ^set-after-set doSetBeginUnion | |
360 | '-' n set-set-dash | |
361 | '&' n set-set-amp | |
362 | '\' n set-escape | |
363 | eof errorDeath doSetNoCloseError | |
364 | default n set-after-lit doSetLiteral | |
365 | ||
366 | set-after-range: | |
367 | ']' n pop doSetEnd | |
368 | '[' n set-open ^set-after-set doSetBeginUnion | |
369 | '-' n set-range-dash | |
370 | '&' n set-range-amp | |
371 | '\' n set-escape | |
372 | eof errorDeath doSetNoCloseError | |
373 | default n set-after-lit doSetLiteral | |
374 | ||
375 | ||
376 | # set-after-op | |
377 | # After a -- or && | |
378 | # It is an error to close a set at this point. | |
379 | # | |
380 | set-after-op: | |
381 | '[' n set-open ^set-after-set doSetBeginUnion | |
382 | ']' errorDeath doSetOpError | |
383 | '\' n set-escape | |
384 | default n set-after-lit doSetLiteral | |
385 | ||
386 | # | |
387 | # set-set-amp | |
388 | # Have scanned [[set]& | |
389 | # Could be a '&' intersection operator, if a set follows. | |
390 | # Could be the start of a '&&' operator. | |
391 | # Otherewise is a literal. | |
392 | set-set-amp: | |
393 | '[' n set-open ^set-after-set doSetBeginIntersection1 | |
394 | '&' n set-after-op doSetIntersection2 | |
395 | default set-after-lit doSetAddAmp | |
396 | ||
397 | ||
398 | # set-lit-amp Have scanned "[literals&" | |
399 | # Could be a start of "&&" operator or a literal | |
400 | # In [abc&[def]], the '&' is a literal | |
401 | # | |
402 | set-lit-amp: | |
403 | '&' n set-after-op doSetIntersection2 | |
404 | default set-after-lit doSetAddAmp | |
405 | ||
406 | ||
407 | # | |
408 | # set-set-dash | |
409 | # Have scanned [set]- | |
410 | # Could be a '-' difference operator, if a [set] follows. | |
411 | # Could be the start of a '--' operator. | |
412 | # Otherewise is a literal. | |
413 | set-set-dash: | |
414 | '[' n set-open ^set-after-set doSetBeginDifference1 | |
415 | '-' n set-after-op doSetDifference2 | |
416 | default set-after-lit doSetAddDash | |
417 | ||
418 | ||
419 | # | |
420 | # set-range-dash | |
421 | # scanned a-b- or \w- | |
422 | # any set or range like item where the trailing single '-' should | |
423 | # be literal, not a set difference operation. | |
424 | # A trailing "--" is still a difference operator. | |
425 | set-range-dash: | |
426 | '-' n set-after-op doSetDifference2 | |
427 | default set-after-lit doSetAddDash | |
428 | ||
429 | ||
430 | set-range-amp: | |
431 | '&' n set-after-op doSetIntersection2 | |
432 | default set-after-lit doSetAddAmp | |
433 | ||
434 | ||
435 | # set-lit-dash | |
436 | # Have scanned "[literals-" Could be a range or a -- operator or a literal | |
437 | # In [abc-[def]], the '-' is a literal (confirmed with a Java test) | |
438 | # [abc-\p{xx} the '-' is an error | |
439 | # [abc-] the '-' is a literal | |
440 | # [ab-xy] the '-' is a range | |
441 | # | |
442 | set-lit-dash: | |
443 | '-' n set-after-op doSetDifference2 | |
444 | '[' set-after-lit doSetAddDash | |
445 | ']' set-after-lit doSetAddDash | |
446 | '\' n set-lit-dash-escape | |
447 | default n set-after-range doSetRange | |
448 | ||
449 | # set-lit-dash-escape | |
450 | # | |
451 | # scanned "[literal-\" | |
452 | # Could be a range, if the \ introduces an escaped literal char or a named char. | |
453 | # Otherwise it is an error. | |
454 | # | |
455 | set-lit-dash-escape: | |
456 | 's' errorDeath doSetOpError | |
457 | 'S' errorDeath doSetOpError | |
458 | 'w' errorDeath doSetOpError | |
459 | 'W' errorDeath doSetOpError | |
460 | 'd' errorDeath doSetOpError | |
461 | 'D' errorDeath doSetOpError | |
462 | 'N' set-after-range doSetNamedRange | |
463 | default n set-after-range doSetRange | |
464 | ||
465 | ||
466 | # | |
467 | # set-escape | |
468 | # Common back-slash escape processing within set expressions | |
469 | # | |
470 | set-escape: | |
471 | 'p' set-after-set doSetProp | |
472 | 'P' set-after-set doSetProp | |
473 | 'N' set-after-lit doSetNamedChar | |
474 | 's' n set-after-range doSetBackslash_s | |
475 | 'S' n set-after-range doSetBackslash_S | |
476 | 'w' n set-after-range doSetBackslash_w | |
477 | 'W' n set-after-range doSetBackslash_W | |
478 | 'd' n set-after-range doSetBackslash_d | |
479 | 'D' n set-after-range doSetBackslash_D | |
480 | 'h' n set-after-range doSetBackslash_h | |
481 | 'H' n set-after-range doSetBackslash_H | |
482 | 'v' n set-after-range doSetBackslash_v | |
483 | 'V' n set-after-range doSetBackslash_V | |
484 | default n set-after-lit doSetLiteralEscaped | |
485 | ||
486 | # | |
487 | # set-finish | |
488 | # Have just encountered the final ']' that completes a [set], and | |
489 | # arrived here via a pop. From here, we exit the set parsing world, and go | |
490 | # back to generic regular expression parsing. | |
491 | # | |
492 | set-finish: | |
493 | default expr-quant doSetFinish | |
494 | ||
495 | ||
496 | # | |
497 | # errorDeath. This state is specified as the next state whenever a syntax error | |
498 | # in the source rules is detected. Barring bugs, the state machine will never | |
499 | # actually get here, but will stop because of the action associated with the error. | |
500 | # But, just in case, this state asks the state machine to exit. | |
501 | errorDeath: | |
502 | default n errorDeath doExit | |
503 | ||
504 |