]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | |
2 | #***************************************************************************** | |
3 | # | |
46f4442e | 4 | # Copyright (C) 2002-2007, International Business Machines Corporation and others. |
b75a7d8f A |
5 | # All Rights Reserved. |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: regexcst.txt | |
10 | # ICU Regular Expression Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a regular expression pattern | |
13 | # The pattern parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the regex pattern grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | ||
23 | # | |
24 | # Here is the syntax of the state definitions in this file: | |
25 | # | |
26 | # | |
27 | #StateName: | |
46f4442e A |
28 | # input-char n next-state ^push-state action |
29 | # input-char n next-state ^push-state action | |
b75a7d8f A |
30 | # | | | | | |
31 | # | | | | |--- action to be performed by state machine | |
32 | # | | | | See function RBBIRuleScanner::doParseActions() | |
33 | # | | | | | |
34 | # | | | |--- Push this named state onto the state stack. | |
35 | # | | | Later, when next state is specified as "pop", | |
36 | # | | | the pushed state will become the current state. | |
37 | # | | | | |
38 | # | | |--- Transition to this state if the current input character matches the input | |
39 | # | | character or char class in the left hand column. "pop" causes the next | |
40 | # | | state to be popped from the state stack. | |
41 | # | | | |
42 | # | |--- When making the state transition specified on this line, advance to the next | |
43 | # | character from the input only if 'n' appears here. | |
44 | # | | |
45 | # |--- Character or named character classes to test for. If the current character being scanned | |
46 | # matches, peform the actions and go to the state specified on this line. | |
47 | # The input character is tested sequentally, in the order written. The characters and | |
48 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
46f4442e | 49 | # |
b75a7d8f A |
50 | |
51 | ||
52 | ||
53 | ||
54 | # | |
55 | # start state, scan position is at the beginning of the pattern. | |
56 | # | |
57 | start: | |
58 | default term doPatStart | |
b75a7d8f | 59 | |
46f4442e A |
60 | |
61 | ||
62 | ||
b75a7d8f A |
63 | # |
64 | # term. At a position where we can accept the start most items in a pattern. | |
65 | # | |
66 | term: | |
67 | quoted n expr-quant doLiteralChar | |
68 | rule_char n expr-quant doLiteralChar | |
46f4442e A |
69 | '[' n set-open ^set-finish doSetBegin |
70 | '(' n open-paren | |
b75a7d8f | 71 | '.' n expr-quant doDotAny |
46f4442e A |
72 | '^' n expr-quant doCaret |
73 | '$' n expr-quant doDollar | |
b75a7d8f A |
74 | '\' n backslash |
75 | '|' n term doOrOperator | |
76 | ')' n pop doCloseParen | |
77 | eof term doPatFinish | |
78 | default errorDeath doRuleError | |
46f4442e | 79 | |
b75a7d8f A |
80 | |
81 | ||
82 | # | |
83 | # expr-quant We've just finished scanning a term, now look for the optional | |
84 | # trailing quantifier - *, +, ?, *?, etc. | |
85 | # | |
86 | expr-quant: | |
46f4442e A |
87 | '*' n quant-star |
88 | '+' n quant-plus | |
89 | '?' n quant-opt | |
b75a7d8f A |
90 | '{' n interval-open doIntervalInit |
91 | '(' n open-paren-quant | |
46f4442e A |
92 | default expr-cont |
93 | ||
94 | ||
b75a7d8f A |
95 | # |
96 | # expr-cont Expression, continuation. At a point where additional terms are | |
97 | # allowed, but not required. No Quantifiers | |
98 | # | |
99 | expr-cont: | |
100 | '|' n term doOrOperator | |
101 | ')' n pop doCloseParen | |
46f4442e A |
102 | default term |
103 | ||
b75a7d8f A |
104 | |
105 | # | |
106 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
107 | # e.g. x(?#comment )* | |
108 | # Open parens from expr-quant come here; anything but a (?# comment | |
109 | # branches into the normal parenthesis sequence as quickly as possible. | |
110 | # | |
111 | open-paren-quant: | |
112 | '?' n open-paren-quant2 doSuppressComments | |
113 | default open-paren | |
46f4442e | 114 | |
b75a7d8f A |
115 | open-paren-quant2: |
116 | '#' n paren-comment ^expr-quant | |
117 | default open-paren-extended | |
46f4442e A |
118 | |
119 | ||
b75a7d8f A |
120 | # |
121 | # open-paren We've got an open paren. We need to scan further to | |
122 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
123 | # | |
124 | open-paren: | |
125 | '?' n open-paren-extended doSuppressComments | |
126 | default term ^expr-quant doOpenCaptureParen | |
46f4442e | 127 | |
b75a7d8f A |
128 | open-paren-extended: |
129 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
130 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
131 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
132 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
133 | '<' n open-paren-lookbehind | |
134 | '#' n paren-comment ^term | |
135 | 'i' paren-flag doBeginMatchMode | |
46f4442e | 136 | 'd' paren-flag doBeginMatchMode |
b75a7d8f A |
137 | 'm' paren-flag doBeginMatchMode |
138 | 's' paren-flag doBeginMatchMode | |
46f4442e | 139 | 'u' paren-flag doBeginMatchMode |
374ca955 | 140 | 'w' paren-flag doBeginMatchMode |
b75a7d8f A |
141 | 'x' paren-flag doBeginMatchMode |
142 | '-' paren-flag doBeginMatchMode | |
143 | '(' n errorDeath doConditionalExpr | |
144 | '{' n errorDeath doPerlInline | |
145 | default errorDeath doBadOpenParenType | |
46f4442e | 146 | |
b75a7d8f A |
147 | open-paren-lookbehind: |
148 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
149 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
150 | default errorDeath doBadOpenParenType | |
46f4442e | 151 | |
b75a7d8f A |
152 | |
153 | # | |
154 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
b75a7d8f A |
155 | # |
156 | paren-comment: | |
157 | ')' n pop | |
158 | eof errorDeath doMismatchedParenErr | |
159 | default n paren-comment | |
160 | ||
161 | # | |
46f4442e A |
162 | # paren-flag Scanned a (?ismx-ismx flag setting |
163 | # | |
b75a7d8f A |
164 | paren-flag: |
165 | 'i' n paren-flag doMatchMode | |
46f4442e | 166 | 'd' n paren-flag doMatchMode |
b75a7d8f A |
167 | 'm' n paren-flag doMatchMode |
168 | 's' n paren-flag doMatchMode | |
46f4442e | 169 | 'u' n paren-flag doMatchMode |
374ca955 | 170 | 'w' n paren-flag doMatchMode |
b75a7d8f A |
171 | 'x' n paren-flag doMatchMode |
172 | '-' n paren-flag doMatchMode | |
173 | ')' n term doSetMatchMode | |
174 | ':' n term ^expr-quant doMatchModeParen | |
374ca955 | 175 | default errorDeath doBadModeFlag |
46f4442e A |
176 | |
177 | ||
b75a7d8f A |
178 | # |
179 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
180 | # between plain '*', '*?', '*+' | |
181 | # | |
182 | quant-star: | |
183 | '?' n expr-cont doNGStar # *? | |
184 | '+' n expr-cont doPossessiveStar # *+ | |
185 | default expr-cont doStar | |
186 | ||
187 | ||
188 | # | |
189 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
190 | # between plain '+', '+?', '++' | |
191 | # | |
192 | quant-plus: | |
193 | '?' n expr-cont doNGPlus # *? | |
194 | '+' n expr-cont doPossessivePlus # *+ | |
195 | default expr-cont doPlus | |
196 | ||
197 | ||
198 | # | |
199 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
200 | # between plain '?', '??', '?+' | |
201 | # | |
202 | quant-opt: | |
203 | '?' n expr-cont doNGOpt # ?? | |
204 | '+' n expr-cont doPossessiveOpt # ?+ | |
205 | default expr-cont doOpt # ? | |
206 | ||
207 | ||
208 | # | |
209 | # Interval scanning a '{', the opening delimiter for an interval specification | |
46f4442e | 210 | # {number} or {min, max} or {min,} |
b75a7d8f A |
211 | # |
212 | interval-open: | |
46f4442e | 213 | digit_char interval-lower |
b75a7d8f | 214 | default errorDeath doIntervalError |
46f4442e | 215 | |
b75a7d8f A |
216 | interval-lower: |
217 | digit_char n interval-lower doIntevalLowerDigit | |
218 | ',' n interval-upper | |
219 | '}' n interval-type doIntervalSame # {n} | |
220 | default errorDeath doIntervalError | |
221 | ||
222 | interval-upper: | |
223 | digit_char n interval-upper doIntervalUpperDigit | |
224 | '}' n interval-type | |
225 | default errorDeath doIntervalError | |
46f4442e | 226 | |
b75a7d8f A |
227 | interval-type: |
228 | '?' n expr-cont doNGInterval # {n,m}? | |
229 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
230 | default expr-cont doInterval # {m,n} | |
46f4442e A |
231 | |
232 | ||
b75a7d8f A |
233 | # |
234 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
235 | # The low level next-char function will have preprocessed | |
236 | # some of them already; those won't come here. | |
237 | backslash: | |
238 | 'A' n term doBackslashA | |
239 | 'B' n term doBackslashB | |
240 | 'b' n term doBackslashb | |
241 | 'd' n expr-quant doBackslashd | |
242 | 'D' n expr-quant doBackslashD | |
243 | 'G' n term doBackslashG | |
46f4442e | 244 | 'N' expr-quant doNamedChar # \N{NAME} named char |
b75a7d8f A |
245 | 'p' expr-quant doProperty # \p{Lu} style property |
246 | 'P' expr-quant doProperty | |
247 | 'Q' n term doEnterQuoteMode | |
248 | 'S' n expr-quant doBackslashS | |
249 | 's' n expr-quant doBackslashs | |
250 | 'W' n expr-quant doBackslashW | |
251 | 'w' n expr-quant doBackslashw | |
252 | 'X' n expr-quant doBackslashX | |
253 | 'Z' n term doBackslashZ | |
254 | 'z' n term doBackslashz | |
46f4442e | 255 | digit_char n expr-quant doBackRef # Will scan multiple digits |
b75a7d8f | 256 | eof errorDeath doEscapeError |
46f4442e A |
257 | default n expr-quant doEscapedLiteralChar |
258 | ||
b75a7d8f | 259 | |
46f4442e A |
260 | |
261 | # | |
262 | # [set expression] parsing, | |
263 | # All states involved in parsing set expressions have names beginning with "set-" | |
264 | # | |
265 | ||
266 | set-open: | |
267 | '^' n set-open2 doSetNegate | |
268 | ':' set-posix doSetPosixProp | |
269 | default set-open2 | |
270 | ||
271 | set-open2: | |
272 | ']' n set-after-lit doSetLiteral | |
273 | default set-start | |
274 | ||
275 | # set-posix: | |
276 | # scanned a '[:' If it really is a [:property:], doSetPosixProp will have | |
277 | # moved the scan to the closing ']'. If it wasn't a property | |
278 | # expression, the scan will still be at the opening ':', which should | |
279 | # be interpreted as a normal set expression. | |
280 | set-posix: | |
281 | ']' n pop doSetEnd | |
282 | ':' set-start | |
283 | default errorDeath doRuleError # should not be possible. | |
284 | ||
285 | # | |
286 | # set-start after the [ and special case leading characters (^ and/or ]) but before | |
287 | # everything else. A '-' is literal at this point. | |
288 | # | |
289 | set-start: | |
290 | ']' n pop doSetEnd | |
291 | '[' n set-open ^set-after-set doSetBeginUnion | |
292 | '\' n set-escape | |
293 | '-' n set-start-dash | |
294 | '&' n set-start-amp | |
295 | default n set-after-lit doSetLiteral | |
296 | ||
297 | # set-start-dash Turn "[--" into a syntax error. | |
298 | # "[-x" is good, - and x are literals. | |
299 | # | |
300 | set-start-dash: | |
301 | '-' errorDeath doRuleError | |
302 | default set-after-lit doSetAddDash | |
303 | ||
304 | # set-start-amp Turn "[&&" into a syntax error. | |
305 | # "[&x" is good, & and x are literals. | |
306 | # | |
307 | set-start-amp: | |
308 | '&' errorDeath doRuleError | |
309 | default set-after-lit doSetAddAmp | |
310 | ||
311 | # | |
312 | # set-after-lit The last thing scanned was a literal character within a set. | |
313 | # Can be followed by anything. Single '-' or '&' are | |
314 | # literals in this context, not operators. | |
315 | set-after-lit: | |
316 | ']' n pop doSetEnd | |
317 | '[' n set-open ^set-after-set doSetBeginUnion | |
318 | '-' n set-lit-dash | |
319 | '&' n set-lit-amp | |
320 | '\' n set-escape | |
321 | eof errorDeath doSetNoCloseError | |
322 | default n set-after-lit doSetLiteral | |
323 | ||
324 | set-after-set: | |
325 | ']' n pop doSetEnd | |
326 | '[' n set-open ^set-after-set doSetBeginUnion | |
327 | '-' n set-set-dash | |
328 | '&' n set-set-amp | |
329 | '\' n set-escape | |
330 | eof errorDeath doSetNoCloseError | |
331 | default n set-after-lit doSetLiteral | |
332 | ||
333 | set-after-range: | |
334 | ']' n pop doSetEnd | |
335 | '[' n set-open ^set-after-set doSetBeginUnion | |
336 | '-' n set-range-dash | |
337 | '&' n set-range-amp | |
338 | '\' n set-escape | |
339 | eof errorDeath doSetNoCloseError | |
340 | default n set-after-lit doSetLiteral | |
b75a7d8f | 341 | |
46f4442e A |
342 | |
343 | # set-after-op | |
344 | # After a -- or && | |
345 | # It is an error to close a set at this point. | |
346 | # | |
347 | set-after-op: | |
348 | '[' n set-open ^set-after-set doSetBeginUnion | |
349 | ']' errorDeath doSetOpError | |
350 | '\' n set-escape | |
351 | default n set-after-lit doSetLiteral | |
352 | ||
353 | # | |
354 | # set-set-amp | |
355 | # Have scanned [[set]& | |
356 | # Could be a '&' intersection operator, if a set follows. | |
357 | # Could be the start of a '&&' operator. | |
358 | # Otherewise is a literal. | |
359 | set-set-amp: | |
360 | '[' n set-open ^set-after-set doSetBeginIntersection1 | |
361 | '&' n set-after-op doSetIntersection2 | |
362 | default set-after-lit doSetAddAmp | |
363 | ||
364 | ||
365 | # set-lit-amp Have scanned "[literals&" | |
366 | # Could be a start of "&&" operator or a literal | |
367 | # In [abc&[def]], the '&' is a literal | |
368 | # | |
369 | set-lit-amp: | |
370 | '&' n set-after-op doSetIntersection2 | |
371 | default set-after-lit doSetAddAmp | |
372 | ||
373 | ||
374 | # | |
375 | # set-set-dash | |
376 | # Have scanned [set]- | |
377 | # Could be a '-' difference operator, if a [set] follows. | |
378 | # Could be the start of a '--' operator. | |
379 | # Otherewise is a literal. | |
380 | set-set-dash: | |
381 | '[' n set-open ^set-after-set doSetBeginDifference1 | |
382 | '-' n set-after-op doSetDifference2 | |
383 | default set-after-lit doSetAddDash | |
384 | ||
385 | ||
386 | # | |
387 | # set-range-dash | |
388 | # scanned a-b- or \w- | |
389 | # any set or range like item where the trailing single '-' should | |
390 | # be literal, not a set difference operation. | |
391 | # A trailing "--" is still a difference operator. | |
392 | set-range-dash: | |
393 | '-' n set-after-op doSetDifference2 | |
394 | default set-after-lit doSetAddDash | |
395 | ||
396 | ||
397 | set-range-amp: | |
398 | '&' n set-after-op doSetIntersection2 | |
399 | default set-after-lit doSetAddAmp | |
400 | ||
401 | ||
402 | # set-lit-dash | |
403 | # Have scanned "[literals-" Could be a range or a -- operator or a literal | |
404 | # In [abc-[def]], the '-' is a literal (confirmed with a Java test) | |
405 | # [abc-\p{xx} the '-' is an error | |
406 | # [abc-] the '-' is a literal | |
407 | # [ab-xy] the '-' is a range | |
408 | # | |
409 | set-lit-dash: | |
410 | '-' n set-after-op doSetDifference2 | |
411 | '[' set-after-lit doSetAddDash | |
412 | ']' set-after-lit doSetAddDash | |
413 | '\' n set-lit-dash-escape | |
414 | default n set-after-range doSetRange | |
415 | ||
416 | # set-lit-dash-escape | |
417 | # | |
418 | # scanned "[literal-\" | |
419 | # Could be a range, if the \ introduces an escaped literal char or a named char. | |
420 | # Otherwise it is an error. | |
421 | # | |
422 | set-lit-dash-escape: | |
423 | 's' errorDeath doSetOpError | |
424 | 'S' errorDeath doSetOpError | |
425 | 'w' errorDeath doSetOpError | |
426 | 'W' errorDeath doSetOpError | |
427 | 'd' errorDeath doSetOpError | |
428 | 'D' errorDeath doSetOpError | |
429 | 'N' set-after-range doSetNamedRange | |
430 | default n set-after-range doSetRange | |
431 | ||
432 | ||
433 | # | |
434 | # set-escape | |
435 | # Common back-slash escape processing within set expressions | |
436 | # | |
437 | set-escape: | |
438 | 'p' set-after-set doSetProp | |
439 | 'P' set-after-set doSetProp | |
440 | 'N' set-after-lit doSetNamedChar | |
441 | 's' n set-after-range doSetBackslash_s | |
442 | 'S' n set-after-range doSetBackslash_S | |
443 | 'w' n set-after-range doSetBackslash_w | |
444 | 'W' n set-after-range doSetBackslash_W | |
445 | 'd' n set-after-range doSetBackslash_d | |
446 | 'D' n set-after-range doSetBackslash_D | |
447 | default n set-after-lit doSetLiteralEscaped | |
448 | ||
449 | # | |
450 | # set-finish | |
451 | # Have just encountered the final ']' that completes a [set], and | |
452 | # arrived here via a pop. From here, we exit the set parsing world, and go | |
453 | # back to generic regular expression parsing. | |
454 | # | |
455 | set-finish: | |
456 | default expr-quant doSetFinish | |
457 | ||
458 | ||
b75a7d8f A |
459 | # |
460 | # errorDeath. This state is specified as the next state whenever a syntax error | |
461 | # in the source rules is detected. Barring bugs, the state machine will never | |
462 | # actually get here, but will stop because of the action associated with the error. | |
463 | # But, just in case, this state asks the state machine to exit. | |
464 | errorDeath: | |
465 | default n errorDeath doExit | |
466 | ||
467 |