]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/regexcst.txt
ICU-66108.tar.gz
[apple/icu.git] / icuSources / i18n / regexcst.txt
1 # Copyright (C) 2016 and later: Unicode, Inc. and others.
2 # License & terms of use: http://www.unicode.org/copyright.html
3 #*****************************************************************************
4 #
5 # Copyright (C) 2002-2015, International Business Machines Corporation and others.
6 # All Rights Reserved.
7 #
8 #*****************************************************************************
9 #
10 # file: regexcst.txt
11 # ICU Regular Expression Parser State Table
12 #
13 # This state table is used when reading and parsing a regular expression pattern
14 # The pattern parser uses a state machine; the data in this file define the
15 # state transitions that occur for each input character.
16 #
17 # *** This file defines the regex pattern grammar. This is it.
18 # *** The determination of what is accepted is here.
19 #
20 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
21 # that are then built with the rule parser.
22 #
23
24 #
25 # Here is the syntax of the state definitions in this file:
26 #
27 #
28 #StateName:
29 # input-char n next-state ^push-state action
30 # input-char n next-state ^push-state action
31 # | | | | |
32 # | | | | |--- action to be performed by state machine
33 # | | | | See function RBBIRuleScanner::doParseActions()
34 # | | | |
35 # | | | |--- Push this named state onto the state stack.
36 # | | | Later, when next state is specified as "pop",
37 # | | | the pushed state will become the current state.
38 # | | |
39 # | | |--- Transition to this state if the current input character matches the input
40 # | | character or char class in the left hand column. "pop" causes the next
41 # | | state to be popped from the state stack.
42 # | |
43 # | |--- When making the state transition specified on this line, advance to the next
44 # | character from the input only if 'n' appears here.
45 # |
46 # |--- Character or named character classes to test for. If the current character being scanned
47 # matches, peform the actions and go to the state specified on this line.
48 # The input character is tested sequentally, in the order written. The characters and
49 # character classes tested for do not need to be mutually exclusive. The first match wins.
50 #
51
52
53
54
55 #
56 # start state, scan position is at the beginning of the pattern.
57 #
58 start:
59 default term doPatStart
60
61
62
63
64 #
65 # term. At a position where we can accept the start most items in a pattern.
66 #
67 term:
68 quoted n expr-quant doLiteralChar
69 rule_char n expr-quant doLiteralChar
70 '[' n set-open ^set-finish doSetBegin
71 '(' n open-paren
72 '.' n expr-quant doDotAny
73 '^' n expr-quant doCaret
74 '$' n expr-quant doDollar
75 '\' n backslash
76 '|' n term doOrOperator
77 ')' n pop doCloseParen
78 eof term doPatFinish
79 default errorDeath doRuleError
80
81
82
83 #
84 # expr-quant We've just finished scanning a term, now look for the optional
85 # trailing quantifier - *, +, ?, *?, etc.
86 #
87 expr-quant:
88 '*' n quant-star
89 '+' n quant-plus
90 '?' n quant-opt
91 '{' n interval-open doIntervalInit
92 '(' n open-paren-quant
93 default expr-cont
94
95
96 #
97 # expr-cont Expression, continuation. At a point where additional terms are
98 # allowed, but not required. No Quantifiers
99 #
100 expr-cont:
101 '|' n term doOrOperator
102 ')' n pop doCloseParen
103 default term
104
105
106 #
107 # open-paren-quant Special case handling for comments appearing before a quantifier,
108 # e.g. x(?#comment )*
109 # Open parens from expr-quant come here; anything but a (?# comment
110 # branches into the normal parenthesis sequence as quickly as possible.
111 #
112 open-paren-quant:
113 '?' n open-paren-quant2 doSuppressComments
114 default open-paren
115
116 open-paren-quant2:
117 '#' n paren-comment ^expr-quant
118 default open-paren-extended
119
120
121 #
122 # open-paren We've got an open paren. We need to scan further to
123 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
124 #
125 open-paren:
126 '?' n open-paren-extended doSuppressComments
127 default term ^expr-quant doOpenCaptureParen
128
129 open-paren-extended:
130 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
131 '>' n term ^expr-quant doOpenAtomicParen # (?>
132 '=' n term ^expr-cont doOpenLookAhead # (?=
133 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
134 '<' n open-paren-lookbehind
135 '#' n paren-comment ^term
136 'i' paren-flag doBeginMatchMode
137 'd' paren-flag doBeginMatchMode
138 'm' paren-flag doBeginMatchMode
139 's' paren-flag doBeginMatchMode
140 'u' paren-flag doBeginMatchMode
141 'w' paren-flag doBeginMatchMode
142 'x' paren-flag doBeginMatchMode
143 '-' paren-flag doBeginMatchMode
144 '(' n errorDeath doConditionalExpr
145 '{' n errorDeath doPerlInline
146 default errorDeath doBadOpenParenType
147
148 open-paren-lookbehind:
149 '=' n term ^expr-cont doOpenLookBehind # (?<=
150 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
151 ascii_letter named-capture doBeginNamedCapture # (?<name
152 default errorDeath doBadOpenParenType
153
154
155 #
156 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
157 #
158 paren-comment:
159 ')' n pop
160 eof errorDeath doMismatchedParenErr
161 default n paren-comment
162
163 #
164 # paren-flag Scanned a (?ismx-ismx flag setting
165 #
166 paren-flag:
167 'i' n paren-flag doMatchMode
168 'd' n paren-flag doMatchMode
169 'm' n paren-flag doMatchMode
170 's' n paren-flag doMatchMode
171 'u' n paren-flag doMatchMode
172 'w' n paren-flag doMatchMode
173 'x' n paren-flag doMatchMode
174 '-' n paren-flag doMatchMode
175 ')' n term doSetMatchMode
176 ':' n term ^expr-quant doMatchModeParen
177 default errorDeath doBadModeFlag
178
179 #
180 # named-capture (?<name> ... ), position currently on the name.
181 #
182 named-capture:
183 ascii_letter n named-capture doContinueNamedCapture
184 digit_char n named-capture doContinueNamedCapture
185 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
186 default errorDeath doBadNamedCapture
187
188 #
189 # quant-star Scanning a '*' quantifier. Need to look ahead to decide
190 # between plain '*', '*?', '*+'
191 #
192 quant-star:
193 '?' n expr-cont doNGStar # *?
194 '+' n expr-cont doPossessiveStar # *+
195 default expr-cont doStar
196
197
198 #
199 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide
200 # between plain '+', '+?', '++'
201 #
202 quant-plus:
203 '?' n expr-cont doNGPlus # *?
204 '+' n expr-cont doPossessivePlus # *+
205 default expr-cont doPlus
206
207
208 #
209 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide
210 # between plain '?', '??', '?+'
211 #
212 quant-opt:
213 '?' n expr-cont doNGOpt # ??
214 '+' n expr-cont doPossessiveOpt # ?+
215 default expr-cont doOpt # ?
216
217
218 #
219 # Interval scanning a '{', the opening delimiter for an interval specification
220 # {number} or {min, max} or {min,}
221 #
222 interval-open:
223 digit_char interval-lower
224 default errorDeath doIntervalError
225
226 interval-lower:
227 digit_char n interval-lower doIntevalLowerDigit
228 ',' n interval-upper
229 '}' n interval-type doIntervalSame # {n}
230 default errorDeath doIntervalError
231
232 interval-upper:
233 digit_char n interval-upper doIntervalUpperDigit
234 '}' n interval-type
235 default errorDeath doIntervalError
236
237 interval-type:
238 '?' n expr-cont doNGInterval # {n,m}?
239 '+' n expr-cont doPossessiveInterval # {n,m}+
240 default expr-cont doInterval # {m,n}
241
242
243 #
244 # backslash # Backslash. Figure out which of the \thingies we have encountered.
245 # The low level next-char function will have preprocessed
246 # some of them already; those won't come here.
247 backslash:
248 'A' n term doBackslashA
249 'B' n term doBackslashB
250 'b' n term doBackslashb
251 'd' n expr-quant doBackslashd
252 'D' n expr-quant doBackslashD
253 'G' n term doBackslashG
254 'h' n expr-quant doBackslashh
255 'H' n expr-quant doBackslashH
256 'k' n named-backref
257 'N' expr-quant doNamedChar # \N{NAME} named char
258 'p' expr-quant doProperty # \p{Lu} style property
259 'P' expr-quant doProperty
260 'R' n expr-quant doBackslashR
261 'Q' n term doEnterQuoteMode
262 'S' n expr-quant doBackslashS
263 's' n expr-quant doBackslashs
264 'v' n expr-quant doBackslashv
265 'V' n expr-quant doBackslashV
266 'W' n expr-quant doBackslashW
267 'w' n expr-quant doBackslashw
268 'X' n expr-quant doBackslashX
269 'Z' n term doBackslashZ
270 'z' n term doBackslashz
271 digit_char n expr-quant doBackRef # Will scan multiple digits
272 eof errorDeath doEscapeError
273 default n expr-quant doEscapedLiteralChar
274
275
276 # named-backref Scanned \k
277 # Leading to \k<captureName>
278 # Failure to get the full sequence is an error.
279 #
280 named-backref:
281 '<' n named-backref-2 doBeginNamedBackRef
282 default errorDeath doBadNamedCapture
283
284 named-backref-2:
285 ascii_letter n named-backref-3 doContinueNamedBackRef
286 default errorDeath doBadNamedCapture
287
288 named-backref-3:
289 ascii_letter n named-backref-3 doContinueNamedBackRef
290 digit_char n named-backref-3 doContinueNamedBackRef
291 '>' n expr-quant doCompleteNamedBackRef
292 default errorDeath doBadNamedCapture
293
294
295 #
296 # [set expression] parsing,
297 # All states involved in parsing set expressions have names beginning with "set-"
298 #
299
300 set-open:
301 '^' n set-open2 doSetNegate
302 ':' set-posix doSetPosixProp
303 default set-open2
304
305 set-open2:
306 ']' n set-after-lit doSetLiteral
307 default set-start
308
309 # set-posix:
310 # scanned a '[:' If it really is a [:property:], doSetPosixProp will have
311 # moved the scan to the closing ']'. If it wasn't a property
312 # expression, the scan will still be at the opening ':', which should
313 # be interpreted as a normal set expression.
314 set-posix:
315 ']' n pop doSetEnd
316 ':' set-start
317 default errorDeath doRuleError # should not be possible.
318
319 #
320 # set-start after the [ and special case leading characters (^ and/or ]) but before
321 # everything else. A '-' is literal at this point.
322 #
323 set-start:
324 ']' n pop doSetEnd
325 '[' n set-open ^set-after-set doSetBeginUnion
326 '\' n set-escape
327 '-' n set-start-dash
328 '&' n set-start-amp
329 default n set-after-lit doSetLiteral
330
331 # set-start-dash Turn "[--" into a syntax error.
332 # "[-x" is good, - and x are literals.
333 #
334 set-start-dash:
335 '-' errorDeath doRuleError
336 default set-after-lit doSetAddDash
337
338 # set-start-amp Turn "[&&" into a syntax error.
339 # "[&x" is good, & and x are literals.
340 #
341 set-start-amp:
342 '&' errorDeath doRuleError
343 default set-after-lit doSetAddAmp
344
345 #
346 # set-after-lit The last thing scanned was a literal character within a set.
347 # Can be followed by anything. Single '-' or '&' are
348 # literals in this context, not operators.
349 set-after-lit:
350 ']' n pop doSetEnd
351 '[' n set-open ^set-after-set doSetBeginUnion
352 '-' n set-lit-dash
353 '&' n set-lit-amp
354 '\' n set-escape
355 eof errorDeath doSetNoCloseError
356 default n set-after-lit doSetLiteral
357
358 set-after-set:
359 ']' n pop doSetEnd
360 '[' n set-open ^set-after-set doSetBeginUnion
361 '-' n set-set-dash
362 '&' n set-set-amp
363 '\' n set-escape
364 eof errorDeath doSetNoCloseError
365 default n set-after-lit doSetLiteral
366
367 set-after-range:
368 ']' n pop doSetEnd
369 '[' n set-open ^set-after-set doSetBeginUnion
370 '-' n set-range-dash
371 '&' n set-range-amp
372 '\' n set-escape
373 eof errorDeath doSetNoCloseError
374 default n set-after-lit doSetLiteral
375
376
377 # set-after-op
378 # After a -- or &&
379 # It is an error to close a set at this point.
380 #
381 set-after-op:
382 '[' n set-open ^set-after-set doSetBeginUnion
383 ']' errorDeath doSetOpError
384 '\' n set-escape
385 default n set-after-lit doSetLiteral
386
387 #
388 # set-set-amp
389 # Have scanned [[set]&
390 # Could be a '&' intersection operator, if a set follows.
391 # Could be the start of a '&&' operator.
392 # Otherewise is a literal.
393 set-set-amp:
394 '[' n set-open ^set-after-set doSetBeginIntersection1
395 '&' n set-after-op doSetIntersection2
396 default set-after-lit doSetAddAmp
397
398
399 # set-lit-amp Have scanned "[literals&"
400 # Could be a start of "&&" operator or a literal
401 # In [abc&[def]], the '&' is a literal
402 #
403 set-lit-amp:
404 '&' n set-after-op doSetIntersection2
405 default set-after-lit doSetAddAmp
406
407
408 #
409 # set-set-dash
410 # Have scanned [set]-
411 # Could be a '-' difference operator, if a [set] follows.
412 # Could be the start of a '--' operator.
413 # Otherewise is a literal.
414 set-set-dash:
415 '[' n set-open ^set-after-set doSetBeginDifference1
416 '-' n set-after-op doSetDifference2
417 default set-after-lit doSetAddDash
418
419
420 #
421 # set-range-dash
422 # scanned a-b- or \w-
423 # any set or range like item where the trailing single '-' should
424 # be literal, not a set difference operation.
425 # A trailing "--" is still a difference operator.
426 set-range-dash:
427 '-' n set-after-op doSetDifference2
428 default set-after-lit doSetAddDash
429
430
431 set-range-amp:
432 '&' n set-after-op doSetIntersection2
433 default set-after-lit doSetAddAmp
434
435
436 # set-lit-dash
437 # Have scanned "[literals-" Could be a range or a -- operator or a literal
438 # In [abc-[def]], the '-' is a literal (confirmed with a Java test)
439 # [abc-\p{xx} the '-' is an error
440 # [abc-] the '-' is a literal
441 # [ab-xy] the '-' is a range
442 #
443 set-lit-dash:
444 '-' n set-after-op doSetDifference2
445 '[' set-after-lit doSetAddDash
446 ']' set-after-lit doSetAddDash
447 '\' n set-lit-dash-escape
448 default n set-after-range doSetRange
449
450 # set-lit-dash-escape
451 #
452 # scanned "[literal-\"
453 # Could be a range, if the \ introduces an escaped literal char or a named char.
454 # Otherwise it is an error.
455 #
456 set-lit-dash-escape:
457 's' errorDeath doSetOpError
458 'S' errorDeath doSetOpError
459 'w' errorDeath doSetOpError
460 'W' errorDeath doSetOpError
461 'd' errorDeath doSetOpError
462 'D' errorDeath doSetOpError
463 'N' set-after-range doSetNamedRange
464 default n set-after-range doSetRange
465
466
467 #
468 # set-escape
469 # Common back-slash escape processing within set expressions
470 #
471 set-escape:
472 'p' set-after-set doSetProp
473 'P' set-after-set doSetProp
474 'N' set-after-lit doSetNamedChar
475 's' n set-after-range doSetBackslash_s
476 'S' n set-after-range doSetBackslash_S
477 'w' n set-after-range doSetBackslash_w
478 'W' n set-after-range doSetBackslash_W
479 'd' n set-after-range doSetBackslash_d
480 'D' n set-after-range doSetBackslash_D
481 'h' n set-after-range doSetBackslash_h
482 'H' n set-after-range doSetBackslash_H
483 'v' n set-after-range doSetBackslash_v
484 'V' n set-after-range doSetBackslash_V
485 default n set-after-lit doSetLiteralEscaped
486
487 #
488 # set-finish
489 # Have just encountered the final ']' that completes a [set], and
490 # arrived here via a pop. From here, we exit the set parsing world, and go
491 # back to generic regular expression parsing.
492 #
493 set-finish:
494 default expr-quant doSetFinish
495
496
497 #
498 # errorDeath. This state is specified as the next state whenever a syntax error
499 # in the source rules is detected. Barring bugs, the state machine will never
500 # actually get here, but will stop because of the action associated with the error.
501 # But, just in case, this state asks the state machine to exit.
502 errorDeath:
503 default n errorDeath doExit
504
505