]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/regexcst.txt
ICU-62107.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / regexcst.txt
... / ...
CommitLineData
1# Copyright (C) 2016 and later: Unicode, Inc. and others.
2# License & terms of use: http://www.unicode.org/copyright.html
3#*****************************************************************************
4#
5# Copyright (C) 2002-2015, International Business Machines Corporation and others.
6# All Rights Reserved.
7#
8#*****************************************************************************
9#
10# file: regexcst.txt
11# ICU Regular Expression Parser State Table
12#
13# This state table is used when reading and parsing a regular expression pattern
14# The pattern parser uses a state machine; the data in this file define the
15# state transitions that occur for each input character.
16#
17# *** This file defines the regex pattern grammar. This is it.
18# *** The determination of what is accepted is here.
19#
20# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
21# that are then built with the rule parser.
22#
23
24#
25# Here is the syntax of the state definitions in this file:
26#
27#
28#StateName:
29# input-char n next-state ^push-state action
30# input-char n next-state ^push-state action
31# | | | | |
32# | | | | |--- action to be performed by state machine
33# | | | | See function RBBIRuleScanner::doParseActions()
34# | | | |
35# | | | |--- Push this named state onto the state stack.
36# | | | Later, when next state is specified as "pop",
37# | | | the pushed state will become the current state.
38# | | |
39# | | |--- Transition to this state if the current input character matches the input
40# | | character or char class in the left hand column. "pop" causes the next
41# | | state to be popped from the state stack.
42# | |
43# | |--- When making the state transition specified on this line, advance to the next
44# | character from the input only if 'n' appears here.
45# |
46# |--- Character or named character classes to test for. If the current character being scanned
47# matches, peform the actions and go to the state specified on this line.
48# The input character is tested sequentally, in the order written. The characters and
49# character classes tested for do not need to be mutually exclusive. The first match wins.
50#
51
52
53
54
55#
56# start state, scan position is at the beginning of the pattern.
57#
58start:
59 default term doPatStart
60
61
62
63
64#
65# term. At a position where we can accept the start most items in a pattern.
66#
67term:
68 quoted n expr-quant doLiteralChar
69 rule_char n expr-quant doLiteralChar
70 '[' n set-open ^set-finish doSetBegin
71 '(' n open-paren
72 '.' n expr-quant doDotAny
73 '^' n expr-quant doCaret
74 '$' n expr-quant doDollar
75 '\' n backslash
76 '|' n term doOrOperator
77 ')' n pop doCloseParen
78 eof term doPatFinish
79 default errorDeath doRuleError
80
81
82
83#
84# expr-quant We've just finished scanning a term, now look for the optional
85# trailing quantifier - *, +, ?, *?, etc.
86#
87expr-quant:
88 '*' n quant-star
89 '+' n quant-plus
90 '?' n quant-opt
91 '{' n interval-open doIntervalInit
92 '(' n open-paren-quant
93 default expr-cont
94
95
96#
97# expr-cont Expression, continuation. At a point where additional terms are
98# allowed, but not required. No Quantifiers
99#
100expr-cont:
101 '|' n term doOrOperator
102 ')' n pop doCloseParen
103 default term
104
105
106#
107# open-paren-quant Special case handling for comments appearing before a quantifier,
108# e.g. x(?#comment )*
109# Open parens from expr-quant come here; anything but a (?# comment
110# branches into the normal parenthesis sequence as quickly as possible.
111#
112open-paren-quant:
113 '?' n open-paren-quant2 doSuppressComments
114 default open-paren
115
116open-paren-quant2:
117 '#' n paren-comment ^expr-quant
118 default open-paren-extended
119
120
121#
122# open-paren We've got an open paren. We need to scan further to
123# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
124#
125open-paren:
126 '?' n open-paren-extended doSuppressComments
127 default term ^expr-quant doOpenCaptureParen
128
129open-paren-extended:
130 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
131 '>' n term ^expr-quant doOpenAtomicParen # (?>
132 '=' n term ^expr-cont doOpenLookAhead # (?=
133 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
134 '<' n open-paren-lookbehind
135 '#' n paren-comment ^term
136 'i' paren-flag doBeginMatchMode
137 'd' paren-flag doBeginMatchMode
138 'm' paren-flag doBeginMatchMode
139 's' paren-flag doBeginMatchMode
140 'u' paren-flag doBeginMatchMode
141 'w' paren-flag doBeginMatchMode
142 'x' paren-flag doBeginMatchMode
143 '-' paren-flag doBeginMatchMode
144 '(' n errorDeath doConditionalExpr
145 '{' n errorDeath doPerlInline
146 default errorDeath doBadOpenParenType
147
148open-paren-lookbehind:
149 '=' n term ^expr-cont doOpenLookBehind # (?<=
150 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
151 ascii_letter named-capture doBeginNamedCapture # (?<name
152 default errorDeath doBadOpenParenType
153
154
155#
156# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
157#
158paren-comment:
159 ')' n pop
160 eof errorDeath doMismatchedParenErr
161 default n paren-comment
162
163#
164# paren-flag Scanned a (?ismx-ismx flag setting
165#
166paren-flag:
167 'i' n paren-flag doMatchMode
168 'd' n paren-flag doMatchMode
169 'm' n paren-flag doMatchMode
170 's' n paren-flag doMatchMode
171 'u' n paren-flag doMatchMode
172 'w' n paren-flag doMatchMode
173 'x' n paren-flag doMatchMode
174 '-' n paren-flag doMatchMode
175 ')' n term doSetMatchMode
176 ':' n term ^expr-quant doMatchModeParen
177 default errorDeath doBadModeFlag
178
179#
180# named-capture (?<name> ... ), position currently on the name.
181#
182named-capture:
183 ascii_letter n named-capture doContinueNamedCapture
184 digit_char n named-capture doContinueNamedCapture
185 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
186 default errorDeath doBadNamedCapture
187
188#
189# quant-star Scanning a '*' quantifier. Need to look ahead to decide
190# between plain '*', '*?', '*+'
191#
192quant-star:
193 '?' n expr-cont doNGStar # *?
194 '+' n expr-cont doPossessiveStar # *+
195 default expr-cont doStar
196
197
198#
199# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
200# between plain '+', '+?', '++'
201#
202quant-plus:
203 '?' n expr-cont doNGPlus # *?
204 '+' n expr-cont doPossessivePlus # *+
205 default expr-cont doPlus
206
207
208#
209# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
210# between plain '?', '??', '?+'
211#
212quant-opt:
213 '?' n expr-cont doNGOpt # ??
214 '+' n expr-cont doPossessiveOpt # ?+
215 default expr-cont doOpt # ?
216
217
218#
219# Interval scanning a '{', the opening delimiter for an interval specification
220# {number} or {min, max} or {min,}
221#
222interval-open:
223 digit_char interval-lower
224 default errorDeath doIntervalError
225
226interval-lower:
227 digit_char n interval-lower doIntevalLowerDigit
228 ',' n interval-upper
229 '}' n interval-type doIntervalSame # {n}
230 default errorDeath doIntervalError
231
232interval-upper:
233 digit_char n interval-upper doIntervalUpperDigit
234 '}' n interval-type
235 default errorDeath doIntervalError
236
237interval-type:
238 '?' n expr-cont doNGInterval # {n,m}?
239 '+' n expr-cont doPossessiveInterval # {n,m}+
240 default expr-cont doInterval # {m,n}
241
242
243#
244# backslash # Backslash. Figure out which of the \thingies we have encountered.
245# The low level next-char function will have preprocessed
246# some of them already; those won't come here.
247backslash:
248 'A' n term doBackslashA
249 'B' n term doBackslashB
250 'b' n term doBackslashb
251 'd' n expr-quant doBackslashd
252 'D' n expr-quant doBackslashD
253 'G' n term doBackslashG
254 'h' n expr-quant doBackslashh
255 'H' n expr-quant doBackslashH
256 'k' n named-backref
257 'N' expr-quant doNamedChar # \N{NAME} named char
258 'p' expr-quant doProperty # \p{Lu} style property
259 'P' expr-quant doProperty
260 'R' n expr-quant doBackslashR
261 'Q' n term doEnterQuoteMode
262 'S' n expr-quant doBackslashS
263 's' n expr-quant doBackslashs
264 'v' n expr-quant doBackslashv
265 'V' n expr-quant doBackslashV
266 'W' n expr-quant doBackslashW
267 'w' n expr-quant doBackslashw
268 'X' n expr-quant doBackslashX
269 'Z' n term doBackslashZ
270 'z' n term doBackslashz
271 digit_char n expr-quant doBackRef # Will scan multiple digits
272 eof errorDeath doEscapeError
273 default n expr-quant doEscapedLiteralChar
274
275
276# named-backref Scanned \k
277# Leading to \k<captureName>
278# Failure to get the full sequence is an error.
279#
280named-backref:
281 '<' n named-backref-2 doBeginNamedBackRef
282 default errorDeath doBadNamedCapture
283
284named-backref-2:
285 ascii_letter n named-backref-3 doContinueNamedBackRef
286 default errorDeath doBadNamedCapture
287
288named-backref-3:
289 ascii_letter n named-backref-3 doContinueNamedBackRef
290 digit_char n named-backref-3 doContinueNamedBackRef
291 '>' n expr-quant doCompleteNamedBackRef
292 default errorDeath doBadNamedCapture
293
294
295#
296# [set expression] parsing,
297# All states involved in parsing set expressions have names beginning with "set-"
298#
299
300set-open:
301 '^' n set-open2 doSetNegate
302 ':' set-posix doSetPosixProp
303 default set-open2
304
305set-open2:
306 ']' n set-after-lit doSetLiteral
307 default set-start
308
309# set-posix:
310# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
311# moved the scan to the closing ']'. If it wasn't a property
312# expression, the scan will still be at the opening ':', which should
313# be interpreted as a normal set expression.
314set-posix:
315 ']' n pop doSetEnd
316 ':' set-start
317 default errorDeath doRuleError # should not be possible.
318
319#
320# set-start after the [ and special case leading characters (^ and/or ]) but before
321# everything else. A '-' is literal at this point.
322#
323set-start:
324 ']' n pop doSetEnd
325 '[' n set-open ^set-after-set doSetBeginUnion
326 '\' n set-escape
327 '-' n set-start-dash
328 '&' n set-start-amp
329 default n set-after-lit doSetLiteral
330
331# set-start-dash Turn "[--" into a syntax error.
332# "[-x" is good, - and x are literals.
333#
334set-start-dash:
335 '-' errorDeath doRuleError
336 default set-after-lit doSetAddDash
337
338# set-start-amp Turn "[&&" into a syntax error.
339# "[&x" is good, & and x are literals.
340#
341set-start-amp:
342 '&' errorDeath doRuleError
343 default set-after-lit doSetAddAmp
344
345#
346# set-after-lit The last thing scanned was a literal character within a set.
347# Can be followed by anything. Single '-' or '&' are
348# literals in this context, not operators.
349set-after-lit:
350 ']' n pop doSetEnd
351 '[' n set-open ^set-after-set doSetBeginUnion
352 '-' n set-lit-dash
353 '&' n set-lit-amp
354 '\' n set-escape
355 eof errorDeath doSetNoCloseError
356 default n set-after-lit doSetLiteral
357
358set-after-set:
359 ']' n pop doSetEnd
360 '[' n set-open ^set-after-set doSetBeginUnion
361 '-' n set-set-dash
362 '&' n set-set-amp
363 '\' n set-escape
364 eof errorDeath doSetNoCloseError
365 default n set-after-lit doSetLiteral
366
367set-after-range:
368 ']' n pop doSetEnd
369 '[' n set-open ^set-after-set doSetBeginUnion
370 '-' n set-range-dash
371 '&' n set-range-amp
372 '\' n set-escape
373 eof errorDeath doSetNoCloseError
374 default n set-after-lit doSetLiteral
375
376
377# set-after-op
378# After a -- or &&
379# It is an error to close a set at this point.
380#
381set-after-op:
382 '[' n set-open ^set-after-set doSetBeginUnion
383 ']' errorDeath doSetOpError
384 '\' n set-escape
385 default n set-after-lit doSetLiteral
386
387#
388# set-set-amp
389# Have scanned [[set]&
390# Could be a '&' intersection operator, if a set follows.
391# Could be the start of a '&&' operator.
392# Otherewise is a literal.
393set-set-amp:
394 '[' n set-open ^set-after-set doSetBeginIntersection1
395 '&' n set-after-op doSetIntersection2
396 default set-after-lit doSetAddAmp
397
398
399# set-lit-amp Have scanned "[literals&"
400# Could be a start of "&&" operator or a literal
401# In [abc&[def]], the '&' is a literal
402#
403set-lit-amp:
404 '&' n set-after-op doSetIntersection2
405 default set-after-lit doSetAddAmp
406
407
408#
409# set-set-dash
410# Have scanned [set]-
411# Could be a '-' difference operator, if a [set] follows.
412# Could be the start of a '--' operator.
413# Otherewise is a literal.
414set-set-dash:
415 '[' n set-open ^set-after-set doSetBeginDifference1
416 '-' n set-after-op doSetDifference2
417 default set-after-lit doSetAddDash
418
419
420#
421# set-range-dash
422# scanned a-b- or \w-
423# any set or range like item where the trailing single '-' should
424# be literal, not a set difference operation.
425# A trailing "--" is still a difference operator.
426set-range-dash:
427 '-' n set-after-op doSetDifference2
428 default set-after-lit doSetAddDash
429
430
431set-range-amp:
432 '&' n set-after-op doSetIntersection2
433 default set-after-lit doSetAddAmp
434
435
436# set-lit-dash
437# Have scanned "[literals-" Could be a range or a -- operator or a literal
438# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
439# [abc-\p{xx} the '-' is an error
440# [abc-] the '-' is a literal
441# [ab-xy] the '-' is a range
442#
443set-lit-dash:
444 '-' n set-after-op doSetDifference2
445 '[' set-after-lit doSetAddDash
446 ']' set-after-lit doSetAddDash
447 '\' n set-lit-dash-escape
448 default n set-after-range doSetRange
449
450# set-lit-dash-escape
451#
452# scanned "[literal-\"
453# Could be a range, if the \ introduces an escaped literal char or a named char.
454# Otherwise it is an error.
455#
456set-lit-dash-escape:
457 's' errorDeath doSetOpError
458 'S' errorDeath doSetOpError
459 'w' errorDeath doSetOpError
460 'W' errorDeath doSetOpError
461 'd' errorDeath doSetOpError
462 'D' errorDeath doSetOpError
463 'N' set-after-range doSetNamedRange
464 default n set-after-range doSetRange
465
466
467#
468# set-escape
469# Common back-slash escape processing within set expressions
470#
471set-escape:
472 'p' set-after-set doSetProp
473 'P' set-after-set doSetProp
474 'N' set-after-lit doSetNamedChar
475 's' n set-after-range doSetBackslash_s
476 'S' n set-after-range doSetBackslash_S
477 'w' n set-after-range doSetBackslash_w
478 'W' n set-after-range doSetBackslash_W
479 'd' n set-after-range doSetBackslash_d
480 'D' n set-after-range doSetBackslash_D
481 'h' n set-after-range doSetBackslash_h
482 'H' n set-after-range doSetBackslash_H
483 'v' n set-after-range doSetBackslash_v
484 'V' n set-after-range doSetBackslash_V
485 default n set-after-lit doSetLiteralEscaped
486
487#
488# set-finish
489# Have just encountered the final ']' that completes a [set], and
490# arrived here via a pop. From here, we exit the set parsing world, and go
491# back to generic regular expression parsing.
492#
493set-finish:
494 default expr-quant doSetFinish
495
496
497#
498# errorDeath. This state is specified as the next state whenever a syntax error
499# in the source rules is detected. Barring bugs, the state machine will never
500# actually get here, but will stop because of the action associated with the error.
501# But, just in case, this state asks the state machine to exit.
502errorDeath:
503 default n errorDeath doExit
504
505