]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/i18n/regexcst.txt
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / i18n / regexcst.txt
... / ...
CommitLineData
1
2#*****************************************************************************
3#
4# Copyright (C) 2002-2015, International Business Machines Corporation and others.
5# All Rights Reserved.
6#
7#*****************************************************************************
8#
9# file: regexcst.txt
10# ICU Regular Expression Parser State Table
11#
12# This state table is used when reading and parsing a regular expression pattern
13# The pattern parser uses a state machine; the data in this file define the
14# state transitions that occur for each input character.
15#
16# *** This file defines the regex pattern grammar. This is it.
17# *** The determination of what is accepted is here.
18#
19# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20# that are then built with the rule parser.
21#
22
23#
24# Here is the syntax of the state definitions in this file:
25#
26#
27#StateName:
28# input-char n next-state ^push-state action
29# input-char n next-state ^push-state action
30# | | | | |
31# | | | | |--- action to be performed by state machine
32# | | | | See function RBBIRuleScanner::doParseActions()
33# | | | |
34# | | | |--- Push this named state onto the state stack.
35# | | | Later, when next state is specified as "pop",
36# | | | the pushed state will become the current state.
37# | | |
38# | | |--- Transition to this state if the current input character matches the input
39# | | character or char class in the left hand column. "pop" causes the next
40# | | state to be popped from the state stack.
41# | |
42# | |--- When making the state transition specified on this line, advance to the next
43# | character from the input only if 'n' appears here.
44# |
45# |--- Character or named character classes to test for. If the current character being scanned
46# matches, peform the actions and go to the state specified on this line.
47# The input character is tested sequentally, in the order written. The characters and
48# character classes tested for do not need to be mutually exclusive. The first match wins.
49#
50
51
52
53
54#
55# start state, scan position is at the beginning of the pattern.
56#
57start:
58 default term doPatStart
59
60
61
62
63#
64# term. At a position where we can accept the start most items in a pattern.
65#
66term:
67 quoted n expr-quant doLiteralChar
68 rule_char n expr-quant doLiteralChar
69 '[' n set-open ^set-finish doSetBegin
70 '(' n open-paren
71 '.' n expr-quant doDotAny
72 '^' n expr-quant doCaret
73 '$' n expr-quant doDollar
74 '\' n backslash
75 '|' n term doOrOperator
76 ')' n pop doCloseParen
77 eof term doPatFinish
78 default errorDeath doRuleError
79
80
81
82#
83# expr-quant We've just finished scanning a term, now look for the optional
84# trailing quantifier - *, +, ?, *?, etc.
85#
86expr-quant:
87 '*' n quant-star
88 '+' n quant-plus
89 '?' n quant-opt
90 '{' n interval-open doIntervalInit
91 '(' n open-paren-quant
92 default expr-cont
93
94
95#
96# expr-cont Expression, continuation. At a point where additional terms are
97# allowed, but not required. No Quantifiers
98#
99expr-cont:
100 '|' n term doOrOperator
101 ')' n pop doCloseParen
102 default term
103
104
105#
106# open-paren-quant Special case handling for comments appearing before a quantifier,
107# e.g. x(?#comment )*
108# Open parens from expr-quant come here; anything but a (?# comment
109# branches into the normal parenthesis sequence as quickly as possible.
110#
111open-paren-quant:
112 '?' n open-paren-quant2 doSuppressComments
113 default open-paren
114
115open-paren-quant2:
116 '#' n paren-comment ^expr-quant
117 default open-paren-extended
118
119
120#
121# open-paren We've got an open paren. We need to scan further to
122# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
123#
124open-paren:
125 '?' n open-paren-extended doSuppressComments
126 default term ^expr-quant doOpenCaptureParen
127
128open-paren-extended:
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
130 '>' n term ^expr-quant doOpenAtomicParen # (?>
131 '=' n term ^expr-cont doOpenLookAhead # (?=
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
133 '<' n open-paren-lookbehind
134 '#' n paren-comment ^term
135 'i' paren-flag doBeginMatchMode
136 'd' paren-flag doBeginMatchMode
137 'm' paren-flag doBeginMatchMode
138 's' paren-flag doBeginMatchMode
139 'u' paren-flag doBeginMatchMode
140 'w' paren-flag doBeginMatchMode
141 'x' paren-flag doBeginMatchMode
142 '-' paren-flag doBeginMatchMode
143 '(' n errorDeath doConditionalExpr
144 '{' n errorDeath doPerlInline
145 default errorDeath doBadOpenParenType
146
147open-paren-lookbehind:
148 '=' n term ^expr-cont doOpenLookBehind # (?<=
149 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
150 ascii_letter named-capture doBeginNamedCapture # (?<name
151 default errorDeath doBadOpenParenType
152
153
154#
155# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
156#
157paren-comment:
158 ')' n pop
159 eof errorDeath doMismatchedParenErr
160 default n paren-comment
161
162#
163# paren-flag Scanned a (?ismx-ismx flag setting
164#
165paren-flag:
166 'i' n paren-flag doMatchMode
167 'd' n paren-flag doMatchMode
168 'm' n paren-flag doMatchMode
169 's' n paren-flag doMatchMode
170 'u' n paren-flag doMatchMode
171 'w' n paren-flag doMatchMode
172 'x' n paren-flag doMatchMode
173 '-' n paren-flag doMatchMode
174 ')' n term doSetMatchMode
175 ':' n term ^expr-quant doMatchModeParen
176 default errorDeath doBadModeFlag
177
178#
179# named-capture (?<name> ... ), position currently on the name.
180#
181named-capture:
182 ascii_letter n named-capture doContinueNamedCapture
183 digit_char n named-capture doContinueNamedCapture
184 '>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
185 default errorDeath doBadNamedCapture
186
187#
188# quant-star Scanning a '*' quantifier. Need to look ahead to decide
189# between plain '*', '*?', '*+'
190#
191quant-star:
192 '?' n expr-cont doNGStar # *?
193 '+' n expr-cont doPossessiveStar # *+
194 default expr-cont doStar
195
196
197#
198# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
199# between plain '+', '+?', '++'
200#
201quant-plus:
202 '?' n expr-cont doNGPlus # *?
203 '+' n expr-cont doPossessivePlus # *+
204 default expr-cont doPlus
205
206
207#
208# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
209# between plain '?', '??', '?+'
210#
211quant-opt:
212 '?' n expr-cont doNGOpt # ??
213 '+' n expr-cont doPossessiveOpt # ?+
214 default expr-cont doOpt # ?
215
216
217#
218# Interval scanning a '{', the opening delimiter for an interval specification
219# {number} or {min, max} or {min,}
220#
221interval-open:
222 digit_char interval-lower
223 default errorDeath doIntervalError
224
225interval-lower:
226 digit_char n interval-lower doIntevalLowerDigit
227 ',' n interval-upper
228 '}' n interval-type doIntervalSame # {n}
229 default errorDeath doIntervalError
230
231interval-upper:
232 digit_char n interval-upper doIntervalUpperDigit
233 '}' n interval-type
234 default errorDeath doIntervalError
235
236interval-type:
237 '?' n expr-cont doNGInterval # {n,m}?
238 '+' n expr-cont doPossessiveInterval # {n,m}+
239 default expr-cont doInterval # {m,n}
240
241
242#
243# backslash # Backslash. Figure out which of the \thingies we have encountered.
244# The low level next-char function will have preprocessed
245# some of them already; those won't come here.
246backslash:
247 'A' n term doBackslashA
248 'B' n term doBackslashB
249 'b' n term doBackslashb
250 'd' n expr-quant doBackslashd
251 'D' n expr-quant doBackslashD
252 'G' n term doBackslashG
253 'h' n expr-quant doBackslashh
254 'H' n expr-quant doBackslashH
255 'k' n named-backref
256 'N' expr-quant doNamedChar # \N{NAME} named char
257 'p' expr-quant doProperty # \p{Lu} style property
258 'P' expr-quant doProperty
259 'R' n expr-quant doBackslashR
260 'Q' n term doEnterQuoteMode
261 'S' n expr-quant doBackslashS
262 's' n expr-quant doBackslashs
263 'v' n expr-quant doBackslashv
264 'V' n expr-quant doBackslashV
265 'W' n expr-quant doBackslashW
266 'w' n expr-quant doBackslashw
267 'X' n expr-quant doBackslashX
268 'Z' n term doBackslashZ
269 'z' n term doBackslashz
270 digit_char n expr-quant doBackRef # Will scan multiple digits
271 eof errorDeath doEscapeError
272 default n expr-quant doEscapedLiteralChar
273
274
275# named-backref Scanned \k
276# Leading to \k<captureName>
277# Failure to get the full sequence is an error.
278#
279named-backref:
280 '<' n named-backref-2 doBeginNamedBackRef
281 default errorDeath doBadNamedCapture
282
283named-backref-2:
284 ascii_letter n named-backref-3 doContinueNamedBackRef
285 default errorDeath doBadNamedCapture
286
287named-backref-3:
288 ascii_letter n named-backref-3 doContinueNamedBackRef
289 digit_char n named-backref-3 doContinueNamedBackRef
290 '>' n expr-quant doCompleteNamedBackRef
291 default errorDeath doBadNamedCapture
292
293
294#
295# [set expression] parsing,
296# All states involved in parsing set expressions have names beginning with "set-"
297#
298
299set-open:
300 '^' n set-open2 doSetNegate
301 ':' set-posix doSetPosixProp
302 default set-open2
303
304set-open2:
305 ']' n set-after-lit doSetLiteral
306 default set-start
307
308# set-posix:
309# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
310# moved the scan to the closing ']'. If it wasn't a property
311# expression, the scan will still be at the opening ':', which should
312# be interpreted as a normal set expression.
313set-posix:
314 ']' n pop doSetEnd
315 ':' set-start
316 default errorDeath doRuleError # should not be possible.
317
318#
319# set-start after the [ and special case leading characters (^ and/or ]) but before
320# everything else. A '-' is literal at this point.
321#
322set-start:
323 ']' n pop doSetEnd
324 '[' n set-open ^set-after-set doSetBeginUnion
325 '\' n set-escape
326 '-' n set-start-dash
327 '&' n set-start-amp
328 default n set-after-lit doSetLiteral
329
330# set-start-dash Turn "[--" into a syntax error.
331# "[-x" is good, - and x are literals.
332#
333set-start-dash:
334 '-' errorDeath doRuleError
335 default set-after-lit doSetAddDash
336
337# set-start-amp Turn "[&&" into a syntax error.
338# "[&x" is good, & and x are literals.
339#
340set-start-amp:
341 '&' errorDeath doRuleError
342 default set-after-lit doSetAddAmp
343
344#
345# set-after-lit The last thing scanned was a literal character within a set.
346# Can be followed by anything. Single '-' or '&' are
347# literals in this context, not operators.
348set-after-lit:
349 ']' n pop doSetEnd
350 '[' n set-open ^set-after-set doSetBeginUnion
351 '-' n set-lit-dash
352 '&' n set-lit-amp
353 '\' n set-escape
354 eof errorDeath doSetNoCloseError
355 default n set-after-lit doSetLiteral
356
357set-after-set:
358 ']' n pop doSetEnd
359 '[' n set-open ^set-after-set doSetBeginUnion
360 '-' n set-set-dash
361 '&' n set-set-amp
362 '\' n set-escape
363 eof errorDeath doSetNoCloseError
364 default n set-after-lit doSetLiteral
365
366set-after-range:
367 ']' n pop doSetEnd
368 '[' n set-open ^set-after-set doSetBeginUnion
369 '-' n set-range-dash
370 '&' n set-range-amp
371 '\' n set-escape
372 eof errorDeath doSetNoCloseError
373 default n set-after-lit doSetLiteral
374
375
376# set-after-op
377# After a -- or &&
378# It is an error to close a set at this point.
379#
380set-after-op:
381 '[' n set-open ^set-after-set doSetBeginUnion
382 ']' errorDeath doSetOpError
383 '\' n set-escape
384 default n set-after-lit doSetLiteral
385
386#
387# set-set-amp
388# Have scanned [[set]&
389# Could be a '&' intersection operator, if a set follows.
390# Could be the start of a '&&' operator.
391# Otherewise is a literal.
392set-set-amp:
393 '[' n set-open ^set-after-set doSetBeginIntersection1
394 '&' n set-after-op doSetIntersection2
395 default set-after-lit doSetAddAmp
396
397
398# set-lit-amp Have scanned "[literals&"
399# Could be a start of "&&" operator or a literal
400# In [abc&[def]], the '&' is a literal
401#
402set-lit-amp:
403 '&' n set-after-op doSetIntersection2
404 default set-after-lit doSetAddAmp
405
406
407#
408# set-set-dash
409# Have scanned [set]-
410# Could be a '-' difference operator, if a [set] follows.
411# Could be the start of a '--' operator.
412# Otherewise is a literal.
413set-set-dash:
414 '[' n set-open ^set-after-set doSetBeginDifference1
415 '-' n set-after-op doSetDifference2
416 default set-after-lit doSetAddDash
417
418
419#
420# set-range-dash
421# scanned a-b- or \w-
422# any set or range like item where the trailing single '-' should
423# be literal, not a set difference operation.
424# A trailing "--" is still a difference operator.
425set-range-dash:
426 '-' n set-after-op doSetDifference2
427 default set-after-lit doSetAddDash
428
429
430set-range-amp:
431 '&' n set-after-op doSetIntersection2
432 default set-after-lit doSetAddAmp
433
434
435# set-lit-dash
436# Have scanned "[literals-" Could be a range or a -- operator or a literal
437# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
438# [abc-\p{xx} the '-' is an error
439# [abc-] the '-' is a literal
440# [ab-xy] the '-' is a range
441#
442set-lit-dash:
443 '-' n set-after-op doSetDifference2
444 '[' set-after-lit doSetAddDash
445 ']' set-after-lit doSetAddDash
446 '\' n set-lit-dash-escape
447 default n set-after-range doSetRange
448
449# set-lit-dash-escape
450#
451# scanned "[literal-\"
452# Could be a range, if the \ introduces an escaped literal char or a named char.
453# Otherwise it is an error.
454#
455set-lit-dash-escape:
456 's' errorDeath doSetOpError
457 'S' errorDeath doSetOpError
458 'w' errorDeath doSetOpError
459 'W' errorDeath doSetOpError
460 'd' errorDeath doSetOpError
461 'D' errorDeath doSetOpError
462 'N' set-after-range doSetNamedRange
463 default n set-after-range doSetRange
464
465
466#
467# set-escape
468# Common back-slash escape processing within set expressions
469#
470set-escape:
471 'p' set-after-set doSetProp
472 'P' set-after-set doSetProp
473 'N' set-after-lit doSetNamedChar
474 's' n set-after-range doSetBackslash_s
475 'S' n set-after-range doSetBackslash_S
476 'w' n set-after-range doSetBackslash_w
477 'W' n set-after-range doSetBackslash_W
478 'd' n set-after-range doSetBackslash_d
479 'D' n set-after-range doSetBackslash_D
480 'h' n set-after-range doSetBackslash_h
481 'H' n set-after-range doSetBackslash_H
482 'v' n set-after-range doSetBackslash_v
483 'V' n set-after-range doSetBackslash_V
484 default n set-after-lit doSetLiteralEscaped
485
486#
487# set-finish
488# Have just encountered the final ']' that completes a [set], and
489# arrived here via a pop. From here, we exit the set parsing world, and go
490# back to generic regular expression parsing.
491#
492set-finish:
493 default expr-quant doSetFinish
494
495
496#
497# errorDeath. This state is specified as the next state whenever a syntax error
498# in the source rules is detected. Barring bugs, the state machine will never
499# actually get here, but will stop because of the action associated with the error.
500# But, just in case, this state asks the state machine to exit.
501errorDeath:
502 default n errorDeath doExit
503
504