]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/regexcst.txt
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / regexcst.txt
CommitLineData
b75a7d8f
A
1
2#*****************************************************************************
3#
4# Copyright (C) 2002-2003, International Business Machines Corporation and others.
5# All Rights Reserved.
6#
7#*****************************************************************************
8#
9# file: regexcst.txt
10# ICU Regular Expression Parser State Table
11#
12# This state table is used when reading and parsing a regular expression pattern
13# The pattern parser uses a state machine; the data in this file define the
14# state transitions that occur for each input character.
15#
16# *** This file defines the regex pattern grammar. This is it.
17# *** The determination of what is accepted is here.
18#
19# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20# that are then built with the rule parser.
21#
22
23#
24# Here is the syntax of the state definitions in this file:
25#
26#
27#StateName:
28# input-char n next-state ^push-state action
29# input-char n next-state ^push-state action
30# | | | | |
31# | | | | |--- action to be performed by state machine
32# | | | | See function RBBIRuleScanner::doParseActions()
33# | | | |
34# | | | |--- Push this named state onto the state stack.
35# | | | Later, when next state is specified as "pop",
36# | | | the pushed state will become the current state.
37# | | |
38# | | |--- Transition to this state if the current input character matches the input
39# | | character or char class in the left hand column. "pop" causes the next
40# | | state to be popped from the state stack.
41# | |
42# | |--- When making the state transition specified on this line, advance to the next
43# | character from the input only if 'n' appears here.
44# |
45# |--- Character or named character classes to test for. If the current character being scanned
46# matches, peform the actions and go to the state specified on this line.
47# The input character is tested sequentally, in the order written. The characters and
48# character classes tested for do not need to be mutually exclusive. The first match wins.
49#
50
51
52
53
54#
55# start state, scan position is at the beginning of the pattern.
56#
57start:
58 default term doPatStart
59
60
61
62
63#
64# term. At a position where we can accept the start most items in a pattern.
65#
66term:
67 quoted n expr-quant doLiteralChar
68 rule_char n expr-quant doLiteralChar
69 '[' n expr-quant doScanUnicodeSet
70 '(' n open-paren
71 '.' n expr-quant doDotAny
72 '^' n term doCaret
73 '$' n term doDollar
74 '\' n backslash
75 '|' n term doOrOperator
76 ')' n pop doCloseParen
77 eof term doPatFinish
78 default errorDeath doRuleError
79
80
81
82#
83# expr-quant We've just finished scanning a term, now look for the optional
84# trailing quantifier - *, +, ?, *?, etc.
85#
86expr-quant:
87 '*' n quant-star
88 '+' n quant-plus
89 '?' n quant-opt
90 '{' n interval-open doIntervalInit
91 '(' n open-paren-quant
92 default expr-cont
93
94
95#
96# expr-cont Expression, continuation. At a point where additional terms are
97# allowed, but not required. No Quantifiers
98#
99expr-cont:
100 '|' n term doOrOperator
101 ')' n pop doCloseParen
102 default term
103
104
105#
106# open-paren-quant Special case handling for comments appearing before a quantifier,
107# e.g. x(?#comment )*
108# Open parens from expr-quant come here; anything but a (?# comment
109# branches into the normal parenthesis sequence as quickly as possible.
110#
111open-paren-quant:
112 '?' n open-paren-quant2 doSuppressComments
113 default open-paren
114
115open-paren-quant2:
116 '#' n paren-comment ^expr-quant
117 default open-paren-extended
118
119
120#
121# open-paren We've got an open paren. We need to scan further to
122# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
123#
124open-paren:
125 '?' n open-paren-extended doSuppressComments
126 default term ^expr-quant doOpenCaptureParen
127
128open-paren-extended:
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
130 '>' n term ^expr-quant doOpenAtomicParen # (?>
131 '=' n term ^expr-cont doOpenLookAhead # (?=
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
133 '<' n open-paren-lookbehind
134 '#' n paren-comment ^term
135 'i' paren-flag doBeginMatchMode
136 'm' paren-flag doBeginMatchMode
137 's' paren-flag doBeginMatchMode
138 'x' paren-flag doBeginMatchMode
139 '-' paren-flag doBeginMatchMode
140 '(' n errorDeath doConditionalExpr
141 '{' n errorDeath doPerlInline
142 default errorDeath doBadOpenParenType
143
144open-paren-lookbehind:
145 '=' n term ^expr-cont doOpenLookBehind # (?<=
146 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
147 default errorDeath doBadOpenParenType
148
149
150#
151# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
152# TODO: should parens nest here? Check what perl does.
153#
154paren-comment:
155 ')' n pop
156 eof errorDeath doMismatchedParenErr
157 default n paren-comment
158
159#
160# paren-flag Scanned a (?ismx-ismx flag setting
161#
162paren-flag:
163 'i' n paren-flag doMatchMode
164 'm' n paren-flag doMatchMode
165 's' n paren-flag doMatchMode
166 'x' n paren-flag doMatchMode
167 '-' n paren-flag doMatchMode
168 ')' n term doSetMatchMode
169 ':' n term ^expr-quant doMatchModeParen
170 default errorDeath
171
172
173#
174# quant-star Scanning a '*' quantifier. Need to look ahead to decide
175# between plain '*', '*?', '*+'
176#
177quant-star:
178 '?' n expr-cont doNGStar # *?
179 '+' n expr-cont doPossessiveStar # *+
180 default expr-cont doStar
181
182
183#
184# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
185# between plain '+', '+?', '++'
186#
187quant-plus:
188 '?' n expr-cont doNGPlus # *?
189 '+' n expr-cont doPossessivePlus # *+
190 default expr-cont doPlus
191
192
193#
194# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
195# between plain '?', '??', '?+'
196#
197quant-opt:
198 '?' n expr-cont doNGOpt # ??
199 '+' n expr-cont doPossessiveOpt # ?+
200 default expr-cont doOpt # ?
201
202
203#
204# Interval scanning a '{', the opening delimiter for an interval specification
205# {number} or {min, max} or {min, }
206#
207interval-open:
208 white_space n interval-open # TODO: is white space allowed here in non-free mode?
209 digit_char interval-lower
210 default errorDeath doIntervalError
211
212interval-lower:
213 digit_char n interval-lower doIntevalLowerDigit
214 ',' n interval-upper
215 '}' n interval-type doIntervalSame # {n}
216 default errorDeath doIntervalError
217
218interval-upper:
219 digit_char n interval-upper doIntervalUpperDigit
220 '}' n interval-type
221 default errorDeath doIntervalError
222
223interval-type:
224 '?' n expr-cont doNGInterval # {n,m}?
225 '+' n expr-cont doPossessiveInterval # {n,m}+
226 default expr-cont doInterval # {m,n}
227
228
229#
230# backslash # Backslash. Figure out which of the \thingies we have encountered.
231# The low level next-char function will have preprocessed
232# some of them already; those won't come here.
233backslash:
234 'A' n term doBackslashA
235 'B' n term doBackslashB
236 'b' n term doBackslashb
237 'd' n expr-quant doBackslashd
238 'D' n expr-quant doBackslashD
239 'G' n term doBackslashG
240 'N' expr-quant doProperty # \N{NAME} named char
241 'p' expr-quant doProperty # \p{Lu} style property
242 'P' expr-quant doProperty
243 'Q' n term doEnterQuoteMode
244 'S' n expr-quant doBackslashS
245 's' n expr-quant doBackslashs
246 'W' n expr-quant doBackslashW
247 'w' n expr-quant doBackslashw
248 'X' n expr-quant doBackslashX
249 'Z' n term doBackslashZ
250 'z' n term doBackslashz
251 '0' n expr-quant doOctal
252 digit_char n expr-quant doBackRef # Will scan multiple digits
253 eof errorDeath doEscapeError
254 default n expr-quant doLiteralChar # Escaped literal char.
255
256
257#
258# errorDeath. This state is specified as the next state whenever a syntax error
259# in the source rules is detected. Barring bugs, the state machine will never
260# actually get here, but will stop because of the action associated with the error.
261# But, just in case, this state asks the state machine to exit.
262errorDeath:
263 default n errorDeath doExit
264
265