]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/regexcst.txt
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / regexcst.txt
1
2 #*****************************************************************************
3 #
4 # Copyright (C) 2002-2003, International Business Machines Corporation and others.
5 # All Rights Reserved.
6 #
7 #*****************************************************************************
8 #
9 # file: regexcst.txt
10 # ICU Regular Expression Parser State Table
11 #
12 # This state table is used when reading and parsing a regular expression pattern
13 # The pattern parser uses a state machine; the data in this file define the
14 # state transitions that occur for each input character.
15 #
16 # *** This file defines the regex pattern grammar. This is it.
17 # *** The determination of what is accepted is here.
18 #
19 # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
20 # that are then built with the rule parser.
21 #
22
23 #
24 # Here is the syntax of the state definitions in this file:
25 #
26 #
27 #StateName:
28 # input-char n next-state ^push-state action
29 # input-char n next-state ^push-state action
30 # | | | | |
31 # | | | | |--- action to be performed by state machine
32 # | | | | See function RBBIRuleScanner::doParseActions()
33 # | | | |
34 # | | | |--- Push this named state onto the state stack.
35 # | | | Later, when next state is specified as "pop",
36 # | | | the pushed state will become the current state.
37 # | | |
38 # | | |--- Transition to this state if the current input character matches the input
39 # | | character or char class in the left hand column. "pop" causes the next
40 # | | state to be popped from the state stack.
41 # | |
42 # | |--- When making the state transition specified on this line, advance to the next
43 # | character from the input only if 'n' appears here.
44 # |
45 # |--- Character or named character classes to test for. If the current character being scanned
46 # matches, peform the actions and go to the state specified on this line.
47 # The input character is tested sequentally, in the order written. The characters and
48 # character classes tested for do not need to be mutually exclusive. The first match wins.
49 #
50
51
52
53
54 #
55 # start state, scan position is at the beginning of the pattern.
56 #
57 start:
58 default term doPatStart
59
60
61
62
63 #
64 # term. At a position where we can accept the start most items in a pattern.
65 #
66 term:
67 quoted n expr-quant doLiteralChar
68 rule_char n expr-quant doLiteralChar
69 '[' n expr-quant doScanUnicodeSet
70 '(' n open-paren
71 '.' n expr-quant doDotAny
72 '^' n term doCaret
73 '$' n term doDollar
74 '\' n backslash
75 '|' n term doOrOperator
76 ')' n pop doCloseParen
77 eof term doPatFinish
78 default errorDeath doRuleError
79
80
81
82 #
83 # expr-quant We've just finished scanning a term, now look for the optional
84 # trailing quantifier - *, +, ?, *?, etc.
85 #
86 expr-quant:
87 '*' n quant-star
88 '+' n quant-plus
89 '?' n quant-opt
90 '{' n interval-open doIntervalInit
91 '(' n open-paren-quant
92 default expr-cont
93
94
95 #
96 # expr-cont Expression, continuation. At a point where additional terms are
97 # allowed, but not required. No Quantifiers
98 #
99 expr-cont:
100 '|' n term doOrOperator
101 ')' n pop doCloseParen
102 default term
103
104
105 #
106 # open-paren-quant Special case handling for comments appearing before a quantifier,
107 # e.g. x(?#comment )*
108 # Open parens from expr-quant come here; anything but a (?# comment
109 # branches into the normal parenthesis sequence as quickly as possible.
110 #
111 open-paren-quant:
112 '?' n open-paren-quant2 doSuppressComments
113 default open-paren
114
115 open-paren-quant2:
116 '#' n paren-comment ^expr-quant
117 default open-paren-extended
118
119
120 #
121 # open-paren We've got an open paren. We need to scan further to
122 # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
123 #
124 open-paren:
125 '?' n open-paren-extended doSuppressComments
126 default term ^expr-quant doOpenCaptureParen
127
128 open-paren-extended:
129 ':' n term ^expr-quant doOpenNonCaptureParen # (?:
130 '>' n term ^expr-quant doOpenAtomicParen # (?>
131 '=' n term ^expr-cont doOpenLookAhead # (?=
132 '!' n term ^expr-cont doOpenLookAheadNeg # (?!
133 '<' n open-paren-lookbehind
134 '#' n paren-comment ^term
135 'i' paren-flag doBeginMatchMode
136 'm' paren-flag doBeginMatchMode
137 's' paren-flag doBeginMatchMode
138 'w' paren-flag doBeginMatchMode
139 'x' paren-flag doBeginMatchMode
140 '-' paren-flag doBeginMatchMode
141 '(' n errorDeath doConditionalExpr
142 '{' n errorDeath doPerlInline
143 default errorDeath doBadOpenParenType
144
145 open-paren-lookbehind:
146 '=' n term ^expr-cont doOpenLookBehind # (?<=
147 '!' n term ^expr-cont doOpenLookBehindNeg # (?<!
148 default errorDeath doBadOpenParenType
149
150
151 #
152 # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
153 # TODO: should parens nest here? Check what perl does.
154 #
155 paren-comment:
156 ')' n pop
157 eof errorDeath doMismatchedParenErr
158 default n paren-comment
159
160 #
161 # paren-flag Scanned a (?ismx-ismx flag setting
162 #
163 paren-flag:
164 'i' n paren-flag doMatchMode
165 'm' n paren-flag doMatchMode
166 's' n paren-flag doMatchMode
167 'w' n paren-flag doMatchMode
168 'x' n paren-flag doMatchMode
169 '-' n paren-flag doMatchMode
170 ')' n term doSetMatchMode
171 ':' n term ^expr-quant doMatchModeParen
172 default errorDeath doBadModeFlag
173
174
175 #
176 # quant-star Scanning a '*' quantifier. Need to look ahead to decide
177 # between plain '*', '*?', '*+'
178 #
179 quant-star:
180 '?' n expr-cont doNGStar # *?
181 '+' n expr-cont doPossessiveStar # *+
182 default expr-cont doStar
183
184
185 #
186 # quant-plus Scanning a '+' quantifier. Need to look ahead to decide
187 # between plain '+', '+?', '++'
188 #
189 quant-plus:
190 '?' n expr-cont doNGPlus # *?
191 '+' n expr-cont doPossessivePlus # *+
192 default expr-cont doPlus
193
194
195 #
196 # quant-opt Scanning a '?' quantifier. Need to look ahead to decide
197 # between plain '?', '??', '?+'
198 #
199 quant-opt:
200 '?' n expr-cont doNGOpt # ??
201 '+' n expr-cont doPossessiveOpt # ?+
202 default expr-cont doOpt # ?
203
204
205 #
206 # Interval scanning a '{', the opening delimiter for an interval specification
207 # {number} or {min, max} or {min, }
208 #
209 interval-open:
210 white_space n interval-open # TODO: is white space allowed here in non-free mode?
211 digit_char interval-lower
212 default errorDeath doIntervalError
213
214 interval-lower:
215 digit_char n interval-lower doIntevalLowerDigit
216 ',' n interval-upper
217 '}' n interval-type doIntervalSame # {n}
218 default errorDeath doIntervalError
219
220 interval-upper:
221 digit_char n interval-upper doIntervalUpperDigit
222 '}' n interval-type
223 default errorDeath doIntervalError
224
225 interval-type:
226 '?' n expr-cont doNGInterval # {n,m}?
227 '+' n expr-cont doPossessiveInterval # {n,m}+
228 default expr-cont doInterval # {m,n}
229
230
231 #
232 # backslash # Backslash. Figure out which of the \thingies we have encountered.
233 # The low level next-char function will have preprocessed
234 # some of them already; those won't come here.
235 backslash:
236 'A' n term doBackslashA
237 'B' n term doBackslashB
238 'b' n term doBackslashb
239 'd' n expr-quant doBackslashd
240 'D' n expr-quant doBackslashD
241 'G' n term doBackslashG
242 'N' expr-quant doProperty # \N{NAME} named char
243 'p' expr-quant doProperty # \p{Lu} style property
244 'P' expr-quant doProperty
245 'Q' n term doEnterQuoteMode
246 'S' n expr-quant doBackslashS
247 's' n expr-quant doBackslashs
248 'W' n expr-quant doBackslashW
249 'w' n expr-quant doBackslashw
250 'X' n expr-quant doBackslashX
251 'Z' n term doBackslashZ
252 'z' n term doBackslashz
253 digit_char n expr-quant doBackRef # Will scan multiple digits
254 eof errorDeath doEscapeError
255 default n expr-quant doLiteralChar # Escaped literal char.
256
257
258 #
259 # errorDeath. This state is specified as the next state whenever a syntax error
260 # in the source rules is detected. Barring bugs, the state machine will never
261 # actually get here, but will stop because of the action associated with the error.
262 # But, just in case, this state asks the state machine to exit.
263 errorDeath:
264 default n errorDeath doExit
265
266