]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | |
2 | #***************************************************************************** | |
3 | # | |
4 | # Copyright (C) 2002-2003, International Business Machines Corporation and others. | |
5 | # All Rights Reserved. | |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: regexcst.txt | |
10 | # ICU Regular Expression Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a regular expression pattern | |
13 | # The pattern parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the regex pattern grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | ||
23 | # | |
24 | # Here is the syntax of the state definitions in this file: | |
25 | # | |
26 | # | |
27 | #StateName: | |
28 | # input-char n next-state ^push-state action | |
29 | # input-char n next-state ^push-state action | |
30 | # | | | | | | |
31 | # | | | | |--- action to be performed by state machine | |
32 | # | | | | See function RBBIRuleScanner::doParseActions() | |
33 | # | | | | | |
34 | # | | | |--- Push this named state onto the state stack. | |
35 | # | | | Later, when next state is specified as "pop", | |
36 | # | | | the pushed state will become the current state. | |
37 | # | | | | |
38 | # | | |--- Transition to this state if the current input character matches the input | |
39 | # | | character or char class in the left hand column. "pop" causes the next | |
40 | # | | state to be popped from the state stack. | |
41 | # | | | |
42 | # | |--- When making the state transition specified on this line, advance to the next | |
43 | # | character from the input only if 'n' appears here. | |
44 | # | | |
45 | # |--- Character or named character classes to test for. If the current character being scanned | |
46 | # matches, peform the actions and go to the state specified on this line. | |
47 | # The input character is tested sequentally, in the order written. The characters and | |
48 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
49 | # | |
50 | ||
51 | ||
52 | ||
53 | ||
54 | # | |
55 | # start state, scan position is at the beginning of the pattern. | |
56 | # | |
57 | start: | |
58 | default term doPatStart | |
59 | ||
60 | ||
61 | ||
62 | ||
63 | # | |
64 | # term. At a position where we can accept the start most items in a pattern. | |
65 | # | |
66 | term: | |
67 | quoted n expr-quant doLiteralChar | |
68 | rule_char n expr-quant doLiteralChar | |
69 | '[' n expr-quant doScanUnicodeSet | |
70 | '(' n open-paren | |
71 | '.' n expr-quant doDotAny | |
72 | '^' n term doCaret | |
73 | '$' n term doDollar | |
74 | '\' n backslash | |
75 | '|' n term doOrOperator | |
76 | ')' n pop doCloseParen | |
77 | eof term doPatFinish | |
78 | default errorDeath doRuleError | |
79 | ||
80 | ||
81 | ||
82 | # | |
83 | # expr-quant We've just finished scanning a term, now look for the optional | |
84 | # trailing quantifier - *, +, ?, *?, etc. | |
85 | # | |
86 | expr-quant: | |
87 | '*' n quant-star | |
88 | '+' n quant-plus | |
89 | '?' n quant-opt | |
90 | '{' n interval-open doIntervalInit | |
91 | '(' n open-paren-quant | |
92 | default expr-cont | |
93 | ||
94 | ||
95 | # | |
96 | # expr-cont Expression, continuation. At a point where additional terms are | |
97 | # allowed, but not required. No Quantifiers | |
98 | # | |
99 | expr-cont: | |
100 | '|' n term doOrOperator | |
101 | ')' n pop doCloseParen | |
102 | default term | |
103 | ||
104 | ||
105 | # | |
106 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
107 | # e.g. x(?#comment )* | |
108 | # Open parens from expr-quant come here; anything but a (?# comment | |
109 | # branches into the normal parenthesis sequence as quickly as possible. | |
110 | # | |
111 | open-paren-quant: | |
112 | '?' n open-paren-quant2 doSuppressComments | |
113 | default open-paren | |
114 | ||
115 | open-paren-quant2: | |
116 | '#' n paren-comment ^expr-quant | |
117 | default open-paren-extended | |
118 | ||
119 | ||
120 | # | |
121 | # open-paren We've got an open paren. We need to scan further to | |
122 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
123 | # | |
124 | open-paren: | |
125 | '?' n open-paren-extended doSuppressComments | |
126 | default term ^expr-quant doOpenCaptureParen | |
127 | ||
128 | open-paren-extended: | |
129 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
130 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
131 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
132 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
133 | '<' n open-paren-lookbehind | |
134 | '#' n paren-comment ^term | |
135 | 'i' paren-flag doBeginMatchMode | |
136 | 'm' paren-flag doBeginMatchMode | |
137 | 's' paren-flag doBeginMatchMode | |
374ca955 | 138 | 'w' paren-flag doBeginMatchMode |
b75a7d8f A |
139 | 'x' paren-flag doBeginMatchMode |
140 | '-' paren-flag doBeginMatchMode | |
141 | '(' n errorDeath doConditionalExpr | |
142 | '{' n errorDeath doPerlInline | |
143 | default errorDeath doBadOpenParenType | |
144 | ||
145 | open-paren-lookbehind: | |
146 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
147 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
148 | default errorDeath doBadOpenParenType | |
149 | ||
150 | ||
151 | # | |
152 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
153 | # TODO: should parens nest here? Check what perl does. | |
154 | # | |
155 | paren-comment: | |
156 | ')' n pop | |
157 | eof errorDeath doMismatchedParenErr | |
158 | default n paren-comment | |
159 | ||
160 | # | |
161 | # paren-flag Scanned a (?ismx-ismx flag setting | |
162 | # | |
163 | paren-flag: | |
164 | 'i' n paren-flag doMatchMode | |
165 | 'm' n paren-flag doMatchMode | |
166 | 's' n paren-flag doMatchMode | |
374ca955 | 167 | 'w' n paren-flag doMatchMode |
b75a7d8f A |
168 | 'x' n paren-flag doMatchMode |
169 | '-' n paren-flag doMatchMode | |
170 | ')' n term doSetMatchMode | |
171 | ':' n term ^expr-quant doMatchModeParen | |
374ca955 | 172 | default errorDeath doBadModeFlag |
b75a7d8f A |
173 | |
174 | ||
175 | # | |
176 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
177 | # between plain '*', '*?', '*+' | |
178 | # | |
179 | quant-star: | |
180 | '?' n expr-cont doNGStar # *? | |
181 | '+' n expr-cont doPossessiveStar # *+ | |
182 | default expr-cont doStar | |
183 | ||
184 | ||
185 | # | |
186 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
187 | # between plain '+', '+?', '++' | |
188 | # | |
189 | quant-plus: | |
190 | '?' n expr-cont doNGPlus # *? | |
191 | '+' n expr-cont doPossessivePlus # *+ | |
192 | default expr-cont doPlus | |
193 | ||
194 | ||
195 | # | |
196 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
197 | # between plain '?', '??', '?+' | |
198 | # | |
199 | quant-opt: | |
200 | '?' n expr-cont doNGOpt # ?? | |
201 | '+' n expr-cont doPossessiveOpt # ?+ | |
202 | default expr-cont doOpt # ? | |
203 | ||
204 | ||
205 | # | |
206 | # Interval scanning a '{', the opening delimiter for an interval specification | |
207 | # {number} or {min, max} or {min, } | |
208 | # | |
209 | interval-open: | |
210 | white_space n interval-open # TODO: is white space allowed here in non-free mode? | |
211 | digit_char interval-lower | |
212 | default errorDeath doIntervalError | |
213 | ||
214 | interval-lower: | |
215 | digit_char n interval-lower doIntevalLowerDigit | |
216 | ',' n interval-upper | |
217 | '}' n interval-type doIntervalSame # {n} | |
218 | default errorDeath doIntervalError | |
219 | ||
220 | interval-upper: | |
221 | digit_char n interval-upper doIntervalUpperDigit | |
222 | '}' n interval-type | |
223 | default errorDeath doIntervalError | |
224 | ||
225 | interval-type: | |
226 | '?' n expr-cont doNGInterval # {n,m}? | |
227 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
228 | default expr-cont doInterval # {m,n} | |
229 | ||
230 | ||
231 | # | |
232 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
233 | # The low level next-char function will have preprocessed | |
234 | # some of them already; those won't come here. | |
235 | backslash: | |
236 | 'A' n term doBackslashA | |
237 | 'B' n term doBackslashB | |
238 | 'b' n term doBackslashb | |
239 | 'd' n expr-quant doBackslashd | |
240 | 'D' n expr-quant doBackslashD | |
241 | 'G' n term doBackslashG | |
242 | 'N' expr-quant doProperty # \N{NAME} named char | |
243 | 'p' expr-quant doProperty # \p{Lu} style property | |
244 | 'P' expr-quant doProperty | |
245 | 'Q' n term doEnterQuoteMode | |
246 | 'S' n expr-quant doBackslashS | |
247 | 's' n expr-quant doBackslashs | |
248 | 'W' n expr-quant doBackslashW | |
249 | 'w' n expr-quant doBackslashw | |
250 | 'X' n expr-quant doBackslashX | |
251 | 'Z' n term doBackslashZ | |
252 | 'z' n term doBackslashz | |
b75a7d8f A |
253 | digit_char n expr-quant doBackRef # Will scan multiple digits |
254 | eof errorDeath doEscapeError | |
255 | default n expr-quant doLiteralChar # Escaped literal char. | |
256 | ||
257 | ||
258 | # | |
259 | # errorDeath. This state is specified as the next state whenever a syntax error | |
260 | # in the source rules is detected. Barring bugs, the state machine will never | |
261 | # actually get here, but will stop because of the action associated with the error. | |
262 | # But, just in case, this state asks the state machine to exit. | |
263 | errorDeath: | |
264 | default n errorDeath doExit | |
265 | ||
266 |