]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | |
2 | #***************************************************************************** | |
3 | # | |
4 | # Copyright (C) 2002-2003, International Business Machines Corporation and others. | |
5 | # All Rights Reserved. | |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: regexcst.txt | |
10 | # ICU Regular Expression Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a regular expression pattern | |
13 | # The pattern parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the regex pattern grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "regexcst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | ||
23 | # | |
24 | # Here is the syntax of the state definitions in this file: | |
25 | # | |
26 | # | |
27 | #StateName: | |
28 | # input-char n next-state ^push-state action | |
29 | # input-char n next-state ^push-state action | |
30 | # | | | | | | |
31 | # | | | | |--- action to be performed by state machine | |
32 | # | | | | See function RBBIRuleScanner::doParseActions() | |
33 | # | | | | | |
34 | # | | | |--- Push this named state onto the state stack. | |
35 | # | | | Later, when next state is specified as "pop", | |
36 | # | | | the pushed state will become the current state. | |
37 | # | | | | |
38 | # | | |--- Transition to this state if the current input character matches the input | |
39 | # | | character or char class in the left hand column. "pop" causes the next | |
40 | # | | state to be popped from the state stack. | |
41 | # | | | |
42 | # | |--- When making the state transition specified on this line, advance to the next | |
43 | # | character from the input only if 'n' appears here. | |
44 | # | | |
45 | # |--- Character or named character classes to test for. If the current character being scanned | |
46 | # matches, peform the actions and go to the state specified on this line. | |
47 | # The input character is tested sequentally, in the order written. The characters and | |
48 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
49 | # | |
50 | ||
51 | ||
52 | ||
53 | ||
54 | # | |
55 | # start state, scan position is at the beginning of the pattern. | |
56 | # | |
57 | start: | |
58 | default term doPatStart | |
59 | ||
60 | ||
61 | ||
62 | ||
63 | # | |
64 | # term. At a position where we can accept the start most items in a pattern. | |
65 | # | |
66 | term: | |
67 | quoted n expr-quant doLiteralChar | |
68 | rule_char n expr-quant doLiteralChar | |
69 | '[' n expr-quant doScanUnicodeSet | |
70 | '(' n open-paren | |
71 | '.' n expr-quant doDotAny | |
72 | '^' n term doCaret | |
73 | '$' n term doDollar | |
74 | '\' n backslash | |
75 | '|' n term doOrOperator | |
76 | ')' n pop doCloseParen | |
77 | eof term doPatFinish | |
78 | default errorDeath doRuleError | |
79 | ||
80 | ||
81 | ||
82 | # | |
83 | # expr-quant We've just finished scanning a term, now look for the optional | |
84 | # trailing quantifier - *, +, ?, *?, etc. | |
85 | # | |
86 | expr-quant: | |
87 | '*' n quant-star | |
88 | '+' n quant-plus | |
89 | '?' n quant-opt | |
90 | '{' n interval-open doIntervalInit | |
91 | '(' n open-paren-quant | |
92 | default expr-cont | |
93 | ||
94 | ||
95 | # | |
96 | # expr-cont Expression, continuation. At a point where additional terms are | |
97 | # allowed, but not required. No Quantifiers | |
98 | # | |
99 | expr-cont: | |
100 | '|' n term doOrOperator | |
101 | ')' n pop doCloseParen | |
102 | default term | |
103 | ||
104 | ||
105 | # | |
106 | # open-paren-quant Special case handling for comments appearing before a quantifier, | |
107 | # e.g. x(?#comment )* | |
108 | # Open parens from expr-quant come here; anything but a (?# comment | |
109 | # branches into the normal parenthesis sequence as quickly as possible. | |
110 | # | |
111 | open-paren-quant: | |
112 | '?' n open-paren-quant2 doSuppressComments | |
113 | default open-paren | |
114 | ||
115 | open-paren-quant2: | |
116 | '#' n paren-comment ^expr-quant | |
117 | default open-paren-extended | |
118 | ||
119 | ||
120 | # | |
121 | # open-paren We've got an open paren. We need to scan further to | |
122 | # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. | |
123 | # | |
124 | open-paren: | |
125 | '?' n open-paren-extended doSuppressComments | |
126 | default term ^expr-quant doOpenCaptureParen | |
127 | ||
128 | open-paren-extended: | |
129 | ':' n term ^expr-quant doOpenNonCaptureParen # (?: | |
130 | '>' n term ^expr-quant doOpenAtomicParen # (?> | |
131 | '=' n term ^expr-cont doOpenLookAhead # (?= | |
132 | '!' n term ^expr-cont doOpenLookAheadNeg # (?! | |
133 | '<' n open-paren-lookbehind | |
134 | '#' n paren-comment ^term | |
135 | 'i' paren-flag doBeginMatchMode | |
136 | 'm' paren-flag doBeginMatchMode | |
137 | 's' paren-flag doBeginMatchMode | |
138 | 'x' paren-flag doBeginMatchMode | |
139 | '-' paren-flag doBeginMatchMode | |
140 | '(' n errorDeath doConditionalExpr | |
141 | '{' n errorDeath doPerlInline | |
142 | default errorDeath doBadOpenParenType | |
143 | ||
144 | open-paren-lookbehind: | |
145 | '=' n term ^expr-cont doOpenLookBehind # (?<= | |
146 | '!' n term ^expr-cont doOpenLookBehindNeg # (?<! | |
147 | default errorDeath doBadOpenParenType | |
148 | ||
149 | ||
150 | # | |
151 | # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' | |
152 | # TODO: should parens nest here? Check what perl does. | |
153 | # | |
154 | paren-comment: | |
155 | ')' n pop | |
156 | eof errorDeath doMismatchedParenErr | |
157 | default n paren-comment | |
158 | ||
159 | # | |
160 | # paren-flag Scanned a (?ismx-ismx flag setting | |
161 | # | |
162 | paren-flag: | |
163 | 'i' n paren-flag doMatchMode | |
164 | 'm' n paren-flag doMatchMode | |
165 | 's' n paren-flag doMatchMode | |
166 | 'x' n paren-flag doMatchMode | |
167 | '-' n paren-flag doMatchMode | |
168 | ')' n term doSetMatchMode | |
169 | ':' n term ^expr-quant doMatchModeParen | |
170 | default errorDeath | |
171 | ||
172 | ||
173 | # | |
174 | # quant-star Scanning a '*' quantifier. Need to look ahead to decide | |
175 | # between plain '*', '*?', '*+' | |
176 | # | |
177 | quant-star: | |
178 | '?' n expr-cont doNGStar # *? | |
179 | '+' n expr-cont doPossessiveStar # *+ | |
180 | default expr-cont doStar | |
181 | ||
182 | ||
183 | # | |
184 | # quant-plus Scanning a '+' quantifier. Need to look ahead to decide | |
185 | # between plain '+', '+?', '++' | |
186 | # | |
187 | quant-plus: | |
188 | '?' n expr-cont doNGPlus # *? | |
189 | '+' n expr-cont doPossessivePlus # *+ | |
190 | default expr-cont doPlus | |
191 | ||
192 | ||
193 | # | |
194 | # quant-opt Scanning a '?' quantifier. Need to look ahead to decide | |
195 | # between plain '?', '??', '?+' | |
196 | # | |
197 | quant-opt: | |
198 | '?' n expr-cont doNGOpt # ?? | |
199 | '+' n expr-cont doPossessiveOpt # ?+ | |
200 | default expr-cont doOpt # ? | |
201 | ||
202 | ||
203 | # | |
204 | # Interval scanning a '{', the opening delimiter for an interval specification | |
205 | # {number} or {min, max} or {min, } | |
206 | # | |
207 | interval-open: | |
208 | white_space n interval-open # TODO: is white space allowed here in non-free mode? | |
209 | digit_char interval-lower | |
210 | default errorDeath doIntervalError | |
211 | ||
212 | interval-lower: | |
213 | digit_char n interval-lower doIntevalLowerDigit | |
214 | ',' n interval-upper | |
215 | '}' n interval-type doIntervalSame # {n} | |
216 | default errorDeath doIntervalError | |
217 | ||
218 | interval-upper: | |
219 | digit_char n interval-upper doIntervalUpperDigit | |
220 | '}' n interval-type | |
221 | default errorDeath doIntervalError | |
222 | ||
223 | interval-type: | |
224 | '?' n expr-cont doNGInterval # {n,m}? | |
225 | '+' n expr-cont doPossessiveInterval # {n,m}+ | |
226 | default expr-cont doInterval # {m,n} | |
227 | ||
228 | ||
229 | # | |
230 | # backslash # Backslash. Figure out which of the \thingies we have encountered. | |
231 | # The low level next-char function will have preprocessed | |
232 | # some of them already; those won't come here. | |
233 | backslash: | |
234 | 'A' n term doBackslashA | |
235 | 'B' n term doBackslashB | |
236 | 'b' n term doBackslashb | |
237 | 'd' n expr-quant doBackslashd | |
238 | 'D' n expr-quant doBackslashD | |
239 | 'G' n term doBackslashG | |
240 | 'N' expr-quant doProperty # \N{NAME} named char | |
241 | 'p' expr-quant doProperty # \p{Lu} style property | |
242 | 'P' expr-quant doProperty | |
243 | 'Q' n term doEnterQuoteMode | |
244 | 'S' n expr-quant doBackslashS | |
245 | 's' n expr-quant doBackslashs | |
246 | 'W' n expr-quant doBackslashW | |
247 | 'w' n expr-quant doBackslashw | |
248 | 'X' n expr-quant doBackslashX | |
249 | 'Z' n term doBackslashZ | |
250 | 'z' n term doBackslashz | |
251 | '0' n expr-quant doOctal | |
252 | digit_char n expr-quant doBackRef # Will scan multiple digits | |
253 | eof errorDeath doEscapeError | |
254 | default n expr-quant doLiteralChar # Escaped literal char. | |
255 | ||
256 | ||
257 | # | |
258 | # errorDeath. This state is specified as the next state whenever a syntax error | |
259 | # in the source rules is detected. Barring bugs, the state machine will never | |
260 | # actually get here, but will stop because of the action associated with the error. | |
261 | # But, just in case, this state asks the state machine to exit. | |
262 | errorDeath: | |
263 | default n errorDeath doExit | |
264 | ||
265 |