]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | |
2 | #***************************************************************************** | |
3 | # | |
374ca955 | 4 | # Copyright (C) 2002-2003, International Business Machines Corporation and others. |
b75a7d8f A |
5 | # All Rights Reserved. |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: rbbirpt.txt | |
10 | # ICU Break Iterator Rule Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a set of RBBI rules | |
13 | # The rule parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the RBBI rule grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | ||
23 | # | |
24 | # Here is the syntax of the state definitions in this file: | |
25 | # | |
26 | # | |
27 | #StateName: | |
28 | # input-char n next-state ^push-state action | |
29 | # input-char n next-state ^push-state action | |
30 | # | | | | | | |
31 | # | | | | |--- action to be performed by state machine | |
32 | # | | | | See function RBBIRuleScanner::doParseActions() | |
33 | # | | | | | |
34 | # | | | |--- Push this named state onto the state stack. | |
35 | # | | | Later, when next state is specified as "pop", | |
36 | # | | | the pushed state will become the current state. | |
37 | # | | | | |
38 | # | | |--- Transition to this state if the current input character matches the input | |
39 | # | | character or char class in the left hand column. "pop" causes the next | |
40 | # | | state to be popped from the state stack. | |
41 | # | | | |
42 | # | |--- When making the state transition specified on this line, advance to the next | |
43 | # | character from the input only if 'n' appears here. | |
44 | # | | |
45 | # |--- Character or named character classes to test for. If the current character being scanned | |
46 | # matches, peform the actions and go to the state specified on this line. | |
47 | # The input character is tested sequentally, in the order written. The characters and | |
48 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
49 | # | |
50 | ||
51 | ||
52 | ||
53 | ||
54 | # | |
55 | # start state, scan position is at the beginning of the rules file, or in between two rules. | |
56 | # | |
57 | start: | |
58 | escaped term ^break-rule-end doExprStart | |
59 | white_space n start | |
60 | '$' scan-var-name ^assign-or-rule doExprStart | |
374ca955 | 61 | '!' n rev-option |
b75a7d8f A |
62 | ';' n start # ignore empty rules. |
63 | eof exit | |
64 | default term ^break-rule-end doExprStart | |
65 | ||
66 | # | |
67 | # break-rule-end: Returned from doing a break-rule expression. | |
68 | # | |
69 | break-rule-end: | |
70 | ';' n start doEndOfRule | |
71 | white_space n break-rule-end | |
72 | default errorDeath doRuleError | |
73 | ||
74 | ||
75 | # | |
374ca955 A |
76 | # ! We've just scanned a '!', indicating either a !!key word flag or a |
77 | # !Reverse rule. | |
b75a7d8f | 78 | # |
374ca955 A |
79 | rev-option: |
80 | '!' n option-scan1 | |
81 | default reverse-rule ^break-rule-end doReverseDir | |
82 | ||
83 | option-scan1: | |
84 | name_start_char n option-scan2 doOptionStart | |
85 | default errorDeath doRuleError | |
86 | ||
87 | option-scan2: | |
88 | name_char n option-scan2 | |
89 | default option-scan3 doOptionEnd | |
90 | ||
91 | option-scan3: | |
92 | ';' n start | |
93 | white_space n option-scan3 | |
94 | default errorDeath doRuleError | |
95 | ||
96 | ||
b75a7d8f A |
97 | reverse-rule: |
98 | default term ^break-rule-end doExprStart | |
99 | ||
100 | ||
101 | # | |
102 | # term. Eat through a single rule character, or a composite thing, which | |
103 | # could be a parenthesized expression, a variable name, or a Unicode Set. | |
104 | # | |
105 | term: | |
106 | escaped n expr-mod doRuleChar | |
107 | white_space n term | |
108 | rule_char n expr-mod doRuleChar | |
109 | '[' scan-unicode-set ^expr-mod | |
110 | '(' n term ^expr-mod doLParen | |
111 | '$' scan-var-name ^term-var-ref | |
112 | '.' n expr-mod doDotAny | |
113 | default errorDeath doRuleError | |
114 | ||
115 | ||
116 | ||
117 | # | |
118 | # term-var-ref We've just finished scanning a reference to a $variable. | |
119 | # Check that the variable was defined. | |
120 | # The variable name scanning is in common with assignment statements, | |
121 | # so the check can't be done there. | |
122 | term-var-ref: | |
123 | default expr-mod doCheckVarDef | |
124 | ||
125 | ||
126 | # | |
127 | # expr-mod We've just finished scanning a term, now look for the optional | |
128 | # trailing '*', '?', '+' | |
129 | # | |
130 | expr-mod: | |
131 | white_space n expr-mod | |
132 | '*' n expr-cont doUnaryOpStar | |
133 | '+' n expr-cont doUnaryOpPlus | |
134 | '?' n expr-cont doUnaryOpQuestion | |
135 | default expr-cont | |
136 | ||
137 | ||
138 | # | |
139 | # expr-cont Expression, continuation. At a point where additional terms are | |
140 | # allowed, but not required. | |
141 | # | |
142 | expr-cont: | |
143 | escaped term doExprCatOperator | |
144 | white_space n expr-cont | |
145 | rule_char term doExprCatOperator | |
146 | '[' term doExprCatOperator | |
147 | '(' term doExprCatOperator | |
148 | '$' term doExprCatOperator | |
149 | '.' term doExprCatOperator | |
150 | '/' look-ahead doExprCatOperator | |
151 | '{' n tag-open doExprCatOperator | |
152 | '|' n term doExprOrOperator | |
153 | ')' n pop doExprRParen | |
154 | default pop doExprFinished | |
155 | ||
156 | ||
157 | # | |
158 | # look-ahead Scanning a '/', which identifies a break point, assuming that the | |
159 | # remainder of the expression matches. | |
160 | # | |
161 | # Generate a parse tree as if this was a special kind of input symbol | |
162 | # appearing in an otherwise normal concatenation expression. | |
163 | # | |
164 | look-ahead: | |
165 | '/' n expr-cont-no-slash doSlash | |
166 | default errorDeath | |
167 | ||
168 | ||
169 | # | |
170 | # expr-cont-no-slash Expression, continuation. At a point where additional terms are | |
171 | # allowed, but not required. Just like | |
172 | # expr-cont, above, except that no '/' | |
173 | # look-ahead symbol is permitted. | |
174 | # | |
175 | expr-cont-no-slash: | |
176 | escaped term doExprCatOperator | |
177 | white_space n expr-cont | |
178 | rule_char term doExprCatOperator | |
179 | '[' term doExprCatOperator | |
180 | '(' term doExprCatOperator | |
181 | '$' term doExprCatOperator | |
182 | '.' term doExprCatOperator | |
183 | '|' n term doExprOrOperator | |
184 | ')' n pop doExprRParen | |
185 | default pop doExprFinished | |
186 | ||
187 | ||
188 | # | |
189 | # tags scanning a '{', the opening delimiter for a tag that identifies | |
190 | # the kind of match. Scan the whole {dddd} tag, where d=digit | |
191 | # | |
192 | tag-open: | |
193 | white_space n tag-open | |
194 | digit_char tag-value doStartTagValue | |
195 | default errorDeath doTagExpectedError | |
196 | ||
197 | tag-value: | |
198 | white_space n tag-close | |
199 | '}' tag-close | |
200 | digit_char n tag-value doTagDigit | |
201 | default errorDeath doTagExpectedError | |
202 | ||
203 | tag-close: | |
204 | white_space n tag-close | |
205 | '}' n expr-cont-no-tag doTagValue | |
206 | default errorDeath doTagExpectedError | |
207 | ||
208 | ||
209 | ||
210 | # | |
211 | # expr-cont-no-tag Expression, continuation. At a point where additional terms are | |
212 | # allowed, but not required. Just like | |
213 | # expr-cont, above, except that no "{ddd}" | |
214 | # tagging is permitted. | |
215 | # | |
216 | expr-cont-no-tag: | |
217 | escaped term doExprCatOperator | |
218 | white_space n expr-cont-no-tag | |
219 | rule_char term doExprCatOperator | |
220 | '[' term doExprCatOperator | |
221 | '(' term doExprCatOperator | |
222 | '$' term doExprCatOperator | |
223 | '.' term doExprCatOperator | |
224 | '/' look-ahead doExprCatOperator | |
225 | '|' n term doExprOrOperator | |
226 | ')' n pop doExprRParen | |
227 | default pop doExprFinished | |
228 | ||
229 | ||
230 | ||
231 | ||
232 | # | |
233 | # Variable Name Scanning. | |
234 | # | |
235 | # The state that branched to here must have pushed a return state | |
236 | # to go to after completion of the variable name scanning. | |
237 | # | |
238 | # The current input character must be the $ that introduces the name. | |
239 | # The $ is consummed here rather than in the state that first detected it | |
240 | # so that the doStartVariableName action only needs to happen in one | |
241 | # place (here), and the other states don't need to worry about it. | |
242 | # | |
243 | scan-var-name: | |
244 | '$' n scan-var-start doStartVariableName | |
245 | default errorDeath | |
246 | ||
247 | ||
248 | scan-var-start: | |
249 | name_start_char n scan-var-body | |
250 | default errorDeath doVariableNameExpectedErr | |
251 | ||
252 | scan-var-body: | |
253 | name_char n scan-var-body | |
254 | default pop doEndVariableName | |
255 | ||
256 | ||
257 | ||
258 | # | |
259 | # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. | |
260 | # Within the RBBI parser, after finding the first character | |
261 | # of a Unicode Set, we just hand the rule input at that | |
262 | # point of to the Unicode Set constructor, then pick | |
263 | # up parsing after the close of the set. | |
264 | # | |
265 | # The action for this state invokes the UnicodeSet parser. | |
266 | # | |
267 | scan-unicode-set: | |
268 | '[' n pop doScanUnicodeSet | |
269 | 'p' n pop doScanUnicodeSet | |
270 | 'P' n pop doScanUnicodeSet | |
271 | default errorDeath | |
272 | ||
273 | ||
274 | ||
275 | ||
276 | ||
277 | ||
278 | ||
279 | # | |
280 | # assign-or-rule. A $variable was encountered at the start of something, could be | |
281 | # either an assignment statement or a rule, depending on whether an '=' | |
282 | # follows the variable name. We get to this state when the variable name | |
283 | # scanning does a return. | |
284 | # | |
285 | assign-or-rule: | |
286 | white_space n assign-or-rule | |
287 | '=' n term ^assign-end doStartAssign # variable was target of assignment | |
288 | default term-var-ref ^break-rule-end # variable was a term in a rule | |
289 | ||
290 | ||
291 | ||
292 | # | |
293 | # assign-end This state is entered when the end of the expression on the | |
294 | # right hand side of an assignment is found. We get here via | |
295 | # a pop; this state is pushed when the '=' in an assignment is found. | |
296 | # | |
297 | # The only thing allowed at this point is a ';'. The RHS of an | |
298 | # assignment must look like a rule expression, and we come here | |
299 | # when what is being scanned no longer looks like an expression. | |
300 | # | |
301 | assign-end: | |
302 | ';' n start doEndAssign | |
303 | default errorDeath doRuleErrorAssignExpr | |
304 | ||
305 | ||
306 | ||
307 | # | |
308 | # errorDeath. This state is specified as the next state whenever a syntax error | |
309 | # in the source rules is detected. Barring bugs, the state machine will never | |
310 | # actually get here, but will stop because of the action associated with the error. | |
311 | # But, just in case, this state asks the state machine to exit. | |
312 | errorDeath: | |
313 | default n errorDeath doExit | |
314 | ||
315 |