]>
Commit | Line | Data |
---|---|---|
1 | ||
2 | #***************************************************************************** | |
3 | # | |
4 | # Copyright (C) 2002-2016, International Business Machines Corporation and others. | |
5 | # All Rights Reserved. | |
6 | # | |
7 | #***************************************************************************** | |
8 | # | |
9 | # file: rbbirpt.txt | |
10 | # ICU Break Iterator Rule Parser State Table | |
11 | # | |
12 | # This state table is used when reading and parsing a set of RBBI rules | |
13 | # The rule parser uses a state machine; the data in this file define the | |
14 | # state transitions that occur for each input character. | |
15 | # | |
16 | # *** This file defines the RBBI rule grammar. This is it. | |
17 | # *** The determination of what is accepted is here. | |
18 | # | |
19 | # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays | |
20 | # that are then built with the rule parser. | |
21 | # | |
22 | # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h | |
23 | ||
24 | # | |
25 | # Here is the syntax of the state definitions in this file: | |
26 | # | |
27 | # | |
28 | #StateName: | |
29 | # input-char n next-state ^push-state action | |
30 | # input-char n next-state ^push-state action | |
31 | # | | | | | | |
32 | # | | | | |--- action to be performed by state machine | |
33 | # | | | | See function RBBIRuleScanner::doParseActions() | |
34 | # | | | | | |
35 | # | | | |--- Push this named state onto the state stack. | |
36 | # | | | Later, when next state is specified as "pop", | |
37 | # | | | the pushed state will become the current state. | |
38 | # | | | | |
39 | # | | |--- Transition to this state if the current input character matches the input | |
40 | # | | character or char class in the left hand column. "pop" causes the next | |
41 | # | | state to be popped from the state stack. | |
42 | # | | | |
43 | # | |--- When making the state transition specified on this line, advance to the next | |
44 | # | character from the input only if 'n' appears here. | |
45 | # | | |
46 | # |--- Character or named character classes to test for. If the current character being scanned | |
47 | # matches, peform the actions and go to the state specified on this line. | |
48 | # The input character is tested sequentally, in the order written. The characters and | |
49 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
50 | # | |
51 | ||
52 | ||
53 | ||
54 | ||
55 | # | |
56 | # start state, scan position is at the beginning of the rules file, or in between two rules. | |
57 | # | |
58 | start: | |
59 | escaped term ^break-rule-end doExprStart | |
60 | white_space n start | |
61 | '^' n start-after-caret ^break-rule-end doNoChain | |
62 | '$' scan-var-name ^assign-or-rule doExprStart | |
63 | '!' n rev-option | |
64 | ';' n start # ignore empty rules. | |
65 | eof exit | |
66 | default term ^break-rule-end doExprStart | |
67 | ||
68 | # | |
69 | # break-rule-end: Returned from doing a break-rule expression. | |
70 | # | |
71 | break-rule-end: | |
72 | ';' n start doEndOfRule | |
73 | white_space n break-rule-end | |
74 | default errorDeath doRuleError | |
75 | ||
76 | # | |
77 | # start of a rule, after having seen a '^' (inhibits rule chain in). | |
78 | # Similar to the main 'start' state in most respects, except | |
79 | # - empty rule is an error. | |
80 | # - A second '^' is an error. | |
81 | # | |
82 | start-after-caret: | |
83 | escaped term doExprStart | |
84 | white_space n start-after-caret | |
85 | '^' errorDeath doRuleError # two '^'s | |
86 | '$' scan-var-name ^term-var-ref doExprStart | |
87 | ';' errorDeath doRuleError # ^ ; | |
88 | eof errorDeath doRuleError | |
89 | default term doExprStart | |
90 | ||
91 | # | |
92 | # ! We've just scanned a '!', indicating either a !!key word flag or a | |
93 | # !Reverse rule. | |
94 | # | |
95 | rev-option: | |
96 | '!' n option-scan1 | |
97 | default reverse-rule ^break-rule-end doReverseDir | |
98 | ||
99 | option-scan1: | |
100 | name_start_char n option-scan2 doOptionStart | |
101 | default errorDeath doRuleError | |
102 | ||
103 | option-scan2: | |
104 | name_char n option-scan2 | |
105 | default option-scan3 doOptionEnd | |
106 | ||
107 | option-scan3: | |
108 | ';' n start | |
109 | white_space n option-scan3 | |
110 | default errorDeath doRuleError | |
111 | ||
112 | ||
113 | reverse-rule: | |
114 | default term ^break-rule-end doExprStart | |
115 | ||
116 | ||
117 | # | |
118 | # term. Eat through a single rule character, or a composite thing, which | |
119 | # could be a parenthesized expression, a variable name, or a Unicode Set. | |
120 | # | |
121 | term: | |
122 | escaped n expr-mod doRuleChar | |
123 | white_space n term | |
124 | rule_char n expr-mod doRuleChar | |
125 | '[' scan-unicode-set ^expr-mod | |
126 | '(' n term ^expr-mod doLParen | |
127 | '$' scan-var-name ^term-var-ref | |
128 | '.' n expr-mod doDotAny | |
129 | default errorDeath doRuleError | |
130 | ||
131 | ||
132 | ||
133 | # | |
134 | # term-var-ref We've just finished scanning a reference to a $variable. | |
135 | # Check that the variable was defined. | |
136 | # The variable name scanning is in common with assignment statements, | |
137 | # so the check can't be done there. | |
138 | term-var-ref: | |
139 | default expr-mod doCheckVarDef | |
140 | ||
141 | ||
142 | # | |
143 | # expr-mod We've just finished scanning a term, now look for the optional | |
144 | # trailing '*', '?', '+' | |
145 | # | |
146 | expr-mod: | |
147 | white_space n expr-mod | |
148 | '*' n expr-cont doUnaryOpStar | |
149 | '+' n expr-cont doUnaryOpPlus | |
150 | '?' n expr-cont doUnaryOpQuestion | |
151 | default expr-cont | |
152 | ||
153 | ||
154 | # | |
155 | # expr-cont Expression, continuation. At a point where additional terms are | |
156 | # allowed, but not required. | |
157 | # | |
158 | expr-cont: | |
159 | escaped term doExprCatOperator | |
160 | white_space n expr-cont | |
161 | rule_char term doExprCatOperator | |
162 | '[' term doExprCatOperator | |
163 | '(' term doExprCatOperator | |
164 | '$' term doExprCatOperator | |
165 | '.' term doExprCatOperator | |
166 | '/' look-ahead doExprCatOperator | |
167 | '{' n tag-open doExprCatOperator | |
168 | '|' n term doExprOrOperator | |
169 | ')' n pop doExprRParen | |
170 | default pop doExprFinished | |
171 | ||
172 | ||
173 | # | |
174 | # look-ahead Scanning a '/', which identifies a break point, assuming that the | |
175 | # remainder of the expression matches. | |
176 | # | |
177 | # Generate a parse tree as if this was a special kind of input symbol | |
178 | # appearing in an otherwise normal concatenation expression. | |
179 | # | |
180 | look-ahead: | |
181 | '/' n expr-cont-no-slash doSlash | |
182 | default errorDeath | |
183 | ||
184 | ||
185 | # | |
186 | # expr-cont-no-slash Expression, continuation. At a point where additional terms are | |
187 | # allowed, but not required. Just like | |
188 | # expr-cont, above, except that no '/' | |
189 | # look-ahead symbol is permitted. | |
190 | # | |
191 | expr-cont-no-slash: | |
192 | escaped term doExprCatOperator | |
193 | white_space n expr-cont | |
194 | rule_char term doExprCatOperator | |
195 | '[' term doExprCatOperator | |
196 | '(' term doExprCatOperator | |
197 | '$' term doExprCatOperator | |
198 | '.' term doExprCatOperator | |
199 | '|' n term doExprOrOperator | |
200 | ')' n pop doExprRParen | |
201 | default pop doExprFinished | |
202 | ||
203 | ||
204 | # | |
205 | # tags scanning a '{', the opening delimiter for a tag that identifies | |
206 | # the kind of match. Scan the whole {dddd} tag, where d=digit | |
207 | # | |
208 | tag-open: | |
209 | white_space n tag-open | |
210 | digit_char tag-value doStartTagValue | |
211 | default errorDeath doTagExpectedError | |
212 | ||
213 | tag-value: | |
214 | white_space n tag-close | |
215 | '}' tag-close | |
216 | digit_char n tag-value doTagDigit | |
217 | default errorDeath doTagExpectedError | |
218 | ||
219 | tag-close: | |
220 | white_space n tag-close | |
221 | '}' n expr-cont-no-tag doTagValue | |
222 | default errorDeath doTagExpectedError | |
223 | ||
224 | ||
225 | ||
226 | # | |
227 | # expr-cont-no-tag Expression, continuation. At a point where additional terms are | |
228 | # allowed, but not required. Just like | |
229 | # expr-cont, above, except that no "{ddd}" | |
230 | # tagging is permitted. | |
231 | # | |
232 | expr-cont-no-tag: | |
233 | escaped term doExprCatOperator | |
234 | white_space n expr-cont-no-tag | |
235 | rule_char term doExprCatOperator | |
236 | '[' term doExprCatOperator | |
237 | '(' term doExprCatOperator | |
238 | '$' term doExprCatOperator | |
239 | '.' term doExprCatOperator | |
240 | '/' look-ahead doExprCatOperator | |
241 | '|' n term doExprOrOperator | |
242 | ')' n pop doExprRParen | |
243 | default pop doExprFinished | |
244 | ||
245 | ||
246 | ||
247 | ||
248 | # | |
249 | # Variable Name Scanning. | |
250 | # | |
251 | # The state that branched to here must have pushed a return state | |
252 | # to go to after completion of the variable name scanning. | |
253 | # | |
254 | # The current input character must be the $ that introduces the name. | |
255 | # The $ is consummed here rather than in the state that first detected it | |
256 | # so that the doStartVariableName action only needs to happen in one | |
257 | # place (here), and the other states don't need to worry about it. | |
258 | # | |
259 | scan-var-name: | |
260 | '$' n scan-var-start doStartVariableName | |
261 | default errorDeath | |
262 | ||
263 | ||
264 | scan-var-start: | |
265 | name_start_char n scan-var-body | |
266 | default errorDeath doVariableNameExpectedErr | |
267 | ||
268 | scan-var-body: | |
269 | name_char n scan-var-body | |
270 | default pop doEndVariableName | |
271 | ||
272 | ||
273 | ||
274 | # | |
275 | # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. | |
276 | # Within the RBBI parser, after finding the first character | |
277 | # of a Unicode Set, we just hand the rule input at that | |
278 | # point of to the Unicode Set constructor, then pick | |
279 | # up parsing after the close of the set. | |
280 | # | |
281 | # The action for this state invokes the UnicodeSet parser. | |
282 | # | |
283 | scan-unicode-set: | |
284 | '[' n pop doScanUnicodeSet | |
285 | 'p' n pop doScanUnicodeSet | |
286 | 'P' n pop doScanUnicodeSet | |
287 | default errorDeath | |
288 | ||
289 | ||
290 | ||
291 | ||
292 | ||
293 | ||
294 | ||
295 | # | |
296 | # assign-or-rule. A $variable was encountered at the start of something, could be | |
297 | # either an assignment statement or a rule, depending on whether an '=' | |
298 | # follows the variable name. We get to this state when the variable name | |
299 | # scanning does a return. | |
300 | # | |
301 | assign-or-rule: | |
302 | white_space n assign-or-rule | |
303 | '=' n term ^assign-end doStartAssign # variable was target of assignment | |
304 | default term-var-ref ^break-rule-end # variable was a term in a rule | |
305 | ||
306 | ||
307 | ||
308 | # | |
309 | # assign-end This state is entered when the end of the expression on the | |
310 | # right hand side of an assignment is found. We get here via | |
311 | # a pop; this state is pushed when the '=' in an assignment is found. | |
312 | # | |
313 | # The only thing allowed at this point is a ';'. The RHS of an | |
314 | # assignment must look like a rule expression, and we come here | |
315 | # when what is being scanned no longer looks like an expression. | |
316 | # | |
317 | assign-end: | |
318 | ';' n start doEndAssign | |
319 | default errorDeath doRuleErrorAssignExpr | |
320 | ||
321 | ||
322 | ||
323 | # | |
324 | # errorDeath. This state is specified as the next state whenever a syntax error | |
325 | # in the source rules is detected. Barring bugs, the state machine will never | |
326 | # actually get here, but will stop because of the action associated with the error. | |
327 | # But, just in case, this state asks the state machine to exit. | |
328 | errorDeath: | |
329 | default n errorDeath doExit | |
330 | ||
331 |