]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | |
2 | #***************************************************************************** | |
f3c0d7a5 A |
3 | # |
4 | # Copyright (C) 2016 and later: Unicode, Inc. and others. | |
5 | # License & terms of use: http://www.unicode.org/copyright.html#License | |
6 | # | |
7 | #***************************************************************************** | |
8 | #***************************************************************************** | |
b75a7d8f | 9 | # |
2ca993e8 | 10 | # Copyright (C) 2002-2016, International Business Machines Corporation and others. |
b75a7d8f A |
11 | # All Rights Reserved. |
12 | # | |
13 | #***************************************************************************** | |
14 | # | |
15 | # file: rbbirpt.txt | |
16 | # ICU Break Iterator Rule Parser State Table | |
17 | # | |
18 | # This state table is used when reading and parsing a set of RBBI rules | |
19 | # The rule parser uses a state machine; the data in this file define the | |
20 | # state transitions that occur for each input character. | |
21 | # | |
22 | # *** This file defines the RBBI rule grammar. This is it. | |
23 | # *** The determination of what is accepted is here. | |
24 | # | |
25 | # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays | |
26 | # that are then built with the rule parser. | |
27 | # | |
2ca993e8 | 28 | # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h |
b75a7d8f A |
29 | |
30 | # | |
31 | # Here is the syntax of the state definitions in this file: | |
32 | # | |
33 | # | |
34 | #StateName: | |
35 | # input-char n next-state ^push-state action | |
36 | # input-char n next-state ^push-state action | |
37 | # | | | | | | |
38 | # | | | | |--- action to be performed by state machine | |
39 | # | | | | See function RBBIRuleScanner::doParseActions() | |
40 | # | | | | | |
41 | # | | | |--- Push this named state onto the state stack. | |
42 | # | | | Later, when next state is specified as "pop", | |
43 | # | | | the pushed state will become the current state. | |
44 | # | | | | |
45 | # | | |--- Transition to this state if the current input character matches the input | |
46 | # | | character or char class in the left hand column. "pop" causes the next | |
47 | # | | state to be popped from the state stack. | |
48 | # | | | |
49 | # | |--- When making the state transition specified on this line, advance to the next | |
50 | # | character from the input only if 'n' appears here. | |
51 | # | | |
52 | # |--- Character or named character classes to test for. If the current character being scanned | |
53 | # matches, peform the actions and go to the state specified on this line. | |
54 | # The input character is tested sequentally, in the order written. The characters and | |
55 | # character classes tested for do not need to be mutually exclusive. The first match wins. | |
56 | # | |
57 | ||
58 | ||
59 | ||
60 | ||
61 | # | |
62 | # start state, scan position is at the beginning of the rules file, or in between two rules. | |
63 | # | |
64 | start: | |
65 | escaped term ^break-rule-end doExprStart | |
66 | white_space n start | |
2ca993e8 | 67 | '^' n start-after-caret ^break-rule-end doNoChain |
b75a7d8f | 68 | '$' scan-var-name ^assign-or-rule doExprStart |
374ca955 | 69 | '!' n rev-option |
b75a7d8f A |
70 | ';' n start # ignore empty rules. |
71 | eof exit | |
72 | default term ^break-rule-end doExprStart | |
73 | ||
74 | # | |
75 | # break-rule-end: Returned from doing a break-rule expression. | |
76 | # | |
77 | break-rule-end: | |
78 | ';' n start doEndOfRule | |
79 | white_space n break-rule-end | |
80 | default errorDeath doRuleError | |
81 | ||
2ca993e8 A |
82 | # |
83 | # start of a rule, after having seen a '^' (inhibits rule chain in). | |
84 | # Similar to the main 'start' state in most respects, except | |
85 | # - empty rule is an error. | |
86 | # - A second '^' is an error. | |
87 | # | |
88 | start-after-caret: | |
89 | escaped term doExprStart | |
90 | white_space n start-after-caret | |
91 | '^' errorDeath doRuleError # two '^'s | |
92 | '$' scan-var-name ^term-var-ref doExprStart | |
93 | ';' errorDeath doRuleError # ^ ; | |
94 | eof errorDeath doRuleError | |
95 | default term doExprStart | |
96 | ||
b75a7d8f | 97 | # |
374ca955 A |
98 | # ! We've just scanned a '!', indicating either a !!key word flag or a |
99 | # !Reverse rule. | |
b75a7d8f | 100 | # |
374ca955 A |
101 | rev-option: |
102 | '!' n option-scan1 | |
103 | default reverse-rule ^break-rule-end doReverseDir | |
104 | ||
105 | option-scan1: | |
106 | name_start_char n option-scan2 doOptionStart | |
107 | default errorDeath doRuleError | |
108 | ||
109 | option-scan2: | |
110 | name_char n option-scan2 | |
111 | default option-scan3 doOptionEnd | |
112 | ||
113 | option-scan3: | |
114 | ';' n start | |
115 | white_space n option-scan3 | |
116 | default errorDeath doRuleError | |
117 | ||
118 | ||
b75a7d8f A |
119 | reverse-rule: |
120 | default term ^break-rule-end doExprStart | |
121 | ||
122 | ||
123 | # | |
124 | # term. Eat through a single rule character, or a composite thing, which | |
125 | # could be a parenthesized expression, a variable name, or a Unicode Set. | |
126 | # | |
127 | term: | |
128 | escaped n expr-mod doRuleChar | |
129 | white_space n term | |
130 | rule_char n expr-mod doRuleChar | |
131 | '[' scan-unicode-set ^expr-mod | |
132 | '(' n term ^expr-mod doLParen | |
133 | '$' scan-var-name ^term-var-ref | |
134 | '.' n expr-mod doDotAny | |
135 | default errorDeath doRuleError | |
136 | ||
137 | ||
138 | ||
139 | # | |
140 | # term-var-ref We've just finished scanning a reference to a $variable. | |
141 | # Check that the variable was defined. | |
142 | # The variable name scanning is in common with assignment statements, | |
143 | # so the check can't be done there. | |
144 | term-var-ref: | |
145 | default expr-mod doCheckVarDef | |
146 | ||
147 | ||
148 | # | |
149 | # expr-mod We've just finished scanning a term, now look for the optional | |
150 | # trailing '*', '?', '+' | |
151 | # | |
152 | expr-mod: | |
153 | white_space n expr-mod | |
154 | '*' n expr-cont doUnaryOpStar | |
155 | '+' n expr-cont doUnaryOpPlus | |
156 | '?' n expr-cont doUnaryOpQuestion | |
157 | default expr-cont | |
158 | ||
159 | ||
160 | # | |
161 | # expr-cont Expression, continuation. At a point where additional terms are | |
162 | # allowed, but not required. | |
163 | # | |
164 | expr-cont: | |
165 | escaped term doExprCatOperator | |
166 | white_space n expr-cont | |
167 | rule_char term doExprCatOperator | |
168 | '[' term doExprCatOperator | |
169 | '(' term doExprCatOperator | |
170 | '$' term doExprCatOperator | |
171 | '.' term doExprCatOperator | |
172 | '/' look-ahead doExprCatOperator | |
173 | '{' n tag-open doExprCatOperator | |
174 | '|' n term doExprOrOperator | |
175 | ')' n pop doExprRParen | |
176 | default pop doExprFinished | |
177 | ||
178 | ||
179 | # | |
180 | # look-ahead Scanning a '/', which identifies a break point, assuming that the | |
181 | # remainder of the expression matches. | |
182 | # | |
183 | # Generate a parse tree as if this was a special kind of input symbol | |
184 | # appearing in an otherwise normal concatenation expression. | |
185 | # | |
186 | look-ahead: | |
187 | '/' n expr-cont-no-slash doSlash | |
188 | default errorDeath | |
189 | ||
190 | ||
191 | # | |
192 | # expr-cont-no-slash Expression, continuation. At a point where additional terms are | |
193 | # allowed, but not required. Just like | |
194 | # expr-cont, above, except that no '/' | |
195 | # look-ahead symbol is permitted. | |
196 | # | |
197 | expr-cont-no-slash: | |
198 | escaped term doExprCatOperator | |
199 | white_space n expr-cont | |
200 | rule_char term doExprCatOperator | |
201 | '[' term doExprCatOperator | |
202 | '(' term doExprCatOperator | |
203 | '$' term doExprCatOperator | |
204 | '.' term doExprCatOperator | |
205 | '|' n term doExprOrOperator | |
206 | ')' n pop doExprRParen | |
207 | default pop doExprFinished | |
208 | ||
209 | ||
210 | # | |
211 | # tags scanning a '{', the opening delimiter for a tag that identifies | |
212 | # the kind of match. Scan the whole {dddd} tag, where d=digit | |
213 | # | |
214 | tag-open: | |
215 | white_space n tag-open | |
216 | digit_char tag-value doStartTagValue | |
217 | default errorDeath doTagExpectedError | |
218 | ||
219 | tag-value: | |
220 | white_space n tag-close | |
221 | '}' tag-close | |
222 | digit_char n tag-value doTagDigit | |
223 | default errorDeath doTagExpectedError | |
224 | ||
225 | tag-close: | |
226 | white_space n tag-close | |
227 | '}' n expr-cont-no-tag doTagValue | |
228 | default errorDeath doTagExpectedError | |
229 | ||
230 | ||
231 | ||
232 | # | |
233 | # expr-cont-no-tag Expression, continuation. At a point where additional terms are | |
234 | # allowed, but not required. Just like | |
235 | # expr-cont, above, except that no "{ddd}" | |
236 | # tagging is permitted. | |
237 | # | |
238 | expr-cont-no-tag: | |
239 | escaped term doExprCatOperator | |
240 | white_space n expr-cont-no-tag | |
241 | rule_char term doExprCatOperator | |
242 | '[' term doExprCatOperator | |
243 | '(' term doExprCatOperator | |
244 | '$' term doExprCatOperator | |
245 | '.' term doExprCatOperator | |
246 | '/' look-ahead doExprCatOperator | |
247 | '|' n term doExprOrOperator | |
248 | ')' n pop doExprRParen | |
249 | default pop doExprFinished | |
250 | ||
251 | ||
252 | ||
253 | ||
254 | # | |
255 | # Variable Name Scanning. | |
256 | # | |
257 | # The state that branched to here must have pushed a return state | |
258 | # to go to after completion of the variable name scanning. | |
259 | # | |
260 | # The current input character must be the $ that introduces the name. | |
261 | # The $ is consummed here rather than in the state that first detected it | |
262 | # so that the doStartVariableName action only needs to happen in one | |
263 | # place (here), and the other states don't need to worry about it. | |
264 | # | |
265 | scan-var-name: | |
266 | '$' n scan-var-start doStartVariableName | |
267 | default errorDeath | |
268 | ||
269 | ||
270 | scan-var-start: | |
271 | name_start_char n scan-var-body | |
272 | default errorDeath doVariableNameExpectedErr | |
273 | ||
274 | scan-var-body: | |
275 | name_char n scan-var-body | |
276 | default pop doEndVariableName | |
277 | ||
278 | ||
279 | ||
280 | # | |
281 | # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class. | |
282 | # Within the RBBI parser, after finding the first character | |
283 | # of a Unicode Set, we just hand the rule input at that | |
284 | # point of to the Unicode Set constructor, then pick | |
285 | # up parsing after the close of the set. | |
286 | # | |
287 | # The action for this state invokes the UnicodeSet parser. | |
288 | # | |
289 | scan-unicode-set: | |
290 | '[' n pop doScanUnicodeSet | |
291 | 'p' n pop doScanUnicodeSet | |
292 | 'P' n pop doScanUnicodeSet | |
293 | default errorDeath | |
294 | ||
295 | ||
296 | ||
297 | ||
298 | ||
299 | ||
300 | ||
301 | # | |
302 | # assign-or-rule. A $variable was encountered at the start of something, could be | |
303 | # either an assignment statement or a rule, depending on whether an '=' | |
304 | # follows the variable name. We get to this state when the variable name | |
305 | # scanning does a return. | |
306 | # | |
307 | assign-or-rule: | |
308 | white_space n assign-or-rule | |
309 | '=' n term ^assign-end doStartAssign # variable was target of assignment | |
310 | default term-var-ref ^break-rule-end # variable was a term in a rule | |
311 | ||
312 | ||
313 | ||
314 | # | |
315 | # assign-end This state is entered when the end of the expression on the | |
316 | # right hand side of an assignment is found. We get here via | |
317 | # a pop; this state is pushed when the '=' in an assignment is found. | |
318 | # | |
319 | # The only thing allowed at this point is a ';'. The RHS of an | |
320 | # assignment must look like a rule expression, and we come here | |
321 | # when what is being scanned no longer looks like an expression. | |
322 | # | |
323 | assign-end: | |
324 | ';' n start doEndAssign | |
325 | default errorDeath doRuleErrorAssignExpr | |
326 | ||
327 | ||
328 | ||
329 | # | |
330 | # errorDeath. This state is specified as the next state whenever a syntax error | |
331 | # in the source rules is detected. Barring bugs, the state machine will never | |
332 | # actually get here, but will stop because of the action associated with the error. | |
333 | # But, just in case, this state asks the state machine to exit. | |
334 | errorDeath: | |
335 | default n errorDeath doExit | |
336 | ||
337 |