]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbirpt.txt
ICU-57132.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbirpt.txt
1
2 #*****************************************************************************
3 #
4 # Copyright (C) 2002-2016, International Business Machines Corporation and others.
5 # All Rights Reserved.
6 #
7 #*****************************************************************************
8 #
9 # file: rbbirpt.txt
10 # ICU Break Iterator Rule Parser State Table
11 #
12 # This state table is used when reading and parsing a set of RBBI rules
13 # The rule parser uses a state machine; the data in this file define the
14 # state transitions that occur for each input character.
15 #
16 # *** This file defines the RBBI rule grammar. This is it.
17 # *** The determination of what is accepted is here.
18 #
19 # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
20 # that are then built with the rule parser.
21 #
22 # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
23
24 #
25 # Here is the syntax of the state definitions in this file:
26 #
27 #
28 #StateName:
29 # input-char n next-state ^push-state action
30 # input-char n next-state ^push-state action
31 # | | | | |
32 # | | | | |--- action to be performed by state machine
33 # | | | | See function RBBIRuleScanner::doParseActions()
34 # | | | |
35 # | | | |--- Push this named state onto the state stack.
36 # | | | Later, when next state is specified as "pop",
37 # | | | the pushed state will become the current state.
38 # | | |
39 # | | |--- Transition to this state if the current input character matches the input
40 # | | character or char class in the left hand column. "pop" causes the next
41 # | | state to be popped from the state stack.
42 # | |
43 # | |--- When making the state transition specified on this line, advance to the next
44 # | character from the input only if 'n' appears here.
45 # |
46 # |--- Character or named character classes to test for. If the current character being scanned
47 # matches, peform the actions and go to the state specified on this line.
48 # The input character is tested sequentally, in the order written. The characters and
49 # character classes tested for do not need to be mutually exclusive. The first match wins.
50 #
51
52
53
54
55 #
56 # start state, scan position is at the beginning of the rules file, or in between two rules.
57 #
58 start:
59 escaped term ^break-rule-end doExprStart
60 white_space n start
61 '^' n start-after-caret ^break-rule-end doNoChain
62 '$' scan-var-name ^assign-or-rule doExprStart
63 '!' n rev-option
64 ';' n start # ignore empty rules.
65 eof exit
66 default term ^break-rule-end doExprStart
67
68 #
69 # break-rule-end: Returned from doing a break-rule expression.
70 #
71 break-rule-end:
72 ';' n start doEndOfRule
73 white_space n break-rule-end
74 default errorDeath doRuleError
75
76 #
77 # start of a rule, after having seen a '^' (inhibits rule chain in).
78 # Similar to the main 'start' state in most respects, except
79 # - empty rule is an error.
80 # - A second '^' is an error.
81 #
82 start-after-caret:
83 escaped term doExprStart
84 white_space n start-after-caret
85 '^' errorDeath doRuleError # two '^'s
86 '$' scan-var-name ^term-var-ref doExprStart
87 ';' errorDeath doRuleError # ^ ;
88 eof errorDeath doRuleError
89 default term doExprStart
90
91 #
92 # ! We've just scanned a '!', indicating either a !!key word flag or a
93 # !Reverse rule.
94 #
95 rev-option:
96 '!' n option-scan1
97 default reverse-rule ^break-rule-end doReverseDir
98
99 option-scan1:
100 name_start_char n option-scan2 doOptionStart
101 default errorDeath doRuleError
102
103 option-scan2:
104 name_char n option-scan2
105 default option-scan3 doOptionEnd
106
107 option-scan3:
108 ';' n start
109 white_space n option-scan3
110 default errorDeath doRuleError
111
112
113 reverse-rule:
114 default term ^break-rule-end doExprStart
115
116
117 #
118 # term. Eat through a single rule character, or a composite thing, which
119 # could be a parenthesized expression, a variable name, or a Unicode Set.
120 #
121 term:
122 escaped n expr-mod doRuleChar
123 white_space n term
124 rule_char n expr-mod doRuleChar
125 '[' scan-unicode-set ^expr-mod
126 '(' n term ^expr-mod doLParen
127 '$' scan-var-name ^term-var-ref
128 '.' n expr-mod doDotAny
129 default errorDeath doRuleError
130
131
132
133 #
134 # term-var-ref We've just finished scanning a reference to a $variable.
135 # Check that the variable was defined.
136 # The variable name scanning is in common with assignment statements,
137 # so the check can't be done there.
138 term-var-ref:
139 default expr-mod doCheckVarDef
140
141
142 #
143 # expr-mod We've just finished scanning a term, now look for the optional
144 # trailing '*', '?', '+'
145 #
146 expr-mod:
147 white_space n expr-mod
148 '*' n expr-cont doUnaryOpStar
149 '+' n expr-cont doUnaryOpPlus
150 '?' n expr-cont doUnaryOpQuestion
151 default expr-cont
152
153
154 #
155 # expr-cont Expression, continuation. At a point where additional terms are
156 # allowed, but not required.
157 #
158 expr-cont:
159 escaped term doExprCatOperator
160 white_space n expr-cont
161 rule_char term doExprCatOperator
162 '[' term doExprCatOperator
163 '(' term doExprCatOperator
164 '$' term doExprCatOperator
165 '.' term doExprCatOperator
166 '/' look-ahead doExprCatOperator
167 '{' n tag-open doExprCatOperator
168 '|' n term doExprOrOperator
169 ')' n pop doExprRParen
170 default pop doExprFinished
171
172
173 #
174 # look-ahead Scanning a '/', which identifies a break point, assuming that the
175 # remainder of the expression matches.
176 #
177 # Generate a parse tree as if this was a special kind of input symbol
178 # appearing in an otherwise normal concatenation expression.
179 #
180 look-ahead:
181 '/' n expr-cont-no-slash doSlash
182 default errorDeath
183
184
185 #
186 # expr-cont-no-slash Expression, continuation. At a point where additional terms are
187 # allowed, but not required. Just like
188 # expr-cont, above, except that no '/'
189 # look-ahead symbol is permitted.
190 #
191 expr-cont-no-slash:
192 escaped term doExprCatOperator
193 white_space n expr-cont
194 rule_char term doExprCatOperator
195 '[' term doExprCatOperator
196 '(' term doExprCatOperator
197 '$' term doExprCatOperator
198 '.' term doExprCatOperator
199 '|' n term doExprOrOperator
200 ')' n pop doExprRParen
201 default pop doExprFinished
202
203
204 #
205 # tags scanning a '{', the opening delimiter for a tag that identifies
206 # the kind of match. Scan the whole {dddd} tag, where d=digit
207 #
208 tag-open:
209 white_space n tag-open
210 digit_char tag-value doStartTagValue
211 default errorDeath doTagExpectedError
212
213 tag-value:
214 white_space n tag-close
215 '}' tag-close
216 digit_char n tag-value doTagDigit
217 default errorDeath doTagExpectedError
218
219 tag-close:
220 white_space n tag-close
221 '}' n expr-cont-no-tag doTagValue
222 default errorDeath doTagExpectedError
223
224
225
226 #
227 # expr-cont-no-tag Expression, continuation. At a point where additional terms are
228 # allowed, but not required. Just like
229 # expr-cont, above, except that no "{ddd}"
230 # tagging is permitted.
231 #
232 expr-cont-no-tag:
233 escaped term doExprCatOperator
234 white_space n expr-cont-no-tag
235 rule_char term doExprCatOperator
236 '[' term doExprCatOperator
237 '(' term doExprCatOperator
238 '$' term doExprCatOperator
239 '.' term doExprCatOperator
240 '/' look-ahead doExprCatOperator
241 '|' n term doExprOrOperator
242 ')' n pop doExprRParen
243 default pop doExprFinished
244
245
246
247
248 #
249 # Variable Name Scanning.
250 #
251 # The state that branched to here must have pushed a return state
252 # to go to after completion of the variable name scanning.
253 #
254 # The current input character must be the $ that introduces the name.
255 # The $ is consummed here rather than in the state that first detected it
256 # so that the doStartVariableName action only needs to happen in one
257 # place (here), and the other states don't need to worry about it.
258 #
259 scan-var-name:
260 '$' n scan-var-start doStartVariableName
261 default errorDeath
262
263
264 scan-var-start:
265 name_start_char n scan-var-body
266 default errorDeath doVariableNameExpectedErr
267
268 scan-var-body:
269 name_char n scan-var-body
270 default pop doEndVariableName
271
272
273
274 #
275 # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
276 # Within the RBBI parser, after finding the first character
277 # of a Unicode Set, we just hand the rule input at that
278 # point of to the Unicode Set constructor, then pick
279 # up parsing after the close of the set.
280 #
281 # The action for this state invokes the UnicodeSet parser.
282 #
283 scan-unicode-set:
284 '[' n pop doScanUnicodeSet
285 'p' n pop doScanUnicodeSet
286 'P' n pop doScanUnicodeSet
287 default errorDeath
288
289
290
291
292
293
294
295 #
296 # assign-or-rule. A $variable was encountered at the start of something, could be
297 # either an assignment statement or a rule, depending on whether an '='
298 # follows the variable name. We get to this state when the variable name
299 # scanning does a return.
300 #
301 assign-or-rule:
302 white_space n assign-or-rule
303 '=' n term ^assign-end doStartAssign # variable was target of assignment
304 default term-var-ref ^break-rule-end # variable was a term in a rule
305
306
307
308 #
309 # assign-end This state is entered when the end of the expression on the
310 # right hand side of an assignment is found. We get here via
311 # a pop; this state is pushed when the '=' in an assignment is found.
312 #
313 # The only thing allowed at this point is a ';'. The RHS of an
314 # assignment must look like a rule expression, and we come here
315 # when what is being scanned no longer looks like an expression.
316 #
317 assign-end:
318 ';' n start doEndAssign
319 default errorDeath doRuleErrorAssignExpr
320
321
322
323 #
324 # errorDeath. This state is specified as the next state whenever a syntax error
325 # in the source rules is detected. Barring bugs, the state machine will never
326 # actually get here, but will stop because of the action associated with the error.
327 # But, just in case, this state asks the state machine to exit.
328 errorDeath:
329 default n errorDeath doExit
330
331