]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbirpt.txt
ICU-59131.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbirpt.txt
1
2 #*****************************************************************************
3 #
4 # Copyright (C) 2016 and later: Unicode, Inc. and others.
5 # License & terms of use: http://www.unicode.org/copyright.html#License
6 #
7 #*****************************************************************************
8 #*****************************************************************************
9 #
10 # Copyright (C) 2002-2016, International Business Machines Corporation and others.
11 # All Rights Reserved.
12 #
13 #*****************************************************************************
14 #
15 # file: rbbirpt.txt
16 # ICU Break Iterator Rule Parser State Table
17 #
18 # This state table is used when reading and parsing a set of RBBI rules
19 # The rule parser uses a state machine; the data in this file define the
20 # state transitions that occur for each input character.
21 #
22 # *** This file defines the RBBI rule grammar. This is it.
23 # *** The determination of what is accepted is here.
24 #
25 # This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
26 # that are then built with the rule parser.
27 #
28 # perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
29
30 #
31 # Here is the syntax of the state definitions in this file:
32 #
33 #
34 #StateName:
35 # input-char n next-state ^push-state action
36 # input-char n next-state ^push-state action
37 # | | | | |
38 # | | | | |--- action to be performed by state machine
39 # | | | | See function RBBIRuleScanner::doParseActions()
40 # | | | |
41 # | | | |--- Push this named state onto the state stack.
42 # | | | Later, when next state is specified as "pop",
43 # | | | the pushed state will become the current state.
44 # | | |
45 # | | |--- Transition to this state if the current input character matches the input
46 # | | character or char class in the left hand column. "pop" causes the next
47 # | | state to be popped from the state stack.
48 # | |
49 # | |--- When making the state transition specified on this line, advance to the next
50 # | character from the input only if 'n' appears here.
51 # |
52 # |--- Character or named character classes to test for. If the current character being scanned
53 # matches, peform the actions and go to the state specified on this line.
54 # The input character is tested sequentally, in the order written. The characters and
55 # character classes tested for do not need to be mutually exclusive. The first match wins.
56 #
57
58
59
60
61 #
62 # start state, scan position is at the beginning of the rules file, or in between two rules.
63 #
64 start:
65 escaped term ^break-rule-end doExprStart
66 white_space n start
67 '^' n start-after-caret ^break-rule-end doNoChain
68 '$' scan-var-name ^assign-or-rule doExprStart
69 '!' n rev-option
70 ';' n start # ignore empty rules.
71 eof exit
72 default term ^break-rule-end doExprStart
73
74 #
75 # break-rule-end: Returned from doing a break-rule expression.
76 #
77 break-rule-end:
78 ';' n start doEndOfRule
79 white_space n break-rule-end
80 default errorDeath doRuleError
81
82 #
83 # start of a rule, after having seen a '^' (inhibits rule chain in).
84 # Similar to the main 'start' state in most respects, except
85 # - empty rule is an error.
86 # - A second '^' is an error.
87 #
88 start-after-caret:
89 escaped term doExprStart
90 white_space n start-after-caret
91 '^' errorDeath doRuleError # two '^'s
92 '$' scan-var-name ^term-var-ref doExprStart
93 ';' errorDeath doRuleError # ^ ;
94 eof errorDeath doRuleError
95 default term doExprStart
96
97 #
98 # ! We've just scanned a '!', indicating either a !!key word flag or a
99 # !Reverse rule.
100 #
101 rev-option:
102 '!' n option-scan1
103 default reverse-rule ^break-rule-end doReverseDir
104
105 option-scan1:
106 name_start_char n option-scan2 doOptionStart
107 default errorDeath doRuleError
108
109 option-scan2:
110 name_char n option-scan2
111 default option-scan3 doOptionEnd
112
113 option-scan3:
114 ';' n start
115 white_space n option-scan3
116 default errorDeath doRuleError
117
118
119 reverse-rule:
120 default term ^break-rule-end doExprStart
121
122
123 #
124 # term. Eat through a single rule character, or a composite thing, which
125 # could be a parenthesized expression, a variable name, or a Unicode Set.
126 #
127 term:
128 escaped n expr-mod doRuleChar
129 white_space n term
130 rule_char n expr-mod doRuleChar
131 '[' scan-unicode-set ^expr-mod
132 '(' n term ^expr-mod doLParen
133 '$' scan-var-name ^term-var-ref
134 '.' n expr-mod doDotAny
135 default errorDeath doRuleError
136
137
138
139 #
140 # term-var-ref We've just finished scanning a reference to a $variable.
141 # Check that the variable was defined.
142 # The variable name scanning is in common with assignment statements,
143 # so the check can't be done there.
144 term-var-ref:
145 default expr-mod doCheckVarDef
146
147
148 #
149 # expr-mod We've just finished scanning a term, now look for the optional
150 # trailing '*', '?', '+'
151 #
152 expr-mod:
153 white_space n expr-mod
154 '*' n expr-cont doUnaryOpStar
155 '+' n expr-cont doUnaryOpPlus
156 '?' n expr-cont doUnaryOpQuestion
157 default expr-cont
158
159
160 #
161 # expr-cont Expression, continuation. At a point where additional terms are
162 # allowed, but not required.
163 #
164 expr-cont:
165 escaped term doExprCatOperator
166 white_space n expr-cont
167 rule_char term doExprCatOperator
168 '[' term doExprCatOperator
169 '(' term doExprCatOperator
170 '$' term doExprCatOperator
171 '.' term doExprCatOperator
172 '/' look-ahead doExprCatOperator
173 '{' n tag-open doExprCatOperator
174 '|' n term doExprOrOperator
175 ')' n pop doExprRParen
176 default pop doExprFinished
177
178
179 #
180 # look-ahead Scanning a '/', which identifies a break point, assuming that the
181 # remainder of the expression matches.
182 #
183 # Generate a parse tree as if this was a special kind of input symbol
184 # appearing in an otherwise normal concatenation expression.
185 #
186 look-ahead:
187 '/' n expr-cont-no-slash doSlash
188 default errorDeath
189
190
191 #
192 # expr-cont-no-slash Expression, continuation. At a point where additional terms are
193 # allowed, but not required. Just like
194 # expr-cont, above, except that no '/'
195 # look-ahead symbol is permitted.
196 #
197 expr-cont-no-slash:
198 escaped term doExprCatOperator
199 white_space n expr-cont
200 rule_char term doExprCatOperator
201 '[' term doExprCatOperator
202 '(' term doExprCatOperator
203 '$' term doExprCatOperator
204 '.' term doExprCatOperator
205 '|' n term doExprOrOperator
206 ')' n pop doExprRParen
207 default pop doExprFinished
208
209
210 #
211 # tags scanning a '{', the opening delimiter for a tag that identifies
212 # the kind of match. Scan the whole {dddd} tag, where d=digit
213 #
214 tag-open:
215 white_space n tag-open
216 digit_char tag-value doStartTagValue
217 default errorDeath doTagExpectedError
218
219 tag-value:
220 white_space n tag-close
221 '}' tag-close
222 digit_char n tag-value doTagDigit
223 default errorDeath doTagExpectedError
224
225 tag-close:
226 white_space n tag-close
227 '}' n expr-cont-no-tag doTagValue
228 default errorDeath doTagExpectedError
229
230
231
232 #
233 # expr-cont-no-tag Expression, continuation. At a point where additional terms are
234 # allowed, but not required. Just like
235 # expr-cont, above, except that no "{ddd}"
236 # tagging is permitted.
237 #
238 expr-cont-no-tag:
239 escaped term doExprCatOperator
240 white_space n expr-cont-no-tag
241 rule_char term doExprCatOperator
242 '[' term doExprCatOperator
243 '(' term doExprCatOperator
244 '$' term doExprCatOperator
245 '.' term doExprCatOperator
246 '/' look-ahead doExprCatOperator
247 '|' n term doExprOrOperator
248 ')' n pop doExprRParen
249 default pop doExprFinished
250
251
252
253
254 #
255 # Variable Name Scanning.
256 #
257 # The state that branched to here must have pushed a return state
258 # to go to after completion of the variable name scanning.
259 #
260 # The current input character must be the $ that introduces the name.
261 # The $ is consummed here rather than in the state that first detected it
262 # so that the doStartVariableName action only needs to happen in one
263 # place (here), and the other states don't need to worry about it.
264 #
265 scan-var-name:
266 '$' n scan-var-start doStartVariableName
267 default errorDeath
268
269
270 scan-var-start:
271 name_start_char n scan-var-body
272 default errorDeath doVariableNameExpectedErr
273
274 scan-var-body:
275 name_char n scan-var-body
276 default pop doEndVariableName
277
278
279
280 #
281 # scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
282 # Within the RBBI parser, after finding the first character
283 # of a Unicode Set, we just hand the rule input at that
284 # point of to the Unicode Set constructor, then pick
285 # up parsing after the close of the set.
286 #
287 # The action for this state invokes the UnicodeSet parser.
288 #
289 scan-unicode-set:
290 '[' n pop doScanUnicodeSet
291 'p' n pop doScanUnicodeSet
292 'P' n pop doScanUnicodeSet
293 default errorDeath
294
295
296
297
298
299
300
301 #
302 # assign-or-rule. A $variable was encountered at the start of something, could be
303 # either an assignment statement or a rule, depending on whether an '='
304 # follows the variable name. We get to this state when the variable name
305 # scanning does a return.
306 #
307 assign-or-rule:
308 white_space n assign-or-rule
309 '=' n term ^assign-end doStartAssign # variable was target of assignment
310 default term-var-ref ^break-rule-end # variable was a term in a rule
311
312
313
314 #
315 # assign-end This state is entered when the end of the expression on the
316 # right hand side of an assignment is found. We get here via
317 # a pop; this state is pushed when the '=' in an assignment is found.
318 #
319 # The only thing allowed at this point is a ';'. The RHS of an
320 # assignment must look like a rule expression, and we come here
321 # when what is being scanned no longer looks like an expression.
322 #
323 assign-end:
324 ';' n start doEndAssign
325 default errorDeath doRuleErrorAssignExpr
326
327
328
329 #
330 # errorDeath. This state is specified as the next state whenever a syntax error
331 # in the source rules is detected. Barring bugs, the state machine will never
332 # actually get here, but will stop because of the action associated with the error.
333 # But, just in case, this state asks the state machine to exit.
334 errorDeath:
335 default n errorDeath doExit
336
337