]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbirpt.txt
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / common / rbbirpt.txt
CommitLineData
b75a7d8f
A
1
2#*****************************************************************************
3#
374ca955 4# Copyright (C) 2002-2003, International Business Machines Corporation and others.
b75a7d8f
A
5# All Rights Reserved.
6#
7#*****************************************************************************
8#
9# file: rbbirpt.txt
10# ICU Break Iterator Rule Parser State Table
11#
12# This state table is used when reading and parsing a set of RBBI rules
13# The rule parser uses a state machine; the data in this file define the
14# state transitions that occur for each input character.
15#
16# *** This file defines the RBBI rule grammar. This is it.
17# *** The determination of what is accepted is here.
18#
19# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
20# that are then built with the rule parser.
21#
22
23#
24# Here is the syntax of the state definitions in this file:
25#
26#
27#StateName:
28# input-char n next-state ^push-state action
29# input-char n next-state ^push-state action
30# | | | | |
31# | | | | |--- action to be performed by state machine
32# | | | | See function RBBIRuleScanner::doParseActions()
33# | | | |
34# | | | |--- Push this named state onto the state stack.
35# | | | Later, when next state is specified as "pop",
36# | | | the pushed state will become the current state.
37# | | |
38# | | |--- Transition to this state if the current input character matches the input
39# | | character or char class in the left hand column. "pop" causes the next
40# | | state to be popped from the state stack.
41# | |
42# | |--- When making the state transition specified on this line, advance to the next
43# | character from the input only if 'n' appears here.
44# |
45# |--- Character or named character classes to test for. If the current character being scanned
46# matches, peform the actions and go to the state specified on this line.
47# The input character is tested sequentally, in the order written. The characters and
48# character classes tested for do not need to be mutually exclusive. The first match wins.
49#
50
51
52
53
54#
55# start state, scan position is at the beginning of the rules file, or in between two rules.
56#
57start:
58 escaped term ^break-rule-end doExprStart
59 white_space n start
60 '$' scan-var-name ^assign-or-rule doExprStart
374ca955 61 '!' n rev-option
b75a7d8f
A
62 ';' n start # ignore empty rules.
63 eof exit
64 default term ^break-rule-end doExprStart
65
66#
67# break-rule-end: Returned from doing a break-rule expression.
68#
69break-rule-end:
70 ';' n start doEndOfRule
71 white_space n break-rule-end
72 default errorDeath doRuleError
73
74
75#
374ca955
A
76# ! We've just scanned a '!', indicating either a !!key word flag or a
77# !Reverse rule.
b75a7d8f 78#
374ca955
A
79rev-option:
80 '!' n option-scan1
81 default reverse-rule ^break-rule-end doReverseDir
82
83option-scan1:
84 name_start_char n option-scan2 doOptionStart
85 default errorDeath doRuleError
86
87option-scan2:
88 name_char n option-scan2
89 default option-scan3 doOptionEnd
90
91option-scan3:
92 ';' n start
93 white_space n option-scan3
94 default errorDeath doRuleError
95
96
b75a7d8f
A
97reverse-rule:
98 default term ^break-rule-end doExprStart
99
100
101#
102# term. Eat through a single rule character, or a composite thing, which
103# could be a parenthesized expression, a variable name, or a Unicode Set.
104#
105term:
106 escaped n expr-mod doRuleChar
107 white_space n term
108 rule_char n expr-mod doRuleChar
109 '[' scan-unicode-set ^expr-mod
110 '(' n term ^expr-mod doLParen
111 '$' scan-var-name ^term-var-ref
112 '.' n expr-mod doDotAny
113 default errorDeath doRuleError
114
115
116
117#
118# term-var-ref We've just finished scanning a reference to a $variable.
119# Check that the variable was defined.
120# The variable name scanning is in common with assignment statements,
121# so the check can't be done there.
122term-var-ref:
123 default expr-mod doCheckVarDef
124
125
126#
127# expr-mod We've just finished scanning a term, now look for the optional
128# trailing '*', '?', '+'
129#
130expr-mod:
131 white_space n expr-mod
132 '*' n expr-cont doUnaryOpStar
133 '+' n expr-cont doUnaryOpPlus
134 '?' n expr-cont doUnaryOpQuestion
135 default expr-cont
136
137
138#
139# expr-cont Expression, continuation. At a point where additional terms are
140# allowed, but not required.
141#
142expr-cont:
143 escaped term doExprCatOperator
144 white_space n expr-cont
145 rule_char term doExprCatOperator
146 '[' term doExprCatOperator
147 '(' term doExprCatOperator
148 '$' term doExprCatOperator
149 '.' term doExprCatOperator
150 '/' look-ahead doExprCatOperator
151 '{' n tag-open doExprCatOperator
152 '|' n term doExprOrOperator
153 ')' n pop doExprRParen
154 default pop doExprFinished
155
156
157#
158# look-ahead Scanning a '/', which identifies a break point, assuming that the
159# remainder of the expression matches.
160#
161# Generate a parse tree as if this was a special kind of input symbol
162# appearing in an otherwise normal concatenation expression.
163#
164look-ahead:
165 '/' n expr-cont-no-slash doSlash
166 default errorDeath
167
168
169#
170# expr-cont-no-slash Expression, continuation. At a point where additional terms are
171# allowed, but not required. Just like
172# expr-cont, above, except that no '/'
173# look-ahead symbol is permitted.
174#
175expr-cont-no-slash:
176 escaped term doExprCatOperator
177 white_space n expr-cont
178 rule_char term doExprCatOperator
179 '[' term doExprCatOperator
180 '(' term doExprCatOperator
181 '$' term doExprCatOperator
182 '.' term doExprCatOperator
183 '|' n term doExprOrOperator
184 ')' n pop doExprRParen
185 default pop doExprFinished
186
187
188#
189# tags scanning a '{', the opening delimiter for a tag that identifies
190# the kind of match. Scan the whole {dddd} tag, where d=digit
191#
192tag-open:
193 white_space n tag-open
194 digit_char tag-value doStartTagValue
195 default errorDeath doTagExpectedError
196
197tag-value:
198 white_space n tag-close
199 '}' tag-close
200 digit_char n tag-value doTagDigit
201 default errorDeath doTagExpectedError
202
203tag-close:
204 white_space n tag-close
205 '}' n expr-cont-no-tag doTagValue
206 default errorDeath doTagExpectedError
207
208
209
210#
211# expr-cont-no-tag Expression, continuation. At a point where additional terms are
212# allowed, but not required. Just like
213# expr-cont, above, except that no "{ddd}"
214# tagging is permitted.
215#
216expr-cont-no-tag:
217 escaped term doExprCatOperator
218 white_space n expr-cont-no-tag
219 rule_char term doExprCatOperator
220 '[' term doExprCatOperator
221 '(' term doExprCatOperator
222 '$' term doExprCatOperator
223 '.' term doExprCatOperator
224 '/' look-ahead doExprCatOperator
225 '|' n term doExprOrOperator
226 ')' n pop doExprRParen
227 default pop doExprFinished
228
229
230
231
232#
233# Variable Name Scanning.
234#
235# The state that branched to here must have pushed a return state
236# to go to after completion of the variable name scanning.
237#
238# The current input character must be the $ that introduces the name.
239# The $ is consummed here rather than in the state that first detected it
240# so that the doStartVariableName action only needs to happen in one
241# place (here), and the other states don't need to worry about it.
242#
243scan-var-name:
244 '$' n scan-var-start doStartVariableName
245 default errorDeath
246
247
248scan-var-start:
249 name_start_char n scan-var-body
250 default errorDeath doVariableNameExpectedErr
251
252scan-var-body:
253 name_char n scan-var-body
254 default pop doEndVariableName
255
256
257
258#
259# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
260# Within the RBBI parser, after finding the first character
261# of a Unicode Set, we just hand the rule input at that
262# point of to the Unicode Set constructor, then pick
263# up parsing after the close of the set.
264#
265# The action for this state invokes the UnicodeSet parser.
266#
267scan-unicode-set:
268 '[' n pop doScanUnicodeSet
269 'p' n pop doScanUnicodeSet
270 'P' n pop doScanUnicodeSet
271 default errorDeath
272
273
274
275
276
277
278
279#
280# assign-or-rule. A $variable was encountered at the start of something, could be
281# either an assignment statement or a rule, depending on whether an '='
282# follows the variable name. We get to this state when the variable name
283# scanning does a return.
284#
285assign-or-rule:
286 white_space n assign-or-rule
287 '=' n term ^assign-end doStartAssign # variable was target of assignment
288 default term-var-ref ^break-rule-end # variable was a term in a rule
289
290
291
292#
293# assign-end This state is entered when the end of the expression on the
294# right hand side of an assignment is found. We get here via
295# a pop; this state is pushed when the '=' in an assignment is found.
296#
297# The only thing allowed at this point is a ';'. The RHS of an
298# assignment must look like a rule expression, and we come here
299# when what is being scanned no longer looks like an expression.
300#
301assign-end:
302 ';' n start doEndAssign
303 default errorDeath doRuleErrorAssignExpr
304
305
306
307#
308# errorDeath. This state is specified as the next state whenever a syntax error
309# in the source rules is detected. Barring bugs, the state machine will never
310# actually get here, but will stop because of the action associated with the error.
311# But, just in case, this state asks the state machine to exit.
312errorDeath:
313 default n errorDeath doExit
314
315