[apple/icu.git] / icuSources / common / rbbirpt.txt


#*****************************************************************************
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  rbbirpt.txt
#  ICU Break Iterator Rule Parser State Table
#
#     This state table is used when reading and parsing a set of RBBI rules
#     The rule parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the RBBI rule grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action    
#   input-char           n next-state           ^push-state     action    
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#            


#
#  start state, scan position is at the beginning of the rules file, or in between two rules.
#
start:
    escaped                term                  ^break-rule-end    doExprStart                       
    white_space          n start                     
    '$'                    scan-var-name         ^assign-or-rule    doExprStart
    '!'                  n rev-option                             
    ';'                  n start                                                  # ignore empty rules.
    eof                    exit              
    default                term                  ^break-rule-end    doExprStart
    
#
#  break-rule-end:  Returned from doing a break-rule expression.
#
break-rule-end:
    ';'	                 n start                                    doEndOfRule
    white_space          n break-rule-end
    default                errorDeath                               doRuleError
     

#
#   !               We've just scanned a '!', indicating either a !!key word flag or a
#                   !Reverse rule.
#
rev-option:
    '!'                  n option-scan1   
    default                reverse-rule           ^break-rule-end   doReverseDir
    
option-scan1:
    name_start_char      n option-scan2                             doOptionStart
    default                errorDeath                               doRuleError
    
option-scan2:
    name_char            n option-scan2
    default                option-scan3                             doOptionEnd
    
option-scan3:
    ';'                  n start 
    white_space          n option-scan3 
    default                errorDeath                               doRuleError 
    

reverse-rule:
    default                term                   ^break-rule-end   doExprStart
    
    
#
#  term.  Eat through a single rule character, or a composite thing, which
#         could be a parenthesized expression, a variable name, or a Unicode Set.
#
term:
    escaped              n expr-mod                                 doRuleChar
    white_space          n term
    rule_char            n expr-mod                                 doRuleChar
    '['                    scan-unicode-set      ^expr-mod
    '('                  n term                  ^expr-mod          doLParen
    '$'                    scan-var-name         ^term-var-ref
    '.'                  n expr-mod                                 doDotAny
    default                errorDeath                               doRuleError
    
    
#
#  term-var-ref   We've just finished scanning a reference to a $variable.
#                 Check that the variable was defined.
#                 The variable name scanning is in common with assignment statements,
#                 so the check can't be done there.
term-var-ref:
    default                expr-mod                                 doCheckVarDef
    
    
#
#   expr-mod      We've just finished scanning a term, now look for the optional
#                 trailing '*', '?', '+'
#
expr-mod:
    white_space          n  expr-mod
    '*'                  n  expr-cont                               doUnaryOpStar
    '+'                  n  expr-cont                               doUnaryOpPlus
    '?'                  n  expr-cont                               doUnaryOpQuestion
    default                 expr-cont 
    
    
#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.
#
expr-cont:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '{'                  n  tag-open                                doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    

#
#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
#                 remainder of the expression matches.
#
#                 Generate a parse tree as if this was a special kind of input symbol
#                 appearing in an otherwise normal concatenation expression.
#
look-ahead:
    '/'                   n expr-cont-no-slash                      doSlash
    default                 errorDeath


#
#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no '/'
#                                            look-ahead symbol is permitted.
#
expr-cont-no-slash:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished


#
#   tags             scanning a '{', the opening delimiter for a tag that identifies
#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
#
tag-open:
    white_space          n  tag-open
    digit_char              tag-value                               doStartTagValue
    default                 errorDeath                              doTagExpectedError
    
tag-value:
    white_space          n  tag-close
    '}'                     tag-close
    digit_char           n  tag-value                               doTagDigit
    default                 errorDeath                              doTagExpectedError
    
tag-close:
    white_space          n  tag-close
    '}'                  n  expr-cont-no-tag                        doTagValue
    default                 errorDeath                              doTagExpectedError
    
    
#
#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no "{ddd}"
#                                            tagging is permitted.
#
expr-cont-no-tag:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont-no-tag
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    
    
#
#   Variable Name Scanning.
#
#                    The state that branched to here must have pushed a return state
#                    to go to after completion of the variable name scanning.
#
#                    The current input character must be the $ that introduces the name.
#                    The $ is consummed here rather than in the state that first detected it
#                    so that the doStartVariableName action only needs to happen in one
#                    place (here), and the other states don't need to worry about it.
#
scan-var-name:
   '$'                  n scan-var-start                            doStartVariableName
   default                errorDeath


scan-var-start:
    name_start_char      n scan-var-body
    default                errorDeath                               doVariableNameExpectedErr
    
scan-var-body:
    name_char            n scan-var-body
    default                pop                                      doEndVariableName
    
    
#
#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
#                     Within the RBBI parser, after finding the first character
#                     of a Unicode Set, we just hand the rule input at that
#                     point of to the Unicode Set constructor, then pick
#                     up parsing after the close of the set.
#
#                     The action for this state invokes the UnicodeSet parser.
#
scan-unicode-set:
    '['                   n pop                                      doScanUnicodeSet
    'p'                   n pop                                      doScanUnicodeSet
    'P'                   n pop                                      doScanUnicodeSet
    default		    errorDeath 
    
    
#
#  assign-or-rule.   A $variable was encountered at the start of something, could be
#                    either an assignment statement or a rule, depending on whether an '='
#                    follows the variable name.  We get to this state when the variable name
#                    scanning does a return.
#
assign-or-rule:
    white_space          n assign-or-rule
    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule


#
#  assign-end        This state is entered when the end of the expression on the
#                    right hand side of an assignment is found.  We get here via
#                    a pop; this state is pushed when the '=' in an assignment is found.
#
#                    The only thing allowed at this point is a ';'.  The RHS of an
#                    assignment must look like a rule expression, and we come here
#                    when what is being scanned no longer looks like an expression.
#
assign-end:
    ';'                  n start                                    doEndAssign
    default                errorDeath                               doRuleErrorAssignExpr
    
    
#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit
Commit	Line	Data
b75a7d8f A	1
	2	#*****************************************************************************
	3	#
374ca955	4	# Copyright (C) 2002-2003, International Business Machines Corporation and others.
b75a7d8f A	5	# All Rights Reserved.
	6	#
	7	#*****************************************************************************
	8	#
	9	# file: rbbirpt.txt
	10	# ICU Break Iterator Rule Parser State Table
	11	#
	12	# This state table is used when reading and parsing a set of RBBI rules
	13	# The rule parser uses a state machine; the data in this file define the
	14	# state transitions that occur for each input character.
	15	#
	16	# *** This file defines the RBBI rule grammar. This is it.
	17	# *** The determination of what is accepted is here.
	18	#
	19	# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
	20	# that are then built with the rule parser.
	21	#
	22
	23	#
	24	# Here is the syntax of the state definitions in this file:
	25	#
	26	#
	27	#StateName:
	28	# input-char n next-state ^push-state action
	29	# input-char n next-state ^push-state action
	30	# \| \| \| \| \|
	31	# \| \| \| \| \|--- action to be performed by state machine
	32	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	33	# \| \| \| \|
	34	# \| \| \| \|--- Push this named state onto the state stack.
	35	# \| \| \| Later, when next state is specified as "pop",
	36	# \| \| \| the pushed state will become the current state.
	37	# \| \| \|
	38	# \| \| \|--- Transition to this state if the current input character matches the input
	39	# \| \| character or char class in the left hand column. "pop" causes the next
	40	# \| \| state to be popped from the state stack.
	41	# \| \|
	42	# \| \|--- When making the state transition specified on this line, advance to the next
	43	# \| character from the input only if 'n' appears here.
	44	# \|
	45	# \|--- Character or named character classes to test for. If the current character being scanned
	46	# matches, peform the actions and go to the state specified on this line.
	47	# The input character is tested sequentally, in the order written. The characters and
	48	# character classes tested for do not need to be mutually exclusive. The first match wins.
	49	#
	50
	51
	52
	53
	54	#
	55	# start state, scan position is at the beginning of the rules file, or in between two rules.
	56	#
	57	start:
	58	escaped term ^break-rule-end doExprStart
	59	white_space n start
	60	'$' scan-var-name ^assign-or-rule doExprStart
374ca955	61	'!' n rev-option
b75a7d8f A	62	';' n start # ignore empty rules.
	63	eof exit
	64	default term ^break-rule-end doExprStart
	65
	66	#
	67	# break-rule-end: Returned from doing a break-rule expression.
	68	#
	69	break-rule-end:
	70	';' n start doEndOfRule
	71	white_space n break-rule-end
	72	default errorDeath doRuleError
	73
	74
	75	#
374ca955 A	76	# ! We've just scanned a '!', indicating either a !!key word flag or a
374ca955 A	77	# !Reverse rule.
b75a7d8f	78	#
374ca955 A	79	rev-option:
	80	'!' n option-scan1
	81	default reverse-rule ^break-rule-end doReverseDir
	82
	83	option-scan1:
	84	name_start_char n option-scan2 doOptionStart
	85	default errorDeath doRuleError
	86
	87	option-scan2:
	88	name_char n option-scan2
	89	default option-scan3 doOptionEnd
	90
	91	option-scan3:
	92	';' n start
	93	white_space n option-scan3
	94	default errorDeath doRuleError
	95
	96
b75a7d8f A	97	reverse-rule:
	98	default term ^break-rule-end doExprStart
	99
	100
	101	#
	102	# term. Eat through a single rule character, or a composite thing, which
	103	# could be a parenthesized expression, a variable name, or a Unicode Set.
	104	#
	105	term:
	106	escaped n expr-mod doRuleChar
	107	white_space n term
	108	rule_char n expr-mod doRuleChar
	109	'[' scan-unicode-set ^expr-mod
	110	'(' n term ^expr-mod doLParen
	111	'$' scan-var-name ^term-var-ref
	112	'.' n expr-mod doDotAny
	113	default errorDeath doRuleError
	114
	115
	116
	117	#
	118	# term-var-ref We've just finished scanning a reference to a $variable.
	119	# Check that the variable was defined.
	120	# The variable name scanning is in common with assignment statements,
	121	# so the check can't be done there.
	122	term-var-ref:
	123	default expr-mod doCheckVarDef
	124
	125
	126	#
	127	# expr-mod We've just finished scanning a term, now look for the optional
	128	# trailing '*', '?', '+'
	129	#
	130	expr-mod:
	131	white_space n expr-mod
	132	'*' n expr-cont doUnaryOpStar
	133	'+' n expr-cont doUnaryOpPlus
	134	'?' n expr-cont doUnaryOpQuestion
	135	default expr-cont
	136
	137
	138	#
	139	# expr-cont Expression, continuation. At a point where additional terms are
	140	# allowed, but not required.
	141	#
	142	expr-cont:
	143	escaped term doExprCatOperator
	144	white_space n expr-cont
	145	rule_char term doExprCatOperator
	146	'[' term doExprCatOperator
	147	'(' term doExprCatOperator
	148	'$' term doExprCatOperator
	149	'.' term doExprCatOperator
	150	'/' look-ahead doExprCatOperator
	151	'{' n tag-open doExprCatOperator
	152	'\|' n term doExprOrOperator
	153	')' n pop doExprRParen
	154	default pop doExprFinished
	155
	156
	157	#
	158	# look-ahead Scanning a '/', which identifies a break point, assuming that the
	159	# remainder of the expression matches.
	160	#
161	# Generate a parse tree as if this was a special kind of input symbol
162	# appearing in an otherwise normal concatenation expression.
163	#
164	look-ahead:
165	'/' n expr-cont-no-slash doSlash
166	default errorDeath
167
168
169	#
170	# expr-cont-no-slash Expression, continuation. At a point where additional terms are
171	# allowed, but not required. Just like
172	# expr-cont, above, except that no '/'
173	# look-ahead symbol is permitted.
174	#
175	expr-cont-no-slash:
176	escaped term doExprCatOperator
177	white_space n expr-cont
178	rule_char term doExprCatOperator
179	'[' term doExprCatOperator
180	'(' term doExprCatOperator
181	'$' term doExprCatOperator
182	'.' term doExprCatOperator
183	'\|' n term doExprOrOperator
184	')' n pop doExprRParen
185	default pop doExprFinished
186
187
188	#
189	# tags scanning a '{', the opening delimiter for a tag that identifies
190	# the kind of match. Scan the whole {dddd} tag, where d=digit
191	#
192	tag-open:
193	white_space n tag-open
194	digit_char tag-value doStartTagValue
195	default errorDeath doTagExpectedError
196
197	tag-value:
198	white_space n tag-close
199	'}' tag-close
200	digit_char n tag-value doTagDigit
201	default errorDeath doTagExpectedError
202
203	tag-close:
204	white_space n tag-close
205	'}' n expr-cont-no-tag doTagValue
206	default errorDeath doTagExpectedError
207
208
209
210	#
211	# expr-cont-no-tag Expression, continuation. At a point where additional terms are
212	# allowed, but not required. Just like
213	# expr-cont, above, except that no "{ddd}"
214	# tagging is permitted.
215	#
216	expr-cont-no-tag:
217	escaped term doExprCatOperator
218	white_space n expr-cont-no-tag
219	rule_char term doExprCatOperator
220	'[' term doExprCatOperator
221	'(' term doExprCatOperator
222	'$' term doExprCatOperator
223	'.' term doExprCatOperator
224	'/' look-ahead doExprCatOperator
225	'\|' n term doExprOrOperator
226	')' n pop doExprRParen
227	default pop doExprFinished
228
229
230
231
232	#
233	# Variable Name Scanning.
234	#
235	# The state that branched to here must have pushed a return state
236	# to go to after completion of the variable name scanning.
237	#
238	# The current input character must be the $ that introduces the name.
239	# The $ is consummed here rather than in the state that first detected it
240	# so that the doStartVariableName action only needs to happen in one
241	# place (here), and the other states don't need to worry about it.
242	#
243	scan-var-name:
244	'$' n scan-var-start doStartVariableName
245	default errorDeath
246
247
248	scan-var-start:
249	name_start_char n scan-var-body
250	default errorDeath doVariableNameExpectedErr
251
252	scan-var-body:
253	name_char n scan-var-body
254	default pop doEndVariableName
255
256
257
258	#
259	# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
260	# Within the RBBI parser, after finding the first character
261	# of a Unicode Set, we just hand the rule input at that
262	# point of to the Unicode Set constructor, then pick
263	# up parsing after the close of the set.
264	#
265	# The action for this state invokes the UnicodeSet parser.
266	#
267	scan-unicode-set:
268	'[' n pop doScanUnicodeSet
269	'p' n pop doScanUnicodeSet
270	'P' n pop doScanUnicodeSet
271	default errorDeath
272
273
274
275
276
277
278
279	#
280	# assign-or-rule. A $variable was encountered at the start of something, could be
281	# either an assignment statement or a rule, depending on whether an '='
282	# follows the variable name. We get to this state when the variable name
283	# scanning does a return.
284	#
285	assign-or-rule:
286	white_space n assign-or-rule
287	'=' n term ^assign-end doStartAssign # variable was target of assignment
288	default term-var-ref ^break-rule-end # variable was a term in a rule
289
290
291
292	#
293	# assign-end This state is entered when the end of the expression on the
294	# right hand side of an assignment is found. We get here via
295	# a pop; this state is pushed when the '=' in an assignment is found.
296	#
297	# The only thing allowed at this point is a ';'. The RHS of an
298	# assignment must look like a rule expression, and we come here
299	# when what is being scanned no longer looks like an expression.
300	#
301	assign-end:
302	';' n start doEndAssign
303	default errorDeath doRuleErrorAssignExpr
304
305
306
307	#
308	# errorDeath. This state is specified as the next state whenever a syntax error
309	# in the source rules is detected. Barring bugs, the state machine will never
310	# actually get here, but will stop because of the action associated with the error.
311	# But, just in case, this state asks the state machine to exit.
312	errorDeath:
313	default n errorDeath doExit
314
315