[apple/icu.git] / icuSources / common / rbbirpt.txt


#*****************************************************************************
#
#   Copyright (C) 2002-2016, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  rbbirpt.txt
#  ICU Break Iterator Rule Parser State Table
#
#     This state table is used when reading and parsing a set of RBBI rules
#     The rule parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the RBBI rule grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#
#    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action    
#   input-char           n next-state           ^push-state     action    
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#            


#
#  start state, scan position is at the beginning of the rules file, or in between two rules.
#
start:
    escaped                term                  ^break-rule-end    doExprStart                       
    white_space          n start                     
    '^'                  n start-after-caret     ^break-rule-end    doNoChain
    '$'                    scan-var-name         ^assign-or-rule    doExprStart
    '!'                  n rev-option                             
    ';'                  n start                                                  # ignore empty rules.
    eof                    exit              
    default                term                  ^break-rule-end    doExprStart
    
#
#  break-rule-end:  Returned from doing a break-rule expression.
#
break-rule-end:
    ';'	                 n start                                    doEndOfRule
    white_space          n break-rule-end
    default                errorDeath                               doRuleError
     
#
# start of a rule, after having seen a '^' (inhibits rule chain in).
#     Similar to the main 'start' state in most respects, except
#          - empty rule is an error.
#          - A second '^' is an error.
#
start-after-caret:
    escaped                term                                     doExprStart
    white_space          n start-after-caret
    '^'                    errorDeath                               doRuleError    # two '^'s
    '$'                    scan-var-name         ^term-var-ref      doExprStart
    ';'                    errorDeath                               doRuleError    # ^ ;
    eof                    errorDeath                               doRuleError
    default                term                                     doExprStart
 
#
#   !               We've just scanned a '!', indicating either a !!key word flag or a
#                   !Reverse rule.
#
rev-option:
    '!'                  n option-scan1   
    default                reverse-rule           ^break-rule-end   doReverseDir
    
option-scan1:
    name_start_char      n option-scan2                             doOptionStart
    default                errorDeath                               doRuleError
    
option-scan2:
    name_char            n option-scan2
    default                option-scan3                             doOptionEnd
    
option-scan3:
    ';'                  n start 
    white_space          n option-scan3 
    default                errorDeath                               doRuleError 
    

reverse-rule:
    default                term                   ^break-rule-end   doExprStart
    
    
#
#  term.  Eat through a single rule character, or a composite thing, which
#         could be a parenthesized expression, a variable name, or a Unicode Set.
#
term:
    escaped              n expr-mod                                 doRuleChar
    white_space          n term
    rule_char            n expr-mod                                 doRuleChar
    '['                    scan-unicode-set      ^expr-mod
    '('                  n term                  ^expr-mod          doLParen
    '$'                    scan-var-name         ^term-var-ref
    '.'                  n expr-mod                                 doDotAny
    default                errorDeath                               doRuleError
    
    
#
#  term-var-ref   We've just finished scanning a reference to a $variable.
#                 Check that the variable was defined.
#                 The variable name scanning is in common with assignment statements,
#                 so the check can't be done there.
term-var-ref:
    default                expr-mod                                 doCheckVarDef
    
    
#
#   expr-mod      We've just finished scanning a term, now look for the optional
#                 trailing '*', '?', '+'
#
expr-mod:
    white_space          n  expr-mod
    '*'                  n  expr-cont                               doUnaryOpStar
    '+'                  n  expr-cont                               doUnaryOpPlus
    '?'                  n  expr-cont                               doUnaryOpQuestion
    default                 expr-cont 
    
    
#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.
#
expr-cont:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '{'                  n  tag-open                                doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    

#
#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
#                 remainder of the expression matches.
#
#                 Generate a parse tree as if this was a special kind of input symbol
#                 appearing in an otherwise normal concatenation expression.
#
look-ahead:
    '/'                   n expr-cont-no-slash                      doSlash
    default                 errorDeath


#
#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no '/'
#                                            look-ahead symbol is permitted.
#
expr-cont-no-slash:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished


#
#   tags             scanning a '{', the opening delimiter for a tag that identifies
#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
#
tag-open:
    white_space          n  tag-open
    digit_char              tag-value                               doStartTagValue
    default                 errorDeath                              doTagExpectedError
    
tag-value:
    white_space          n  tag-close
    '}'                     tag-close
    digit_char           n  tag-value                               doTagDigit
    default                 errorDeath                              doTagExpectedError
    
tag-close:
    white_space          n  tag-close
    '}'                  n  expr-cont-no-tag                        doTagValue
    default                 errorDeath                              doTagExpectedError
    
    
#
#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no "{ddd}"
#                                            tagging is permitted.
#
expr-cont-no-tag:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont-no-tag
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    
    
#
#   Variable Name Scanning.
#
#                    The state that branched to here must have pushed a return state
#                    to go to after completion of the variable name scanning.
#
#                    The current input character must be the $ that introduces the name.
#                    The $ is consummed here rather than in the state that first detected it
#                    so that the doStartVariableName action only needs to happen in one
#                    place (here), and the other states don't need to worry about it.
#
scan-var-name:
   '$'                  n scan-var-start                            doStartVariableName
   default                errorDeath


scan-var-start:
    name_start_char      n scan-var-body
    default                errorDeath                               doVariableNameExpectedErr
    
scan-var-body:
    name_char            n scan-var-body
    default                pop                                      doEndVariableName
    
    
#
#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
#                     Within the RBBI parser, after finding the first character
#                     of a Unicode Set, we just hand the rule input at that
#                     point of to the Unicode Set constructor, then pick
#                     up parsing after the close of the set.
#
#                     The action for this state invokes the UnicodeSet parser.
#
scan-unicode-set:
    '['                   n pop                                      doScanUnicodeSet
    'p'                   n pop                                      doScanUnicodeSet
    'P'                   n pop                                      doScanUnicodeSet
    default		    errorDeath 
    
    
#
#  assign-or-rule.   A $variable was encountered at the start of something, could be
#                    either an assignment statement or a rule, depending on whether an '='
#                    follows the variable name.  We get to this state when the variable name
#                    scanning does a return.
#
assign-or-rule:
    white_space          n assign-or-rule
    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule


#
#  assign-end        This state is entered when the end of the expression on the
#                    right hand side of an assignment is found.  We get here via
#                    a pop; this state is pushed when the '=' in an assignment is found.
#
#                    The only thing allowed at this point is a ';'.  The RHS of an
#                    assignment must look like a rule expression, and we come here
#                    when what is being scanned no longer looks like an expression.
#
assign-end:
    ';'                  n start                                    doEndAssign
    default                errorDeath                               doRuleErrorAssignExpr
    
    
#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit
Commit	Line	Data
b75a7d8f A	1
	2	#*****************************************************************************
	3	#
2ca993e8	4	# Copyright (C) 2002-2016, International Business Machines Corporation and others.
b75a7d8f A	5	# All Rights Reserved.
	6	#
	7	#*****************************************************************************
	8	#
	9	# file: rbbirpt.txt
	10	# ICU Break Iterator Rule Parser State Table
	11	#
	12	# This state table is used when reading and parsing a set of RBBI rules
	13	# The rule parser uses a state machine; the data in this file define the
	14	# state transitions that occur for each input character.
	15	#
	16	# *** This file defines the RBBI rule grammar. This is it.
	17	# *** The determination of what is accepted is here.
	18	#
	19	# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
	20	# that are then built with the rule parser.
	21	#
2ca993e8	22	# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
b75a7d8f A	23
	24	#
	25	# Here is the syntax of the state definitions in this file:
	26	#
	27	#
	28	#StateName:
	29	# input-char n next-state ^push-state action
	30	# input-char n next-state ^push-state action
	31	# \| \| \| \| \|
	32	# \| \| \| \| \|--- action to be performed by state machine
	33	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	34	# \| \| \| \|
	35	# \| \| \| \|--- Push this named state onto the state stack.
	36	# \| \| \| Later, when next state is specified as "pop",
	37	# \| \| \| the pushed state will become the current state.
	38	# \| \| \|
	39	# \| \| \|--- Transition to this state if the current input character matches the input
	40	# \| \| character or char class in the left hand column. "pop" causes the next
	41	# \| \| state to be popped from the state stack.
	42	# \| \|
	43	# \| \|--- When making the state transition specified on this line, advance to the next
	44	# \| character from the input only if 'n' appears here.
	45	# \|
	46	# \|--- Character or named character classes to test for. If the current character being scanned
	47	# matches, peform the actions and go to the state specified on this line.
	48	# The input character is tested sequentally, in the order written. The characters and
	49	# character classes tested for do not need to be mutually exclusive. The first match wins.
	50	#
	51
	52
	53
	54
	55	#
	56	# start state, scan position is at the beginning of the rules file, or in between two rules.
	57	#
	58	start:
	59	escaped term ^break-rule-end doExprStart
	60	white_space n start
2ca993e8	61	'^' n start-after-caret ^break-rule-end doNoChain
b75a7d8f	62	'$' scan-var-name ^assign-or-rule doExprStart
374ca955	63	'!' n rev-option
b75a7d8f A	64	';' n start # ignore empty rules.
	65	eof exit
	66	default term ^break-rule-end doExprStart
	67
	68	#
	69	# break-rule-end: Returned from doing a break-rule expression.
	70	#
	71	break-rule-end:
	72	';' n start doEndOfRule
	73	white_space n break-rule-end
	74	default errorDeath doRuleError
	75
2ca993e8 A	76	#
	77	# start of a rule, after having seen a '^' (inhibits rule chain in).
	78	# Similar to the main 'start' state in most respects, except
	79	# - empty rule is an error.
	80	# - A second '^' is an error.
	81	#
	82	start-after-caret:
	83	escaped term doExprStart
	84	white_space n start-after-caret
	85	'^' errorDeath doRuleError # two '^'s
	86	'$' scan-var-name ^term-var-ref doExprStart
	87	';' errorDeath doRuleError # ^ ;
	88	eof errorDeath doRuleError
	89	default term doExprStart
	90
b75a7d8f	91	#
374ca955 A	92	# ! We've just scanned a '!', indicating either a !!key word flag or a
374ca955 A	93	# !Reverse rule.
b75a7d8f	94	#
374ca955 A	95	rev-option:
	96	'!' n option-scan1
	97	default reverse-rule ^break-rule-end doReverseDir
	98
	99	option-scan1:
	100	name_start_char n option-scan2 doOptionStart
	101	default errorDeath doRuleError
	102
	103	option-scan2:
	104	name_char n option-scan2
	105	default option-scan3 doOptionEnd
	106
	107	option-scan3:
	108	';' n start
	109	white_space n option-scan3
	110	default errorDeath doRuleError
	111
	112
b75a7d8f A	113	reverse-rule:
	114	default term ^break-rule-end doExprStart
	115
	116
	117	#
	118	# term. Eat through a single rule character, or a composite thing, which
	119	# could be a parenthesized expression, a variable name, or a Unicode Set.
	120	#
	121	term:
	122	escaped n expr-mod doRuleChar
	123	white_space n term
	124	rule_char n expr-mod doRuleChar
	125	'[' scan-unicode-set ^expr-mod
	126	'(' n term ^expr-mod doLParen
	127	'$' scan-var-name ^term-var-ref
	128	'.' n expr-mod doDotAny
	129	default errorDeath doRuleError
	130
	131
	132
	133	#
	134	# term-var-ref We've just finished scanning a reference to a $variable.
	135	# Check that the variable was defined.
	136	# The variable name scanning is in common with assignment statements,
	137	# so the check can't be done there.
	138	term-var-ref:
	139	default expr-mod doCheckVarDef
	140
	141
	142	#
	143	# expr-mod We've just finished scanning a term, now look for the optional
	144	# trailing '*', '?', '+'
	145	#
	146	expr-mod:
	147	white_space n expr-mod
	148	'*' n expr-cont doUnaryOpStar
	149	'+' n expr-cont doUnaryOpPlus
	150	'?' n expr-cont doUnaryOpQuestion
	151	default expr-cont
	152
	153
	154	#
	155	# expr-cont Expression, continuation. At a point where additional terms are
	156	# allowed, but not required.
	157	#
	158	expr-cont:
	159	escaped term doExprCatOperator
	160	white_space n expr-cont
	161	rule_char term doExprCatOperator
	162	'[' term doExprCatOperator
	163	'(' term doExprCatOperator
	164	'$' term doExprCatOperator
	165	'.' term doExprCatOperator
	166	'/' look-ahead doExprCatOperator
	167	'{' n tag-open doExprCatOperator
	168	'\|' n term doExprOrOperator
	169	')' n pop doExprRParen
	170	default pop doExprFinished
	171
	172
	173	#
	174	# look-ahead Scanning a '/', which identifies a break point, assuming that the
	175	# remainder of the expression matches.
	176	#
177	# Generate a parse tree as if this was a special kind of input symbol
178	# appearing in an otherwise normal concatenation expression.
179	#
180	look-ahead:
181	'/' n expr-cont-no-slash doSlash
182	default errorDeath
183
184
185	#
186	# expr-cont-no-slash Expression, continuation. At a point where additional terms are
187	# allowed, but not required. Just like
188	# expr-cont, above, except that no '/'
189	# look-ahead symbol is permitted.
190	#
191	expr-cont-no-slash:
192	escaped term doExprCatOperator
193	white_space n expr-cont
194	rule_char term doExprCatOperator
195	'[' term doExprCatOperator
196	'(' term doExprCatOperator
197	'$' term doExprCatOperator
198	'.' term doExprCatOperator
199	'\|' n term doExprOrOperator
200	')' n pop doExprRParen
201	default pop doExprFinished
202
203
204	#
205	# tags scanning a '{', the opening delimiter for a tag that identifies
206	# the kind of match. Scan the whole {dddd} tag, where d=digit
207	#
208	tag-open:
209	white_space n tag-open
210	digit_char tag-value doStartTagValue
211	default errorDeath doTagExpectedError
212
213	tag-value:
214	white_space n tag-close
215	'}' tag-close
216	digit_char n tag-value doTagDigit
217	default errorDeath doTagExpectedError
218
219	tag-close:
220	white_space n tag-close
221	'}' n expr-cont-no-tag doTagValue
222	default errorDeath doTagExpectedError
223
224
225
226	#
227	# expr-cont-no-tag Expression, continuation. At a point where additional terms are
228	# allowed, but not required. Just like
229	# expr-cont, above, except that no "{ddd}"
230	# tagging is permitted.
231	#
232	expr-cont-no-tag:
233	escaped term doExprCatOperator
234	white_space n expr-cont-no-tag
235	rule_char term doExprCatOperator
236	'[' term doExprCatOperator
237	'(' term doExprCatOperator
238	'$' term doExprCatOperator
239	'.' term doExprCatOperator
240	'/' look-ahead doExprCatOperator
241	'\|' n term doExprOrOperator
242	')' n pop doExprRParen
243	default pop doExprFinished
244
245
246
247
248	#
249	# Variable Name Scanning.
250	#
251	# The state that branched to here must have pushed a return state
252	# to go to after completion of the variable name scanning.
253	#
254	# The current input character must be the $ that introduces the name.
255	# The $ is consummed here rather than in the state that first detected it
256	# so that the doStartVariableName action only needs to happen in one
257	# place (here), and the other states don't need to worry about it.
258	#
259	scan-var-name:
260	'$' n scan-var-start doStartVariableName
261	default errorDeath
262
263
264	scan-var-start:
265	name_start_char n scan-var-body
266	default errorDeath doVariableNameExpectedErr
267
268	scan-var-body:
269	name_char n scan-var-body
270	default pop doEndVariableName
271
272
273
274	#
275	# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
276	# Within the RBBI parser, after finding the first character
277	# of a Unicode Set, we just hand the rule input at that
278	# point of to the Unicode Set constructor, then pick
279	# up parsing after the close of the set.
280	#
281	# The action for this state invokes the UnicodeSet parser.
282	#
283	scan-unicode-set:
284	'[' n pop doScanUnicodeSet
285	'p' n pop doScanUnicodeSet
286	'P' n pop doScanUnicodeSet
287	default errorDeath
288
289
290
291
292
293
294
295	#
296	# assign-or-rule. A $variable was encountered at the start of something, could be
297	# either an assignment statement or a rule, depending on whether an '='
298	# follows the variable name. We get to this state when the variable name
299	# scanning does a return.
300	#
301	assign-or-rule:
302	white_space n assign-or-rule
303	'=' n term ^assign-end doStartAssign # variable was target of assignment
304	default term-var-ref ^break-rule-end # variable was a term in a rule
305
306
307
308	#
309	# assign-end This state is entered when the end of the expression on the
310	# right hand side of an assignment is found. We get here via
311	# a pop; this state is pushed when the '=' in an assignment is found.
312	#
313	# The only thing allowed at this point is a ';'. The RHS of an
314	# assignment must look like a rule expression, and we come here
315	# when what is being scanned no longer looks like an expression.
316	#
317	assign-end:
318	';' n start doEndAssign
319	default errorDeath doRuleErrorAssignExpr
320
321
322
323	#
324	# errorDeath. This state is specified as the next state whenever a syntax error
325	# in the source rules is detected. Barring bugs, the state machine will never
326	# actually get here, but will stop because of the action associated with the error.
327	# But, just in case, this state asks the state machine to exit.
328	errorDeath:
329	default n errorDeath doExit
330
331