[apple/icu.git] / icuSources / common / rbbirpt.txt


#*****************************************************************************
#
#   Copyright (C) 2016 and later: Unicode, Inc. and others.
#   License & terms of use: http://www.unicode.org/copyright.html#License
#
#*****************************************************************************
#*****************************************************************************
#
#   Copyright (C) 2002-2016, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  rbbirpt.txt
#  ICU Break Iterator Rule Parser State Table
#
#     This state table is used when reading and parsing a set of RBBI rules
#     The rule parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the RBBI rule grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#
#    perl rbbicst.pl    < rbbirpt.txt > rbbirpt.h

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action    
#   input-char           n next-state           ^push-state     action    
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#            


#
#  start state, scan position is at the beginning of the rules file, or in between two rules.
#
start:
    escaped                term                  ^break-rule-end    doExprStart                       
    white_space          n start                     
    '^'                  n start-after-caret     ^break-rule-end    doNoChain
    '$'                    scan-var-name         ^assign-or-rule    doExprStart
    '!'                  n rev-option                             
    ';'                  n start                                                  # ignore empty rules.
    eof                    exit              
    default                term                  ^break-rule-end    doExprStart
    
#
#  break-rule-end:  Returned from doing a break-rule expression.
#
break-rule-end:
    ';'	                 n start                                    doEndOfRule
    white_space          n break-rule-end
    default                errorDeath                               doRuleError
     
#
# start of a rule, after having seen a '^' (inhibits rule chain in).
#     Similar to the main 'start' state in most respects, except
#          - empty rule is an error.
#          - A second '^' is an error.
#
start-after-caret:
    escaped                term                                     doExprStart
    white_space          n start-after-caret
    '^'                    errorDeath                               doRuleError    # two '^'s
    '$'                    scan-var-name         ^term-var-ref      doExprStart
    ';'                    errorDeath                               doRuleError    # ^ ;
    eof                    errorDeath                               doRuleError
    default                term                                     doExprStart
 
#
#   !               We've just scanned a '!', indicating either a !!key word flag or a
#                   !Reverse rule.
#
rev-option:
    '!'                  n option-scan1   
    default                reverse-rule           ^break-rule-end   doReverseDir
    
option-scan1:
    name_start_char      n option-scan2                             doOptionStart
    default                errorDeath                               doRuleError
    
option-scan2:
    name_char            n option-scan2
    default                option-scan3                             doOptionEnd
    
option-scan3:
    ';'                  n start 
    white_space          n option-scan3 
    default                errorDeath                               doRuleError 
    

reverse-rule:
    default                term                   ^break-rule-end   doExprStart
    
    
#
#  term.  Eat through a single rule character, or a composite thing, which
#         could be a parenthesized expression, a variable name, or a Unicode Set.
#
term:
    escaped              n expr-mod                                 doRuleChar
    white_space          n term
    rule_char            n expr-mod                                 doRuleChar
    '['                    scan-unicode-set      ^expr-mod
    '('                  n term                  ^expr-mod          doLParen
    '$'                    scan-var-name         ^term-var-ref
    '.'                  n expr-mod                                 doDotAny
    default                errorDeath                               doRuleError
    
    
#
#  term-var-ref   We've just finished scanning a reference to a $variable.
#                 Check that the variable was defined.
#                 The variable name scanning is in common with assignment statements,
#                 so the check can't be done there.
term-var-ref:
    default                expr-mod                                 doCheckVarDef
    
    
#
#   expr-mod      We've just finished scanning a term, now look for the optional
#                 trailing '*', '?', '+'
#
expr-mod:
    white_space          n  expr-mod
    '*'                  n  expr-cont                               doUnaryOpStar
    '+'                  n  expr-cont                               doUnaryOpPlus
    '?'                  n  expr-cont                               doUnaryOpQuestion
    default                 expr-cont 
    
    
#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.
#
expr-cont:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '{'                  n  tag-open                                doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    

#
#   look-ahead    Scanning a '/', which identifies a break point, assuming that the
#                 remainder of the expression matches.
#
#                 Generate a parse tree as if this was a special kind of input symbol
#                 appearing in an otherwise normal concatenation expression.
#
look-ahead:
    '/'                   n expr-cont-no-slash                      doSlash
    default                 errorDeath


#
#  expr-cont-no-slash    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no '/'
#                                            look-ahead symbol is permitted.
#
expr-cont-no-slash:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished


#
#   tags             scanning a '{', the opening delimiter for a tag that identifies
#                    the kind of match.  Scan the whole {dddd} tag, where d=digit
#
tag-open:
    white_space          n  tag-open
    digit_char              tag-value                               doStartTagValue
    default                 errorDeath                              doTagExpectedError
    
tag-value:
    white_space          n  tag-close
    '}'                     tag-close
    digit_char           n  tag-value                               doTagDigit
    default                 errorDeath                              doTagExpectedError
    
tag-close:
    white_space          n  tag-close
    '}'                  n  expr-cont-no-tag                        doTagValue
    default                 errorDeath                              doTagExpectedError
    
    
#
#  expr-cont-no-tag    Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  Just like
#                                            expr-cont, above, except that no "{ddd}"
#                                            tagging is permitted.
#
expr-cont-no-tag:
    escaped                 term                                    doExprCatOperator
    white_space          n  expr-cont-no-tag
    rule_char               term                                    doExprCatOperator
    '['                     term                                    doExprCatOperator
    '('                     term                                    doExprCatOperator
    '$'                     term                                    doExprCatOperator
    '.'                     term                                    doExprCatOperator
    '/'                     look-ahead                              doExprCatOperator
    '|'                  n  term                                    doExprOrOperator
    ')'                  n  pop                                     doExprRParen
    default                 pop                                     doExprFinished
    
    
#
#   Variable Name Scanning.
#
#                    The state that branched to here must have pushed a return state
#                    to go to after completion of the variable name scanning.
#
#                    The current input character must be the $ that introduces the name.
#                    The $ is consummed here rather than in the state that first detected it
#                    so that the doStartVariableName action only needs to happen in one
#                    place (here), and the other states don't need to worry about it.
#
scan-var-name:
   '$'                  n scan-var-start                            doStartVariableName
   default                errorDeath


scan-var-start:
    name_start_char      n scan-var-body
    default                errorDeath                               doVariableNameExpectedErr
    
scan-var-body:
    name_char            n scan-var-body
    default                pop                                      doEndVariableName
    
    
#
#  scan-unicode-set   Unicode Sets are parsed by the the UnicodeSet class.
#                     Within the RBBI parser, after finding the first character
#                     of a Unicode Set, we just hand the rule input at that
#                     point of to the Unicode Set constructor, then pick
#                     up parsing after the close of the set.
#
#                     The action for this state invokes the UnicodeSet parser.
#
scan-unicode-set:
    '['                   n pop                                      doScanUnicodeSet
    'p'                   n pop                                      doScanUnicodeSet
    'P'                   n pop                                      doScanUnicodeSet
    default		    errorDeath 
    
    
#
#  assign-or-rule.   A $variable was encountered at the start of something, could be
#                    either an assignment statement or a rule, depending on whether an '='
#                    follows the variable name.  We get to this state when the variable name
#                    scanning does a return.
#
assign-or-rule:
    white_space          n assign-or-rule
    '='                  n term                  ^assign-end        doStartAssign   # variable was target of assignment
    default                term-var-ref          ^break-rule-end                    # variable was a term in a rule


#
#  assign-end        This state is entered when the end of the expression on the
#                    right hand side of an assignment is found.  We get here via
#                    a pop; this state is pushed when the '=' in an assignment is found.
#
#                    The only thing allowed at this point is a ';'.  The RHS of an
#                    assignment must look like a rule expression, and we come here
#                    when what is being scanned no longer looks like an expression.
#
assign-end:
    ';'                  n start                                    doEndAssign
    default                errorDeath                               doRuleErrorAssignExpr
    
    
#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit
Commit	Line	Data
b75a7d8f A	1
b75a7d8f A	2	#*****************************************************************************
f3c0d7a5 A	3	#
	4	# Copyright (C) 2016 and later: Unicode, Inc. and others.
	5	# License & terms of use: http://www.unicode.org/copyright.html#License
	6	#
	7	#*****************************************************************************
	8	#*****************************************************************************
b75a7d8f	9	#
2ca993e8	10	# Copyright (C) 2002-2016, International Business Machines Corporation and others.
b75a7d8f A	11	# All Rights Reserved.
	12	#
	13	#*****************************************************************************
	14	#
	15	# file: rbbirpt.txt
	16	# ICU Break Iterator Rule Parser State Table
	17	#
	18	# This state table is used when reading and parsing a set of RBBI rules
	19	# The rule parser uses a state machine; the data in this file define the
	20	# state transitions that occur for each input character.
	21	#
	22	# *** This file defines the RBBI rule grammar. This is it.
	23	# *** The determination of what is accepted is here.
	24	#
	25	# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
	26	# that are then built with the rule parser.
	27	#
2ca993e8	28	# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
b75a7d8f A	29
	30	#
	31	# Here is the syntax of the state definitions in this file:
	32	#
	33	#
	34	#StateName:
	35	# input-char n next-state ^push-state action
	36	# input-char n next-state ^push-state action
	37	# \| \| \| \| \|
	38	# \| \| \| \| \|--- action to be performed by state machine
	39	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	40	# \| \| \| \|
	41	# \| \| \| \|--- Push this named state onto the state stack.
	42	# \| \| \| Later, when next state is specified as "pop",
	43	# \| \| \| the pushed state will become the current state.
	44	# \| \| \|
	45	# \| \| \|--- Transition to this state if the current input character matches the input
	46	# \| \| character or char class in the left hand column. "pop" causes the next
	47	# \| \| state to be popped from the state stack.
	48	# \| \|
	49	# \| \|--- When making the state transition specified on this line, advance to the next
	50	# \| character from the input only if 'n' appears here.
	51	# \|
	52	# \|--- Character or named character classes to test for. If the current character being scanned
	53	# matches, peform the actions and go to the state specified on this line.
	54	# The input character is tested sequentally, in the order written. The characters and
	55	# character classes tested for do not need to be mutually exclusive. The first match wins.
	56	#
	57
	58
	59
	60
	61	#
	62	# start state, scan position is at the beginning of the rules file, or in between two rules.
	63	#
	64	start:
	65	escaped term ^break-rule-end doExprStart
	66	white_space n start
2ca993e8	67	'^' n start-after-caret ^break-rule-end doNoChain
b75a7d8f	68	'$' scan-var-name ^assign-or-rule doExprStart
374ca955	69	'!' n rev-option
b75a7d8f A	70	';' n start # ignore empty rules.
	71	eof exit
	72	default term ^break-rule-end doExprStart
	73
	74	#
	75	# break-rule-end: Returned from doing a break-rule expression.
	76	#
	77	break-rule-end:
	78	';' n start doEndOfRule
	79	white_space n break-rule-end
	80	default errorDeath doRuleError
	81
2ca993e8 A	82	#
	83	# start of a rule, after having seen a '^' (inhibits rule chain in).
	84	# Similar to the main 'start' state in most respects, except
	85	# - empty rule is an error.
	86	# - A second '^' is an error.
	87	#
	88	start-after-caret:
	89	escaped term doExprStart
	90	white_space n start-after-caret
	91	'^' errorDeath doRuleError # two '^'s
	92	'$' scan-var-name ^term-var-ref doExprStart
	93	';' errorDeath doRuleError # ^ ;
	94	eof errorDeath doRuleError
	95	default term doExprStart
	96
b75a7d8f	97	#
374ca955 A	98	# ! We've just scanned a '!', indicating either a !!key word flag or a
374ca955 A	99	# !Reverse rule.
b75a7d8f	100	#
374ca955 A	101	rev-option:
	102	'!' n option-scan1
	103	default reverse-rule ^break-rule-end doReverseDir
	104
	105	option-scan1:
	106	name_start_char n option-scan2 doOptionStart
	107	default errorDeath doRuleError
	108
	109	option-scan2:
	110	name_char n option-scan2
	111	default option-scan3 doOptionEnd
	112
	113	option-scan3:
	114	';' n start
	115	white_space n option-scan3
	116	default errorDeath doRuleError
	117
	118
b75a7d8f A	119	reverse-rule:
	120	default term ^break-rule-end doExprStart
	121
	122
	123	#
	124	# term. Eat through a single rule character, or a composite thing, which
	125	# could be a parenthesized expression, a variable name, or a Unicode Set.
	126	#
	127	term:
	128	escaped n expr-mod doRuleChar
	129	white_space n term
	130	rule_char n expr-mod doRuleChar
	131	'[' scan-unicode-set ^expr-mod
	132	'(' n term ^expr-mod doLParen
	133	'$' scan-var-name ^term-var-ref
	134	'.' n expr-mod doDotAny
	135	default errorDeath doRuleError
	136
	137
	138
	139	#
	140	# term-var-ref We've just finished scanning a reference to a $variable.
	141	# Check that the variable was defined.
	142	# The variable name scanning is in common with assignment statements,
	143	# so the check can't be done there.
	144	term-var-ref:
	145	default expr-mod doCheckVarDef
	146
	147
	148	#
	149	# expr-mod We've just finished scanning a term, now look for the optional
	150	# trailing '*', '?', '+'
	151	#
	152	expr-mod:
	153	white_space n expr-mod
	154	'*' n expr-cont doUnaryOpStar
	155	'+' n expr-cont doUnaryOpPlus
	156	'?' n expr-cont doUnaryOpQuestion
	157	default expr-cont
	158
	159
	160	#
	161	# expr-cont Expression, continuation. At a point where additional terms are
	162	# allowed, but not required.
	163	#
	164	expr-cont:
	165	escaped term doExprCatOperator
	166	white_space n expr-cont
	167	rule_char term doExprCatOperator
	168	'[' term doExprCatOperator
	169	'(' term doExprCatOperator
	170	'$' term doExprCatOperator
	171	'.' term doExprCatOperator
	172	'/' look-ahead doExprCatOperator
	173	'{' n tag-open doExprCatOperator
	174	'\|' n term doExprOrOperator
	175	')' n pop doExprRParen
	176	default pop doExprFinished
	177
	178
	179	#
	180	# look-ahead Scanning a '/', which identifies a break point, assuming that the
	181	# remainder of the expression matches.
	182	#
183	# Generate a parse tree as if this was a special kind of input symbol
184	# appearing in an otherwise normal concatenation expression.
185	#
186	look-ahead:
187	'/' n expr-cont-no-slash doSlash
188	default errorDeath
189
190
191	#
192	# expr-cont-no-slash Expression, continuation. At a point where additional terms are
193	# allowed, but not required. Just like
194	# expr-cont, above, except that no '/'
195	# look-ahead symbol is permitted.
196	#
197	expr-cont-no-slash:
198	escaped term doExprCatOperator
199	white_space n expr-cont
200	rule_char term doExprCatOperator
201	'[' term doExprCatOperator
202	'(' term doExprCatOperator
203	'$' term doExprCatOperator
204	'.' term doExprCatOperator
205	'\|' n term doExprOrOperator
206	')' n pop doExprRParen
207	default pop doExprFinished
208
209
210	#
211	# tags scanning a '{', the opening delimiter for a tag that identifies
212	# the kind of match. Scan the whole {dddd} tag, where d=digit
213	#
214	tag-open:
215	white_space n tag-open
216	digit_char tag-value doStartTagValue
217	default errorDeath doTagExpectedError
218
219	tag-value:
220	white_space n tag-close
221	'}' tag-close
222	digit_char n tag-value doTagDigit
223	default errorDeath doTagExpectedError
224
225	tag-close:
226	white_space n tag-close
227	'}' n expr-cont-no-tag doTagValue
228	default errorDeath doTagExpectedError
229
230
231
232	#
233	# expr-cont-no-tag Expression, continuation. At a point where additional terms are
234	# allowed, but not required. Just like
235	# expr-cont, above, except that no "{ddd}"
236	# tagging is permitted.
237	#
238	expr-cont-no-tag:
239	escaped term doExprCatOperator
240	white_space n expr-cont-no-tag
241	rule_char term doExprCatOperator
242	'[' term doExprCatOperator
243	'(' term doExprCatOperator
244	'$' term doExprCatOperator
245	'.' term doExprCatOperator
246	'/' look-ahead doExprCatOperator
247	'\|' n term doExprOrOperator
248	')' n pop doExprRParen
249	default pop doExprFinished
250
251
252
253
254	#
255	# Variable Name Scanning.
256	#
257	# The state that branched to here must have pushed a return state
258	# to go to after completion of the variable name scanning.
259	#
260	# The current input character must be the $ that introduces the name.
261	# The $ is consummed here rather than in the state that first detected it
262	# so that the doStartVariableName action only needs to happen in one
263	# place (here), and the other states don't need to worry about it.
264	#
265	scan-var-name:
266	'$' n scan-var-start doStartVariableName
267	default errorDeath
268
269
270	scan-var-start:
271	name_start_char n scan-var-body
272	default errorDeath doVariableNameExpectedErr
273
274	scan-var-body:
275	name_char n scan-var-body
276	default pop doEndVariableName
277
278
279
280	#
281	# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
282	# Within the RBBI parser, after finding the first character
283	# of a Unicode Set, we just hand the rule input at that
284	# point of to the Unicode Set constructor, then pick
285	# up parsing after the close of the set.
286	#
287	# The action for this state invokes the UnicodeSet parser.
288	#
289	scan-unicode-set:
290	'[' n pop doScanUnicodeSet
291	'p' n pop doScanUnicodeSet
292	'P' n pop doScanUnicodeSet
293	default errorDeath
294
295
296
297
298
299
300
301	#
302	# assign-or-rule. A $variable was encountered at the start of something, could be
303	# either an assignment statement or a rule, depending on whether an '='
304	# follows the variable name. We get to this state when the variable name
305	# scanning does a return.
306	#
307	assign-or-rule:
308	white_space n assign-or-rule
309	'=' n term ^assign-end doStartAssign # variable was target of assignment
310	default term-var-ref ^break-rule-end # variable was a term in a rule
311
312
313
314	#
315	# assign-end This state is entered when the end of the expression on the
316	# right hand side of an assignment is found. We get here via
317	# a pop; this state is pushed when the '=' in an assignment is found.
318	#
319	# The only thing allowed at this point is a ';'. The RHS of an
320	# assignment must look like a rule expression, and we come here
321	# when what is being scanned no longer looks like an expression.
322	#
323	assign-end:
324	';' n start doEndAssign
325	default errorDeath doRuleErrorAssignExpr
326
327
328
329	#
330	# errorDeath. This state is specified as the next state whenever a syntax error
331	# in the source rules is detected. Barring bugs, the state machine will never
332	# actually get here, but will stop because of the action associated with the error.
333	# But, just in case, this state asks the state machine to exit.
334	errorDeath:
335	default n errorDeath doExit
336
337