[apple/icu.git] / icuSources / i18n / regexcst.txt


#*****************************************************************************
#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#   All Rights Reserved.
#
#*****************************************************************************
#
#  file:  regexcst.txt
#  ICU Regular Expression Parser State Table
#
#     This state table is used when reading and parsing a regular expression pattern
#     The pattern parser uses a state machine; the data in this file define the
#     state transitions that occur for each input character.
#
#     *** This file defines the regex pattern grammar.   This is it.
#     *** The determination of what is accepted is here.
#
#     This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
#     that are then built with the rule parser.
#

#
# Here is the syntax of the state definitions in this file:
#
#
#StateName:
#   input-char           n next-state           ^push-state     action    
#   input-char           n next-state           ^push-state     action    
#       |                |   |                      |             |
#       |                |   |                      |             |--- action to be performed by state machine
#       |                |   |                      |                  See function RBBIRuleScanner::doParseActions()
#       |                |   |                      |
#       |                |   |                      |--- Push this named state onto the state stack.
#       |                |   |                           Later, when next state is specified as "pop",
#       |                |   |                           the pushed state will become the current state.
#       |                |   |
#       |                |   |--- Transition to this state if the current input character matches the input
#       |                |        character or char class in the left hand column.  "pop" causes the next
#       |                |        state to be popped from the state stack.
#       |                |
#       |                |--- When making the state transition specified on this line, advance to the next
#       |                     character from the input only if 'n' appears here.
#       |
#       |--- Character or named character classes to test for.  If the current character being scanned
#            matches, peform the actions and go to the state specified on this line.
#            The input character is tested sequentally, in the order written.  The characters and
#            character classes tested for do not need to be mutually exclusive.  The first match wins.
#            


#
#  start state, scan position is at the beginning of the pattern.
#
start:
   default                 term                                     doPatStart
    

#
#  term.  At a position where we can accept the start most items in a pattern.
#
term:
    quoted               n expr-quant                               doLiteralChar
    rule_char            n expr-quant                               doLiteralChar
    '['                  n expr-quant                               doScanUnicodeSet
    '('                  n open-paren                     
    '.'                  n expr-quant                               doDotAny
    '^'                  n term                                     doCaret
    '$'                  n term                                     doDollar
    '\'                  n backslash
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    eof	                   term                                     doPatFinish
    default                errorDeath                               doRuleError
    

#
#   expr-quant    We've just finished scanning a term, now look for the optional
#                 trailing quantifier - *, +, ?, *?,  etc.
#
expr-quant:
    '*'                  n  quant-star                       
    '+'                  n  quant-plus                              
    '?'                  n  quant-opt     
    '{'                  n  interval-open                          doIntervalInit
    '('                  n  open-paren-quant
    default                 expr-cont 
    
    
#
#  expr-cont      Expression, continuation.  At a point where additional terms are
#                                            allowed, but not required.  No Quantifiers
#
expr-cont:
    '|'                  n  term                                    doOrOperator
    ')'                  n  pop                                     doCloseParen
    default                 term                                    
    

#
#   open-paren-quant   Special case handling for comments appearing before a quantifier,
#                        e.g.   x(?#comment )*
#                      Open parens from expr-quant come here; anything but a (?# comment
#                      branches into the normal parenthesis sequence as quickly as possible.
#
open-paren-quant:
    '?'                  n  open-paren-quant2                      doSuppressComments
    default                 open-paren
    
open-paren-quant2:
    '#'                  n  paren-comment   ^expr-quant
    default                 open-paren-extended
    
 
#
#   open-paren    We've got an open paren.  We need to scan further to
#                 determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
#
open-paren:
    '?'                  n  open-paren-extended                     doSuppressComments
    default                 term            ^expr-quant             doOpenCaptureParen
    
open-paren-extended:
    ':'                  n  term            ^expr-quant             doOpenNonCaptureParen  #  (?:
    '>'                  n  term            ^expr-quant             doOpenAtomicParen      #  (?>
    '='                  n  term            ^expr-cont              doOpenLookAhead        #  (?=
    '!'                  n  term            ^expr-cont              doOpenLookAheadNeg     #  (?!
    '<'                  n  open-paren-lookbehind
    '#'                  n  paren-comment   ^term
    'i'                     paren-flag                              doBeginMatchMode
    'm'                     paren-flag                              doBeginMatchMode
    's'                     paren-flag                              doBeginMatchMode
    'w'                     paren-flag                              doBeginMatchMode
    'x'                     paren-flag                              doBeginMatchMode
    '-'                     paren-flag                              doBeginMatchMode
    '('                  n  errorDeath                              doConditionalExpr
    '{'                  n  errorDeath                              doPerlInline
    default                 errorDeath                              doBadOpenParenType
    
open-paren-lookbehind:
    '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
    '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
    default                 errorDeath                              doBadOpenParenType
    

#
#   paren-comment    We've got a (?# ... )  style comment.  Eat pattern text till we get to the ')'
#                    TODO:  should parens nest here?  Check what perl does.
#
paren-comment:
    ')'                  n  pop
    eof		                errorDeath                              doMismatchedParenErr
    default              n  paren-comment

#
#  paren-flag    Scanned a (?ismx-ismx  flag setting 
#                 
paren-flag:
    'i'                  n  paren-flag                              doMatchMode
    'm'                  n  paren-flag                              doMatchMode
    's'                  n  paren-flag                              doMatchMode
    'w'                  n  paren-flag                              doMatchMode
    'x'                  n  paren-flag                              doMatchMode
    '-'                  n  paren-flag                              doMatchMode
    ')'                  n  term                                    doSetMatchMode
    ':'                  n  term              ^expr-quant           doMatchModeParen
    default                 errorDeath                              doBadModeFlag
    
    
#
#  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
#                 between plain '*', '*?', '*+'
#
quant-star:
     '?'                 n  expr-cont                               doNGStar               #  *?
     '+'                 n  expr-cont                               doPossessiveStar       #  *+
     default                expr-cont                               doStar


#
#  quant-plus     Scanning a '+' quantifier.  Need to look ahead to decide
#                 between plain '+', '+?', '++'
#
quant-plus:
     '?'                 n  expr-cont                               doNGPlus               #  *?
     '+'                 n  expr-cont                               doPossessivePlus       #  *+
     default                expr-cont                               doPlus


#
#  quant-opt  Scanning a '?' quantifier.  Need to look ahead to decide
#                  between plain '?', '??', '?+'
#
quant-opt:
     '?'                 n  expr-cont                               doNGOpt                 #  ??
     '+'                 n  expr-cont                               doPossessiveOpt         #  ?+
     default                expr-cont                               doOpt                   #  ?


#
#   Interval         scanning a '{', the opening delimiter for an interval specification
#                                   {number} or {min, max} or {min, }
#
interval-open:
    white_space          n  interval-open                                  # TODO:  is white space allowed here in non-free mode?
    digit_char              interval-lower                          
    default                 errorDeath                              doIntervalError
    
interval-lower:
    digit_char           n  interval-lower                          doIntevalLowerDigit
    ','			         n  interval-upper
    '}'                  n  interval-type                           doIntervalSame             # {n}
    default                 errorDeath                              doIntervalError

interval-upper:
    digit_char           n  interval-upper                          doIntervalUpperDigit
    '}'                  n  interval-type
    default                 errorDeath                              doIntervalError
    
interval-type:
    '?'                  n  expr-cont                               doNGInterval                # {n,m}?
    '+'                  n  expr-cont                               doPossessiveInterval        # {n,m}+
    default                 expr-cont                               doInterval                  # {m,n}
    
    
#
#  backslash        #  Backslash.  Figure out which of the \thingies we have encountered.
#                                  The low level next-char function will have preprocessed
#                                  some of them already; those won't come here.
backslash:
   'A'                   n  term                                    doBackslashA
   'B'                   n  term                                    doBackslashB
   'b'                   n  term                                    doBackslashb
   'd'                   n  expr-quant                              doBackslashd
   'D'                   n  expr-quant                              doBackslashD
   'G'                   n  term                                    doBackslashG
   'N'                      expr-quant                              doProperty       #   \N{NAME}  named char
   'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
   'P'                      expr-quant                              doProperty
   'Q'                   n  term                                    doEnterQuoteMode
   'S'                   n  expr-quant                              doBackslashS
   's'                   n  expr-quant                              doBackslashs
   'W'                   n  expr-quant                              doBackslashW
   'w'                   n  expr-quant                              doBackslashw
   'X'                   n  expr-quant                              doBackslashX
   'Z'                   n  term                                    doBackslashZ
   'z'                   n  term                                    doBackslashz
   digit_char	         n  expr-quant                              doBackRef         #  Will scan multiple digits
   eof                      errorDeath                              doEscapeError
   default               n  expr-quant		                    doLiteralChar     #  Escaped literal char.		       

    
#
# errorDeath.   This state is specified as the next state whenever a syntax error
#               in the source rules is detected.  Barring bugs, the state machine will never
#               actually get here, but will stop because of the action associated with the error.
#               But, just in case, this state asks the state machine to exit.
errorDeath:
    default              n errorDeath                               doExit
Commit	Line	Data
b75a7d8f A	1
	2	#*****************************************************************************
	3	#
	4	# Copyright (C) 2002-2003, International Business Machines Corporation and others.
	5	# All Rights Reserved.
	6	#
	7	#*****************************************************************************
	8	#
	9	# file: regexcst.txt
	10	# ICU Regular Expression Parser State Table
	11	#
	12	# This state table is used when reading and parsing a regular expression pattern
	13	# The pattern parser uses a state machine; the data in this file define the
	14	# state transitions that occur for each input character.
	15	#
	16	# *** This file defines the regex pattern grammar. This is it.
	17	# *** The determination of what is accepted is here.
	18	#
	19	# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
	20	# that are then built with the rule parser.
	21	#
	22
	23	#
	24	# Here is the syntax of the state definitions in this file:
	25	#
	26	#
	27	#StateName:
	28	# input-char n next-state ^push-state action
	29	# input-char n next-state ^push-state action
	30	# \| \| \| \| \|
	31	# \| \| \| \| \|--- action to be performed by state machine
	32	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	33	# \| \| \| \|
	34	# \| \| \| \|--- Push this named state onto the state stack.
	35	# \| \| \| Later, when next state is specified as "pop",
	36	# \| \| \| the pushed state will become the current state.
	37	# \| \| \|
	38	# \| \| \|--- Transition to this state if the current input character matches the input
	39	# \| \| character or char class in the left hand column. "pop" causes the next
	40	# \| \| state to be popped from the state stack.
	41	# \| \|
	42	# \| \|--- When making the state transition specified on this line, advance to the next
	43	# \| character from the input only if 'n' appears here.
	44	# \|
	45	# \|--- Character or named character classes to test for. If the current character being scanned
	46	# matches, peform the actions and go to the state specified on this line.
	47	# The input character is tested sequentally, in the order written. The characters and
	48	# character classes tested for do not need to be mutually exclusive. The first match wins.
	49	#
	50
	51
	52
	53
	54	#
	55	# start state, scan position is at the beginning of the pattern.
	56	#
	57	start:
	58	default term doPatStart
	59
	60
	61
	62
	63	#
	64	# term. At a position where we can accept the start most items in a pattern.
65	#
66	term:
67	quoted n expr-quant doLiteralChar
68	rule_char n expr-quant doLiteralChar
69	'[' n expr-quant doScanUnicodeSet
70	'(' n open-paren
71	'.' n expr-quant doDotAny
72	'^' n term doCaret
73	'$' n term doDollar
74	'\' n backslash
75	'\|' n term doOrOperator
76	')' n pop doCloseParen
77	eof term doPatFinish
78	default errorDeath doRuleError
79
80
81
82	#
83	# expr-quant We've just finished scanning a term, now look for the optional
84	# trailing quantifier - , +, ?, ?, etc.
85	#
86	expr-quant:
87	'*' n quant-star
88	'+' n quant-plus
89	'?' n quant-opt
90	'{' n interval-open doIntervalInit
91	'(' n open-paren-quant
92	default expr-cont
93
94
95	#
96	# expr-cont Expression, continuation. At a point where additional terms are
97	# allowed, but not required. No Quantifiers
98	#
99	expr-cont:
100	'\|' n term doOrOperator
101	')' n pop doCloseParen
102	default term
103
104
105	#
106	# open-paren-quant Special case handling for comments appearing before a quantifier,
107	# e.g. x(?#comment )*
108	# Open parens from expr-quant come here; anything but a (?# comment
109	# branches into the normal parenthesis sequence as quickly as possible.
110	#
111	open-paren-quant:
112	'?' n open-paren-quant2 doSuppressComments
113	default open-paren
114
115	open-paren-quant2:
116	'#' n paren-comment ^expr-quant
117	default open-paren-extended
118
119
120	#
121	# open-paren We've got an open paren. We need to scan further to
122	# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
123	#
124	open-paren:
125	'?' n open-paren-extended doSuppressComments
126	default term ^expr-quant doOpenCaptureParen
127
128	open-paren-extended:
129	':' n term ^expr-quant doOpenNonCaptureParen # (?:
130	'>' n term ^expr-quant doOpenAtomicParen # (?>
131	'=' n term ^expr-cont doOpenLookAhead # (?=
132	'!' n term ^expr-cont doOpenLookAheadNeg # (?!
133	'<' n open-paren-lookbehind
134	'#' n paren-comment ^term
135	'i' paren-flag doBeginMatchMode
136	'm' paren-flag doBeginMatchMode
137	's' paren-flag doBeginMatchMode
374ca955	138	'w' paren-flag doBeginMatchMode
b75a7d8f A	139	'x' paren-flag doBeginMatchMode
	140	'-' paren-flag doBeginMatchMode
	141	'(' n errorDeath doConditionalExpr
	142	'{' n errorDeath doPerlInline
	143	default errorDeath doBadOpenParenType
	144
	145	open-paren-lookbehind:
	146	'=' n term ^expr-cont doOpenLookBehind # (?<=
	147	'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
	148	default errorDeath doBadOpenParenType
	149
	150
	151	#
	152	# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
	153	# TODO: should parens nest here? Check what perl does.
	154	#
	155	paren-comment:
	156	')' n pop
	157	eof errorDeath doMismatchedParenErr
	158	default n paren-comment
	159
	160	#
	161	# paren-flag Scanned a (?ismx-ismx flag setting
	162	#
	163	paren-flag:
	164	'i' n paren-flag doMatchMode
	165	'm' n paren-flag doMatchMode
	166	's' n paren-flag doMatchMode
374ca955	167	'w' n paren-flag doMatchMode
b75a7d8f A	168	'x' n paren-flag doMatchMode
	169	'-' n paren-flag doMatchMode
	170	')' n term doSetMatchMode
	171	':' n term ^expr-quant doMatchModeParen
374ca955	172	default errorDeath doBadModeFlag
b75a7d8f A	173
	174
	175	#
	176	# quant-star Scanning a '*' quantifier. Need to look ahead to decide
	177	# between plain '', '?', '*+'
	178	#
	179	quant-star:
	180	'?' n expr-cont doNGStar # *?
	181	'+' n expr-cont doPossessiveStar # *+
	182	default expr-cont doStar
	183
	184
	185	#
	186	# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
	187	# between plain '+', '+?', '++'
	188	#
	189	quant-plus:
	190	'?' n expr-cont doNGPlus # *?
	191	'+' n expr-cont doPossessivePlus # *+
	192	default expr-cont doPlus
	193
	194
	195	#
	196	# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
	197	# between plain '?', '??', '?+'
	198	#
	199	quant-opt:
	200	'?' n expr-cont doNGOpt # ??
	201	'+' n expr-cont doPossessiveOpt # ?+
	202	default expr-cont doOpt # ?
	203
	204
	205	#
	206	# Interval scanning a '{', the opening delimiter for an interval specification
	207	# {number} or {min, max} or {min, }
	208	#
	209	interval-open:
	210	white_space n interval-open # TODO: is white space allowed here in non-free mode?
	211	digit_char interval-lower
	212	default errorDeath doIntervalError
	213
	214	interval-lower:
	215	digit_char n interval-lower doIntevalLowerDigit
	216	',' n interval-upper
	217	'}' n interval-type doIntervalSame # {n}
	218	default errorDeath doIntervalError
	219
	220	interval-upper:
	221	digit_char n interval-upper doIntervalUpperDigit
	222	'}' n interval-type
	223	default errorDeath doIntervalError
	224
	225	interval-type:
	226	'?' n expr-cont doNGInterval # {n,m}?
	227	'+' n expr-cont doPossessiveInterval # {n,m}+
	228	default expr-cont doInterval # {m,n}
	229
	230
	231	#
	232	# backslash # Backslash. Figure out which of the \thingies we have encountered.
	233	# The low level next-char function will have preprocessed
	234	# some of them already; those won't come here.
	235	backslash:
	236	'A' n term doBackslashA
237	'B' n term doBackslashB
238	'b' n term doBackslashb
239	'd' n expr-quant doBackslashd
240	'D' n expr-quant doBackslashD
241	'G' n term doBackslashG
242	'N' expr-quant doProperty # \N{NAME} named char
243	'p' expr-quant doProperty # \p{Lu} style property
244	'P' expr-quant doProperty
245	'Q' n term doEnterQuoteMode
246	'S' n expr-quant doBackslashS
247	's' n expr-quant doBackslashs
248	'W' n expr-quant doBackslashW
249	'w' n expr-quant doBackslashw
250	'X' n expr-quant doBackslashX
251	'Z' n term doBackslashZ
252	'z' n term doBackslashz
b75a7d8f A	253	digit_char n expr-quant doBackRef # Will scan multiple digits
	254	eof errorDeath doEscapeError
	255	default n expr-quant doLiteralChar # Escaped literal char.
	256
	257
	258	#
	259	# errorDeath. This state is specified as the next state whenever a syntax error
	260	# in the source rules is detected. Barring bugs, the state machine will never
	261	# actually get here, but will stop because of the action associated with the error.
	262	# But, just in case, this state asks the state machine to exit.
	263	errorDeath:
	264	default n errorDeath doExit
	265
	266