git.saurik.com Git - apple/icu.git/blame_incremental

Commit	Line	Data
	1	# Copyright (C) 2016 and later: Unicode, Inc. and others.
	2	# License & terms of use: http://www.unicode.org/copyright.html
	3	#*****************************************************************************
	4	#
	5	# Copyright (C) 2002-2015, International Business Machines Corporation and others.
	6	# All Rights Reserved.
	7	#
	8	#*****************************************************************************
	9	#
	10	# file: regexcst.txt
	11	# ICU Regular Expression Parser State Table
	12	#
	13	# This state table is used when reading and parsing a regular expression pattern
	14	# The pattern parser uses a state machine; the data in this file define the
	15	# state transitions that occur for each input character.
	16	#
	17	# *** This file defines the regex pattern grammar. This is it.
	18	# *** The determination of what is accepted is here.
	19	#
	20	# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays
	21	# that are then built with the rule parser.
	22	#
	23
	24	#
	25	# Here is the syntax of the state definitions in this file:
	26	#
	27	#
	28	#StateName:
	29	# input-char n next-state ^push-state action
	30	# input-char n next-state ^push-state action
	31	# \| \| \| \| \|
	32	# \| \| \| \| \|--- action to be performed by state machine
	33	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	34	# \| \| \| \|
	35	# \| \| \| \|--- Push this named state onto the state stack.
	36	# \| \| \| Later, when next state is specified as "pop",
	37	# \| \| \| the pushed state will become the current state.
	38	# \| \| \|
	39	# \| \| \|--- Transition to this state if the current input character matches the input
	40	# \| \| character or char class in the left hand column. "pop" causes the next
	41	# \| \| state to be popped from the state stack.
	42	# \| \|
	43	# \| \|--- When making the state transition specified on this line, advance to the next
	44	# \| character from the input only if 'n' appears here.
	45	# \|
	46	# \|--- Character or named character classes to test for. If the current character being scanned
	47	# matches, peform the actions and go to the state specified on this line.
	48	# The input character is tested sequentally, in the order written. The characters and
	49	# character classes tested for do not need to be mutually exclusive. The first match wins.
	50	#
	51
	52
	53
	54
	55	#
	56	# start state, scan position is at the beginning of the pattern.
	57	#
	58	start:
	59	default term doPatStart
	60
	61
	62
	63
	64	#
	65	# term. At a position where we can accept the start most items in a pattern.
	66	#
	67	term:
	68	quoted n expr-quant doLiteralChar
	69	rule_char n expr-quant doLiteralChar
	70	'[' n set-open ^set-finish doSetBegin
	71	'(' n open-paren
	72	'.' n expr-quant doDotAny
	73	'^' n expr-quant doCaret
	74	'$' n expr-quant doDollar
	75	'\' n backslash
	76	'\|' n term doOrOperator
	77	')' n pop doCloseParen
	78	eof term doPatFinish
	79	default errorDeath doRuleError
	80
	81
	82
	83	#
	84	# expr-quant We've just finished scanning a term, now look for the optional
	85	# trailing quantifier - , +, ?, ?, etc.
	86	#
	87	expr-quant:
	88	'*' n quant-star
	89	'+' n quant-plus
	90	'?' n quant-opt
	91	'{' n interval-open doIntervalInit
	92	'(' n open-paren-quant
	93	default expr-cont
	94
	95
	96	#
	97	# expr-cont Expression, continuation. At a point where additional terms are
	98	# allowed, but not required. No Quantifiers
	99	#
	100	expr-cont:
	101	'\|' n term doOrOperator
	102	')' n pop doCloseParen
	103	default term
	104
	105
	106	#
	107	# open-paren-quant Special case handling for comments appearing before a quantifier,
	108	# e.g. x(?#comment )*
	109	# Open parens from expr-quant come here; anything but a (?# comment
	110	# branches into the normal parenthesis sequence as quickly as possible.
	111	#
	112	open-paren-quant:
	113	'?' n open-paren-quant2 doSuppressComments
	114	default open-paren
	115
	116	open-paren-quant2:
	117	'#' n paren-comment ^expr-quant
	118	default open-paren-extended
	119
	120
	121	#
	122	# open-paren We've got an open paren. We need to scan further to
	123	# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
	124	#
	125	open-paren:
	126	'?' n open-paren-extended doSuppressComments
	127	default term ^expr-quant doOpenCaptureParen
	128
	129	open-paren-extended:
	130	':' n term ^expr-quant doOpenNonCaptureParen # (?:
	131	'>' n term ^expr-quant doOpenAtomicParen # (?>
	132	'=' n term ^expr-cont doOpenLookAhead # (?=
	133	'!' n term ^expr-cont doOpenLookAheadNeg # (?!
	134	'<' n open-paren-lookbehind
	135	'#' n paren-comment ^term
	136	'i' paren-flag doBeginMatchMode
	137	'd' paren-flag doBeginMatchMode
	138	'm' paren-flag doBeginMatchMode
	139	's' paren-flag doBeginMatchMode
	140	'u' paren-flag doBeginMatchMode
	141	'w' paren-flag doBeginMatchMode
	142	'x' paren-flag doBeginMatchMode
	143	'-' paren-flag doBeginMatchMode
	144	'(' n errorDeath doConditionalExpr
	145	'{' n errorDeath doPerlInline
	146	default errorDeath doBadOpenParenType
	147
	148	open-paren-lookbehind:
	149	'=' n term ^expr-cont doOpenLookBehind # (?<=
	150	'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
	151	ascii_letter named-capture doBeginNamedCapture # (?<name
	152	default errorDeath doBadOpenParenType
	153
	154
	155	#
	156	# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
	157	#
	158	paren-comment:
	159	')' n pop
	160	eof errorDeath doMismatchedParenErr
	161	default n paren-comment
	162
	163	#
	164	# paren-flag Scanned a (?ismx-ismx flag setting
	165	#
	166	paren-flag:
	167	'i' n paren-flag doMatchMode
	168	'd' n paren-flag doMatchMode
	169	'm' n paren-flag doMatchMode
	170	's' n paren-flag doMatchMode
	171	'u' n paren-flag doMatchMode
	172	'w' n paren-flag doMatchMode
	173	'x' n paren-flag doMatchMode
	174	'-' n paren-flag doMatchMode
	175	')' n term doSetMatchMode
	176	':' n term ^expr-quant doMatchModeParen
	177	default errorDeath doBadModeFlag
	178
	179	#
	180	# named-capture (?<name> ... ), position currently on the name.
	181	#
	182	named-capture:
	183	ascii_letter n named-capture doContinueNamedCapture
	184	digit_char n named-capture doContinueNamedCapture
	185	'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.
	186	default errorDeath doBadNamedCapture
	187
	188	#
	189	# quant-star Scanning a '*' quantifier. Need to look ahead to decide
	190	# between plain '', '?', '*+'
	191	#
	192	quant-star:
	193	'?' n expr-cont doNGStar # *?
	194	'+' n expr-cont doPossessiveStar # *+
	195	default expr-cont doStar
	196
	197
	198	#
	199	# quant-plus Scanning a '+' quantifier. Need to look ahead to decide
	200	# between plain '+', '+?', '++'
	201	#
	202	quant-plus:
	203	'?' n expr-cont doNGPlus # *?
	204	'+' n expr-cont doPossessivePlus # *+
	205	default expr-cont doPlus
	206
	207
	208	#
	209	# quant-opt Scanning a '?' quantifier. Need to look ahead to decide
	210	# between plain '?', '??', '?+'
	211	#
	212	quant-opt:
	213	'?' n expr-cont doNGOpt # ??
	214	'+' n expr-cont doPossessiveOpt # ?+
	215	default expr-cont doOpt # ?
	216
	217
	218	#
	219	# Interval scanning a '{', the opening delimiter for an interval specification
	220	# {number} or {min, max} or {min,}
	221	#
	222	interval-open:
	223	digit_char interval-lower
	224	default errorDeath doIntervalError
	225
	226	interval-lower:
	227	digit_char n interval-lower doIntevalLowerDigit
	228	',' n interval-upper
	229	'}' n interval-type doIntervalSame # {n}
	230	default errorDeath doIntervalError
	231
	232	interval-upper:
	233	digit_char n interval-upper doIntervalUpperDigit
	234	'}' n interval-type
	235	default errorDeath doIntervalError
	236
	237	interval-type:
	238	'?' n expr-cont doNGInterval # {n,m}?
	239	'+' n expr-cont doPossessiveInterval # {n,m}+
	240	default expr-cont doInterval # {m,n}
	241
	242
	243	#
	244	# backslash # Backslash. Figure out which of the \thingies we have encountered.
	245	# The low level next-char function will have preprocessed
	246	# some of them already; those won't come here.
	247	backslash:
	248	'A' n term doBackslashA
	249	'B' n term doBackslashB
	250	'b' n term doBackslashb
	251	'd' n expr-quant doBackslashd
	252	'D' n expr-quant doBackslashD
	253	'G' n term doBackslashG
	254	'h' n expr-quant doBackslashh
	255	'H' n expr-quant doBackslashH
	256	'k' n named-backref
	257	'N' expr-quant doNamedChar # \N{NAME} named char
	258	'p' expr-quant doProperty # \p{Lu} style property
	259	'P' expr-quant doProperty
	260	'R' n expr-quant doBackslashR
	261	'Q' n term doEnterQuoteMode
	262	'S' n expr-quant doBackslashS
	263	's' n expr-quant doBackslashs
	264	'v' n expr-quant doBackslashv
	265	'V' n expr-quant doBackslashV
	266	'W' n expr-quant doBackslashW
	267	'w' n expr-quant doBackslashw
	268	'X' n expr-quant doBackslashX
	269	'Z' n term doBackslashZ
	270	'z' n term doBackslashz
	271	digit_char n expr-quant doBackRef # Will scan multiple digits
	272	eof errorDeath doEscapeError
	273	default n expr-quant doEscapedLiteralChar
	274
	275
	276	# named-backref Scanned \k
	277	# Leading to \k<captureName>
	278	# Failure to get the full sequence is an error.
	279	#
	280	named-backref:
	281	'<' n named-backref-2 doBeginNamedBackRef
	282	default errorDeath doBadNamedCapture
	283
	284	named-backref-2:
	285	ascii_letter n named-backref-3 doContinueNamedBackRef
	286	default errorDeath doBadNamedCapture
	287
	288	named-backref-3:
	289	ascii_letter n named-backref-3 doContinueNamedBackRef
	290	digit_char n named-backref-3 doContinueNamedBackRef
	291	'>' n expr-quant doCompleteNamedBackRef
	292	default errorDeath doBadNamedCapture
	293
	294
	295	#
	296	# [set expression] parsing,
	297	# All states involved in parsing set expressions have names beginning with "set-"
	298	#
	299
	300	set-open:
	301	'^' n set-open2 doSetNegate
	302	':' set-posix doSetPosixProp
	303	default set-open2
	304
	305	set-open2:
	306	']' n set-after-lit doSetLiteral
	307	default set-start
	308
	309	# set-posix:
	310	# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
	311	# moved the scan to the closing ']'. If it wasn't a property
	312	# expression, the scan will still be at the opening ':', which should
	313	# be interpreted as a normal set expression.
	314	set-posix:
	315	']' n pop doSetEnd
	316	':' set-start
	317	default errorDeath doRuleError # should not be possible.
	318
	319	#
	320	# set-start after the [ and special case leading characters (^ and/or ]) but before
	321	# everything else. A '-' is literal at this point.
	322	#
	323	set-start:
	324	']' n pop doSetEnd
	325	'[' n set-open ^set-after-set doSetBeginUnion
	326	'\' n set-escape
	327	'-' n set-start-dash
	328	'&' n set-start-amp
	329	default n set-after-lit doSetLiteral
	330
	331	# set-start-dash Turn "[--" into a syntax error.
	332	# "[-x" is good, - and x are literals.
	333	#
	334	set-start-dash:
	335	'-' errorDeath doRuleError
	336	default set-after-lit doSetAddDash
	337
	338	# set-start-amp Turn "[&&" into a syntax error.
	339	# "[&x" is good, & and x are literals.
	340	#
	341	set-start-amp:
	342	'&' errorDeath doRuleError
	343	default set-after-lit doSetAddAmp
	344
	345	#
	346	# set-after-lit The last thing scanned was a literal character within a set.
	347	# Can be followed by anything. Single '-' or '&' are
	348	# literals in this context, not operators.
	349	set-after-lit:
	350	']' n pop doSetEnd
	351	'[' n set-open ^set-after-set doSetBeginUnion
	352	'-' n set-lit-dash
	353	'&' n set-lit-amp
	354	'\' n set-escape
	355	eof errorDeath doSetNoCloseError
	356	default n set-after-lit doSetLiteral
	357
	358	set-after-set:
	359	']' n pop doSetEnd
	360	'[' n set-open ^set-after-set doSetBeginUnion
	361	'-' n set-set-dash
	362	'&' n set-set-amp
	363	'\' n set-escape
	364	eof errorDeath doSetNoCloseError
	365	default n set-after-lit doSetLiteral
	366
	367	set-after-range:
	368	']' n pop doSetEnd
	369	'[' n set-open ^set-after-set doSetBeginUnion
	370	'-' n set-range-dash
	371	'&' n set-range-amp
	372	'\' n set-escape
	373	eof errorDeath doSetNoCloseError
	374	default n set-after-lit doSetLiteral
	375
	376
	377	# set-after-op
	378	# After a -- or &&
	379	# It is an error to close a set at this point.
	380	#
	381	set-after-op:
	382	'[' n set-open ^set-after-set doSetBeginUnion
	383	']' errorDeath doSetOpError
	384	'\' n set-escape
	385	default n set-after-lit doSetLiteral
	386
	387	#
	388	# set-set-amp
	389	# Have scanned [[set]&
	390	# Could be a '&' intersection operator, if a set follows.
	391	# Could be the start of a '&&' operator.
	392	# Otherewise is a literal.
	393	set-set-amp:
	394	'[' n set-open ^set-after-set doSetBeginIntersection1
	395	'&' n set-after-op doSetIntersection2
	396	default set-after-lit doSetAddAmp
	397
	398
	399	# set-lit-amp Have scanned "[literals&"
	400	# Could be a start of "&&" operator or a literal
	401	# In [abc&[def]], the '&' is a literal
	402	#
	403	set-lit-amp:
	404	'&' n set-after-op doSetIntersection2
	405	default set-after-lit doSetAddAmp
	406
	407
	408	#
	409	# set-set-dash
	410	# Have scanned [set]-
	411	# Could be a '-' difference operator, if a [set] follows.
	412	# Could be the start of a '--' operator.
	413	# Otherewise is a literal.
	414	set-set-dash:
	415	'[' n set-open ^set-after-set doSetBeginDifference1
	416	'-' n set-after-op doSetDifference2
	417	default set-after-lit doSetAddDash
	418
	419
	420	#
	421	# set-range-dash
	422	# scanned a-b- or \w-
	423	# any set or range like item where the trailing single '-' should
	424	# be literal, not a set difference operation.
	425	# A trailing "--" is still a difference operator.
	426	set-range-dash:
	427	'-' n set-after-op doSetDifference2
	428	default set-after-lit doSetAddDash
	429
	430
	431	set-range-amp:
	432	'&' n set-after-op doSetIntersection2
	433	default set-after-lit doSetAddAmp
	434
	435
	436	# set-lit-dash
	437	# Have scanned "[literals-" Could be a range or a -- operator or a literal
	438	# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
	439	# [abc-\p{xx} the '-' is an error
	440	# [abc-] the '-' is a literal
	441	# [ab-xy] the '-' is a range
	442	#
	443	set-lit-dash:
	444	'-' n set-after-op doSetDifference2
	445	'[' set-after-lit doSetAddDash
	446	']' set-after-lit doSetAddDash
	447	'\' n set-lit-dash-escape
	448	default n set-after-range doSetRange
	449
	450	# set-lit-dash-escape
	451	#
	452	# scanned "[literal-\"
	453	# Could be a range, if the \ introduces an escaped literal char or a named char.
	454	# Otherwise it is an error.
	455	#
	456	set-lit-dash-escape:
	457	's' errorDeath doSetOpError
	458	'S' errorDeath doSetOpError
	459	'w' errorDeath doSetOpError
	460	'W' errorDeath doSetOpError
	461	'd' errorDeath doSetOpError
	462	'D' errorDeath doSetOpError
	463	'N' set-after-range doSetNamedRange
	464	default n set-after-range doSetRange
	465
	466
	467	#
	468	# set-escape
	469	# Common back-slash escape processing within set expressions
	470	#
	471	set-escape:
	472	'p' set-after-set doSetProp
	473	'P' set-after-set doSetProp
	474	'N' set-after-lit doSetNamedChar
	475	's' n set-after-range doSetBackslash_s
	476	'S' n set-after-range doSetBackslash_S
	477	'w' n set-after-range doSetBackslash_w
	478	'W' n set-after-range doSetBackslash_W
	479	'd' n set-after-range doSetBackslash_d
	480	'D' n set-after-range doSetBackslash_D
	481	'h' n set-after-range doSetBackslash_h
	482	'H' n set-after-range doSetBackslash_H
	483	'v' n set-after-range doSetBackslash_v
	484	'V' n set-after-range doSetBackslash_V
	485	default n set-after-lit doSetLiteralEscaped
	486
	487	#
	488	# set-finish
	489	# Have just encountered the final ']' that completes a [set], and
	490	# arrived here via a pop. From here, we exit the set parsing world, and go
	491	# back to generic regular expression parsing.
	492	#
	493	set-finish:
	494	default expr-quant doSetFinish
	495
	496
	497	#
	498	# errorDeath. This state is specified as the next state whenever a syntax error
	499	# in the source rules is detected. Barring bugs, the state machine will never
	500	# actually get here, but will stop because of the action associated with the error.

1

2

# License & terms of use: http://www.unicode.org/copyright.html

3

#*****************************************************************************

#

#

#*****************************************************************************

9

#

10

# file: regexcst.txt

11

# ICU Regular Expression Parser State Table

12

#

13

# This state table is used when reading and parsing a regular expression pattern

14

# The pattern parser uses a state machine; the data in this file define the

15

# state transitions that occur for each input character.

16

#

17

# *** This file defines the regex pattern grammar. This is it.

18

# *** The determination of what is accepted is here.

19

#

20

# This file is processed by a perl script "regexcst.pl" to produce initialized C arrays

21

# that are then built with the rule parser.

#

#

# Here is the syntax of the state definitions in this file:

#

#

#StateName:

# input-char n next-state ^push-state action

30

# input-char n next-state ^push-state action

31

# | | | | |

32

# | | | | |--- action to be performed by state machine

33

# | | | | See function RBBIRuleScanner::doParseActions()

34

# | | | |

35

# | | | |--- Push this named state onto the state stack.

36

# | | | Later, when next state is specified as "pop",

37

# | | | the pushed state will become the current state.

38

# | | |

39

# | | |--- Transition to this state if the current input character matches the input

40

# | | character or char class in the left hand column. "pop" causes the next

41

# | | state to be popped from the state stack.

42

# | |

43

# | |--- When making the state transition specified on this line, advance to the next

44

# | character from the input only if 'n' appears here.

45

# |

46

# |--- Character or named character classes to test for. If the current character being scanned

47

# matches, peform the actions and go to the state specified on this line.

48

# The input character is tested sequentally, in the order written. The characters and

49

# character classes tested for do not need to be mutually exclusive. The first match wins.

#

#

# start state, scan position is at the beginning of the pattern.

57

#

58

start:

59

default term doPatStart

#

# term. At a position where we can accept the start most items in a pattern.

66

#

67

term:

68

quoted n expr-quant doLiteralChar

69

rule_char n expr-quant doLiteralChar

70

'[' n set-open ^set-finish doSetBegin

71

'(' n open-paren

72

'.' n expr-quant doDotAny

73

'^' n expr-quant doCaret

74

'$' n expr-quant doDollar

75

'\' n backslash

76

'|' n term doOrOperator

77

')' n pop doCloseParen

78

eof term doPatFinish

79

default errorDeath doRuleError

#

# expr-quant We've just finished scanning a term, now look for the optional

85

# trailing quantifier - *, +, ?, *?, etc.

#

expr-quant:

'*' n quant-star

'+' n quant-plus

'?' n quant-opt

'{' n interval-open doIntervalInit

92

'(' n open-paren-quant

default expr-cont

#

# expr-cont Expression, continuation. At a point where additional terms are

98

# allowed, but not required. No Quantifiers

99

#

100

expr-cont:

101

'|' n term doOrOperator

102

')' n pop doCloseParen

default term

#

# open-paren-quant Special case handling for comments appearing before a quantifier,

108

# e.g. x(?#comment )*

109

# Open parens from expr-quant come here; anything but a (?# comment

110

# branches into the normal parenthesis sequence as quickly as possible.

111

#

112

open-paren-quant:

113

'?' n open-paren-quant2 doSuppressComments

default open-paren

open-paren-quant2:

'#' n paren-comment ^expr-quant

118

default open-paren-extended

#

# open-paren We've got an open paren. We need to scan further to

123

# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.

124

#

125

open-paren:

126

'?' n open-paren-extended doSuppressComments

127

default term ^expr-quant doOpenCaptureParen

128

129

open-paren-extended:

130

':' n term ^expr-quant doOpenNonCaptureParen # (?:

131

'>' n term ^expr-quant doOpenAtomicParen # (?>

132

'=' n term ^expr-cont doOpenLookAhead # (?=

133

'!' n term ^expr-cont doOpenLookAheadNeg # (?!

134

'<' n open-paren-lookbehind

135

'#' n paren-comment ^term

136

'i' paren-flag doBeginMatchMode

137

'd' paren-flag doBeginMatchMode

138

'm' paren-flag doBeginMatchMode

139

's' paren-flag doBeginMatchMode

140

'u' paren-flag doBeginMatchMode

141

'w' paren-flag doBeginMatchMode

142

'x' paren-flag doBeginMatchMode

143

'-' paren-flag doBeginMatchMode

144

'(' n errorDeath doConditionalExpr

145

'{' n errorDeath doPerlInline

146

default errorDeath doBadOpenParenType

147

148

open-paren-lookbehind:

149

'=' n term ^expr-cont doOpenLookBehind # (?<=

150

'!' n term ^expr-cont doOpenLookBehindNeg # (?<!

151

ascii_letter named-capture doBeginNamedCapture # (?<name

152

default errorDeath doBadOpenParenType

#

# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'

#

paren-comment:

')' n pop

eof errorDeath doMismatchedParenErr

161

default n paren-comment

162

163

#

164

# paren-flag Scanned a (?ismx-ismx flag setting

165

#

166

paren-flag:

167

'i' n paren-flag doMatchMode

168

'd' n paren-flag doMatchMode

169

'm' n paren-flag doMatchMode

170

's' n paren-flag doMatchMode

171

'u' n paren-flag doMatchMode

172

'w' n paren-flag doMatchMode

173

'x' n paren-flag doMatchMode

174

'-' n paren-flag doMatchMode

175

')' n term doSetMatchMode

176

':' n term ^expr-quant doMatchModeParen

177

default errorDeath doBadModeFlag

178

179

#

180

# named-capture (?<name> ... ), position currently on the name.

181

#

182

named-capture:

183

ascii_letter n named-capture doContinueNamedCapture

184

digit_char n named-capture doContinueNamedCapture

185

'>' n term ^expr-quant doOpenCaptureParen # common w non-named capture.

186

default errorDeath doBadNamedCapture

187

188

#

189

# quant-star Scanning a '*' quantifier. Need to look ahead to decide

190

# between plain '*', '*?', '*+'

191

#

192

quant-star:

193

'?' n expr-cont doNGStar # *?

194

'+' n expr-cont doPossessiveStar # *+

195

default expr-cont doStar

#

# quant-plus Scanning a '+' quantifier. Need to look ahead to decide

200

# between plain '+', '+?', '++'

201

#

202

quant-plus:

203

'?' n expr-cont doNGPlus # *?

204

'+' n expr-cont doPossessivePlus # *+

205

default expr-cont doPlus

#

# quant-opt Scanning a '?' quantifier. Need to look ahead to decide

210

# between plain '?', '??', '?+'

211

#

212

quant-opt:

213

'?' n expr-cont doNGOpt # ??

214

'+' n expr-cont doPossessiveOpt # ?+

215

default expr-cont doOpt # ?

#

# Interval scanning a '{', the opening delimiter for an interval specification

220

# {number} or {min, max} or {min,}

221

#

222

interval-open:

223

digit_char interval-lower

224

default errorDeath doIntervalError

225

226

interval-lower:

227

digit_char n interval-lower doIntevalLowerDigit

228

',' n interval-upper

229

'}' n interval-type doIntervalSame # {n}

230

default errorDeath doIntervalError

231

232

interval-upper:

233

digit_char n interval-upper doIntervalUpperDigit

234

'}' n interval-type

235

default errorDeath doIntervalError

236

237

interval-type:

238

'?' n expr-cont doNGInterval # {n,m}?

239

'+' n expr-cont doPossessiveInterval # {n,m}+

240

default expr-cont doInterval # {m,n}

#

# backslash # Backslash. Figure out which of the \thingies we have encountered.

245

# The low level next-char function will have preprocessed

246

# some of them already; those won't come here.

247

backslash:

248

'A' n term doBackslashA

249

'B' n term doBackslashB

250

'b' n term doBackslashb

251

'd' n expr-quant doBackslashd

252

'D' n expr-quant doBackslashD

253

'G' n term doBackslashG

254

'h' n expr-quant doBackslashh

255

'H' n expr-quant doBackslashH

256

'k' n named-backref

257

'N' expr-quant doNamedChar # \N{NAME} named char

258

'p' expr-quant doProperty # \p{Lu} style property

259

'P' expr-quant doProperty

260

'R' n expr-quant doBackslashR

261

'Q' n term doEnterQuoteMode

262

'S' n expr-quant doBackslashS

263

's' n expr-quant doBackslashs

264

'v' n expr-quant doBackslashv

265

'V' n expr-quant doBackslashV

266

'W' n expr-quant doBackslashW

267

'w' n expr-quant doBackslashw

268

'X' n expr-quant doBackslashX

269

'Z' n term doBackslashZ

270

'z' n term doBackslashz

271

digit_char n expr-quant doBackRef # Will scan multiple digits

272

eof errorDeath doEscapeError

273

default n expr-quant doEscapedLiteralChar

274

275

276

# named-backref Scanned \k

277

# Leading to \k<captureName>

278

# Failure to get the full sequence is an error.

279

#

280

named-backref:

281

'<' n named-backref-2 doBeginNamedBackRef

282

default errorDeath doBadNamedCapture

283

284

named-backref-2:

285

ascii_letter n named-backref-3 doContinueNamedBackRef

286

default errorDeath doBadNamedCapture

287

288

named-backref-3:

289

ascii_letter n named-backref-3 doContinueNamedBackRef

290

digit_char n named-backref-3 doContinueNamedBackRef

291

'>' n expr-quant doCompleteNamedBackRef

292

default errorDeath doBadNamedCapture

#

# [set expression] parsing,

297

# All states involved in parsing set expressions have names beginning with "set-"

#

set-open:

'^' n set-open2 doSetNegate

302

':' set-posix doSetPosixProp

default set-open2

set-open2:

']' n set-after-lit doSetLiteral

default set-start

# set-posix:

# scanned a '[:' If it really is a [:property:], doSetPosixProp will have

311

# moved the scan to the closing ']'. If it wasn't a property

312

# expression, the scan will still be at the opening ':', which should

313

# be interpreted as a normal set expression.

set-posix:

']' n pop doSetEnd

':' set-start

default errorDeath doRuleError # should not be possible.

318

319

#

320

# set-start after the [ and special case leading characters (^ and/or ]) but before

321

# everything else. A '-' is literal at this point.

#

set-start:

']' n pop doSetEnd

'[' n set-open ^set-after-set doSetBeginUnion

'\' n set-escape

'-' n set-start-dash

'&' n set-start-amp

default n set-after-lit doSetLiteral

330

331

# set-start-dash Turn "[--" into a syntax error.

332

# "[-x" is good, - and x are literals.

333

#

334

set-start-dash:

335

'-' errorDeath doRuleError

336

default set-after-lit doSetAddDash

337

338

# set-start-amp Turn "[&&" into a syntax error.

339

# "[&x" is good, & and x are literals.

340

#

341

set-start-amp:

342

'&' errorDeath doRuleError

343

default set-after-lit doSetAddAmp

344

345

#

346

# set-after-lit The last thing scanned was a literal character within a set.

347

# Can be followed by anything. Single '-' or '&' are

348

# literals in this context, not operators.

349

set-after-lit:

350

']' n pop doSetEnd

351

'[' n set-open ^set-after-set doSetBeginUnion

'-' n set-lit-dash

'&' n set-lit-amp

'\' n set-escape

eof errorDeath doSetNoCloseError

356

default n set-after-lit doSetLiteral

set-after-set:

']' n pop doSetEnd

'[' n set-open ^set-after-set doSetBeginUnion

'-' n set-set-dash

'&' n set-set-amp

'\' n set-escape

eof errorDeath doSetNoCloseError

365

default n set-after-lit doSetLiteral

set-after-range:

']' n pop doSetEnd

'[' n set-open ^set-after-set doSetBeginUnion

'-' n set-range-dash

'&' n set-range-amp

'\' n set-escape

eof errorDeath doSetNoCloseError

374

default n set-after-lit doSetLiteral

# set-after-op

# After a -- or &&

# It is an error to close a set at this point.

380

#

381

set-after-op:

382

'[' n set-open ^set-after-set doSetBeginUnion

383

']' errorDeath doSetOpError

384

'\' n set-escape

385

default n set-after-lit doSetLiteral

#

# set-set-amp

# Have scanned [[set]&

390

# Could be a '&' intersection operator, if a set follows.

391

# Could be the start of a '&&' operator.

392

# Otherewise is a literal.

393

set-set-amp:

394

'[' n set-open ^set-after-set doSetBeginIntersection1

395

'&' n set-after-op doSetIntersection2

396

default set-after-lit doSetAddAmp

397

398

399

# set-lit-amp Have scanned "[literals&"

400

# Could be a start of "&&" operator or a literal

401

# In [abc&[def]], the '&' is a literal

402

#

403

set-lit-amp:

404

'&' n set-after-op doSetIntersection2

405

default set-after-lit doSetAddAmp

#

# set-set-dash

# Have scanned [set]-

411

# Could be a '-' difference operator, if a [set] follows.

412

# Could be the start of a '--' operator.

413

# Otherewise is a literal.

414

set-set-dash:

415

'[' n set-open ^set-after-set doSetBeginDifference1

416

'-' n set-after-op doSetDifference2

417

default set-after-lit doSetAddDash

#

# set-range-dash

# scanned a-b- or \w-

423

# any set or range like item where the trailing single '-' should

424

# be literal, not a set difference operation.

425

# A trailing "--" is still a difference operator.

426

set-range-dash:

427

'-' n set-after-op doSetDifference2

428

default set-after-lit doSetAddDash

set-range-amp:

'&' n set-after-op doSetIntersection2

433

default set-after-lit doSetAddAmp

# set-lit-dash

# Have scanned "[literals-" Could be a range or a -- operator or a literal

438

# In [abc-[def]], the '-' is a literal (confirmed with a Java test)

439

# [abc-\p{xx} the '-' is an error

440

# [abc-] the '-' is a literal

441

# [ab-xy] the '-' is a range

442

#

443

set-lit-dash:

444

'-' n set-after-op doSetDifference2

445

'[' set-after-lit doSetAddDash

446

']' set-after-lit doSetAddDash

447

'\' n set-lit-dash-escape

448

default n set-after-range doSetRange

449

450

# set-lit-dash-escape

451

#

452

# scanned "[literal-\"

453

# Could be a range, if the \ introduces an escaped literal char or a named char.

454

# Otherwise it is an error.

455

#

456

set-lit-dash-escape:

457

's' errorDeath doSetOpError

458

'S' errorDeath doSetOpError

459

'w' errorDeath doSetOpError

460

'W' errorDeath doSetOpError

461

'd' errorDeath doSetOpError

462

'D' errorDeath doSetOpError

463

'N' set-after-range doSetNamedRange

464

default n set-after-range doSetRange

#

# set-escape

# Common back-slash escape processing within set expressions

470

#

471

set-escape:

472

'p' set-after-set doSetProp

473

'P' set-after-set doSetProp

474

'N' set-after-lit doSetNamedChar

475

's' n set-after-range doSetBackslash_s

476

'S' n set-after-range doSetBackslash_S

477

'w' n set-after-range doSetBackslash_w

478

'W' n set-after-range doSetBackslash_W

479

'd' n set-after-range doSetBackslash_d

480

'D' n set-after-range doSetBackslash_D

481

'h' n set-after-range doSetBackslash_h

482

'H' n set-after-range doSetBackslash_H

483

'v' n set-after-range doSetBackslash_v

484

'V' n set-after-range doSetBackslash_V

485

default n set-after-lit doSetLiteralEscaped

#

# set-finish

# Have just encountered the final ']' that completes a [set], and

490

# arrived here via a pop. From here, we exit the set parsing world, and go

491

# back to generic regular expression parsing.

492

#

493

set-finish:

494

default expr-quant doSetFinish

#

# errorDeath. This state is specified as the next state whenever a syntax error

499

# in the source rules is detected. Barring bugs, the state machine will never

500

# actually get here, but will stop because of the action associated with the error.

501

# But, just in case, this state asks the state machine to exit.

502

errorDeath:

503

default n errorDeath doExit

504

505