git.saurik.com Git - apple/icu.git/blame_incremental

Commit	Line	Data
	1
	2	#*****************************************************************************
	3	#
	4	# Copyright (C) 2002-2016, International Business Machines Corporation and others.
	5	# All Rights Reserved.
	6	#
	7	#*****************************************************************************
	8	#
	9	# file: rbbirpt.txt
	10	# ICU Break Iterator Rule Parser State Table
	11	#
	12	# This state table is used when reading and parsing a set of RBBI rules
	13	# The rule parser uses a state machine; the data in this file define the
	14	# state transitions that occur for each input character.
	15	#
	16	# *** This file defines the RBBI rule grammar. This is it.
	17	# *** The determination of what is accepted is here.
	18	#
	19	# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays
	20	# that are then built with the rule parser.
	21	#
	22	# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h
	23
	24	#
	25	# Here is the syntax of the state definitions in this file:
	26	#
	27	#
	28	#StateName:
	29	# input-char n next-state ^push-state action
	30	# input-char n next-state ^push-state action
	31	# \| \| \| \| \|
	32	# \| \| \| \| \|--- action to be performed by state machine
	33	# \| \| \| \| See function RBBIRuleScanner::doParseActions()
	34	# \| \| \| \|
	35	# \| \| \| \|--- Push this named state onto the state stack.
	36	# \| \| \| Later, when next state is specified as "pop",
	37	# \| \| \| the pushed state will become the current state.
	38	# \| \| \|
	39	# \| \| \|--- Transition to this state if the current input character matches the input
	40	# \| \| character or char class in the left hand column. "pop" causes the next
	41	# \| \| state to be popped from the state stack.
	42	# \| \|
	43	# \| \|--- When making the state transition specified on this line, advance to the next
	44	# \| character from the input only if 'n' appears here.
	45	# \|
	46	# \|--- Character or named character classes to test for. If the current character being scanned
	47	# matches, peform the actions and go to the state specified on this line.
	48	# The input character is tested sequentally, in the order written. The characters and
	49	# character classes tested for do not need to be mutually exclusive. The first match wins.
	50	#
	51
	52
	53
	54
	55	#
	56	# start state, scan position is at the beginning of the rules file, or in between two rules.
	57	#
	58	start:
	59	escaped term ^break-rule-end doExprStart
	60	white_space n start
	61	'^' n start-after-caret ^break-rule-end doNoChain
	62	'$' scan-var-name ^assign-or-rule doExprStart
	63	'!' n rev-option
	64	';' n start # ignore empty rules.
	65	eof exit
	66	default term ^break-rule-end doExprStart
	67
	68	#
	69	# break-rule-end: Returned from doing a break-rule expression.
	70	#
	71	break-rule-end:
	72	';' n start doEndOfRule
	73	white_space n break-rule-end
	74	default errorDeath doRuleError
	75
	76	#
	77	# start of a rule, after having seen a '^' (inhibits rule chain in).
	78	# Similar to the main 'start' state in most respects, except
	79	# - empty rule is an error.
	80	# - A second '^' is an error.
	81	#
	82	start-after-caret:
	83	escaped term doExprStart
	84	white_space n start-after-caret
	85	'^' errorDeath doRuleError # two '^'s
	86	'$' scan-var-name ^term-var-ref doExprStart
	87	';' errorDeath doRuleError # ^ ;
	88	eof errorDeath doRuleError
	89	default term doExprStart
	90
	91	#
	92	# ! We've just scanned a '!', indicating either a !!key word flag or a
	93	# !Reverse rule.
	94	#
	95	rev-option:
	96	'!' n option-scan1
	97	default reverse-rule ^break-rule-end doReverseDir
	98
	99	option-scan1:
	100	name_start_char n option-scan2 doOptionStart
	101	default errorDeath doRuleError
	102
	103	option-scan2:
	104	name_char n option-scan2
	105	default option-scan3 doOptionEnd
	106
	107	option-scan3:
	108	';' n start
	109	white_space n option-scan3
	110	default errorDeath doRuleError
	111
	112
	113	reverse-rule:
	114	default term ^break-rule-end doExprStart
	115
	116
	117	#
	118	# term. Eat through a single rule character, or a composite thing, which
	119	# could be a parenthesized expression, a variable name, or a Unicode Set.
	120	#
	121	term:
	122	escaped n expr-mod doRuleChar
	123	white_space n term
	124	rule_char n expr-mod doRuleChar
	125	'[' scan-unicode-set ^expr-mod
	126	'(' n term ^expr-mod doLParen
	127	'$' scan-var-name ^term-var-ref
	128	'.' n expr-mod doDotAny
	129	default errorDeath doRuleError
	130
	131
	132
	133	#
	134	# term-var-ref We've just finished scanning a reference to a $variable.
	135	# Check that the variable was defined.
	136	# The variable name scanning is in common with assignment statements,
	137	# so the check can't be done there.
	138	term-var-ref:
	139	default expr-mod doCheckVarDef
	140
	141
	142	#
	143	# expr-mod We've just finished scanning a term, now look for the optional
	144	# trailing '*', '?', '+'
	145	#
	146	expr-mod:
	147	white_space n expr-mod
	148	'*' n expr-cont doUnaryOpStar
	149	'+' n expr-cont doUnaryOpPlus
	150	'?' n expr-cont doUnaryOpQuestion
	151	default expr-cont
	152
	153
	154	#
	155	# expr-cont Expression, continuation. At a point where additional terms are
	156	# allowed, but not required.
	157	#
	158	expr-cont:
	159	escaped term doExprCatOperator
	160	white_space n expr-cont
	161	rule_char term doExprCatOperator
	162	'[' term doExprCatOperator
	163	'(' term doExprCatOperator
	164	'$' term doExprCatOperator
	165	'.' term doExprCatOperator
	166	'/' look-ahead doExprCatOperator
	167	'{' n tag-open doExprCatOperator
	168	'\|' n term doExprOrOperator
	169	')' n pop doExprRParen
	170	default pop doExprFinished
	171
	172
	173	#
	174	# look-ahead Scanning a '/', which identifies a break point, assuming that the
	175	# remainder of the expression matches.
	176	#
	177	# Generate a parse tree as if this was a special kind of input symbol
	178	# appearing in an otherwise normal concatenation expression.
	179	#
	180	look-ahead:
	181	'/' n expr-cont-no-slash doSlash
	182	default errorDeath
	183
	184
	185	#
	186	# expr-cont-no-slash Expression, continuation. At a point where additional terms are
	187	# allowed, but not required. Just like
	188	# expr-cont, above, except that no '/'
	189	# look-ahead symbol is permitted.
	190	#
	191	expr-cont-no-slash:
	192	escaped term doExprCatOperator
	193	white_space n expr-cont
	194	rule_char term doExprCatOperator
	195	'[' term doExprCatOperator
	196	'(' term doExprCatOperator
	197	'$' term doExprCatOperator
	198	'.' term doExprCatOperator
	199	'\|' n term doExprOrOperator
	200	')' n pop doExprRParen
	201	default pop doExprFinished
	202
	203
	204	#
	205	# tags scanning a '{', the opening delimiter for a tag that identifies
	206	# the kind of match. Scan the whole {dddd} tag, where d=digit
	207	#
	208	tag-open:
	209	white_space n tag-open
	210	digit_char tag-value doStartTagValue
	211	default errorDeath doTagExpectedError
	212
	213	tag-value:
	214	white_space n tag-close
	215	'}' tag-close
	216	digit_char n tag-value doTagDigit
	217	default errorDeath doTagExpectedError
	218
	219	tag-close:
	220	white_space n tag-close
	221	'}' n expr-cont-no-tag doTagValue
	222	default errorDeath doTagExpectedError
	223
	224
	225
	226	#
	227	# expr-cont-no-tag Expression, continuation. At a point where additional terms are
	228	# allowed, but not required. Just like
	229	# expr-cont, above, except that no "{ddd}"
	230	# tagging is permitted.
	231	#
	232	expr-cont-no-tag:
	233	escaped term doExprCatOperator
	234	white_space n expr-cont-no-tag
	235	rule_char term doExprCatOperator
	236	'[' term doExprCatOperator
	237	'(' term doExprCatOperator
	238	'$' term doExprCatOperator
	239	'.' term doExprCatOperator
	240	'/' look-ahead doExprCatOperator
	241	'\|' n term doExprOrOperator
	242	')' n pop doExprRParen
	243	default pop doExprFinished
	244
	245
	246
	247
	248	#
	249	# Variable Name Scanning.
	250	#
	251	# The state that branched to here must have pushed a return state
	252	# to go to after completion of the variable name scanning.
	253	#
	254	# The current input character must be the $ that introduces the name.
	255	# The $ is consummed here rather than in the state that first detected it
	256	# so that the doStartVariableName action only needs to happen in one
	257	# place (here), and the other states don't need to worry about it.
	258	#
	259	scan-var-name:
	260	'$' n scan-var-start doStartVariableName
	261	default errorDeath
	262
	263
	264	scan-var-start:
	265	name_start_char n scan-var-body
	266	default errorDeath doVariableNameExpectedErr
	267
	268	scan-var-body:
	269	name_char n scan-var-body
	270	default pop doEndVariableName
	271
	272
	273
	274	#
	275	# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.
	276	# Within the RBBI parser, after finding the first character
	277	# of a Unicode Set, we just hand the rule input at that
	278	# point of to the Unicode Set constructor, then pick
	279	# up parsing after the close of the set.
	280	#
	281	# The action for this state invokes the UnicodeSet parser.
	282	#
	283	scan-unicode-set:
	284	'[' n pop doScanUnicodeSet
	285	'p' n pop doScanUnicodeSet
	286	'P' n pop doScanUnicodeSet
	287	default errorDeath
	288
	289
	290
	291
	292
	293
	294
	295	#
	296	# assign-or-rule. A $variable was encountered at the start of something, could be
	297	# either an assignment statement or a rule, depending on whether an '='
	298	# follows the variable name. We get to this state when the variable name
	299	# scanning does a return.
	300	#
	301	assign-or-rule:
	302	white_space n assign-or-rule
	303	'=' n term ^assign-end doStartAssign # variable was target of assignment
	304	default term-var-ref ^break-rule-end # variable was a term in a rule
	305
	306
	307
	308	#
	309	# assign-end This state is entered when the end of the expression on the
	310	# right hand side of an assignment is found. We get here via
	311	# a pop; this state is pushed when the '=' in an assignment is found.
	312	#
	313	# The only thing allowed at this point is a ';'. The RHS of an
	314	# assignment must look like a rule expression, and we come here
	315	# when what is being scanned no longer looks like an expression.
	316	#
	317	assign-end:
	318	';' n start doEndAssign
	319	default errorDeath doRuleErrorAssignExpr
	320
	321
	322
	323	#
	324	# errorDeath. This state is specified as the next state whenever a syntax error
	325	# in the source rules is detected. Barring bugs, the state machine will never
	326	# actually get here, but will stop because of the action associated with the error.
	327	# But, just in case, this state asks the state machine to exit.
	328	errorDeath:
	329	default n errorDeath doExit
	330
	331

1

2

#*****************************************************************************

#

#

#*****************************************************************************

8

#

9

# file: rbbirpt.txt

10

# ICU Break Iterator Rule Parser State Table

11

#

12

# This state table is used when reading and parsing a set of RBBI rules

13

# The rule parser uses a state machine; the data in this file define the

14

# state transitions that occur for each input character.

15

#

16

# *** This file defines the RBBI rule grammar. This is it.

17

# *** The determination of what is accepted is here.

18

#

19

# This file is processed by a perl script "rbbicst.pl" to produce initialized C arrays

20

# that are then built with the rule parser.

21

#

22

# perl rbbicst.pl < rbbirpt.txt > rbbirpt.h

23

24

#

25

# Here is the syntax of the state definitions in this file:

#

#

#StateName:

# input-char n next-state ^push-state action

30

# input-char n next-state ^push-state action

31

# | | | | |

32

# | | | | |--- action to be performed by state machine

33

# | | | | See function RBBIRuleScanner::doParseActions()

34

# | | | |

35

# | | | |--- Push this named state onto the state stack.

36

# | | | Later, when next state is specified as "pop",

37

# | | | the pushed state will become the current state.

38

# | | |

39

# | | |--- Transition to this state if the current input character matches the input

40

# | | character or char class in the left hand column. "pop" causes the next

41

# | | state to be popped from the state stack.

42

# | |

43

# | |--- When making the state transition specified on this line, advance to the next

44

# | character from the input only if 'n' appears here.

45

# |

46

# |--- Character or named character classes to test for. If the current character being scanned

47

# matches, peform the actions and go to the state specified on this line.

48

# The input character is tested sequentally, in the order written. The characters and

49

# character classes tested for do not need to be mutually exclusive. The first match wins.

#

#

# start state, scan position is at the beginning of the rules file, or in between two rules.

57

#

58

start:

59

escaped term ^break-rule-end doExprStart

60

white_space n start

61

'^' n start-after-caret ^break-rule-end doNoChain

62

'$' scan-var-name ^assign-or-rule doExprStart

63

'!' n rev-option

64

';' n start # ignore empty rules.

65

eof exit

66

default term ^break-rule-end doExprStart

67

68

#

69

# break-rule-end: Returned from doing a break-rule expression.

70

#

71

break-rule-end:

72

';' n start doEndOfRule

73

white_space n break-rule-end

74

default errorDeath doRuleError

75

76

#

77

# start of a rule, after having seen a '^' (inhibits rule chain in).

78

# Similar to the main 'start' state in most respects, except

79

# - empty rule is an error.

80

# - A second '^' is an error.

81

#

82

start-after-caret:

83

escaped term doExprStart

84

white_space n start-after-caret

85

'^' errorDeath doRuleError # two '^'s

86

'$' scan-var-name ^term-var-ref doExprStart

87

';' errorDeath doRuleError # ^ ;

88

eof errorDeath doRuleError

89

default term doExprStart

90

91

#

92

# ! We've just scanned a '!', indicating either a !!key word flag or a

# !Reverse rule.

#

rev-option:

'!' n option-scan1

default reverse-rule ^break-rule-end doReverseDir

98

99

option-scan1:

100

name_start_char n option-scan2 doOptionStart

101

default errorDeath doRuleError

102

103

option-scan2:

104

name_char n option-scan2

105

default option-scan3 doOptionEnd

option-scan3:

';' n start

white_space n option-scan3

110

default errorDeath doRuleError

reverse-rule:

default term ^break-rule-end doExprStart

#

# term. Eat through a single rule character, or a composite thing, which

119

# could be a parenthesized expression, a variable name, or a Unicode Set.

120

#

121

term:

122

escaped n expr-mod doRuleChar

123

white_space n term

124

rule_char n expr-mod doRuleChar

125

'[' scan-unicode-set ^expr-mod

126

'(' n term ^expr-mod doLParen

127

'$' scan-var-name ^term-var-ref

128

'.' n expr-mod doDotAny

129

default errorDeath doRuleError

#

# term-var-ref We've just finished scanning a reference to a $variable.

135

# Check that the variable was defined.

136

# The variable name scanning is in common with assignment statements,

137

# so the check can't be done there.

138

term-var-ref:

139

default expr-mod doCheckVarDef

#

# expr-mod We've just finished scanning a term, now look for the optional

144

# trailing '*', '?', '+'

145

#

146

expr-mod:

147

white_space n expr-mod

148

'*' n expr-cont doUnaryOpStar

149

'+' n expr-cont doUnaryOpPlus

150

'?' n expr-cont doUnaryOpQuestion

default expr-cont

#

# expr-cont Expression, continuation. At a point where additional terms are

156

# allowed, but not required.

157

#

158

expr-cont:

159

escaped term doExprCatOperator

160

white_space n expr-cont

161

rule_char term doExprCatOperator

162

'[' term doExprCatOperator

163

'(' term doExprCatOperator

164

'$' term doExprCatOperator

165

'.' term doExprCatOperator

166

'/' look-ahead doExprCatOperator

167

'{' n tag-open doExprCatOperator

168

'|' n term doExprOrOperator

169

')' n pop doExprRParen

170

default pop doExprFinished

#

# look-ahead Scanning a '/', which identifies a break point, assuming that the

175

# remainder of the expression matches.

176

#

177

# Generate a parse tree as if this was a special kind of input symbol

178

# appearing in an otherwise normal concatenation expression.

179

#

180

look-ahead:

181

'/' n expr-cont-no-slash doSlash

default errorDeath

#

# expr-cont-no-slash Expression, continuation. At a point where additional terms are

187

# allowed, but not required. Just like

188

# expr-cont, above, except that no '/'

189

# look-ahead symbol is permitted.

190

#

191

expr-cont-no-slash:

192

escaped term doExprCatOperator

193

white_space n expr-cont

194

rule_char term doExprCatOperator

195

'[' term doExprCatOperator

196

'(' term doExprCatOperator

197

'$' term doExprCatOperator

198

'.' term doExprCatOperator

199

'|' n term doExprOrOperator

200

')' n pop doExprRParen

201

default pop doExprFinished

#

# tags scanning a '{', the opening delimiter for a tag that identifies

206

# the kind of match. Scan the whole {dddd} tag, where d=digit

207

#

208

tag-open:

209

white_space n tag-open

210

digit_char tag-value doStartTagValue

211

default errorDeath doTagExpectedError

212

213

tag-value:

214

white_space n tag-close

215

'}' tag-close

216

digit_char n tag-value doTagDigit

217

default errorDeath doTagExpectedError

218

219

tag-close:

220

white_space n tag-close

221

'}' n expr-cont-no-tag doTagValue

222

default errorDeath doTagExpectedError

#

# expr-cont-no-tag Expression, continuation. At a point where additional terms are

228

# allowed, but not required. Just like

229

# expr-cont, above, except that no "{ddd}"

230

# tagging is permitted.

231

#

232

expr-cont-no-tag:

233

escaped term doExprCatOperator

234

white_space n expr-cont-no-tag

235

rule_char term doExprCatOperator

236

'[' term doExprCatOperator

237

'(' term doExprCatOperator

238

'$' term doExprCatOperator

239

'.' term doExprCatOperator

240

'/' look-ahead doExprCatOperator

241

'|' n term doExprOrOperator

242

')' n pop doExprRParen

243

default pop doExprFinished

#

# Variable Name Scanning.

250

#

251

# The state that branched to here must have pushed a return state

252

# to go to after completion of the variable name scanning.

253

#

254

# The current input character must be the $ that introduces the name.

255

# The $ is consummed here rather than in the state that first detected it

256

# so that the doStartVariableName action only needs to happen in one

257

# place (here), and the other states don't need to worry about it.

258

#

259

scan-var-name:

260

'$' n scan-var-start doStartVariableName

default errorDeath

scan-var-start:

name_start_char n scan-var-body

266

default errorDeath doVariableNameExpectedErr

267

268

scan-var-body:

269

name_char n scan-var-body

270

default pop doEndVariableName

#

# scan-unicode-set Unicode Sets are parsed by the the UnicodeSet class.

276

# Within the RBBI parser, after finding the first character

277

# of a Unicode Set, we just hand the rule input at that

278

# point of to the Unicode Set constructor, then pick

279

# up parsing after the close of the set.

280

#

281

# The action for this state invokes the UnicodeSet parser.

282

#

283

scan-unicode-set:

284

'[' n pop doScanUnicodeSet

285

'p' n pop doScanUnicodeSet

286

'P' n pop doScanUnicodeSet

default errorDeath

#

# assign-or-rule. A $variable was encountered at the start of something, could be

297

# either an assignment statement or a rule, depending on whether an '='

298

# follows the variable name. We get to this state when the variable name

299

# scanning does a return.

300

#

301

assign-or-rule:

302

white_space n assign-or-rule

303

'=' n term ^assign-end doStartAssign # variable was target of assignment

304

default term-var-ref ^break-rule-end # variable was a term in a rule

#

# assign-end This state is entered when the end of the expression on the

310

# right hand side of an assignment is found. We get here via

311

# a pop; this state is pushed when the '=' in an assignment is found.

312

#

313

# The only thing allowed at this point is a ';'. The RHS of an

314

# assignment must look like a rule expression, and we come here

315

# when what is being scanned no longer looks like an expression.

316

#

317

assign-end:

318

';' n start doEndAssign

319

default errorDeath doRuleErrorAssignExpr

#

# errorDeath. This state is specified as the next state whenever a syntax error

325

# in the source rules is detected. Barring bugs, the state machine will never

326

# actually get here, but will stop because of the action associated with the error.

327

# But, just in case, this state asks the state machine to exit.

328

errorDeath:

329

default n errorDeath doExit

330

331