]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbitblb.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
729e4ab9 3* Copyright (c) 2002-2009, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5**********************************************************************
6*/
73c04bcf
A
7//
8// rbbitblb.cpp
9//
10
b75a7d8f
A
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/unistr.h"
17#include "rbbitblb.h"
18#include "rbbirb.h"
19#include "rbbisetb.h"
20#include "rbbidata.h"
21#include "cstring.h"
22#include "uassert.h"
73c04bcf 23#include "cmemory.h"
b75a7d8f
A
24
25U_NAMESPACE_BEGIN
26
27RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
28 fTree(*rootNode) {
374ca955
A
29 fRB = rb;
30 fStatus = fRB->fStatus;
31 UErrorCode status = U_ZERO_ERROR;
32 fDStates = new UVector(status);
33 if (U_FAILURE(*fStatus)) {
34 return;
35 }
36 if (U_FAILURE(status)) {
37 *fStatus = status;
38 return;
39 }
40 if (fDStates == NULL) {
41 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
42 }
b75a7d8f
A
43}
44
45
46
47RBBITableBuilder::~RBBITableBuilder() {
48 int i;
49 for (i=0; i<fDStates->size(); i++) {
50 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
51 }
52 delete fDStates;
53}
54
55
56//-----------------------------------------------------------------------------
57//
58// RBBITableBuilder::build - This is the main function for building the DFA state transtion
59// table from the RBBI rules parse tree.
60//
61//-----------------------------------------------------------------------------
62void RBBITableBuilder::build() {
63
64 if (U_FAILURE(*fStatus)) {
65 return;
66 }
67
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
70 if (fTree==NULL) {
71 return;
72 }
73
74 //
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
77 //
78 fTree = fTree->flattenVariables();
73c04bcf 79#ifdef RBBI_DEBUG
b75a7d8f 80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
374ca955 81 RBBIDebugPuts("Parse tree after flattening variable references.");
b75a7d8f
A
82 fTree->printTree(TRUE);
83 }
73c04bcf
A
84#endif
85
86 //
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
91 //
92 if (fRB->fSetBuilder->sawBOF()) {
93 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
46f4442e
A
95 // Delete and exit if memory allocation failed.
96 if (bofTop == NULL || bofLeaf == NULL) {
97 *fStatus = U_MEMORY_ALLOCATION_ERROR;
98 delete bofTop;
99 delete bofLeaf;
100 return;
101 }
73c04bcf
A
102 bofTop->fLeftChild = bofLeaf;
103 bofTop->fRightChild = fTree;
104 bofLeaf->fParent = bofTop;
105 bofLeaf->fVal = 2; // Reserved value for {bof}.
106 fTree = bofTop;
107 }
b75a7d8f
A
108
109 //
110 // Add a unique right-end marker to the expression.
111 // Appears as a cat-node, left child being the original tree,
112 // right child being the end marker.
113 //
114 RBBINode *cn = new RBBINode(RBBINode::opCat);
46f4442e
A
115 // Exit if memory allocation failed.
116 if (cn == NULL) {
117 *fStatus = U_MEMORY_ALLOCATION_ERROR;
118 return;
119 }
b75a7d8f
A
120 cn->fLeftChild = fTree;
121 fTree->fParent = cn;
122 cn->fRightChild = new RBBINode(RBBINode::endMark);
46f4442e
A
123 // Delete and exit if memory allocation failed.
124 if (cn->fRightChild == NULL) {
125 *fStatus = U_MEMORY_ALLOCATION_ERROR;
126 delete cn;
127 return;
128 }
b75a7d8f
A
129 cn->fRightChild->fParent = cn;
130 fTree = cn;
131
132 //
133 // Replace all references to UnicodeSets with the tree for the equivalent
134 // expression.
135 //
136 fTree->flattenSets();
73c04bcf 137#ifdef RBBI_DEBUG
b75a7d8f 138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
374ca955 139 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
b75a7d8f
A
140 fTree->printTree(TRUE);
141 }
73c04bcf 142#endif
b75a7d8f
A
143
144
145 //
146 // calculate the functions nullable, firstpos, lastpos and followpos on
147 // nodes in the parse tree.
148 // See the alogrithm description in Aho.
149 // Understanding how this works by looking at the code alone will be
150 // nearly impossible.
151 //
152 calcNullable(fTree);
153 calcFirstPos(fTree);
154 calcLastPos(fTree);
155 calcFollowPos(fTree);
156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
374ca955 157 RBBIDebugPuts("\n");
b75a7d8f
A
158 printPosSets(fTree);
159 }
160
374ca955
A
161 //
162 // For "chained" rules, modify the followPos sets
163 //
164 if (fRB->fChainRules) {
165 calcChainedFollowPos(fTree);
166 }
167
73c04bcf
A
168 //
169 // BOF (start of input) test fixup.
170 //
171 if (fRB->fSetBuilder->sawBOF()) {
172 bofFixup();
173 }
174
b75a7d8f
A
175 //
176 // Build the DFA state transition tables.
177 //
178 buildStateTable();
179 flagAcceptingStates();
180 flagLookAheadStates();
181 flagTaggedStates();
b75a7d8f 182
374ca955
A
183 //
184 // Update the global table of rule status {tag} values
185 // The rule builder has a global vector of status values that are common
186 // for all tables. Merge the ones from this table into the global set.
187 //
188 mergeRuleStatusVals();
189
190 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
b75a7d8f
A
191}
192
193
194
195//-----------------------------------------------------------------------------
196//
197// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198//
199//-----------------------------------------------------------------------------
200void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237}
238
239
240
241
242//-----------------------------------------------------------------------------
243//
244// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245//
246//-----------------------------------------------------------------------------
247void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
73c04bcf
A
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
b75a7d8f
A
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284}
285
286
287
288//-----------------------------------------------------------------------------
289//
290// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291//
292//-----------------------------------------------------------------------------
293void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
73c04bcf
A
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
b75a7d8f
A
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330}
331
332
333
334//-----------------------------------------------------------------------------
335//
336// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337//
338//-----------------------------------------------------------------------------
339void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376}
377
378
374ca955
A
379//-----------------------------------------------------------------------------
380//
381// calcChainedFollowPos. Modify the previously calculated followPos sets
382// to implement rule chaining. NOT described by Aho
383//
384//-----------------------------------------------------------------------------
385void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
386
387 UVector endMarkerNodes(*fStatus);
388 UVector leafNodes(*fStatus);
389 int32_t i;
390
391 if (U_FAILURE(*fStatus)) {
392 return;
393 }
394
395 // get a list of all endmarker nodes.
396 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
397
73c04bcf 398 // get a list all leaf nodes
374ca955
A
399 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
400 if (U_FAILURE(*fStatus)) {
401 return;
402 }
403
73c04bcf
A
404 // Get all nodes that can be the start a match, which is FirstPosition()
405 // of the portion of the tree corresponding to user-written rules.
406 // See the tree description in bofFixup().
407 RBBINode *userRuleRoot = tree;
408 if (fRB->fSetBuilder->sawBOF()) {
409 userRuleRoot = tree->fLeftChild->fRightChild;
410 }
411 U_ASSERT(userRuleRoot != NULL);
412 UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
374ca955
A
413
414
415 // Iteratate over all leaf nodes,
416 //
417 int32_t endNodeIx;
418 int32_t startNodeIx;
419
420 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
421 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
422 RBBINode *endNode = NULL;
423
424 // Identify leaf nodes that correspond to overall rule match positions.
425 // These include an endMarkerNode in their followPos sets.
426 for (i=0; i<endMarkerNodes.size(); i++) {
427 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
428 endNode = tNode;
429 break;
430 }
431 }
432 if (endNode == NULL) {
433 // node wasn't an end node. Try again with the next.
434 continue;
435 }
436
437 // We've got a node that can end a match.
438
439 // Line Break Specific hack: If this node's val correspond to the $CM char class,
440 // don't chain from it.
441 // TODO: Add rule syntax for this behavior, get specifics out of here and
442 // into the rule file.
b331163b 443 if (fRB->fLBCMNoChain || fRB->fRINoChain) {
374ca955 444 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
73c04bcf
A
445 if (c != -1) {
446 // c == -1 occurs with sets containing only the {eof} marker string.
b331163b
A
447 if (fRB->fLBCMNoChain) {
448 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
449 if (cLBProp == U_LB_COMBINING_MARK) {
450 continue;
451 }
452 }
453 if (fRB->fRINoChain) {
454 UGraphemeClusterBreak cGBProp = (UGraphemeClusterBreak)u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
455 if (cGBProp == U_GCB_REGIONAL_INDICATOR) {
456 continue;
457 }
73c04bcf 458 }
374ca955
A
459 }
460 }
461
462
463 // Now iterate over the nodes that can start a match, looking for ones
464 // with the same char class as our ending node.
465 RBBINode *startNode;
466 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
467 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
468 if (startNode->fType != RBBINode::leafChar) {
469 continue;
470 }
471
472 if (endNode->fVal == startNode->fVal) {
473 // The end val (character class) of one possible match is the
474 // same as the start of another.
475
476 // Add all nodes from the followPos of the start node to the
477 // followPos set of the end node, which will have the effect of
478 // letting matches transition from a match state at endNode
479 // to the second char of a match starting with startNode.
480 setAdd(endNode->fFollowPos, startNode->fFollowPos);
481 }
482 }
483 }
484}
485
486
73c04bcf
A
487//-----------------------------------------------------------------------------
488//
489// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
490// Do an swizzle similar to chaining, modifying the followPos set of
491// the bofNode to include the followPos nodes from other {bot} nodes
492// scattered through the tree.
493//
494// This function has much in common with calcChainedFollowPos().
495//
496//-----------------------------------------------------------------------------
497void RBBITableBuilder::bofFixup() {
498
499 if (U_FAILURE(*fStatus)) {
500 return;
501 }
502
503 // The parse tree looks like this ...
504 // fTree root ---> <cat>
505 // / \ .
506 // <cat> <#end node>
507 // / \ .
508 // <bofNode> rest
509 // of tree
510 //
511 // We will be adding things to the followPos set of the <bofNode>
512 //
513 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
514 U_ASSERT(bofNode->fType == RBBINode::leafChar);
515 U_ASSERT(bofNode->fVal == 2);
516
517 // Get all nodes that can be the start a match of the user-written rules
518 // (excluding the fake bofNode)
519 // We want the nodes that can start a match in the
520 // part labeled "rest of tree"
521 //
522 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
523
524 RBBINode *startNode;
525 int startNodeIx;
526 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
527 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
528 if (startNode->fType != RBBINode::leafChar) {
529 continue;
530 }
531
532 if (startNode->fVal == bofNode->fVal) {
533 // We found a leaf node corresponding to a {bof} that was
534 // explicitly written into a rule.
535 // Add everything from the followPos set of this node to the
536 // followPos set of the fake bofNode at the start of the tree.
537 //
538 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
539 }
540 }
541}
542
b75a7d8f
A
543//-----------------------------------------------------------------------------
544//
545// buildStateTable() Determine the set of runtime DFA states and the
546// transition tables for these states, by the algorithm
547// of fig. 3.44 in Aho.
548//
549// Most of the comments are quotes of Aho's psuedo-code.
550//
551//-----------------------------------------------------------------------------
552void RBBITableBuilder::buildStateTable() {
374ca955
A
553 if (U_FAILURE(*fStatus)) {
554 return;
555 }
46f4442e
A
556 RBBIStateDescriptor *failState;
557 // Set it to NULL to avoid uninitialized warning
558 RBBIStateDescriptor *initialState = NULL;
b75a7d8f
A
559 //
560 // Add a dummy state 0 - the stop state. Not from Aho.
561 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
46f4442e
A
562 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
563 if (failState == NULL) {
564 *fStatus = U_MEMORY_ALLOCATION_ERROR;
565 goto ExitBuildSTdeleteall;
566 }
b75a7d8f 567 failState->fPositions = new UVector(*fStatus);
46f4442e
A
568 if (failState->fPositions == NULL) {
569 *fStatus = U_MEMORY_ALLOCATION_ERROR;
570 }
571 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
572 goto ExitBuildSTdeleteall;
374ca955 573 }
b75a7d8f 574 fDStates->addElement(failState, *fStatus);
374ca955 575 if (U_FAILURE(*fStatus)) {
46f4442e 576 goto ExitBuildSTdeleteall;
374ca955 577 }
b75a7d8f
A
578
579 // initially, the only unmarked state in Dstates is firstpos(root),
580 // where toot is the root of the syntax tree for (r)#;
46f4442e
A
581 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
582 if (initialState == NULL) {
583 *fStatus = U_MEMORY_ALLOCATION_ERROR;
584 }
374ca955 585 if (U_FAILURE(*fStatus)) {
46f4442e 586 goto ExitBuildSTdeleteall;
374ca955 587 }
b75a7d8f 588 initialState->fPositions = new UVector(*fStatus);
46f4442e
A
589 if (initialState->fPositions == NULL) {
590 *fStatus = U_MEMORY_ALLOCATION_ERROR;
591 }
374ca955 592 if (U_FAILURE(*fStatus)) {
46f4442e 593 goto ExitBuildSTdeleteall;
374ca955 594 }
b75a7d8f
A
595 setAdd(initialState->fPositions, fTree->fFirstPosSet);
596 fDStates->addElement(initialState, *fStatus);
374ca955 597 if (U_FAILURE(*fStatus)) {
46f4442e 598 goto ExitBuildSTdeleteall;
374ca955 599 }
b75a7d8f
A
600
601 // while there is an unmarked state T in Dstates do begin
602 for (;;) {
603 RBBIStateDescriptor *T = NULL;
604 int32_t tx;
605 for (tx=1; tx<fDStates->size(); tx++) {
606 RBBIStateDescriptor *temp;
607 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
608 if (temp->fMarked == FALSE) {
609 T = temp;
610 break;
611 }
612 }
613 if (T == NULL) {
614 break;
615 }
616
617 // mark T;
618 T->fMarked = TRUE;
619
620 // for each input symbol a do begin
621 int32_t a;
622 for (a = 1; a<=lastInputSymbol; a++) {
623 // let U be the set of positions that are in followpos(p)
624 // for some position p in T
625 // such that the symbol at position p is a;
626 UVector *U = NULL;
627 RBBINode *p;
628 int32_t px;
629 for (px=0; px<T->fPositions->size(); px++) {
630 p = (RBBINode *)T->fPositions->elementAt(px);
631 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
632 if (U == NULL) {
633 U = new UVector(*fStatus);
46f4442e
A
634 if (U == NULL) {
635 *fStatus = U_MEMORY_ALLOCATION_ERROR;
636 goto ExitBuildSTdeleteall;
637 }
b75a7d8f
A
638 }
639 setAdd(U, p->fFollowPos);
640 }
641 }
642
643 // if U is not empty and not in DStates then
644 int32_t ux = 0;
645 UBool UinDstates = FALSE;
646 if (U != NULL) {
647 U_ASSERT(U->size() > 0);
648 int ix;
649 for (ix=0; ix<fDStates->size(); ix++) {
650 RBBIStateDescriptor *temp2;
651 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
652 if (setEquals(U, temp2->fPositions)) {
653 delete U;
654 U = temp2->fPositions;
655 ux = ix;
656 UinDstates = TRUE;
657 break;
658 }
659 }
660
661 // Add U as an unmarked state to Dstates
662 if (!UinDstates)
663 {
664 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
46f4442e
A
665 if (newState == NULL) {
666 *fStatus = U_MEMORY_ALLOCATION_ERROR;
667 }
374ca955 668 if (U_FAILURE(*fStatus)) {
46f4442e 669 goto ExitBuildSTdeleteall;
374ca955 670 }
b75a7d8f
A
671 newState->fPositions = U;
672 fDStates->addElement(newState, *fStatus);
374ca955
A
673 if (U_FAILURE(*fStatus)) {
674 return;
675 }
b75a7d8f
A
676 ux = fDStates->size()-1;
677 }
678
679 // Dtran[T, a] := U;
680 T->fDtran->setElementAt(ux, a);
681 }
682 }
683 }
46f4442e
A
684 return;
685 // delete local pointers only if error occured.
686ExitBuildSTdeleteall:
687 delete initialState;
688 delete failState;
b75a7d8f
A
689}
690
691
692
693//-----------------------------------------------------------------------------
694//
695// flagAcceptingStates Identify accepting states.
696// First get a list of all of the end marker nodes.
697// Then, for each state s,
698// if s contains one of the end marker nodes in its list of tree positions then
699// s is an accepting state.
700//
701//-----------------------------------------------------------------------------
702void RBBITableBuilder::flagAcceptingStates() {
374ca955
A
703 if (U_FAILURE(*fStatus)) {
704 return;
705 }
b75a7d8f
A
706 UVector endMarkerNodes(*fStatus);
707 RBBINode *endMarker;
708 int32_t i;
709 int32_t n;
710
374ca955
A
711 if (U_FAILURE(*fStatus)) {
712 return;
713 }
714
b75a7d8f 715 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
374ca955
A
716 if (U_FAILURE(*fStatus)) {
717 return;
718 }
b75a7d8f
A
719
720 for (i=0; i<endMarkerNodes.size(); i++) {
721 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
722 for (n=0; n<fDStates->size(); n++) {
723 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
724 if (sd->fPositions->indexOf(endMarker) >= 0) {
725 // Any non-zero value for fAccepting means this is an accepting node.
726 // The value is what will be returned to the user as the break status.
727 // If no other value was specified, force it to -1.
73c04bcf
A
728
729 if (sd->fAccepting==0) {
730 // State hasn't been marked as accepting yet. Do it now.
731 sd->fAccepting = endMarker->fVal;
732 if (sd->fAccepting == 0) {
733 sd->fAccepting = -1;
734 }
735 }
736 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
737 // Both lookahead and non-lookahead accepting for this state.
738 // Favor the look-ahead. Expedient for line break.
739 // TODO: need a more elegant resolution for conflicting rules.
740 sd->fAccepting = endMarker->fVal;
b75a7d8f 741 }
73c04bcf
A
742 // implicit else:
743 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
b75a7d8f
A
744
745 // If the end marker node is from a look-ahead rule, set
746 // the fLookAhead field or this state also.
747 if (endMarker->fLookAheadEnd) {
73c04bcf
A
748 // TODO: don't change value if already set?
749 // TODO: allow for more than one active look-ahead rule in engine.
750 // Make value here an index to a side array in engine?
b75a7d8f
A
751 sd->fLookAhead = sd->fAccepting;
752 }
753 }
754 }
755 }
756}
757
758
759//-----------------------------------------------------------------------------
760//
761// flagLookAheadStates Very similar to flagAcceptingStates, above.
762//
763//-----------------------------------------------------------------------------
764void RBBITableBuilder::flagLookAheadStates() {
374ca955
A
765 if (U_FAILURE(*fStatus)) {
766 return;
767 }
b75a7d8f
A
768 UVector lookAheadNodes(*fStatus);
769 RBBINode *lookAheadNode;
770 int32_t i;
771 int32_t n;
772
773 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
374ca955
A
774 if (U_FAILURE(*fStatus)) {
775 return;
776 }
b75a7d8f
A
777 for (i=0; i<lookAheadNodes.size(); i++) {
778 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
779
780 for (n=0; n<fDStates->size(); n++) {
781 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
782 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
783 sd->fLookAhead = lookAheadNode->fVal;
784 }
785 }
786 }
787}
788
789
790
791
792//-----------------------------------------------------------------------------
793//
794// flagTaggedStates
795//
796//-----------------------------------------------------------------------------
797void RBBITableBuilder::flagTaggedStates() {
374ca955
A
798 if (U_FAILURE(*fStatus)) {
799 return;
800 }
b75a7d8f
A
801 UVector tagNodes(*fStatus);
802 RBBINode *tagNode;
803 int32_t i;
804 int32_t n;
805
374ca955
A
806 if (U_FAILURE(*fStatus)) {
807 return;
808 }
b75a7d8f 809 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
374ca955
A
810 if (U_FAILURE(*fStatus)) {
811 return;
812 }
b75a7d8f
A
813 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
814 tagNode = (RBBINode *)tagNodes.elementAt(i);
73c04bcf 815
b75a7d8f
A
816 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
817 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
818 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
374ca955 819 sortedAdd(&sd->fTagVals, tagNode->fVal);
b75a7d8f
A
820 }
821 }
822 }
823}
374ca955
A
824
825
826
827
828//-----------------------------------------------------------------------------
829//
830// mergeRuleStatusVals
831//
832// Update the global table of rule status {tag} values
833// The rule builder has a global vector of status values that are common
834// for all tables. Merge the ones from this table into the global set.
835//
836//-----------------------------------------------------------------------------
837void RBBITableBuilder::mergeRuleStatusVals() {
838 //
839 // The basic outline of what happens here is this...
840 //
841 // for each state in this state table
842 // if the status tag list for this state is in the global statuses list
843 // record where and
844 // continue with the next state
845 // else
846 // add the tag list for this state to the global list.
847 //
848 int i;
849 int n;
850
851 // Pre-set a single tag of {0} into the table.
852 // We will need this as a default, for rule sets with no explicit tagging.
853 if (fRB->fRuleStatusVals->size() == 0) {
854 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
855 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
856 }
73c04bcf
A
857
858 // For each state
859 for (n=0; n<fDStates->size(); n++) {
374ca955
A
860 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
861 UVector *thisStatesTagValues = sd->fTagVals;
862 if (thisStatesTagValues == NULL) {
863 // No tag values are explicitly associated with this state.
864 // Set the default tag value.
865 sd->fTagsIdx = 0;
866 continue;
867 }
868
869 // There are tag(s) associated with this state.
870 // fTagsIdx will be the index into the global tag list for this state's tag values.
871 // Initial value of -1 flags that we haven't got it set yet.
872 sd->fTagsIdx = -1;
873 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
874 int32_t nextTagGroupStart = 0;
73c04bcf 875
374ca955
A
876 // Loop runs once per group of tags in the global list
877 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
878 thisTagGroupStart = nextTagGroupStart;
879 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
880 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
881 // The number of tags for this state is different from
882 // the number of tags in this group from the global list.
883 // Continue with the next group from the global list.
884 continue;
885 }
886 // The lengths match, go ahead and compare the actual tag values
887 // between this state and the group from the global list.
888 for (i=0; i<thisStatesTagValues->size(); i++) {
73c04bcf 889 if (thisStatesTagValues->elementAti(i) !=
374ca955 890 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
73c04bcf 891 // Mismatch.
374ca955
A
892 break;
893 }
894 }
73c04bcf 895
374ca955
A
896 if (i == thisStatesTagValues->size()) {
897 // We found a set of tag values in the global list that match
898 // those for this state. Use them.
899 sd->fTagsIdx = thisTagGroupStart;
73c04bcf 900 break;
374ca955
A
901 }
902 }
73c04bcf 903
374ca955
A
904 if (sd->fTagsIdx == -1) {
905 // No suitable entry in the global tag list already. Add one
906 sd->fTagsIdx = fRB->fRuleStatusVals->size();
907 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
908 for (i=0; i<thisStatesTagValues->size(); i++) {
909 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
910 }
911 }
912 }
913}
914
915
916
917
918
919
920
921//-----------------------------------------------------------------------------
922//
923// sortedAdd Add a value to a vector of sorted values (ints).
924// Do not replicate entries; if the value is already there, do not
925// add a second one.
926// Lazily create the vector if it does not already exist.
927//
928//-----------------------------------------------------------------------------
929void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
930 int32_t i;
931
932 if (*vector == NULL) {
933 *vector = new UVector(*fStatus);
934 }
935 if (*vector == NULL || U_FAILURE(*fStatus)) {
936 return;
937 }
938 UVector *vec = *vector;
939 int32_t vSize = vec->size();
940 for (i=0; i<vSize; i++) {
941 int32_t valAtI = vec->elementAti(i);
942 if (valAtI == val) {
943 // The value is already in the vector. Don't add it again.
944 return;
945 }
946 if (valAtI > val) {
947 break;
948 }
949 }
950 vec->insertElementAt(val, i, *fStatus);
b75a7d8f
A
951}
952
953
954
955//-----------------------------------------------------------------------------
956//
957// setAdd Set operation on UVector
958// dest = dest union source
73c04bcf 959// Elements may only appear once and must be sorted.
b75a7d8f
A
960//
961//-----------------------------------------------------------------------------
962void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
46f4442e
A
963 int32_t destOriginalSize = dest->size();
964 int32_t sourceSize = source->size();
73c04bcf 965 int32_t di = 0;
729e4ab9
A
966 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
967 void **destPtr, **sourcePtr;
73c04bcf 968 void **destLim, **sourceLim;
b75a7d8f 969
729e4ab9
A
970 if (destOriginalSize > destArray.getCapacity()) {
971 if (destArray.resize(destOriginalSize) == NULL) {
972 return;
973 }
73c04bcf 974 }
729e4ab9
A
975 destPtr = destArray.getAlias();
976 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
73c04bcf 977
729e4ab9
A
978 if (sourceSize > sourceArray.getCapacity()) {
979 if (sourceArray.resize(sourceSize) == NULL) {
980 return;
73c04bcf 981 }
73c04bcf 982 }
729e4ab9
A
983 sourcePtr = sourceArray.getAlias();
984 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
73c04bcf
A
985
986 // Avoid multiple "get element" calls by getting the contents into arrays
729e4ab9
A
987 (void) dest->toArray(destPtr);
988 (void) source->toArray(sourcePtr);
73c04bcf 989
46f4442e 990 dest->setSize(sourceSize+destOriginalSize, *fStatus);
73c04bcf 991
729e4ab9
A
992 while (sourcePtr < sourceLim && destPtr < destLim) {
993 if (*destPtr == *sourcePtr) {
994 dest->setElementAt(*sourcePtr++, di++);
995 destPtr++;
73c04bcf 996 }
46f4442e
A
997 // This check is required for machines with segmented memory, like i5/OS.
998 // Direct pointer comparison is not recommended.
729e4ab9
A
999 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1000 dest->setElementAt(*destPtr++, di++);
46f4442e 1001 }
729e4ab9
A
1002 else { /* *sourcePtr < *destPtr */
1003 dest->setElementAt(*sourcePtr++, di++);
b75a7d8f 1004 }
73c04bcf
A
1005 }
1006
1007 // At most one of these two cleanup loops will execute
729e4ab9
A
1008 while (destPtr < destLim) {
1009 dest->setElementAt(*destPtr++, di++);
73c04bcf 1010 }
729e4ab9
A
1011 while (sourcePtr < sourceLim) {
1012 dest->setElementAt(*sourcePtr++, di++);
73c04bcf
A
1013 }
1014
46f4442e 1015 dest->setSize(di, *fStatus);
b75a7d8f
A
1016}
1017
1018
374ca955 1019
b75a7d8f
A
1020//-----------------------------------------------------------------------------
1021//
1022// setEqual Set operation on UVector.
1023// Compare for equality.
73c04bcf 1024// Elements must be sorted.
b75a7d8f
A
1025//
1026//-----------------------------------------------------------------------------
1027UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
73c04bcf 1028 return a->equals(*b);
b75a7d8f
A
1029}
1030
1031
1032//-----------------------------------------------------------------------------
1033//
1034// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1035// for each node in the tree.
1036//
1037//-----------------------------------------------------------------------------
b75a7d8f 1038#ifdef RBBI_DEBUG
374ca955 1039void RBBITableBuilder::printPosSets(RBBINode *n) {
b75a7d8f
A
1040 if (n==NULL) {
1041 return;
1042 }
374ca955 1043 n->printNode();
b75a7d8f
A
1044 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1045
1046 RBBIDebugPrintf(" firstpos: ");
1047 printSet(n->fFirstPosSet);
1048
1049 RBBIDebugPrintf(" lastpos: ");
1050 printSet(n->fLastPosSet);
1051
1052 RBBIDebugPrintf(" followpos: ");
1053 printSet(n->fFollowPos);
1054
1055 printPosSets(n->fLeftChild);
1056 printPosSets(n->fRightChild);
b75a7d8f 1057}
374ca955 1058#endif
b75a7d8f
A
1059
1060
1061
1062//-----------------------------------------------------------------------------
1063//
1064// getTableSize() Calculate the size of the runtime form of this
1065// state transition table.
1066//
1067//-----------------------------------------------------------------------------
374ca955 1068int32_t RBBITableBuilder::getTableSize() const {
b75a7d8f
A
1069 int32_t size = 0;
1070 int32_t numRows;
1071 int32_t numCols;
1072 int32_t rowSize;
1073
1074 if (fTree == NULL) {
1075 return 0;
1076 }
1077
1078 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
1079
1080 numRows = fDStates->size();
1081 numCols = fRB->fSetBuilder->getNumCharCategories();
1082
1083 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1084 // Therefore we subtract two from numCols when determining
1085 // how much storage to add to a row for the total columns.
1086 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1087 size += numRows * rowSize;
1088 return size;
1089}
1090
1091
1092
1093//-----------------------------------------------------------------------------
1094//
1095// exportTable() export the state transition table in the format required
1096// by the runtime engine. getTableSize() bytes of memory
1097// must be available at the output address "where".
1098//
1099//-----------------------------------------------------------------------------
1100void RBBITableBuilder::exportTable(void *where) {
1101 RBBIStateTable *table = (RBBIStateTable *)where;
1102 uint32_t state;
1103 int col;
1104
1105 if (U_FAILURE(*fStatus) || fTree == NULL) {
1106 return;
1107 }
1108
1109 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1110 fDStates->size() > 0x7fff) {
1111 *fStatus = U_BRK_INTERNAL_ERROR;
1112 return;
1113 }
1114
1115 table->fRowLen = sizeof(RBBIStateTableRow) +
1116 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1117 table->fNumStates = fDStates->size();
374ca955
A
1118 table->fFlags = 0;
1119 if (fRB->fLookAheadHardBreak) {
1120 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1121 }
73c04bcf
A
1122 if (fRB->fSetBuilder->sawBOF()) {
1123 table->fFlags |= RBBI_BOF_REQUIRED;
1124 }
374ca955 1125 table->fReserved = 0;
b75a7d8f
A
1126
1127 for (state=0; state<table->fNumStates; state++) {
1128 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1129 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1130 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1131 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1132 row->fAccepting = (int16_t)sd->fAccepting;
1133 row->fLookAhead = (int16_t)sd->fLookAhead;
374ca955 1134 row->fTagIdx = (int16_t)sd->fTagsIdx;
b75a7d8f
A
1135 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1136 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1137 }
1138 }
1139}
1140
1141
1142
1143//-----------------------------------------------------------------------------
1144//
1145// printSet Debug function. Print the contents of a UVector
1146//
1147//-----------------------------------------------------------------------------
b75a7d8f 1148#ifdef RBBI_DEBUG
374ca955 1149void RBBITableBuilder::printSet(UVector *s) {
b75a7d8f
A
1150 int32_t i;
1151 for (i=0; i<s->size(); i++) {
1152 void *v = s->elementAt(i);
1153 RBBIDebugPrintf("%10p", v);
1154 }
1155 RBBIDebugPrintf("\n");
b75a7d8f 1156}
374ca955 1157#endif
b75a7d8f
A
1158
1159
1160//-----------------------------------------------------------------------------
1161//
1162// printStates Debug Function. Dump the fully constructed state transition table.
1163//
1164//-----------------------------------------------------------------------------
b75a7d8f 1165#ifdef RBBI_DEBUG
374ca955 1166void RBBITableBuilder::printStates() {
b75a7d8f
A
1167 int c; // input "character"
1168 int n; // state number
1169
1170 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1171 RBBIDebugPrintf(" | Acc LA Tag");
374ca955
A
1172 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1173 RBBIDebugPrintf(" %2d", c);
1174 }
b75a7d8f
A
1175 RBBIDebugPrintf("\n");
1176 RBBIDebugPrintf(" |---------------");
374ca955
A
1177 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1178 RBBIDebugPrintf("---");
1179 }
b75a7d8f
A
1180 RBBIDebugPrintf("\n");
1181
1182 for (n=0; n<fDStates->size(); n++) {
1183 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1184 RBBIDebugPrintf(" %3d | " , n);
374ca955 1185 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
b75a7d8f
A
1186 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1187 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1188 }
1189 RBBIDebugPrintf("\n");
1190 }
1191 RBBIDebugPrintf("\n\n");
b75a7d8f 1192}
374ca955 1193#endif
b75a7d8f
A
1194
1195
1196
374ca955
A
1197//-----------------------------------------------------------------------------
1198//
1199// printRuleStatusTable Debug Function. Dump the common rule status table
1200//
1201//-----------------------------------------------------------------------------
1202#ifdef RBBI_DEBUG
1203void RBBITableBuilder::printRuleStatusTable() {
1204 int32_t thisRecord = 0;
1205 int32_t nextRecord = 0;
1206 int i;
1207 UVector *tbl = fRB->fRuleStatusVals;
1208
1209 RBBIDebugPrintf("index | tags \n");
1210 RBBIDebugPrintf("-------------------\n");
73c04bcf 1211
374ca955
A
1212 while (nextRecord < tbl->size()) {
1213 thisRecord = nextRecord;
1214 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1215 RBBIDebugPrintf("%4d ", thisRecord);
1216 for (i=thisRecord+1; i<nextRecord; i++) {
1217 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1218 }
1219 RBBIDebugPrintf("\n");
1220 }
1221 RBBIDebugPrintf("\n\n");
1222}
1223#endif
b75a7d8f
A
1224
1225
1226//-----------------------------------------------------------------------------
1227//
1228// RBBIStateDescriptor Methods. This is a very struct-like class
1229// Most access is directly to the fields.
1230//
1231//-----------------------------------------------------------------------------
1232
1233RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1234 fMarked = FALSE;
1235 fAccepting = 0;
1236 fLookAhead = 0;
374ca955
A
1237 fTagsIdx = 0;
1238 fTagVals = NULL;
b75a7d8f
A
1239 fPositions = NULL;
1240 fDtran = NULL;
73c04bcf 1241
374ca955 1242 fDtran = new UVector(lastInputSymbol+1, *fStatus);
b75a7d8f
A
1243 if (U_FAILURE(*fStatus)) {
1244 return;
1245 }
b75a7d8f
A
1246 if (fDtran == NULL) {
1247 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1248 return;
1249 }
46f4442e 1250 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
b75a7d8f
A
1251 // It is indexed by input symbols, and will
1252 // hold the next state number for each
1253 // symbol.
1254}
1255
1256
1257RBBIStateDescriptor::~RBBIStateDescriptor() {
1258 delete fPositions;
1259 delete fDtran;
374ca955 1260 delete fTagVals;
b75a7d8f
A
1261 fPositions = NULL;
1262 fDtran = NULL;
374ca955 1263 fTagVals = NULL;
b75a7d8f
A
1264}
1265
1266U_NAMESPACE_END
1267
1268#endif /* #if !UCONFIG_NO_BREAK_ITERATION */