]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbitblb.cpp
ICU-57163.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2002-2016, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7 //
8 // rbbitblb.cpp
9 //
10
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/unistr.h"
17 #include "rbbitblb.h"
18 #include "rbbirb.h"
19 #include "rbbisetb.h"
20 #include "rbbidata.h"
21 #include "cstring.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24
25 U_NAMESPACE_BEGIN
26
27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
28 fTree(*rootNode) {
29 fRB = rb;
30 fStatus = fRB->fStatus;
31 UErrorCode status = U_ZERO_ERROR;
32 fDStates = new UVector(status);
33 if (U_FAILURE(*fStatus)) {
34 return;
35 }
36 if (U_FAILURE(status)) {
37 *fStatus = status;
38 return;
39 }
40 if (fDStates == NULL) {
41 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
42 }
43 }
44
45
46
47 RBBITableBuilder::~RBBITableBuilder() {
48 int i;
49 for (i=0; i<fDStates->size(); i++) {
50 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
51 }
52 delete fDStates;
53 }
54
55
56 //-----------------------------------------------------------------------------
57 //
58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
59 // table from the RBBI rules parse tree.
60 //
61 //-----------------------------------------------------------------------------
62 void RBBITableBuilder::build() {
63
64 if (U_FAILURE(*fStatus)) {
65 return;
66 }
67
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
70 if (fTree==NULL) {
71 return;
72 }
73
74 //
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
77 //
78 fTree = fTree->flattenVariables();
79 #ifdef RBBI_DEBUG
80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
81 RBBIDebugPuts("\nParse tree after flattening variable references.");
82 fTree->printTree(TRUE);
83 }
84 #endif
85
86 //
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
91 //
92 if (fRB->fSetBuilder->sawBOF()) {
93 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
95 // Delete and exit if memory allocation failed.
96 if (bofTop == NULL || bofLeaf == NULL) {
97 *fStatus = U_MEMORY_ALLOCATION_ERROR;
98 delete bofTop;
99 delete bofLeaf;
100 return;
101 }
102 bofTop->fLeftChild = bofLeaf;
103 bofTop->fRightChild = fTree;
104 bofLeaf->fParent = bofTop;
105 bofLeaf->fVal = 2; // Reserved value for {bof}.
106 fTree = bofTop;
107 }
108
109 //
110 // Add a unique right-end marker to the expression.
111 // Appears as a cat-node, left child being the original tree,
112 // right child being the end marker.
113 //
114 RBBINode *cn = new RBBINode(RBBINode::opCat);
115 // Exit if memory allocation failed.
116 if (cn == NULL) {
117 *fStatus = U_MEMORY_ALLOCATION_ERROR;
118 return;
119 }
120 cn->fLeftChild = fTree;
121 fTree->fParent = cn;
122 cn->fRightChild = new RBBINode(RBBINode::endMark);
123 // Delete and exit if memory allocation failed.
124 if (cn->fRightChild == NULL) {
125 *fStatus = U_MEMORY_ALLOCATION_ERROR;
126 delete cn;
127 return;
128 }
129 cn->fRightChild->fParent = cn;
130 fTree = cn;
131
132 //
133 // Replace all references to UnicodeSets with the tree for the equivalent
134 // expression.
135 //
136 fTree->flattenSets();
137 #ifdef RBBI_DEBUG
138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
139 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
140 fTree->printTree(TRUE);
141 }
142 #endif
143
144
145 //
146 // calculate the functions nullable, firstpos, lastpos and followpos on
147 // nodes in the parse tree.
148 // See the alogrithm description in Aho.
149 // Understanding how this works by looking at the code alone will be
150 // nearly impossible.
151 //
152 calcNullable(fTree);
153 calcFirstPos(fTree);
154 calcLastPos(fTree);
155 calcFollowPos(fTree);
156 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
157 RBBIDebugPuts("\n");
158 printPosSets(fTree);
159 }
160
161 //
162 // For "chained" rules, modify the followPos sets
163 //
164 if (fRB->fChainRules) {
165 calcChainedFollowPos(fTree);
166 }
167
168 //
169 // BOF (start of input) test fixup.
170 //
171 if (fRB->fSetBuilder->sawBOF()) {
172 bofFixup();
173 }
174
175 //
176 // Build the DFA state transition tables.
177 //
178 buildStateTable();
179 flagAcceptingStates();
180 flagLookAheadStates();
181 flagTaggedStates();
182
183 //
184 // Update the global table of rule status {tag} values
185 // The rule builder has a global vector of status values that are common
186 // for all tables. Merge the ones from this table into the global set.
187 //
188 mergeRuleStatusVals();
189
190 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
191 }
192
193
194
195 //-----------------------------------------------------------------------------
196 //
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198 //
199 //-----------------------------------------------------------------------------
200 void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237 }
238
239
240
241
242 //-----------------------------------------------------------------------------
243 //
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245 //
246 //-----------------------------------------------------------------------------
247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284 }
285
286
287
288 //-----------------------------------------------------------------------------
289 //
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291 //
292 //-----------------------------------------------------------------------------
293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330 }
331
332
333
334 //-----------------------------------------------------------------------------
335 //
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337 //
338 //-----------------------------------------------------------------------------
339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376 }
377
378 //-----------------------------------------------------------------------------
379 //
380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381 // as roots of a rule to a destination vector.
382 //
383 //-----------------------------------------------------------------------------
384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385 if (node == NULL || U_FAILURE(*fStatus)) {
386 return;
387 }
388 if (node->fRuleRoot) {
389 dest->addElement(node, *fStatus);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
392 return;
393 }
394 addRuleRootNodes(dest, node->fLeftChild);
395 addRuleRootNodes(dest, node->fRightChild);
396 }
397
398 //-----------------------------------------------------------------------------
399 //
400 // calcChainedFollowPos. Modify the previously calculated followPos sets
401 // to implement rule chaining. NOT described by Aho
402 //
403 //-----------------------------------------------------------------------------
404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
406 UVector endMarkerNodes(*fStatus);
407 UVector leafNodes(*fStatus);
408 int32_t i;
409
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list of all endmarker nodes.
415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
417 // get a list all leaf nodes
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
429
430 UVector matchStartNodes(*fStatus);
431 for (int i=0; i<ruleRootNodes.size(); ++i) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
440
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 RBBINode *endNode = NULL;
447
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i=0; i<endMarkerNodes.size(); i++) {
451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452 endNode = tNode;
453 break;
454 }
455 }
456 if (endNode == NULL) {
457 // node wasn't an end node. Try again with the next.
458 continue;
459 }
460
461 // We've got a node that can end a match.
462
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
467 if (fRB->fLBCMNoChain || fRB->fRINoChain) {
468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
469 if (c != -1) {
470 // c == -1 occurs with sets containing only the {eof} marker string.
471 if (fRB->fLBCMNoChain) {
472 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
473 if (cLBProp == U_LB_COMBINING_MARK) {
474 continue;
475 }
476 }
477 if (fRB->fRINoChain) {
478 UGraphemeClusterBreak cGBProp = (UGraphemeClusterBreak)u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
479 if (cGBProp == U_GCB_REGIONAL_INDICATOR) {
480 continue;
481 }
482 }
483 }
484 }
485
486
487 // Now iterate over the nodes that can start a match, looking for ones
488 // with the same char class as our ending node.
489 RBBINode *startNode;
490 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
491 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
492 if (startNode->fType != RBBINode::leafChar) {
493 continue;
494 }
495
496 if (endNode->fVal == startNode->fVal) {
497 // The end val (character class) of one possible match is the
498 // same as the start of another.
499
500 // Add all nodes from the followPos of the start node to the
501 // followPos set of the end node, which will have the effect of
502 // letting matches transition from a match state at endNode
503 // to the second char of a match starting with startNode.
504 setAdd(endNode->fFollowPos, startNode->fFollowPos);
505 }
506 }
507 }
508 }
509
510
511 //-----------------------------------------------------------------------------
512 //
513 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
514 // Do an swizzle similar to chaining, modifying the followPos set of
515 // the bofNode to include the followPos nodes from other {bot} nodes
516 // scattered through the tree.
517 //
518 // This function has much in common with calcChainedFollowPos().
519 //
520 //-----------------------------------------------------------------------------
521 void RBBITableBuilder::bofFixup() {
522
523 if (U_FAILURE(*fStatus)) {
524 return;
525 }
526
527 // The parse tree looks like this ...
528 // fTree root ---> <cat>
529 // / \ .
530 // <cat> <#end node>
531 // / \ .
532 // <bofNode> rest
533 // of tree
534 //
535 // We will be adding things to the followPos set of the <bofNode>
536 //
537 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
538 U_ASSERT(bofNode->fType == RBBINode::leafChar);
539 U_ASSERT(bofNode->fVal == 2);
540
541 // Get all nodes that can be the start a match of the user-written rules
542 // (excluding the fake bofNode)
543 // We want the nodes that can start a match in the
544 // part labeled "rest of tree"
545 //
546 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
547
548 RBBINode *startNode;
549 int startNodeIx;
550 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
551 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
552 if (startNode->fType != RBBINode::leafChar) {
553 continue;
554 }
555
556 if (startNode->fVal == bofNode->fVal) {
557 // We found a leaf node corresponding to a {bof} that was
558 // explicitly written into a rule.
559 // Add everything from the followPos set of this node to the
560 // followPos set of the fake bofNode at the start of the tree.
561 //
562 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
563 }
564 }
565 }
566
567 //-----------------------------------------------------------------------------
568 //
569 // buildStateTable() Determine the set of runtime DFA states and the
570 // transition tables for these states, by the algorithm
571 // of fig. 3.44 in Aho.
572 //
573 // Most of the comments are quotes of Aho's psuedo-code.
574 //
575 //-----------------------------------------------------------------------------
576 void RBBITableBuilder::buildStateTable() {
577 if (U_FAILURE(*fStatus)) {
578 return;
579 }
580 RBBIStateDescriptor *failState;
581 // Set it to NULL to avoid uninitialized warning
582 RBBIStateDescriptor *initialState = NULL;
583 //
584 // Add a dummy state 0 - the stop state. Not from Aho.
585 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
586 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
587 if (failState == NULL) {
588 *fStatus = U_MEMORY_ALLOCATION_ERROR;
589 goto ExitBuildSTdeleteall;
590 }
591 failState->fPositions = new UVector(*fStatus);
592 if (failState->fPositions == NULL) {
593 *fStatus = U_MEMORY_ALLOCATION_ERROR;
594 }
595 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
596 goto ExitBuildSTdeleteall;
597 }
598 fDStates->addElement(failState, *fStatus);
599 if (U_FAILURE(*fStatus)) {
600 goto ExitBuildSTdeleteall;
601 }
602
603 // initially, the only unmarked state in Dstates is firstpos(root),
604 // where toot is the root of the syntax tree for (r)#;
605 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
606 if (initialState == NULL) {
607 *fStatus = U_MEMORY_ALLOCATION_ERROR;
608 }
609 if (U_FAILURE(*fStatus)) {
610 goto ExitBuildSTdeleteall;
611 }
612 initialState->fPositions = new UVector(*fStatus);
613 if (initialState->fPositions == NULL) {
614 *fStatus = U_MEMORY_ALLOCATION_ERROR;
615 }
616 if (U_FAILURE(*fStatus)) {
617 goto ExitBuildSTdeleteall;
618 }
619 setAdd(initialState->fPositions, fTree->fFirstPosSet);
620 fDStates->addElement(initialState, *fStatus);
621 if (U_FAILURE(*fStatus)) {
622 goto ExitBuildSTdeleteall;
623 }
624
625 // while there is an unmarked state T in Dstates do begin
626 for (;;) {
627 RBBIStateDescriptor *T = NULL;
628 int32_t tx;
629 for (tx=1; tx<fDStates->size(); tx++) {
630 RBBIStateDescriptor *temp;
631 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
632 if (temp->fMarked == FALSE) {
633 T = temp;
634 break;
635 }
636 }
637 if (T == NULL) {
638 break;
639 }
640
641 // mark T;
642 T->fMarked = TRUE;
643
644 // for each input symbol a do begin
645 int32_t a;
646 for (a = 1; a<=lastInputSymbol; a++) {
647 // let U be the set of positions that are in followpos(p)
648 // for some position p in T
649 // such that the symbol at position p is a;
650 UVector *U = NULL;
651 RBBINode *p;
652 int32_t px;
653 for (px=0; px<T->fPositions->size(); px++) {
654 p = (RBBINode *)T->fPositions->elementAt(px);
655 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
656 if (U == NULL) {
657 U = new UVector(*fStatus);
658 if (U == NULL) {
659 *fStatus = U_MEMORY_ALLOCATION_ERROR;
660 goto ExitBuildSTdeleteall;
661 }
662 }
663 setAdd(U, p->fFollowPos);
664 }
665 }
666
667 // if U is not empty and not in DStates then
668 int32_t ux = 0;
669 UBool UinDstates = FALSE;
670 if (U != NULL) {
671 U_ASSERT(U->size() > 0);
672 int ix;
673 for (ix=0; ix<fDStates->size(); ix++) {
674 RBBIStateDescriptor *temp2;
675 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
676 if (setEquals(U, temp2->fPositions)) {
677 delete U;
678 U = temp2->fPositions;
679 ux = ix;
680 UinDstates = TRUE;
681 break;
682 }
683 }
684
685 // Add U as an unmarked state to Dstates
686 if (!UinDstates)
687 {
688 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
689 if (newState == NULL) {
690 *fStatus = U_MEMORY_ALLOCATION_ERROR;
691 }
692 if (U_FAILURE(*fStatus)) {
693 goto ExitBuildSTdeleteall;
694 }
695 newState->fPositions = U;
696 fDStates->addElement(newState, *fStatus);
697 if (U_FAILURE(*fStatus)) {
698 return;
699 }
700 ux = fDStates->size()-1;
701 }
702
703 // Dtran[T, a] := U;
704 T->fDtran->setElementAt(ux, a);
705 }
706 }
707 }
708 return;
709 // delete local pointers only if error occured.
710 ExitBuildSTdeleteall:
711 delete initialState;
712 delete failState;
713 }
714
715
716
717 //-----------------------------------------------------------------------------
718 //
719 // flagAcceptingStates Identify accepting states.
720 // First get a list of all of the end marker nodes.
721 // Then, for each state s,
722 // if s contains one of the end marker nodes in its list of tree positions then
723 // s is an accepting state.
724 //
725 //-----------------------------------------------------------------------------
726 void RBBITableBuilder::flagAcceptingStates() {
727 if (U_FAILURE(*fStatus)) {
728 return;
729 }
730 UVector endMarkerNodes(*fStatus);
731 RBBINode *endMarker;
732 int32_t i;
733 int32_t n;
734
735 if (U_FAILURE(*fStatus)) {
736 return;
737 }
738
739 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
740 if (U_FAILURE(*fStatus)) {
741 return;
742 }
743
744 for (i=0; i<endMarkerNodes.size(); i++) {
745 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
746 for (n=0; n<fDStates->size(); n++) {
747 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
748 if (sd->fPositions->indexOf(endMarker) >= 0) {
749 // Any non-zero value for fAccepting means this is an accepting node.
750 // The value is what will be returned to the user as the break status.
751 // If no other value was specified, force it to -1.
752
753 if (sd->fAccepting==0) {
754 // State hasn't been marked as accepting yet. Do it now.
755 sd->fAccepting = endMarker->fVal;
756 if (sd->fAccepting == 0) {
757 sd->fAccepting = -1;
758 }
759 }
760 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
761 // Both lookahead and non-lookahead accepting for this state.
762 // Favor the look-ahead. Expedient for line break.
763 // TODO: need a more elegant resolution for conflicting rules.
764 sd->fAccepting = endMarker->fVal;
765 }
766 // implicit else:
767 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
768
769 // If the end marker node is from a look-ahead rule, set
770 // the fLookAhead field or this state also.
771 if (endMarker->fLookAheadEnd) {
772 // TODO: don't change value if already set?
773 // TODO: allow for more than one active look-ahead rule in engine.
774 // Make value here an index to a side array in engine?
775 sd->fLookAhead = sd->fAccepting;
776 }
777 }
778 }
779 }
780 }
781
782
783 //-----------------------------------------------------------------------------
784 //
785 // flagLookAheadStates Very similar to flagAcceptingStates, above.
786 //
787 //-----------------------------------------------------------------------------
788 void RBBITableBuilder::flagLookAheadStates() {
789 if (U_FAILURE(*fStatus)) {
790 return;
791 }
792 UVector lookAheadNodes(*fStatus);
793 RBBINode *lookAheadNode;
794 int32_t i;
795 int32_t n;
796
797 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
798 if (U_FAILURE(*fStatus)) {
799 return;
800 }
801 for (i=0; i<lookAheadNodes.size(); i++) {
802 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
803
804 for (n=0; n<fDStates->size(); n++) {
805 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
806 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
807 sd->fLookAhead = lookAheadNode->fVal;
808 }
809 }
810 }
811 }
812
813
814
815
816 //-----------------------------------------------------------------------------
817 //
818 // flagTaggedStates
819 //
820 //-----------------------------------------------------------------------------
821 void RBBITableBuilder::flagTaggedStates() {
822 if (U_FAILURE(*fStatus)) {
823 return;
824 }
825 UVector tagNodes(*fStatus);
826 RBBINode *tagNode;
827 int32_t i;
828 int32_t n;
829
830 if (U_FAILURE(*fStatus)) {
831 return;
832 }
833 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
834 if (U_FAILURE(*fStatus)) {
835 return;
836 }
837 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
838 tagNode = (RBBINode *)tagNodes.elementAt(i);
839
840 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
841 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
842 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
843 sortedAdd(&sd->fTagVals, tagNode->fVal);
844 }
845 }
846 }
847 }
848
849
850
851
852 //-----------------------------------------------------------------------------
853 //
854 // mergeRuleStatusVals
855 //
856 // Update the global table of rule status {tag} values
857 // The rule builder has a global vector of status values that are common
858 // for all tables. Merge the ones from this table into the global set.
859 //
860 //-----------------------------------------------------------------------------
861 void RBBITableBuilder::mergeRuleStatusVals() {
862 //
863 // The basic outline of what happens here is this...
864 //
865 // for each state in this state table
866 // if the status tag list for this state is in the global statuses list
867 // record where and
868 // continue with the next state
869 // else
870 // add the tag list for this state to the global list.
871 //
872 int i;
873 int n;
874
875 // Pre-set a single tag of {0} into the table.
876 // We will need this as a default, for rule sets with no explicit tagging.
877 if (fRB->fRuleStatusVals->size() == 0) {
878 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
879 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
880 }
881
882 // For each state
883 for (n=0; n<fDStates->size(); n++) {
884 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
885 UVector *thisStatesTagValues = sd->fTagVals;
886 if (thisStatesTagValues == NULL) {
887 // No tag values are explicitly associated with this state.
888 // Set the default tag value.
889 sd->fTagsIdx = 0;
890 continue;
891 }
892
893 // There are tag(s) associated with this state.
894 // fTagsIdx will be the index into the global tag list for this state's tag values.
895 // Initial value of -1 flags that we haven't got it set yet.
896 sd->fTagsIdx = -1;
897 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
898 int32_t nextTagGroupStart = 0;
899
900 // Loop runs once per group of tags in the global list
901 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
902 thisTagGroupStart = nextTagGroupStart;
903 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
904 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
905 // The number of tags for this state is different from
906 // the number of tags in this group from the global list.
907 // Continue with the next group from the global list.
908 continue;
909 }
910 // The lengths match, go ahead and compare the actual tag values
911 // between this state and the group from the global list.
912 for (i=0; i<thisStatesTagValues->size(); i++) {
913 if (thisStatesTagValues->elementAti(i) !=
914 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
915 // Mismatch.
916 break;
917 }
918 }
919
920 if (i == thisStatesTagValues->size()) {
921 // We found a set of tag values in the global list that match
922 // those for this state. Use them.
923 sd->fTagsIdx = thisTagGroupStart;
924 break;
925 }
926 }
927
928 if (sd->fTagsIdx == -1) {
929 // No suitable entry in the global tag list already. Add one
930 sd->fTagsIdx = fRB->fRuleStatusVals->size();
931 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
932 for (i=0; i<thisStatesTagValues->size(); i++) {
933 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
934 }
935 }
936 }
937 }
938
939
940
941
942
943
944
945 //-----------------------------------------------------------------------------
946 //
947 // sortedAdd Add a value to a vector of sorted values (ints).
948 // Do not replicate entries; if the value is already there, do not
949 // add a second one.
950 // Lazily create the vector if it does not already exist.
951 //
952 //-----------------------------------------------------------------------------
953 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
954 int32_t i;
955
956 if (*vector == NULL) {
957 *vector = new UVector(*fStatus);
958 }
959 if (*vector == NULL || U_FAILURE(*fStatus)) {
960 return;
961 }
962 UVector *vec = *vector;
963 int32_t vSize = vec->size();
964 for (i=0; i<vSize; i++) {
965 int32_t valAtI = vec->elementAti(i);
966 if (valAtI == val) {
967 // The value is already in the vector. Don't add it again.
968 return;
969 }
970 if (valAtI > val) {
971 break;
972 }
973 }
974 vec->insertElementAt(val, i, *fStatus);
975 }
976
977
978
979 //-----------------------------------------------------------------------------
980 //
981 // setAdd Set operation on UVector
982 // dest = dest union source
983 // Elements may only appear once and must be sorted.
984 //
985 //-----------------------------------------------------------------------------
986 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
987 int32_t destOriginalSize = dest->size();
988 int32_t sourceSize = source->size();
989 int32_t di = 0;
990 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
991 void **destPtr, **sourcePtr;
992 void **destLim, **sourceLim;
993
994 if (destOriginalSize > destArray.getCapacity()) {
995 if (destArray.resize(destOriginalSize) == NULL) {
996 return;
997 }
998 }
999 destPtr = destArray.getAlias();
1000 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1001
1002 if (sourceSize > sourceArray.getCapacity()) {
1003 if (sourceArray.resize(sourceSize) == NULL) {
1004 return;
1005 }
1006 }
1007 sourcePtr = sourceArray.getAlias();
1008 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1009
1010 // Avoid multiple "get element" calls by getting the contents into arrays
1011 (void) dest->toArray(destPtr);
1012 (void) source->toArray(sourcePtr);
1013
1014 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1015
1016 while (sourcePtr < sourceLim && destPtr < destLim) {
1017 if (*destPtr == *sourcePtr) {
1018 dest->setElementAt(*sourcePtr++, di++);
1019 destPtr++;
1020 }
1021 // This check is required for machines with segmented memory, like i5/OS.
1022 // Direct pointer comparison is not recommended.
1023 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1024 dest->setElementAt(*destPtr++, di++);
1025 }
1026 else { /* *sourcePtr < *destPtr */
1027 dest->setElementAt(*sourcePtr++, di++);
1028 }
1029 }
1030
1031 // At most one of these two cleanup loops will execute
1032 while (destPtr < destLim) {
1033 dest->setElementAt(*destPtr++, di++);
1034 }
1035 while (sourcePtr < sourceLim) {
1036 dest->setElementAt(*sourcePtr++, di++);
1037 }
1038
1039 dest->setSize(di, *fStatus);
1040 }
1041
1042
1043
1044 //-----------------------------------------------------------------------------
1045 //
1046 // setEqual Set operation on UVector.
1047 // Compare for equality.
1048 // Elements must be sorted.
1049 //
1050 //-----------------------------------------------------------------------------
1051 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1052 return a->equals(*b);
1053 }
1054
1055
1056 //-----------------------------------------------------------------------------
1057 //
1058 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1059 // for each node in the tree.
1060 //
1061 //-----------------------------------------------------------------------------
1062 #ifdef RBBI_DEBUG
1063 void RBBITableBuilder::printPosSets(RBBINode *n) {
1064 if (n==NULL) {
1065 return;
1066 }
1067 printf("\n");
1068 RBBINode::printNodeHeader();
1069 n->printNode();
1070 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1071
1072 RBBIDebugPrintf(" firstpos: ");
1073 printSet(n->fFirstPosSet);
1074
1075 RBBIDebugPrintf(" lastpos: ");
1076 printSet(n->fLastPosSet);
1077
1078 RBBIDebugPrintf(" followpos: ");
1079 printSet(n->fFollowPos);
1080
1081 printPosSets(n->fLeftChild);
1082 printPosSets(n->fRightChild);
1083 }
1084 #endif
1085
1086
1087
1088 //-----------------------------------------------------------------------------
1089 //
1090 // getTableSize() Calculate the size of the runtime form of this
1091 // state transition table.
1092 //
1093 //-----------------------------------------------------------------------------
1094 int32_t RBBITableBuilder::getTableSize() const {
1095 int32_t size = 0;
1096 int32_t numRows;
1097 int32_t numCols;
1098 int32_t rowSize;
1099
1100 if (fTree == NULL) {
1101 return 0;
1102 }
1103
1104 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
1105
1106 numRows = fDStates->size();
1107 numCols = fRB->fSetBuilder->getNumCharCategories();
1108
1109 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1110 // Therefore we subtract two from numCols when determining
1111 // how much storage to add to a row for the total columns.
1112 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1113 size += numRows * rowSize;
1114 return size;
1115 }
1116
1117
1118
1119 //-----------------------------------------------------------------------------
1120 //
1121 // exportTable() export the state transition table in the format required
1122 // by the runtime engine. getTableSize() bytes of memory
1123 // must be available at the output address "where".
1124 //
1125 //-----------------------------------------------------------------------------
1126 void RBBITableBuilder::exportTable(void *where) {
1127 RBBIStateTable *table = (RBBIStateTable *)where;
1128 uint32_t state;
1129 int col;
1130
1131 if (U_FAILURE(*fStatus) || fTree == NULL) {
1132 return;
1133 }
1134
1135 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1136 fDStates->size() > 0x7fff) {
1137 *fStatus = U_BRK_INTERNAL_ERROR;
1138 return;
1139 }
1140
1141 table->fRowLen = sizeof(RBBIStateTableRow) +
1142 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1143 table->fNumStates = fDStates->size();
1144 table->fFlags = 0;
1145 if (fRB->fLookAheadHardBreak) {
1146 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1147 }
1148 if (fRB->fSetBuilder->sawBOF()) {
1149 table->fFlags |= RBBI_BOF_REQUIRED;
1150 }
1151 table->fReserved = 0;
1152
1153 for (state=0; state<table->fNumStates; state++) {
1154 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1155 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1156 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1157 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1158 row->fAccepting = (int16_t)sd->fAccepting;
1159 row->fLookAhead = (int16_t)sd->fLookAhead;
1160 row->fTagIdx = (int16_t)sd->fTagsIdx;
1161 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1162 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1163 }
1164 }
1165 }
1166
1167
1168
1169 //-----------------------------------------------------------------------------
1170 //
1171 // printSet Debug function. Print the contents of a UVector
1172 //
1173 //-----------------------------------------------------------------------------
1174 #ifdef RBBI_DEBUG
1175 void RBBITableBuilder::printSet(UVector *s) {
1176 int32_t i;
1177 for (i=0; i<s->size(); i++) {
1178 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1179 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1180 }
1181 RBBIDebugPrintf("\n");
1182 }
1183 #endif
1184
1185
1186 //-----------------------------------------------------------------------------
1187 //
1188 // printStates Debug Function. Dump the fully constructed state transition table.
1189 //
1190 //-----------------------------------------------------------------------------
1191 #ifdef RBBI_DEBUG
1192 void RBBITableBuilder::printStates() {
1193 int c; // input "character"
1194 int n; // state number
1195
1196 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1197 RBBIDebugPrintf(" | Acc LA Tag");
1198 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1199 RBBIDebugPrintf(" %2d", c);
1200 }
1201 RBBIDebugPrintf("\n");
1202 RBBIDebugPrintf(" |---------------");
1203 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1204 RBBIDebugPrintf("---");
1205 }
1206 RBBIDebugPrintf("\n");
1207
1208 for (n=0; n<fDStates->size(); n++) {
1209 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1210 RBBIDebugPrintf(" %3d | " , n);
1211 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1212 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1213 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1214 }
1215 RBBIDebugPrintf("\n");
1216 }
1217 RBBIDebugPrintf("\n\n");
1218 }
1219 #endif
1220
1221
1222
1223 //-----------------------------------------------------------------------------
1224 //
1225 // printRuleStatusTable Debug Function. Dump the common rule status table
1226 //
1227 //-----------------------------------------------------------------------------
1228 #ifdef RBBI_DEBUG
1229 void RBBITableBuilder::printRuleStatusTable() {
1230 int32_t thisRecord = 0;
1231 int32_t nextRecord = 0;
1232 int i;
1233 UVector *tbl = fRB->fRuleStatusVals;
1234
1235 RBBIDebugPrintf("index | tags \n");
1236 RBBIDebugPrintf("-------------------\n");
1237
1238 while (nextRecord < tbl->size()) {
1239 thisRecord = nextRecord;
1240 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1241 RBBIDebugPrintf("%4d ", thisRecord);
1242 for (i=thisRecord+1; i<nextRecord; i++) {
1243 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1244 }
1245 RBBIDebugPrintf("\n");
1246 }
1247 RBBIDebugPrintf("\n\n");
1248 }
1249 #endif
1250
1251
1252 //-----------------------------------------------------------------------------
1253 //
1254 // RBBIStateDescriptor Methods. This is a very struct-like class
1255 // Most access is directly to the fields.
1256 //
1257 //-----------------------------------------------------------------------------
1258
1259 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1260 fMarked = FALSE;
1261 fAccepting = 0;
1262 fLookAhead = 0;
1263 fTagsIdx = 0;
1264 fTagVals = NULL;
1265 fPositions = NULL;
1266 fDtran = NULL;
1267
1268 fDtran = new UVector(lastInputSymbol+1, *fStatus);
1269 if (U_FAILURE(*fStatus)) {
1270 return;
1271 }
1272 if (fDtran == NULL) {
1273 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1274 return;
1275 }
1276 fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
1277 // It is indexed by input symbols, and will
1278 // hold the next state number for each
1279 // symbol.
1280 }
1281
1282
1283 RBBIStateDescriptor::~RBBIStateDescriptor() {
1284 delete fPositions;
1285 delete fDtran;
1286 delete fTagVals;
1287 fPositions = NULL;
1288 fDtran = NULL;
1289 fTagVals = NULL;
1290 }
1291
1292 U_NAMESPACE_END
1293
1294 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */