]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbitblb.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
1 /*
2 **********************************************************************
3 * Copyright (c) 2002-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7 //
8 // rbbitblb.cpp
9 //
10
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_BREAK_ITERATION
15
16 #include "unicode/unistr.h"
17 #include "rbbitblb.h"
18 #include "rbbirb.h"
19 #include "rbbisetb.h"
20 #include "rbbidata.h"
21 #include "cstring.h"
22 #include "uassert.h"
23 #include "cmemory.h"
24
25 U_NAMESPACE_BEGIN
26
27 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
28 fTree(*rootNode) {
29 fRB = rb;
30 fStatus = fRB->fStatus;
31 UErrorCode status = U_ZERO_ERROR;
32 fDStates = new UVector(status);
33 if (U_FAILURE(*fStatus)) {
34 return;
35 }
36 if (U_FAILURE(status)) {
37 *fStatus = status;
38 return;
39 }
40 if (fDStates == NULL) {
41 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
42 }
43 }
44
45
46
47 RBBITableBuilder::~RBBITableBuilder() {
48 int i;
49 for (i=0; i<fDStates->size(); i++) {
50 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
51 }
52 delete fDStates;
53 }
54
55
56 //-----------------------------------------------------------------------------
57 //
58 // RBBITableBuilder::build - This is the main function for building the DFA state transtion
59 // table from the RBBI rules parse tree.
60 //
61 //-----------------------------------------------------------------------------
62 void RBBITableBuilder::build() {
63
64 if (U_FAILURE(*fStatus)) {
65 return;
66 }
67
68 // If there were no rules, just return. This situation can easily arise
69 // for the reverse rules.
70 if (fTree==NULL) {
71 return;
72 }
73
74 //
75 // Walk through the tree, replacing any references to $variables with a copy of the
76 // parse tree for the substition expression.
77 //
78 fTree = fTree->flattenVariables();
79 #ifdef RBBI_DEBUG
80 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
81 RBBIDebugPuts("Parse tree after flattening variable references.");
82 fTree->printTree(TRUE);
83 }
84 #endif
85
86 //
87 // If the rules contained any references to {bof}
88 // add a {bof} <cat> <former root of tree> to the
89 // tree. Means that all matches must start out with the
90 // {bof} fake character.
91 //
92 if (fRB->fSetBuilder->sawBOF()) {
93 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
94 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
95 bofTop->fLeftChild = bofLeaf;
96 bofTop->fRightChild = fTree;
97 bofLeaf->fParent = bofTop;
98 bofLeaf->fVal = 2; // Reserved value for {bof}.
99 fTree = bofTop;
100 }
101
102 //
103 // Add a unique right-end marker to the expression.
104 // Appears as a cat-node, left child being the original tree,
105 // right child being the end marker.
106 //
107 RBBINode *cn = new RBBINode(RBBINode::opCat);
108 cn->fLeftChild = fTree;
109 fTree->fParent = cn;
110 cn->fRightChild = new RBBINode(RBBINode::endMark);
111 cn->fRightChild->fParent = cn;
112 fTree = cn;
113
114 //
115 // Replace all references to UnicodeSets with the tree for the equivalent
116 // expression.
117 //
118 fTree->flattenSets();
119 #ifdef RBBI_DEBUG
120 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
121 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
122 fTree->printTree(TRUE);
123 }
124 #endif
125
126
127 //
128 // calculate the functions nullable, firstpos, lastpos and followpos on
129 // nodes in the parse tree.
130 // See the alogrithm description in Aho.
131 // Understanding how this works by looking at the code alone will be
132 // nearly impossible.
133 //
134 calcNullable(fTree);
135 calcFirstPos(fTree);
136 calcLastPos(fTree);
137 calcFollowPos(fTree);
138 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
139 RBBIDebugPuts("\n");
140 printPosSets(fTree);
141 }
142
143 //
144 // For "chained" rules, modify the followPos sets
145 //
146 if (fRB->fChainRules) {
147 calcChainedFollowPos(fTree);
148 }
149
150 //
151 // BOF (start of input) test fixup.
152 //
153 if (fRB->fSetBuilder->sawBOF()) {
154 bofFixup();
155 }
156
157 //
158 // Build the DFA state transition tables.
159 //
160 buildStateTable();
161 flagAcceptingStates();
162 flagLookAheadStates();
163 flagTaggedStates();
164
165 //
166 // Update the global table of rule status {tag} values
167 // The rule builder has a global vector of status values that are common
168 // for all tables. Merge the ones from this table into the global set.
169 //
170 mergeRuleStatusVals();
171
172 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
173 }
174
175
176
177 //-----------------------------------------------------------------------------
178 //
179 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
180 //
181 //-----------------------------------------------------------------------------
182 void RBBITableBuilder::calcNullable(RBBINode *n) {
183 if (n == NULL) {
184 return;
185 }
186 if (n->fType == RBBINode::setRef ||
187 n->fType == RBBINode::endMark ) {
188 // These are non-empty leaf node types.
189 n->fNullable = FALSE;
190 return;
191 }
192
193 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
194 // Lookahead marker node. It's a leaf, so no recursion on children.
195 // It's nullable because it does not match any literal text from the input stream.
196 n->fNullable = TRUE;
197 return;
198 }
199
200
201 // The node is not a leaf.
202 // Calculate nullable on its children.
203 calcNullable(n->fLeftChild);
204 calcNullable(n->fRightChild);
205
206 // Apply functions from table 3.40 in Aho
207 if (n->fType == RBBINode::opOr) {
208 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
209 }
210 else if (n->fType == RBBINode::opCat) {
211 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
212 }
213 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
214 n->fNullable = TRUE;
215 }
216 else {
217 n->fNullable = FALSE;
218 }
219 }
220
221
222
223
224 //-----------------------------------------------------------------------------
225 //
226 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
227 //
228 //-----------------------------------------------------------------------------
229 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
230 if (n == NULL) {
231 return;
232 }
233 if (n->fType == RBBINode::leafChar ||
234 n->fType == RBBINode::endMark ||
235 n->fType == RBBINode::lookAhead ||
236 n->fType == RBBINode::tag) {
237 // These are non-empty leaf node types.
238 // Note: In order to maintain the sort invariant on the set,
239 // this function should only be called on a node whose set is
240 // empty to start with.
241 n->fFirstPosSet->addElement(n, *fStatus);
242 return;
243 }
244
245 // The node is not a leaf.
246 // Calculate firstPos on its children.
247 calcFirstPos(n->fLeftChild);
248 calcFirstPos(n->fRightChild);
249
250 // Apply functions from table 3.40 in Aho
251 if (n->fType == RBBINode::opOr) {
252 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
253 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
254 }
255 else if (n->fType == RBBINode::opCat) {
256 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
257 if (n->fLeftChild->fNullable) {
258 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
259 }
260 }
261 else if (n->fType == RBBINode::opStar ||
262 n->fType == RBBINode::opQuestion ||
263 n->fType == RBBINode::opPlus) {
264 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
265 }
266 }
267
268
269
270 //-----------------------------------------------------------------------------
271 //
272 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
273 //
274 //-----------------------------------------------------------------------------
275 void RBBITableBuilder::calcLastPos(RBBINode *n) {
276 if (n == NULL) {
277 return;
278 }
279 if (n->fType == RBBINode::leafChar ||
280 n->fType == RBBINode::endMark ||
281 n->fType == RBBINode::lookAhead ||
282 n->fType == RBBINode::tag) {
283 // These are non-empty leaf node types.
284 // Note: In order to maintain the sort invariant on the set,
285 // this function should only be called on a node whose set is
286 // empty to start with.
287 n->fLastPosSet->addElement(n, *fStatus);
288 return;
289 }
290
291 // The node is not a leaf.
292 // Calculate lastPos on its children.
293 calcLastPos(n->fLeftChild);
294 calcLastPos(n->fRightChild);
295
296 // Apply functions from table 3.40 in Aho
297 if (n->fType == RBBINode::opOr) {
298 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
299 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
300 }
301 else if (n->fType == RBBINode::opCat) {
302 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
303 if (n->fRightChild->fNullable) {
304 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
305 }
306 }
307 else if (n->fType == RBBINode::opStar ||
308 n->fType == RBBINode::opQuestion ||
309 n->fType == RBBINode::opPlus) {
310 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
311 }
312 }
313
314
315
316 //-----------------------------------------------------------------------------
317 //
318 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
319 //
320 //-----------------------------------------------------------------------------
321 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
322 if (n == NULL ||
323 n->fType == RBBINode::leafChar ||
324 n->fType == RBBINode::endMark) {
325 return;
326 }
327
328 calcFollowPos(n->fLeftChild);
329 calcFollowPos(n->fRightChild);
330
331 // Aho rule #1
332 if (n->fType == RBBINode::opCat) {
333 RBBINode *i; // is 'i' in Aho's description
334 uint32_t ix;
335
336 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
337
338 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
339 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
340 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
341 }
342 }
343
344 // Aho rule #2
345 if (n->fType == RBBINode::opStar ||
346 n->fType == RBBINode::opPlus) {
347 RBBINode *i; // again, n and i are the names from Aho's description.
348 uint32_t ix;
349
350 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
351 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
352 setAdd(i->fFollowPos, n->fFirstPosSet);
353 }
354 }
355
356
357
358 }
359
360
361 //-----------------------------------------------------------------------------
362 //
363 // calcChainedFollowPos. Modify the previously calculated followPos sets
364 // to implement rule chaining. NOT described by Aho
365 //
366 //-----------------------------------------------------------------------------
367 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
368
369 UVector endMarkerNodes(*fStatus);
370 UVector leafNodes(*fStatus);
371 int32_t i;
372
373 if (U_FAILURE(*fStatus)) {
374 return;
375 }
376
377 // get a list of all endmarker nodes.
378 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
379
380 // get a list all leaf nodes
381 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
382 if (U_FAILURE(*fStatus)) {
383 return;
384 }
385
386 // Get all nodes that can be the start a match, which is FirstPosition()
387 // of the portion of the tree corresponding to user-written rules.
388 // See the tree description in bofFixup().
389 RBBINode *userRuleRoot = tree;
390 if (fRB->fSetBuilder->sawBOF()) {
391 userRuleRoot = tree->fLeftChild->fRightChild;
392 }
393 U_ASSERT(userRuleRoot != NULL);
394 UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
395
396
397 // Iteratate over all leaf nodes,
398 //
399 int32_t endNodeIx;
400 int32_t startNodeIx;
401
402 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
403 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
404 RBBINode *endNode = NULL;
405
406 // Identify leaf nodes that correspond to overall rule match positions.
407 // These include an endMarkerNode in their followPos sets.
408 for (i=0; i<endMarkerNodes.size(); i++) {
409 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
410 endNode = tNode;
411 break;
412 }
413 }
414 if (endNode == NULL) {
415 // node wasn't an end node. Try again with the next.
416 continue;
417 }
418
419 // We've got a node that can end a match.
420
421 // Line Break Specific hack: If this node's val correspond to the $CM char class,
422 // don't chain from it.
423 // TODO: Add rule syntax for this behavior, get specifics out of here and
424 // into the rule file.
425 if (fRB->fLBCMNoChain) {
426 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
427 if (c != -1) {
428 // c == -1 occurs with sets containing only the {eof} marker string.
429 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
430 if (cLBProp == U_LB_COMBINING_MARK) {
431 continue;
432 }
433 }
434 }
435
436
437 // Now iterate over the nodes that can start a match, looking for ones
438 // with the same char class as our ending node.
439 RBBINode *startNode;
440 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
441 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
442 if (startNode->fType != RBBINode::leafChar) {
443 continue;
444 }
445
446 if (endNode->fVal == startNode->fVal) {
447 // The end val (character class) of one possible match is the
448 // same as the start of another.
449
450 // Add all nodes from the followPos of the start node to the
451 // followPos set of the end node, which will have the effect of
452 // letting matches transition from a match state at endNode
453 // to the second char of a match starting with startNode.
454 setAdd(endNode->fFollowPos, startNode->fFollowPos);
455 }
456 }
457 }
458 }
459
460
461 //-----------------------------------------------------------------------------
462 //
463 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
464 // Do an swizzle similar to chaining, modifying the followPos set of
465 // the bofNode to include the followPos nodes from other {bot} nodes
466 // scattered through the tree.
467 //
468 // This function has much in common with calcChainedFollowPos().
469 //
470 //-----------------------------------------------------------------------------
471 void RBBITableBuilder::bofFixup() {
472
473 if (U_FAILURE(*fStatus)) {
474 return;
475 }
476
477 // The parse tree looks like this ...
478 // fTree root ---> <cat>
479 // / \ .
480 // <cat> <#end node>
481 // / \ .
482 // <bofNode> rest
483 // of tree
484 //
485 // We will be adding things to the followPos set of the <bofNode>
486 //
487 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
488 U_ASSERT(bofNode->fType == RBBINode::leafChar);
489 U_ASSERT(bofNode->fVal == 2);
490
491 // Get all nodes that can be the start a match of the user-written rules
492 // (excluding the fake bofNode)
493 // We want the nodes that can start a match in the
494 // part labeled "rest of tree"
495 //
496 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
497
498 RBBINode *startNode;
499 int startNodeIx;
500 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
501 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
502 if (startNode->fType != RBBINode::leafChar) {
503 continue;
504 }
505
506 if (startNode->fVal == bofNode->fVal) {
507 // We found a leaf node corresponding to a {bof} that was
508 // explicitly written into a rule.
509 // Add everything from the followPos set of this node to the
510 // followPos set of the fake bofNode at the start of the tree.
511 //
512 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
513 }
514 }
515 }
516
517 //-----------------------------------------------------------------------------
518 //
519 // buildStateTable() Determine the set of runtime DFA states and the
520 // transition tables for these states, by the algorithm
521 // of fig. 3.44 in Aho.
522 //
523 // Most of the comments are quotes of Aho's psuedo-code.
524 //
525 //-----------------------------------------------------------------------------
526 void RBBITableBuilder::buildStateTable() {
527 if (U_FAILURE(*fStatus)) {
528 return;
529 }
530 //
531 // Add a dummy state 0 - the stop state. Not from Aho.
532 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
533 RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
534 failState->fPositions = new UVector(*fStatus);
535 if (U_FAILURE(*fStatus)) {
536 return;
537 }
538 fDStates->addElement(failState, *fStatus);
539 if (U_FAILURE(*fStatus)) {
540 return;
541 }
542
543 // initially, the only unmarked state in Dstates is firstpos(root),
544 // where toot is the root of the syntax tree for (r)#;
545 RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
546 if (U_FAILURE(*fStatus)) {
547 return;
548 }
549 initialState->fPositions = new UVector(*fStatus);
550 if (U_FAILURE(*fStatus)) {
551 return;
552 }
553 setAdd(initialState->fPositions, fTree->fFirstPosSet);
554 fDStates->addElement(initialState, *fStatus);
555 if (U_FAILURE(*fStatus)) {
556 return;
557 }
558
559 // while there is an unmarked state T in Dstates do begin
560 for (;;) {
561 RBBIStateDescriptor *T = NULL;
562 int32_t tx;
563 for (tx=1; tx<fDStates->size(); tx++) {
564 RBBIStateDescriptor *temp;
565 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
566 if (temp->fMarked == FALSE) {
567 T = temp;
568 break;
569 }
570 }
571 if (T == NULL) {
572 break;
573 }
574
575 // mark T;
576 T->fMarked = TRUE;
577
578 // for each input symbol a do begin
579 int32_t a;
580 for (a = 1; a<=lastInputSymbol; a++) {
581 // let U be the set of positions that are in followpos(p)
582 // for some position p in T
583 // such that the symbol at position p is a;
584 UVector *U = NULL;
585 RBBINode *p;
586 int32_t px;
587 for (px=0; px<T->fPositions->size(); px++) {
588 p = (RBBINode *)T->fPositions->elementAt(px);
589 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
590 if (U == NULL) {
591 U = new UVector(*fStatus);
592 }
593 setAdd(U, p->fFollowPos);
594 }
595 }
596
597 // if U is not empty and not in DStates then
598 int32_t ux = 0;
599 UBool UinDstates = FALSE;
600 if (U != NULL) {
601 U_ASSERT(U->size() > 0);
602 int ix;
603 for (ix=0; ix<fDStates->size(); ix++) {
604 RBBIStateDescriptor *temp2;
605 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
606 if (setEquals(U, temp2->fPositions)) {
607 delete U;
608 U = temp2->fPositions;
609 ux = ix;
610 UinDstates = TRUE;
611 break;
612 }
613 }
614
615 // Add U as an unmarked state to Dstates
616 if (!UinDstates)
617 {
618 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
619 if (U_FAILURE(*fStatus)) {
620 return;
621 }
622 newState->fPositions = U;
623 fDStates->addElement(newState, *fStatus);
624 if (U_FAILURE(*fStatus)) {
625 return;
626 }
627 ux = fDStates->size()-1;
628 }
629
630 // Dtran[T, a] := U;
631 T->fDtran->setElementAt(ux, a);
632 }
633 }
634 }
635 }
636
637
638
639 //-----------------------------------------------------------------------------
640 //
641 // flagAcceptingStates Identify accepting states.
642 // First get a list of all of the end marker nodes.
643 // Then, for each state s,
644 // if s contains one of the end marker nodes in its list of tree positions then
645 // s is an accepting state.
646 //
647 //-----------------------------------------------------------------------------
648 void RBBITableBuilder::flagAcceptingStates() {
649 if (U_FAILURE(*fStatus)) {
650 return;
651 }
652 UVector endMarkerNodes(*fStatus);
653 RBBINode *endMarker;
654 int32_t i;
655 int32_t n;
656
657 if (U_FAILURE(*fStatus)) {
658 return;
659 }
660
661 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
662 if (U_FAILURE(*fStatus)) {
663 return;
664 }
665
666 for (i=0; i<endMarkerNodes.size(); i++) {
667 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
668 for (n=0; n<fDStates->size(); n++) {
669 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
670 if (sd->fPositions->indexOf(endMarker) >= 0) {
671 // Any non-zero value for fAccepting means this is an accepting node.
672 // The value is what will be returned to the user as the break status.
673 // If no other value was specified, force it to -1.
674
675 if (sd->fAccepting==0) {
676 // State hasn't been marked as accepting yet. Do it now.
677 sd->fAccepting = endMarker->fVal;
678 if (sd->fAccepting == 0) {
679 sd->fAccepting = -1;
680 }
681 }
682 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
683 // Both lookahead and non-lookahead accepting for this state.
684 // Favor the look-ahead. Expedient for line break.
685 // TODO: need a more elegant resolution for conflicting rules.
686 sd->fAccepting = endMarker->fVal;
687 }
688 // implicit else:
689 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
690
691 // If the end marker node is from a look-ahead rule, set
692 // the fLookAhead field or this state also.
693 if (endMarker->fLookAheadEnd) {
694 // TODO: don't change value if already set?
695 // TODO: allow for more than one active look-ahead rule in engine.
696 // Make value here an index to a side array in engine?
697 sd->fLookAhead = sd->fAccepting;
698 }
699 }
700 }
701 }
702 }
703
704
705 //-----------------------------------------------------------------------------
706 //
707 // flagLookAheadStates Very similar to flagAcceptingStates, above.
708 //
709 //-----------------------------------------------------------------------------
710 void RBBITableBuilder::flagLookAheadStates() {
711 if (U_FAILURE(*fStatus)) {
712 return;
713 }
714 UVector lookAheadNodes(*fStatus);
715 RBBINode *lookAheadNode;
716 int32_t i;
717 int32_t n;
718
719 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
720 if (U_FAILURE(*fStatus)) {
721 return;
722 }
723 for (i=0; i<lookAheadNodes.size(); i++) {
724 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
725
726 for (n=0; n<fDStates->size(); n++) {
727 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
728 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
729 sd->fLookAhead = lookAheadNode->fVal;
730 }
731 }
732 }
733 }
734
735
736
737
738 //-----------------------------------------------------------------------------
739 //
740 // flagTaggedStates
741 //
742 //-----------------------------------------------------------------------------
743 void RBBITableBuilder::flagTaggedStates() {
744 if (U_FAILURE(*fStatus)) {
745 return;
746 }
747 UVector tagNodes(*fStatus);
748 RBBINode *tagNode;
749 int32_t i;
750 int32_t n;
751
752 if (U_FAILURE(*fStatus)) {
753 return;
754 }
755 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
756 if (U_FAILURE(*fStatus)) {
757 return;
758 }
759 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
760 tagNode = (RBBINode *)tagNodes.elementAt(i);
761
762 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
763 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
764 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
765 sortedAdd(&sd->fTagVals, tagNode->fVal);
766 }
767 }
768 }
769 }
770
771
772
773
774 //-----------------------------------------------------------------------------
775 //
776 // mergeRuleStatusVals
777 //
778 // Update the global table of rule status {tag} values
779 // The rule builder has a global vector of status values that are common
780 // for all tables. Merge the ones from this table into the global set.
781 //
782 //-----------------------------------------------------------------------------
783 void RBBITableBuilder::mergeRuleStatusVals() {
784 //
785 // The basic outline of what happens here is this...
786 //
787 // for each state in this state table
788 // if the status tag list for this state is in the global statuses list
789 // record where and
790 // continue with the next state
791 // else
792 // add the tag list for this state to the global list.
793 //
794 int i;
795 int n;
796
797 // Pre-set a single tag of {0} into the table.
798 // We will need this as a default, for rule sets with no explicit tagging.
799 if (fRB->fRuleStatusVals->size() == 0) {
800 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
801 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
802 }
803
804 // For each state
805 for (n=0; n<fDStates->size(); n++) {
806 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
807 UVector *thisStatesTagValues = sd->fTagVals;
808 if (thisStatesTagValues == NULL) {
809 // No tag values are explicitly associated with this state.
810 // Set the default tag value.
811 sd->fTagsIdx = 0;
812 continue;
813 }
814
815 // There are tag(s) associated with this state.
816 // fTagsIdx will be the index into the global tag list for this state's tag values.
817 // Initial value of -1 flags that we haven't got it set yet.
818 sd->fTagsIdx = -1;
819 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
820 int32_t nextTagGroupStart = 0;
821
822 // Loop runs once per group of tags in the global list
823 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
824 thisTagGroupStart = nextTagGroupStart;
825 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
826 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
827 // The number of tags for this state is different from
828 // the number of tags in this group from the global list.
829 // Continue with the next group from the global list.
830 continue;
831 }
832 // The lengths match, go ahead and compare the actual tag values
833 // between this state and the group from the global list.
834 for (i=0; i<thisStatesTagValues->size(); i++) {
835 if (thisStatesTagValues->elementAti(i) !=
836 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
837 // Mismatch.
838 break;
839 }
840 }
841
842 if (i == thisStatesTagValues->size()) {
843 // We found a set of tag values in the global list that match
844 // those for this state. Use them.
845 sd->fTagsIdx = thisTagGroupStart;
846 break;
847 }
848 }
849
850 if (sd->fTagsIdx == -1) {
851 // No suitable entry in the global tag list already. Add one
852 sd->fTagsIdx = fRB->fRuleStatusVals->size();
853 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
854 for (i=0; i<thisStatesTagValues->size(); i++) {
855 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
856 }
857 }
858 }
859 }
860
861
862
863
864
865
866
867 //-----------------------------------------------------------------------------
868 //
869 // sortedAdd Add a value to a vector of sorted values (ints).
870 // Do not replicate entries; if the value is already there, do not
871 // add a second one.
872 // Lazily create the vector if it does not already exist.
873 //
874 //-----------------------------------------------------------------------------
875 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
876 int32_t i;
877
878 if (*vector == NULL) {
879 *vector = new UVector(*fStatus);
880 }
881 if (*vector == NULL || U_FAILURE(*fStatus)) {
882 return;
883 }
884 UVector *vec = *vector;
885 int32_t vSize = vec->size();
886 for (i=0; i<vSize; i++) {
887 int32_t valAtI = vec->elementAti(i);
888 if (valAtI == val) {
889 // The value is already in the vector. Don't add it again.
890 return;
891 }
892 if (valAtI > val) {
893 break;
894 }
895 }
896 vec->insertElementAt(val, i, *fStatus);
897 }
898
899
900
901 //-----------------------------------------------------------------------------
902 //
903 // setAdd Set operation on UVector
904 // dest = dest union source
905 // Elements may only appear once and must be sorted.
906 //
907 //-----------------------------------------------------------------------------
908 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
909 int destOriginalSize = dest->size();
910 int sourceSize = source->size();
911 int32_t di = 0;
912 void *(destS[16]), *(sourceS[16]); // Handle small cases without malloc
913 void **destH = 0, **sourceH = 0;
914 void **destBuff, **sourceBuff;
915 void **destLim, **sourceLim;
916
917 if (destOriginalSize > sizeof(destS)/sizeof(destS[0])) {
918 destH = (void **)uprv_malloc(sizeof(void *) * destOriginalSize);
919 destBuff = destH;
920 }
921 else {
922 destBuff = destS;
923 }
924 if (destBuff == 0) {
925 return;
926 }
927 destLim = destBuff + destOriginalSize;
928
929 if (sourceSize > sizeof(sourceS)/sizeof(sourceS[0])) {
930 sourceH = (void **)uprv_malloc(sizeof(void *) * sourceSize);
931 sourceBuff = sourceH;
932 }
933 else {
934 sourceBuff = sourceS;
935 }
936 if (sourceBuff == 0) {
937 if (destH) {
938 uprv_free(destH);
939 }
940 return;
941 }
942 sourceLim = sourceBuff + sourceSize;
943
944 // Avoid multiple "get element" calls by getting the contents into arrays
945 (void) dest->toArray(destBuff);
946 (void) source->toArray(sourceBuff);
947
948 dest->setSize(sourceSize+destOriginalSize);
949
950 while (sourceBuff < sourceLim && destBuff < destLim) {
951 if (*destBuff < *sourceBuff) {
952 dest->setElementAt(*destBuff++, di++);
953 }
954 else if (*sourceBuff < *destBuff) {
955 dest->setElementAt(*sourceBuff++, di++);
956 }
957 else {
958 dest->setElementAt(*sourceBuff++, di++);
959 destBuff++;
960 }
961 }
962
963 // At most one of these two cleanup loops will execute
964 while (destBuff < destLim) {
965 dest->setElementAt(*destBuff++, di++);
966 }
967 while (sourceBuff < sourceLim) {
968 dest->setElementAt(*sourceBuff++, di++);
969 }
970
971 dest->setSize(di);
972 if (destH) {
973 uprv_free(destH);
974 }
975 if (sourceH) {
976 uprv_free(sourceH);
977 }
978 }
979
980
981
982 //-----------------------------------------------------------------------------
983 //
984 // setEqual Set operation on UVector.
985 // Compare for equality.
986 // Elements must be sorted.
987 //
988 //-----------------------------------------------------------------------------
989 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
990 return a->equals(*b);
991 }
992
993
994 //-----------------------------------------------------------------------------
995 //
996 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
997 // for each node in the tree.
998 //
999 //-----------------------------------------------------------------------------
1000 #ifdef RBBI_DEBUG
1001 void RBBITableBuilder::printPosSets(RBBINode *n) {
1002 if (n==NULL) {
1003 return;
1004 }
1005 n->printNode();
1006 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1007
1008 RBBIDebugPrintf(" firstpos: ");
1009 printSet(n->fFirstPosSet);
1010
1011 RBBIDebugPrintf(" lastpos: ");
1012 printSet(n->fLastPosSet);
1013
1014 RBBIDebugPrintf(" followpos: ");
1015 printSet(n->fFollowPos);
1016
1017 printPosSets(n->fLeftChild);
1018 printPosSets(n->fRightChild);
1019 }
1020 #endif
1021
1022
1023
1024 //-----------------------------------------------------------------------------
1025 //
1026 // getTableSize() Calculate the size of the runtime form of this
1027 // state transition table.
1028 //
1029 //-----------------------------------------------------------------------------
1030 int32_t RBBITableBuilder::getTableSize() const {
1031 int32_t size = 0;
1032 int32_t numRows;
1033 int32_t numCols;
1034 int32_t rowSize;
1035
1036 if (fTree == NULL) {
1037 return 0;
1038 }
1039
1040 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
1041
1042 numRows = fDStates->size();
1043 numCols = fRB->fSetBuilder->getNumCharCategories();
1044
1045 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1046 // Therefore we subtract two from numCols when determining
1047 // how much storage to add to a row for the total columns.
1048 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1049 size += numRows * rowSize;
1050 return size;
1051 }
1052
1053
1054
1055 //-----------------------------------------------------------------------------
1056 //
1057 // exportTable() export the state transition table in the format required
1058 // by the runtime engine. getTableSize() bytes of memory
1059 // must be available at the output address "where".
1060 //
1061 //-----------------------------------------------------------------------------
1062 void RBBITableBuilder::exportTable(void *where) {
1063 RBBIStateTable *table = (RBBIStateTable *)where;
1064 uint32_t state;
1065 int col;
1066
1067 if (U_FAILURE(*fStatus) || fTree == NULL) {
1068 return;
1069 }
1070
1071 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1072 fDStates->size() > 0x7fff) {
1073 *fStatus = U_BRK_INTERNAL_ERROR;
1074 return;
1075 }
1076
1077 table->fRowLen = sizeof(RBBIStateTableRow) +
1078 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1079 table->fNumStates = fDStates->size();
1080 table->fFlags = 0;
1081 if (fRB->fLookAheadHardBreak) {
1082 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1083 }
1084 if (fRB->fSetBuilder->sawBOF()) {
1085 table->fFlags |= RBBI_BOF_REQUIRED;
1086 }
1087 table->fReserved = 0;
1088
1089 for (state=0; state<table->fNumStates; state++) {
1090 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1091 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1092 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1093 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1094 row->fAccepting = (int16_t)sd->fAccepting;
1095 row->fLookAhead = (int16_t)sd->fLookAhead;
1096 row->fTagIdx = (int16_t)sd->fTagsIdx;
1097 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1098 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1099 }
1100 }
1101 }
1102
1103
1104
1105 //-----------------------------------------------------------------------------
1106 //
1107 // printSet Debug function. Print the contents of a UVector
1108 //
1109 //-----------------------------------------------------------------------------
1110 #ifdef RBBI_DEBUG
1111 void RBBITableBuilder::printSet(UVector *s) {
1112 int32_t i;
1113 for (i=0; i<s->size(); i++) {
1114 void *v = s->elementAt(i);
1115 RBBIDebugPrintf("%10p", v);
1116 }
1117 RBBIDebugPrintf("\n");
1118 }
1119 #endif
1120
1121
1122 //-----------------------------------------------------------------------------
1123 //
1124 // printStates Debug Function. Dump the fully constructed state transition table.
1125 //
1126 //-----------------------------------------------------------------------------
1127 #ifdef RBBI_DEBUG
1128 void RBBITableBuilder::printStates() {
1129 int c; // input "character"
1130 int n; // state number
1131
1132 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1133 RBBIDebugPrintf(" | Acc LA Tag");
1134 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1135 RBBIDebugPrintf(" %2d", c);
1136 }
1137 RBBIDebugPrintf("\n");
1138 RBBIDebugPrintf(" |---------------");
1139 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1140 RBBIDebugPrintf("---");
1141 }
1142 RBBIDebugPrintf("\n");
1143
1144 for (n=0; n<fDStates->size(); n++) {
1145 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1146 RBBIDebugPrintf(" %3d | " , n);
1147 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1148 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1149 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1150 }
1151 RBBIDebugPrintf("\n");
1152 }
1153 RBBIDebugPrintf("\n\n");
1154 }
1155 #endif
1156
1157
1158
1159 //-----------------------------------------------------------------------------
1160 //
1161 // printRuleStatusTable Debug Function. Dump the common rule status table
1162 //
1163 //-----------------------------------------------------------------------------
1164 #ifdef RBBI_DEBUG
1165 void RBBITableBuilder::printRuleStatusTable() {
1166 int32_t thisRecord = 0;
1167 int32_t nextRecord = 0;
1168 int i;
1169 UVector *tbl = fRB->fRuleStatusVals;
1170
1171 RBBIDebugPrintf("index | tags \n");
1172 RBBIDebugPrintf("-------------------\n");
1173
1174 while (nextRecord < tbl->size()) {
1175 thisRecord = nextRecord;
1176 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1177 RBBIDebugPrintf("%4d ", thisRecord);
1178 for (i=thisRecord+1; i<nextRecord; i++) {
1179 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1180 }
1181 RBBIDebugPrintf("\n");
1182 }
1183 RBBIDebugPrintf("\n\n");
1184 }
1185 #endif
1186
1187
1188 //-----------------------------------------------------------------------------
1189 //
1190 // RBBIStateDescriptor Methods. This is a very struct-like class
1191 // Most access is directly to the fields.
1192 //
1193 //-----------------------------------------------------------------------------
1194
1195 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1196 fMarked = FALSE;
1197 fAccepting = 0;
1198 fLookAhead = 0;
1199 fTagsIdx = 0;
1200 fTagVals = NULL;
1201 fPositions = NULL;
1202 fDtran = NULL;
1203
1204 fDtran = new UVector(lastInputSymbol+1, *fStatus);
1205 if (U_FAILURE(*fStatus)) {
1206 return;
1207 }
1208 if (fDtran == NULL) {
1209 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1210 return;
1211 }
1212 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1213 // It is indexed by input symbols, and will
1214 // hold the next state number for each
1215 // symbol.
1216 }
1217
1218
1219 RBBIStateDescriptor::~RBBIStateDescriptor() {
1220 delete fPositions;
1221 delete fDtran;
1222 delete fTagVals;
1223 fPositions = NULL;
1224 fDtran = NULL;
1225 fTagVals = NULL;
1226 }
1227
1228 U_NAMESPACE_END
1229
1230 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */