]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbitblb57.cpp
ICU-62108.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb57.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include "unicode/unistr.h"
16 #include "rbbitblb57.h"
17 #include "rbbirb57.h"
18 #include "rbbisetb57.h"
19 #include "rbbidata57.h"
20 #include "cstring.h"
21 #include "uassert.h"
22 #include "cmemory.h"
23
24 U_NAMESPACE_BEGIN
25
26 RBBITableBuilder57::RBBITableBuilder57(RBBIRuleBuilder57 *rb, RBBINode **rootNode) :
27 fTree(*rootNode) {
28 fRB = rb;
29 fStatus = fRB->fStatus;
30 UErrorCode status = U_ZERO_ERROR;
31 fDStates = new UVector(status);
32 if (U_FAILURE(*fStatus)) {
33 return;
34 }
35 if (U_FAILURE(status)) {
36 *fStatus = status;
37 return;
38 }
39 if (fDStates == NULL) {
40 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
41 }
42 }
43
44
45
46 RBBITableBuilder57::~RBBITableBuilder57() {
47 int i;
48 for (i=0; i<fDStates->size(); i++) {
49 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
50 }
51 delete fDStates;
52 }
53
54
55 //-----------------------------------------------------------------------------
56 //
57 // RBBITableBuilder57::build - This is the main function for building the DFA state transtion
58 // table from the RBBI rules parse tree.
59 //
60 //-----------------------------------------------------------------------------
61 void RBBITableBuilder57::build() {
62
63 if (U_FAILURE(*fStatus)) {
64 return;
65 }
66
67 // If there were no rules, just return. This situation can easily arise
68 // for the reverse rules.
69 if (fTree==NULL) {
70 return;
71 }
72
73 //
74 // Walk through the tree, replacing any references to $variables with a copy of the
75 // parse tree for the substition expression.
76 //
77 fTree = fTree->flattenVariables();
78 #ifdef RBBI_DEBUG
79 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
80 RBBIDebugPuts("\nParse tree after flattening variable references.");
81 fTree->printTree(TRUE);
82 }
83 #endif
84
85 //
86 // If the rules contained any references to {bof}
87 // add a {bof} <cat> <former root of tree> to the
88 // tree. Means that all matches must start out with the
89 // {bof} fake character.
90 //
91 if (fRB->fSetBuilder->sawBOF()) {
92 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
93 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
94 // Delete and exit if memory allocation failed.
95 if (bofTop == NULL || bofLeaf == NULL) {
96 *fStatus = U_MEMORY_ALLOCATION_ERROR;
97 delete bofTop;
98 delete bofLeaf;
99 return;
100 }
101 bofTop->fLeftChild = bofLeaf;
102 bofTop->fRightChild = fTree;
103 bofLeaf->fParent = bofTop;
104 bofLeaf->fVal = 2; // Reserved value for {bof}.
105 fTree = bofTop;
106 }
107
108 //
109 // Add a unique right-end marker to the expression.
110 // Appears as a cat-node, left child being the original tree,
111 // right child being the end marker.
112 //
113 RBBINode *cn = new RBBINode(RBBINode::opCat);
114 // Exit if memory allocation failed.
115 if (cn == NULL) {
116 *fStatus = U_MEMORY_ALLOCATION_ERROR;
117 return;
118 }
119 cn->fLeftChild = fTree;
120 fTree->fParent = cn;
121 cn->fRightChild = new RBBINode(RBBINode::endMark);
122 // Delete and exit if memory allocation failed.
123 if (cn->fRightChild == NULL) {
124 *fStatus = U_MEMORY_ALLOCATION_ERROR;
125 delete cn;
126 return;
127 }
128 cn->fRightChild->fParent = cn;
129 fTree = cn;
130
131 //
132 // Replace all references to UnicodeSets with the tree for the equivalent
133 // expression.
134 //
135 fTree->flattenSets();
136 #ifdef RBBI_DEBUG
137 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
138 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
139 fTree->printTree(TRUE);
140 }
141 #endif
142
143
144 //
145 // calculate the functions nullable, firstpos, lastpos and followpos on
146 // nodes in the parse tree.
147 // See the alogrithm description in Aho.
148 // Understanding how this works by looking at the code alone will be
149 // nearly impossible.
150 //
151 calcNullable(fTree);
152 calcFirstPos(fTree);
153 calcLastPos(fTree);
154 calcFollowPos(fTree);
155 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
156 RBBIDebugPuts("\n");
157 printPosSets(fTree);
158 }
159
160 //
161 // For "chained" rules, modify the followPos sets
162 //
163 if (fRB->fChainRules) {
164 calcChainedFollowPos(fTree);
165 }
166
167 //
168 // BOF (start of input) test fixup.
169 //
170 if (fRB->fSetBuilder->sawBOF()) {
171 bofFixup();
172 }
173
174 //
175 // Build the DFA state transition tables.
176 //
177 buildStateTable();
178 flagAcceptingStates();
179 flagLookAheadStates();
180 flagTaggedStates();
181
182 //
183 // Update the global table of rule status {tag} values
184 // The rule builder has a global vector of status values that are common
185 // for all tables. Merge the ones from this table into the global set.
186 //
187 mergeRuleStatusVals();
188
189 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
190 }
191
192
193
194 //-----------------------------------------------------------------------------
195 //
196 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
197 //
198 //-----------------------------------------------------------------------------
199 void RBBITableBuilder57::calcNullable(RBBINode *n) {
200 if (n == NULL) {
201 return;
202 }
203 if (n->fType == RBBINode::setRef ||
204 n->fType == RBBINode::endMark ) {
205 // These are non-empty leaf node types.
206 n->fNullable = FALSE;
207 return;
208 }
209
210 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
211 // Lookahead marker node. It's a leaf, so no recursion on children.
212 // It's nullable because it does not match any literal text from the input stream.
213 n->fNullable = TRUE;
214 return;
215 }
216
217
218 // The node is not a leaf.
219 // Calculate nullable on its children.
220 calcNullable(n->fLeftChild);
221 calcNullable(n->fRightChild);
222
223 // Apply functions from table 3.40 in Aho
224 if (n->fType == RBBINode::opOr) {
225 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
226 }
227 else if (n->fType == RBBINode::opCat) {
228 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
229 }
230 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
231 n->fNullable = TRUE;
232 }
233 else {
234 n->fNullable = FALSE;
235 }
236 }
237
238
239
240
241 //-----------------------------------------------------------------------------
242 //
243 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
244 //
245 //-----------------------------------------------------------------------------
246 void RBBITableBuilder57::calcFirstPos(RBBINode *n) {
247 if (n == NULL) {
248 return;
249 }
250 if (n->fType == RBBINode::leafChar ||
251 n->fType == RBBINode::endMark ||
252 n->fType == RBBINode::lookAhead ||
253 n->fType == RBBINode::tag) {
254 // These are non-empty leaf node types.
255 // Note: In order to maintain the sort invariant on the set,
256 // this function should only be called on a node whose set is
257 // empty to start with.
258 n->fFirstPosSet->addElement(n, *fStatus);
259 return;
260 }
261
262 // The node is not a leaf.
263 // Calculate firstPos on its children.
264 calcFirstPos(n->fLeftChild);
265 calcFirstPos(n->fRightChild);
266
267 // Apply functions from table 3.40 in Aho
268 if (n->fType == RBBINode::opOr) {
269 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
270 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
271 }
272 else if (n->fType == RBBINode::opCat) {
273 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
274 if (n->fLeftChild->fNullable) {
275 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
276 }
277 }
278 else if (n->fType == RBBINode::opStar ||
279 n->fType == RBBINode::opQuestion ||
280 n->fType == RBBINode::opPlus) {
281 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
282 }
283 }
284
285
286
287 //-----------------------------------------------------------------------------
288 //
289 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
290 //
291 //-----------------------------------------------------------------------------
292 void RBBITableBuilder57::calcLastPos(RBBINode *n) {
293 if (n == NULL) {
294 return;
295 }
296 if (n->fType == RBBINode::leafChar ||
297 n->fType == RBBINode::endMark ||
298 n->fType == RBBINode::lookAhead ||
299 n->fType == RBBINode::tag) {
300 // These are non-empty leaf node types.
301 // Note: In order to maintain the sort invariant on the set,
302 // this function should only be called on a node whose set is
303 // empty to start with.
304 n->fLastPosSet->addElement(n, *fStatus);
305 return;
306 }
307
308 // The node is not a leaf.
309 // Calculate lastPos on its children.
310 calcLastPos(n->fLeftChild);
311 calcLastPos(n->fRightChild);
312
313 // Apply functions from table 3.40 in Aho
314 if (n->fType == RBBINode::opOr) {
315 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
316 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
317 }
318 else if (n->fType == RBBINode::opCat) {
319 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
320 if (n->fRightChild->fNullable) {
321 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322 }
323 }
324 else if (n->fType == RBBINode::opStar ||
325 n->fType == RBBINode::opQuestion ||
326 n->fType == RBBINode::opPlus) {
327 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328 }
329 }
330
331
332
333 //-----------------------------------------------------------------------------
334 //
335 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
336 //
337 //-----------------------------------------------------------------------------
338 void RBBITableBuilder57::calcFollowPos(RBBINode *n) {
339 if (n == NULL ||
340 n->fType == RBBINode::leafChar ||
341 n->fType == RBBINode::endMark) {
342 return;
343 }
344
345 calcFollowPos(n->fLeftChild);
346 calcFollowPos(n->fRightChild);
347
348 // Aho rule #1
349 if (n->fType == RBBINode::opCat) {
350 RBBINode *i; // is 'i' in Aho's description
351 uint32_t ix;
352
353 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
354
355 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
356 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
357 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
358 }
359 }
360
361 // Aho rule #2
362 if (n->fType == RBBINode::opStar ||
363 n->fType == RBBINode::opPlus) {
364 RBBINode *i; // again, n and i are the names from Aho's description.
365 uint32_t ix;
366
367 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
368 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
369 setAdd(i->fFollowPos, n->fFirstPosSet);
370 }
371 }
372
373
374
375 }
376
377 //-----------------------------------------------------------------------------
378 //
379 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
380 // as roots of a rule to a destination vector.
381 //
382 //-----------------------------------------------------------------------------
383 void RBBITableBuilder57::addRuleRootNodes(UVector *dest, RBBINode *node) {
384 if (node == NULL || U_FAILURE(*fStatus)) {
385 return;
386 }
387 if (node->fRuleRoot) {
388 dest->addElement(node, *fStatus);
389 // Note: rules cannot nest. If we found a rule start node,
390 // no child node can also be a start node.
391 return;
392 }
393 addRuleRootNodes(dest, node->fLeftChild);
394 addRuleRootNodes(dest, node->fRightChild);
395 }
396
397 //-----------------------------------------------------------------------------
398 //
399 // calcChainedFollowPos. Modify the previously calculated followPos sets
400 // to implement rule chaining. NOT described by Aho
401 //
402 //-----------------------------------------------------------------------------
403 void RBBITableBuilder57::calcChainedFollowPos(RBBINode *tree) {
404
405 UVector endMarkerNodes(*fStatus);
406 UVector leafNodes(*fStatus);
407 int32_t i;
408
409 if (U_FAILURE(*fStatus)) {
410 return;
411 }
412
413 // get a list of all endmarker nodes.
414 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
415
416 // get a list all leaf nodes
417 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
418 if (U_FAILURE(*fStatus)) {
419 return;
420 }
421
422 // Collect all leaf nodes that can start matches for rules
423 // with inbound chaining enabled, which is the union of the
424 // firstPosition sets from each of the rule root nodes.
425
426 UVector ruleRootNodes(*fStatus);
427 addRuleRootNodes(&ruleRootNodes, tree);
428
429 UVector matchStartNodes(*fStatus);
430 for (int i=0; i<ruleRootNodes.size(); ++i) {
431 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
432 if (node->fChainIn) {
433 setAdd(&matchStartNodes, node->fFirstPosSet);
434 }
435 }
436 if (U_FAILURE(*fStatus)) {
437 return;
438 }
439
440 int32_t endNodeIx;
441 int32_t startNodeIx;
442
443 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
444 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
445 RBBINode *endNode = NULL;
446
447 // Identify leaf nodes that correspond to overall rule match positions.
448 // These include an endMarkerNode in their followPos sets.
449 for (i=0; i<endMarkerNodes.size(); i++) {
450 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
451 endNode = tNode;
452 break;
453 }
454 }
455 if (endNode == NULL) {
456 // node wasn't an end node. Try again with the next.
457 continue;
458 }
459
460 // We've got a node that can end a match.
461
462 // Line Break Specific hack: If this node's val correspond to the $CM char class,
463 // don't chain from it.
464 // TODO: Add rule syntax for this behavior, get specifics out of here and
465 // into the rule file.
466 if (fRB->fLBCMNoChain || fRB->fRINoChain) {
467 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
468 if (c != -1) {
469 // c == -1 occurs with sets containing only the {eof} marker string.
470 if (fRB->fLBCMNoChain) {
471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472 if (cLBProp == U_LB_COMBINING_MARK) {
473 continue;
474 }
475 }
476 if (fRB->fRINoChain) {
477 UGraphemeClusterBreak cGBProp = (UGraphemeClusterBreak)u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
478 if (cGBProp == U_GCB_REGIONAL_INDICATOR) {
479 continue;
480 }
481 }
482 }
483 }
484
485
486 // Now iterate over the nodes that can start a match, looking for ones
487 // with the same char class as our ending node.
488 RBBINode *startNode;
489 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
490 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
491 if (startNode->fType != RBBINode::leafChar) {
492 continue;
493 }
494
495 if (endNode->fVal == startNode->fVal) {
496 // The end val (character class) of one possible match is the
497 // same as the start of another.
498
499 // Add all nodes from the followPos of the start node to the
500 // followPos set of the end node, which will have the effect of
501 // letting matches transition from a match state at endNode
502 // to the second char of a match starting with startNode.
503 setAdd(endNode->fFollowPos, startNode->fFollowPos);
504 }
505 }
506 }
507 }
508
509
510 //-----------------------------------------------------------------------------
511 //
512 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
513 // Do an swizzle similar to chaining, modifying the followPos set of
514 // the bofNode to include the followPos nodes from other {bot} nodes
515 // scattered through the tree.
516 //
517 // This function has much in common with calcChainedFollowPos().
518 //
519 //-----------------------------------------------------------------------------
520 void RBBITableBuilder57::bofFixup() {
521
522 if (U_FAILURE(*fStatus)) {
523 return;
524 }
525
526 // The parse tree looks like this ...
527 // fTree root ---> <cat>
528 // / \ .
529 // <cat> <#end node>
530 // / \ .
531 // <bofNode> rest
532 // of tree
533 //
534 // We will be adding things to the followPos set of the <bofNode>
535 //
536 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
537 U_ASSERT(bofNode->fType == RBBINode::leafChar);
538 U_ASSERT(bofNode->fVal == 2);
539
540 // Get all nodes that can be the start a match of the user-written rules
541 // (excluding the fake bofNode)
542 // We want the nodes that can start a match in the
543 // part labeled "rest of tree"
544 //
545 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
546
547 RBBINode *startNode;
548 int startNodeIx;
549 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
550 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
551 if (startNode->fType != RBBINode::leafChar) {
552 continue;
553 }
554
555 if (startNode->fVal == bofNode->fVal) {
556 // We found a leaf node corresponding to a {bof} that was
557 // explicitly written into a rule.
558 // Add everything from the followPos set of this node to the
559 // followPos set of the fake bofNode at the start of the tree.
560 //
561 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
562 }
563 }
564 }
565
566 //-----------------------------------------------------------------------------
567 //
568 // buildStateTable() Determine the set of runtime DFA states and the
569 // transition tables for these states, by the algorithm
570 // of fig. 3.44 in Aho.
571 //
572 // Most of the comments are quotes of Aho's psuedo-code.
573 //
574 //-----------------------------------------------------------------------------
575 void RBBITableBuilder57::buildStateTable() {
576 if (U_FAILURE(*fStatus)) {
577 return;
578 }
579 RBBIStateDescriptor *failState;
580 // Set it to NULL to avoid uninitialized warning
581 RBBIStateDescriptor *initialState = NULL;
582 //
583 // Add a dummy state 0 - the stop state. Not from Aho.
584 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
585 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
586 if (failState == NULL) {
587 *fStatus = U_MEMORY_ALLOCATION_ERROR;
588 goto ExitBuildSTdeleteall;
589 }
590 failState->fPositions = new UVector(*fStatus);
591 if (failState->fPositions == NULL) {
592 *fStatus = U_MEMORY_ALLOCATION_ERROR;
593 }
594 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
595 goto ExitBuildSTdeleteall;
596 }
597 fDStates->addElement(failState, *fStatus);
598 if (U_FAILURE(*fStatus)) {
599 goto ExitBuildSTdeleteall;
600 }
601
602 // initially, the only unmarked state in Dstates is firstpos(root),
603 // where toot is the root of the syntax tree for (r)#;
604 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
605 if (initialState == NULL) {
606 *fStatus = U_MEMORY_ALLOCATION_ERROR;
607 }
608 if (U_FAILURE(*fStatus)) {
609 goto ExitBuildSTdeleteall;
610 }
611 initialState->fPositions = new UVector(*fStatus);
612 if (initialState->fPositions == NULL) {
613 *fStatus = U_MEMORY_ALLOCATION_ERROR;
614 }
615 if (U_FAILURE(*fStatus)) {
616 goto ExitBuildSTdeleteall;
617 }
618 setAdd(initialState->fPositions, fTree->fFirstPosSet);
619 fDStates->addElement(initialState, *fStatus);
620 if (U_FAILURE(*fStatus)) {
621 goto ExitBuildSTdeleteall;
622 }
623
624 // while there is an unmarked state T in Dstates do begin
625 for (;;) {
626 RBBIStateDescriptor *T = NULL;
627 int32_t tx;
628 for (tx=1; tx<fDStates->size(); tx++) {
629 RBBIStateDescriptor *temp;
630 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
631 if (temp->fMarked == FALSE) {
632 T = temp;
633 break;
634 }
635 }
636 if (T == NULL) {
637 break;
638 }
639
640 // mark T;
641 T->fMarked = TRUE;
642
643 // for each input symbol a do begin
644 int32_t a;
645 for (a = 1; a<=lastInputSymbol; a++) {
646 // let U be the set of positions that are in followpos(p)
647 // for some position p in T
648 // such that the symbol at position p is a;
649 UVector *U = NULL;
650 RBBINode *p;
651 int32_t px;
652 for (px=0; px<T->fPositions->size(); px++) {
653 p = (RBBINode *)T->fPositions->elementAt(px);
654 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
655 if (U == NULL) {
656 U = new UVector(*fStatus);
657 if (U == NULL) {
658 *fStatus = U_MEMORY_ALLOCATION_ERROR;
659 goto ExitBuildSTdeleteall;
660 }
661 }
662 setAdd(U, p->fFollowPos);
663 }
664 }
665
666 // if U is not empty and not in DStates then
667 int32_t ux = 0;
668 UBool UinDstates = FALSE;
669 if (U != NULL) {
670 U_ASSERT(U->size() > 0);
671 int ix;
672 for (ix=0; ix<fDStates->size(); ix++) {
673 RBBIStateDescriptor *temp2;
674 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
675 if (setEquals(U, temp2->fPositions)) {
676 delete U;
677 U = temp2->fPositions;
678 ux = ix;
679 UinDstates = TRUE;
680 break;
681 }
682 }
683
684 // Add U as an unmarked state to Dstates
685 if (!UinDstates)
686 {
687 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
688 if (newState == NULL) {
689 *fStatus = U_MEMORY_ALLOCATION_ERROR;
690 }
691 if (U_FAILURE(*fStatus)) {
692 goto ExitBuildSTdeleteall;
693 }
694 newState->fPositions = U;
695 fDStates->addElement(newState, *fStatus);
696 if (U_FAILURE(*fStatus)) {
697 return;
698 }
699 ux = fDStates->size()-1;
700 }
701
702 // Dtran[T, a] := U;
703 T->fDtran->setElementAt(ux, a);
704 }
705 }
706 }
707 return;
708 // delete local pointers only if error occured.
709 ExitBuildSTdeleteall:
710 delete initialState;
711 delete failState;
712 }
713
714
715
716 //-----------------------------------------------------------------------------
717 //
718 // flagAcceptingStates Identify accepting states.
719 // First get a list of all of the end marker nodes.
720 // Then, for each state s,
721 // if s contains one of the end marker nodes in its list of tree positions then
722 // s is an accepting state.
723 //
724 //-----------------------------------------------------------------------------
725 void RBBITableBuilder57::flagAcceptingStates() {
726 if (U_FAILURE(*fStatus)) {
727 return;
728 }
729 UVector endMarkerNodes(*fStatus);
730 RBBINode *endMarker;
731 int32_t i;
732 int32_t n;
733
734 if (U_FAILURE(*fStatus)) {
735 return;
736 }
737
738 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
739 if (U_FAILURE(*fStatus)) {
740 return;
741 }
742
743 for (i=0; i<endMarkerNodes.size(); i++) {
744 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
745 for (n=0; n<fDStates->size(); n++) {
746 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
747 if (sd->fPositions->indexOf(endMarker) >= 0) {
748 // Any non-zero value for fAccepting means this is an accepting node.
749 // The value is what will be returned to the user as the break status.
750 // If no other value was specified, force it to -1.
751
752 if (sd->fAccepting==0) {
753 // State hasn't been marked as accepting yet. Do it now.
754 sd->fAccepting = endMarker->fVal;
755 if (sd->fAccepting == 0) {
756 sd->fAccepting = -1;
757 }
758 }
759 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
760 // Both lookahead and non-lookahead accepting for this state.
761 // Favor the look-ahead. Expedient for line break.
762 // TODO: need a more elegant resolution for conflicting rules.
763 sd->fAccepting = endMarker->fVal;
764 }
765 // implicit else:
766 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
767
768 // If the end marker node is from a look-ahead rule, set
769 // the fLookAhead field or this state also.
770 if (endMarker->fLookAheadEnd) {
771 // TODO: don't change value if already set?
772 // TODO: allow for more than one active look-ahead rule in engine.
773 // Make value here an index to a side array in engine?
774 sd->fLookAhead = sd->fAccepting;
775 }
776 }
777 }
778 }
779 }
780
781
782 //-----------------------------------------------------------------------------
783 //
784 // flagLookAheadStates Very similar to flagAcceptingStates, above.
785 //
786 //-----------------------------------------------------------------------------
787 void RBBITableBuilder57::flagLookAheadStates() {
788 if (U_FAILURE(*fStatus)) {
789 return;
790 }
791 UVector lookAheadNodes(*fStatus);
792 RBBINode *lookAheadNode;
793 int32_t i;
794 int32_t n;
795
796 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
797 if (U_FAILURE(*fStatus)) {
798 return;
799 }
800 for (i=0; i<lookAheadNodes.size(); i++) {
801 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
802
803 for (n=0; n<fDStates->size(); n++) {
804 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
805 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
806 sd->fLookAhead = lookAheadNode->fVal;
807 }
808 }
809 }
810 }
811
812
813
814
815 //-----------------------------------------------------------------------------
816 //
817 // flagTaggedStates
818 //
819 //-----------------------------------------------------------------------------
820 void RBBITableBuilder57::flagTaggedStates() {
821 if (U_FAILURE(*fStatus)) {
822 return;
823 }
824 UVector tagNodes(*fStatus);
825 RBBINode *tagNode;
826 int32_t i;
827 int32_t n;
828
829 if (U_FAILURE(*fStatus)) {
830 return;
831 }
832 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
833 if (U_FAILURE(*fStatus)) {
834 return;
835 }
836 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
837 tagNode = (RBBINode *)tagNodes.elementAt(i);
838
839 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
840 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
841 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
842 sortedAdd(&sd->fTagVals, tagNode->fVal);
843 }
844 }
845 }
846 }
847
848
849
850
851 //-----------------------------------------------------------------------------
852 //
853 // mergeRuleStatusVals
854 //
855 // Update the global table of rule status {tag} values
856 // The rule builder has a global vector of status values that are common
857 // for all tables. Merge the ones from this table into the global set.
858 //
859 //-----------------------------------------------------------------------------
860 void RBBITableBuilder57::mergeRuleStatusVals() {
861 //
862 // The basic outline of what happens here is this...
863 //
864 // for each state in this state table
865 // if the status tag list for this state is in the global statuses list
866 // record where and
867 // continue with the next state
868 // else
869 // add the tag list for this state to the global list.
870 //
871 int i;
872 int n;
873
874 // Pre-set a single tag of {0} into the table.
875 // We will need this as a default, for rule sets with no explicit tagging.
876 if (fRB->fRuleStatusVals->size() == 0) {
877 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
878 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
879 }
880
881 // For each state
882 for (n=0; n<fDStates->size(); n++) {
883 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
884 UVector *thisStatesTagValues = sd->fTagVals;
885 if (thisStatesTagValues == NULL) {
886 // No tag values are explicitly associated with this state.
887 // Set the default tag value.
888 sd->fTagsIdx = 0;
889 continue;
890 }
891
892 // There are tag(s) associated with this state.
893 // fTagsIdx will be the index into the global tag list for this state's tag values.
894 // Initial value of -1 flags that we haven't got it set yet.
895 sd->fTagsIdx = -1;
896 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
897 int32_t nextTagGroupStart = 0;
898
899 // Loop runs once per group of tags in the global list
900 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
901 thisTagGroupStart = nextTagGroupStart;
902 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
903 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
904 // The number of tags for this state is different from
905 // the number of tags in this group from the global list.
906 // Continue with the next group from the global list.
907 continue;
908 }
909 // The lengths match, go ahead and compare the actual tag values
910 // between this state and the group from the global list.
911 for (i=0; i<thisStatesTagValues->size(); i++) {
912 if (thisStatesTagValues->elementAti(i) !=
913 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
914 // Mismatch.
915 break;
916 }
917 }
918
919 if (i == thisStatesTagValues->size()) {
920 // We found a set of tag values in the global list that match
921 // those for this state. Use them.
922 sd->fTagsIdx = thisTagGroupStart;
923 break;
924 }
925 }
926
927 if (sd->fTagsIdx == -1) {
928 // No suitable entry in the global tag list already. Add one
929 sd->fTagsIdx = fRB->fRuleStatusVals->size();
930 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
931 for (i=0; i<thisStatesTagValues->size(); i++) {
932 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
933 }
934 }
935 }
936 }
937
938
939
940
941
942
943
944 //-----------------------------------------------------------------------------
945 //
946 // sortedAdd Add a value to a vector of sorted values (ints).
947 // Do not replicate entries; if the value is already there, do not
948 // add a second one.
949 // Lazily create the vector if it does not already exist.
950 //
951 //-----------------------------------------------------------------------------
952 void RBBITableBuilder57::sortedAdd(UVector **vector, int32_t val) {
953 int32_t i;
954
955 if (*vector == NULL) {
956 *vector = new UVector(*fStatus);
957 }
958 if (*vector == NULL || U_FAILURE(*fStatus)) {
959 return;
960 }
961 UVector *vec = *vector;
962 int32_t vSize = vec->size();
963 for (i=0; i<vSize; i++) {
964 int32_t valAtI = vec->elementAti(i);
965 if (valAtI == val) {
966 // The value is already in the vector. Don't add it again.
967 return;
968 }
969 if (valAtI > val) {
970 break;
971 }
972 }
973 vec->insertElementAt(val, i, *fStatus);
974 }
975
976
977
978 //-----------------------------------------------------------------------------
979 //
980 // setAdd Set operation on UVector
981 // dest = dest union source
982 // Elements may only appear once and must be sorted.
983 //
984 //-----------------------------------------------------------------------------
985 void RBBITableBuilder57::setAdd(UVector *dest, UVector *source) {
986 int32_t destOriginalSize = dest->size();
987 int32_t sourceSize = source->size();
988 int32_t di = 0;
989 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
990 void **destPtr, **sourcePtr;
991 void **destLim, **sourceLim;
992
993 if (destOriginalSize > destArray.getCapacity()) {
994 if (destArray.resize(destOriginalSize) == NULL) {
995 return;
996 }
997 }
998 destPtr = destArray.getAlias();
999 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1000
1001 if (sourceSize > sourceArray.getCapacity()) {
1002 if (sourceArray.resize(sourceSize) == NULL) {
1003 return;
1004 }
1005 }
1006 sourcePtr = sourceArray.getAlias();
1007 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1008
1009 // Avoid multiple "get element" calls by getting the contents into arrays
1010 (void) dest->toArray(destPtr);
1011 (void) source->toArray(sourcePtr);
1012
1013 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1014
1015 while (sourcePtr < sourceLim && destPtr < destLim) {
1016 if (*destPtr == *sourcePtr) {
1017 dest->setElementAt(*sourcePtr++, di++);
1018 destPtr++;
1019 }
1020 // This check is required for machines with segmented memory, like i5/OS.
1021 // Direct pointer comparison is not recommended.
1022 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1023 dest->setElementAt(*destPtr++, di++);
1024 }
1025 else { /* *sourcePtr < *destPtr */
1026 dest->setElementAt(*sourcePtr++, di++);
1027 }
1028 }
1029
1030 // At most one of these two cleanup loops will execute
1031 while (destPtr < destLim) {
1032 dest->setElementAt(*destPtr++, di++);
1033 }
1034 while (sourcePtr < sourceLim) {
1035 dest->setElementAt(*sourcePtr++, di++);
1036 }
1037
1038 dest->setSize(di, *fStatus);
1039 }
1040
1041
1042
1043 //-----------------------------------------------------------------------------
1044 //
1045 // setEqual Set operation on UVector.
1046 // Compare for equality.
1047 // Elements must be sorted.
1048 //
1049 //-----------------------------------------------------------------------------
1050 UBool RBBITableBuilder57::setEquals(UVector *a, UVector *b) {
1051 return a->equals(*b);
1052 }
1053
1054
1055 //-----------------------------------------------------------------------------
1056 //
1057 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1058 // for each node in the tree.
1059 //
1060 //-----------------------------------------------------------------------------
1061 #ifdef RBBI_DEBUG
1062 void RBBITableBuilder57::printPosSets(RBBINode *n) {
1063 if (n==NULL) {
1064 return;
1065 }
1066 printf("\n");
1067 RBBINode::printNodeHeader();
1068 n->printNode();
1069 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1070
1071 RBBIDebugPrintf(" firstpos: ");
1072 printSet(n->fFirstPosSet);
1073
1074 RBBIDebugPrintf(" lastpos: ");
1075 printSet(n->fLastPosSet);
1076
1077 RBBIDebugPrintf(" followpos: ");
1078 printSet(n->fFollowPos);
1079
1080 printPosSets(n->fLeftChild);
1081 printPosSets(n->fRightChild);
1082 }
1083 #endif
1084
1085
1086
1087 //-----------------------------------------------------------------------------
1088 //
1089 // getTableSize() Calculate the size of the runtime form of this
1090 // state transition table.
1091 //
1092 //-----------------------------------------------------------------------------
1093 int32_t RBBITableBuilder57::getTableSize() const {
1094 int32_t size = 0;
1095 int32_t numRows;
1096 int32_t numCols;
1097 int32_t rowSize;
1098
1099 if (fTree == NULL) {
1100 return 0;
1101 }
1102
1103 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
1104
1105 numRows = fDStates->size();
1106 numCols = fRB->fSetBuilder->getNumCharCategories();
1107
1108 // Note The declaration of RBBIStateTableRow is for a table of two columns.
1109 // Therefore we subtract two from numCols when determining
1110 // how much storage to add to a row for the total columns.
1111 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1112 size += numRows * rowSize;
1113 return size;
1114 }
1115
1116
1117
1118 //-----------------------------------------------------------------------------
1119 //
1120 // exportTable() export the state transition table in the format required
1121 // by the runtime engine. getTableSize() bytes of memory
1122 // must be available at the output address "where".
1123 //
1124 //-----------------------------------------------------------------------------
1125 void RBBITableBuilder57::exportTable(void *where) {
1126 RBBIStateTable *table = (RBBIStateTable *)where;
1127 uint32_t state;
1128 int col;
1129
1130 if (U_FAILURE(*fStatus) || fTree == NULL) {
1131 return;
1132 }
1133
1134 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1135 fDStates->size() > 0x7fff) {
1136 *fStatus = U_BRK_INTERNAL_ERROR;
1137 return;
1138 }
1139
1140 table->fRowLen = sizeof(RBBIStateTableRow) +
1141 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1142 table->fNumStates = fDStates->size();
1143 table->fFlags = 0;
1144 if (fRB->fLookAheadHardBreak) {
1145 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1146 }
1147 if (fRB->fSetBuilder->sawBOF()) {
1148 table->fFlags |= RBBI_BOF_REQUIRED;
1149 }
1150 table->fReserved = 0;
1151
1152 for (state=0; state<table->fNumStates; state++) {
1153 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1154 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1155 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1156 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1157 row->fAccepting = (int16_t)sd->fAccepting;
1158 row->fLookAhead = (int16_t)sd->fLookAhead;
1159 row->fTagIdx = (int16_t)sd->fTagsIdx;
1160 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1161 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1162 }
1163 }
1164 }
1165
1166
1167
1168 //-----------------------------------------------------------------------------
1169 //
1170 // printSet Debug function. Print the contents of a UVector
1171 //
1172 //-----------------------------------------------------------------------------
1173 #ifdef RBBI_DEBUG
1174 void RBBITableBuilder57::printSet(UVector *s) {
1175 int32_t i;
1176 for (i=0; i<s->size(); i++) {
1177 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1178 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1179 }
1180 RBBIDebugPrintf("\n");
1181 }
1182 #endif
1183
1184
1185 //-----------------------------------------------------------------------------
1186 //
1187 // printStates Debug Function. Dump the fully constructed state transition table.
1188 //
1189 //-----------------------------------------------------------------------------
1190 #ifdef RBBI_DEBUG
1191 void RBBITableBuilder57::printStates() {
1192 int c; // input "character"
1193 int n; // state number
1194
1195 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1196 RBBIDebugPrintf(" | Acc LA Tag");
1197 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1198 RBBIDebugPrintf(" %2d", c);
1199 }
1200 RBBIDebugPrintf("\n");
1201 RBBIDebugPrintf(" |---------------");
1202 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1203 RBBIDebugPrintf("---");
1204 }
1205 RBBIDebugPrintf("\n");
1206
1207 for (n=0; n<fDStates->size(); n++) {
1208 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1209 RBBIDebugPrintf(" %3d | " , n);
1210 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1211 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1212 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1213 }
1214 RBBIDebugPrintf("\n");
1215 }
1216 RBBIDebugPrintf("\n\n");
1217 }
1218 #endif
1219
1220
1221
1222 //-----------------------------------------------------------------------------
1223 //
1224 // printRuleStatusTable Debug Function. Dump the common rule status table
1225 //
1226 //-----------------------------------------------------------------------------
1227 #ifdef RBBI_DEBUG
1228 void RBBITableBuilder57::printRuleStatusTable() {
1229 int32_t thisRecord = 0;
1230 int32_t nextRecord = 0;
1231 int i;
1232 UVector *tbl = fRB->fRuleStatusVals;
1233
1234 RBBIDebugPrintf("index | tags \n");
1235 RBBIDebugPrintf("-------------------\n");
1236
1237 while (nextRecord < tbl->size()) {
1238 thisRecord = nextRecord;
1239 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1240 RBBIDebugPrintf("%4d ", thisRecord);
1241 for (i=thisRecord+1; i<nextRecord; i++) {
1242 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1243 }
1244 RBBIDebugPrintf("\n");
1245 }
1246 RBBIDebugPrintf("\n\n");
1247 }
1248 #endif
1249
1250
1251 //-----------------------------------------------------------------------------
1252 //
1253 // RBBIStateDescriptor - in standard rbbitblb.cpp
1254 //
1255 //-----------------------------------------------------------------------------
1256
1257 U_NAMESPACE_END
1258
1259 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */