]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbitblb.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbisetb.h"
22 #include "rbbidata.h"
23 #include "cstring.h"
24 #include "uassert.h"
25 #include "uvectr32.h"
26 #include "cmemory.h"
27
28 U_NAMESPACE_BEGIN
29
30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31 fRB(rb),
32 fTree(*rootNode),
33 fStatus(&status),
34 fDStates(nullptr),
35 fSafeTable(nullptr) {
36 if (U_FAILURE(status)) {
37 return;
38 }
39 // fDStates is UVector<RBBIStateDescriptor *>
40 fDStates = new UVector(status);
41 if (U_SUCCESS(status) && fDStates == nullptr ) {
42 status = U_MEMORY_ALLOCATION_ERROR;
43 }
44 }
45
46
47
48 RBBITableBuilder::~RBBITableBuilder() {
49 int i;
50 for (i=0; i<fDStates->size(); i++) {
51 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52 }
53 delete fDStates;
54 delete fSafeTable;
55 }
56
57
58 //-----------------------------------------------------------------------------
59 //
60 // RBBITableBuilder::buildForwardTable - This is the main function for building
61 // the DFA state transition table from the RBBI rules parse tree.
62 //
63 //-----------------------------------------------------------------------------
64 void RBBITableBuilder::buildForwardTable() {
65
66 if (U_FAILURE(*fStatus)) {
67 return;
68 }
69
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
72 if (fTree==NULL) {
73 return;
74 }
75
76 //
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
79 //
80 fTree = fTree->flattenVariables();
81 #ifdef RBBI_DEBUG
82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83 RBBIDebugPuts("\nParse tree after flattening variable references.");
84 RBBINode::printTree(fTree, TRUE);
85 }
86 #endif
87
88 //
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
93 //
94 if (fRB->fSetBuilder->sawBOF()) {
95 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
97 // Delete and exit if memory allocation failed.
98 if (bofTop == NULL || bofLeaf == NULL) {
99 *fStatus = U_MEMORY_ALLOCATION_ERROR;
100 delete bofTop;
101 delete bofLeaf;
102 return;
103 }
104 bofTop->fLeftChild = bofLeaf;
105 bofTop->fRightChild = fTree;
106 bofLeaf->fParent = bofTop;
107 bofLeaf->fVal = 2; // Reserved value for {bof}.
108 fTree = bofTop;
109 }
110
111 //
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
115 //
116 RBBINode *cn = new RBBINode(RBBINode::opCat);
117 // Exit if memory allocation failed.
118 if (cn == NULL) {
119 *fStatus = U_MEMORY_ALLOCATION_ERROR;
120 return;
121 }
122 cn->fLeftChild = fTree;
123 fTree->fParent = cn;
124 cn->fRightChild = new RBBINode(RBBINode::endMark);
125 // Delete and exit if memory allocation failed.
126 if (cn->fRightChild == NULL) {
127 *fStatus = U_MEMORY_ALLOCATION_ERROR;
128 delete cn;
129 return;
130 }
131 cn->fRightChild->fParent = cn;
132 fTree = cn;
133
134 //
135 // Replace all references to UnicodeSets with the tree for the equivalent
136 // expression.
137 //
138 fTree->flattenSets();
139 #ifdef RBBI_DEBUG
140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142 RBBINode::printTree(fTree, TRUE);
143 }
144 #endif
145
146
147 //
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
153 //
154 calcNullable(fTree);
155 calcFirstPos(fTree);
156 calcLastPos(fTree);
157 calcFollowPos(fTree);
158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159 RBBIDebugPuts("\n");
160 printPosSets(fTree);
161 }
162
163 //
164 // For "chained" rules, modify the followPos sets
165 //
166 if (fRB->fChainRules) {
167 calcChainedFollowPos(fTree);
168 }
169
170 //
171 // BOF (start of input) test fixup.
172 //
173 if (fRB->fSetBuilder->sawBOF()) {
174 bofFixup();
175 }
176
177 //
178 // Build the DFA state transition tables.
179 //
180 buildStateTable();
181 flagAcceptingStates();
182 flagLookAheadStates();
183 flagTaggedStates();
184
185 //
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
189 //
190 mergeRuleStatusVals();
191 }
192
193
194
195 //-----------------------------------------------------------------------------
196 //
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198 //
199 //-----------------------------------------------------------------------------
200 void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237 }
238
239
240
241
242 //-----------------------------------------------------------------------------
243 //
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245 //
246 //-----------------------------------------------------------------------------
247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284 }
285
286
287
288 //-----------------------------------------------------------------------------
289 //
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291 //
292 //-----------------------------------------------------------------------------
293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330 }
331
332
333
334 //-----------------------------------------------------------------------------
335 //
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337 //
338 //-----------------------------------------------------------------------------
339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376 }
377
378 //-----------------------------------------------------------------------------
379 //
380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381 // as roots of a rule to a destination vector.
382 //
383 //-----------------------------------------------------------------------------
384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385 if (node == NULL || U_FAILURE(*fStatus)) {
386 return;
387 }
388 if (node->fRuleRoot) {
389 dest->addElement(node, *fStatus);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
392 return;
393 }
394 addRuleRootNodes(dest, node->fLeftChild);
395 addRuleRootNodes(dest, node->fRightChild);
396 }
397
398 //-----------------------------------------------------------------------------
399 //
400 // calcChainedFollowPos. Modify the previously calculated followPos sets
401 // to implement rule chaining. NOT described by Aho
402 //
403 //-----------------------------------------------------------------------------
404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
406 UVector endMarkerNodes(*fStatus);
407 UVector leafNodes(*fStatus);
408 int32_t i;
409
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list of all endmarker nodes.
415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
417 // get a list all leaf nodes
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
429
430 UVector matchStartNodes(*fStatus);
431 for (int i=0; i<ruleRootNodes.size(); ++i) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
440
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 RBBINode *endNode = NULL;
447
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i=0; i<endMarkerNodes.size(); i++) {
451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452 endNode = tNode;
453 break;
454 }
455 }
456 if (endNode == NULL) {
457 // node wasn't an end node. Try again with the next.
458 continue;
459 }
460
461 // We've got a node that can end a match.
462
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
467 if (fRB->fLBCMNoChain) {
468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
469 if (c != -1) {
470 // c == -1 occurs with sets containing only the {eof} marker string.
471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472 if (cLBProp == U_LB_COMBINING_MARK) {
473 continue;
474 }
475 }
476 }
477
478
479 // Now iterate over the nodes that can start a match, looking for ones
480 // with the same char class as our ending node.
481 RBBINode *startNode;
482 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
484 if (startNode->fType != RBBINode::leafChar) {
485 continue;
486 }
487
488 if (endNode->fVal == startNode->fVal) {
489 // The end val (character class) of one possible match is the
490 // same as the start of another.
491
492 // Add all nodes from the followPos of the start node to the
493 // followPos set of the end node, which will have the effect of
494 // letting matches transition from a match state at endNode
495 // to the second char of a match starting with startNode.
496 setAdd(endNode->fFollowPos, startNode->fFollowPos);
497 }
498 }
499 }
500 }
501
502
503 //-----------------------------------------------------------------------------
504 //
505 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
506 // Do an swizzle similar to chaining, modifying the followPos set of
507 // the bofNode to include the followPos nodes from other {bot} nodes
508 // scattered through the tree.
509 //
510 // This function has much in common with calcChainedFollowPos().
511 //
512 //-----------------------------------------------------------------------------
513 void RBBITableBuilder::bofFixup() {
514
515 if (U_FAILURE(*fStatus)) {
516 return;
517 }
518
519 // The parse tree looks like this ...
520 // fTree root ---> <cat>
521 // / \ .
522 // <cat> <#end node>
523 // / \ .
524 // <bofNode> rest
525 // of tree
526 //
527 // We will be adding things to the followPos set of the <bofNode>
528 //
529 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
530 U_ASSERT(bofNode->fType == RBBINode::leafChar);
531 U_ASSERT(bofNode->fVal == 2);
532
533 // Get all nodes that can be the start a match of the user-written rules
534 // (excluding the fake bofNode)
535 // We want the nodes that can start a match in the
536 // part labeled "rest of tree"
537 //
538 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539
540 RBBINode *startNode;
541 int startNodeIx;
542 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544 if (startNode->fType != RBBINode::leafChar) {
545 continue;
546 }
547
548 if (startNode->fVal == bofNode->fVal) {
549 // We found a leaf node corresponding to a {bof} that was
550 // explicitly written into a rule.
551 // Add everything from the followPos set of this node to the
552 // followPos set of the fake bofNode at the start of the tree.
553 //
554 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555 }
556 }
557 }
558
559 //-----------------------------------------------------------------------------
560 //
561 // buildStateTable() Determine the set of runtime DFA states and the
562 // transition tables for these states, by the algorithm
563 // of fig. 3.44 in Aho.
564 //
565 // Most of the comments are quotes of Aho's psuedo-code.
566 //
567 //-----------------------------------------------------------------------------
568 void RBBITableBuilder::buildStateTable() {
569 if (U_FAILURE(*fStatus)) {
570 return;
571 }
572 RBBIStateDescriptor *failState;
573 // Set it to NULL to avoid uninitialized warning
574 RBBIStateDescriptor *initialState = NULL;
575 //
576 // Add a dummy state 0 - the stop state. Not from Aho.
577 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
578 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579 if (failState == NULL) {
580 *fStatus = U_MEMORY_ALLOCATION_ERROR;
581 goto ExitBuildSTdeleteall;
582 }
583 failState->fPositions = new UVector(*fStatus);
584 if (failState->fPositions == NULL) {
585 *fStatus = U_MEMORY_ALLOCATION_ERROR;
586 }
587 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588 goto ExitBuildSTdeleteall;
589 }
590 fDStates->addElement(failState, *fStatus);
591 if (U_FAILURE(*fStatus)) {
592 goto ExitBuildSTdeleteall;
593 }
594
595 // initially, the only unmarked state in Dstates is firstpos(root),
596 // where toot is the root of the syntax tree for (r)#;
597 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598 if (initialState == NULL) {
599 *fStatus = U_MEMORY_ALLOCATION_ERROR;
600 }
601 if (U_FAILURE(*fStatus)) {
602 goto ExitBuildSTdeleteall;
603 }
604 initialState->fPositions = new UVector(*fStatus);
605 if (initialState->fPositions == NULL) {
606 *fStatus = U_MEMORY_ALLOCATION_ERROR;
607 }
608 if (U_FAILURE(*fStatus)) {
609 goto ExitBuildSTdeleteall;
610 }
611 setAdd(initialState->fPositions, fTree->fFirstPosSet);
612 fDStates->addElement(initialState, *fStatus);
613 if (U_FAILURE(*fStatus)) {
614 goto ExitBuildSTdeleteall;
615 }
616
617 // while there is an unmarked state T in Dstates do begin
618 for (;;) {
619 RBBIStateDescriptor *T = NULL;
620 int32_t tx;
621 for (tx=1; tx<fDStates->size(); tx++) {
622 RBBIStateDescriptor *temp;
623 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624 if (temp->fMarked == FALSE) {
625 T = temp;
626 break;
627 }
628 }
629 if (T == NULL) {
630 break;
631 }
632
633 // mark T;
634 T->fMarked = TRUE;
635
636 // for each input symbol a do begin
637 int32_t a;
638 for (a = 1; a<=lastInputSymbol; a++) {
639 // let U be the set of positions that are in followpos(p)
640 // for some position p in T
641 // such that the symbol at position p is a;
642 UVector *U = NULL;
643 RBBINode *p;
644 int32_t px;
645 for (px=0; px<T->fPositions->size(); px++) {
646 p = (RBBINode *)T->fPositions->elementAt(px);
647 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
648 if (U == NULL) {
649 U = new UVector(*fStatus);
650 if (U == NULL) {
651 *fStatus = U_MEMORY_ALLOCATION_ERROR;
652 goto ExitBuildSTdeleteall;
653 }
654 }
655 setAdd(U, p->fFollowPos);
656 }
657 }
658
659 // if U is not empty and not in DStates then
660 int32_t ux = 0;
661 UBool UinDstates = FALSE;
662 if (U != NULL) {
663 U_ASSERT(U->size() > 0);
664 int ix;
665 for (ix=0; ix<fDStates->size(); ix++) {
666 RBBIStateDescriptor *temp2;
667 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668 if (setEquals(U, temp2->fPositions)) {
669 delete U;
670 U = temp2->fPositions;
671 ux = ix;
672 UinDstates = TRUE;
673 break;
674 }
675 }
676
677 // Add U as an unmarked state to Dstates
678 if (!UinDstates)
679 {
680 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
681 if (newState == NULL) {
682 *fStatus = U_MEMORY_ALLOCATION_ERROR;
683 }
684 if (U_FAILURE(*fStatus)) {
685 goto ExitBuildSTdeleteall;
686 }
687 newState->fPositions = U;
688 fDStates->addElement(newState, *fStatus);
689 if (U_FAILURE(*fStatus)) {
690 return;
691 }
692 ux = fDStates->size()-1;
693 }
694
695 // Dtran[T, a] := U;
696 T->fDtran->setElementAt(ux, a);
697 }
698 }
699 }
700 return;
701 // delete local pointers only if error occured.
702 ExitBuildSTdeleteall:
703 delete initialState;
704 delete failState;
705 }
706
707
708
709 //-----------------------------------------------------------------------------
710 //
711 // flagAcceptingStates Identify accepting states.
712 // First get a list of all of the end marker nodes.
713 // Then, for each state s,
714 // if s contains one of the end marker nodes in its list of tree positions then
715 // s is an accepting state.
716 //
717 //-----------------------------------------------------------------------------
718 void RBBITableBuilder::flagAcceptingStates() {
719 if (U_FAILURE(*fStatus)) {
720 return;
721 }
722 UVector endMarkerNodes(*fStatus);
723 RBBINode *endMarker;
724 int32_t i;
725 int32_t n;
726
727 if (U_FAILURE(*fStatus)) {
728 return;
729 }
730
731 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
732 if (U_FAILURE(*fStatus)) {
733 return;
734 }
735
736 for (i=0; i<endMarkerNodes.size(); i++) {
737 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738 for (n=0; n<fDStates->size(); n++) {
739 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740 if (sd->fPositions->indexOf(endMarker) >= 0) {
741 // Any non-zero value for fAccepting means this is an accepting node.
742 // The value is what will be returned to the user as the break status.
743 // If no other value was specified, force it to -1.
744
745 if (sd->fAccepting==0) {
746 // State hasn't been marked as accepting yet. Do it now.
747 sd->fAccepting = endMarker->fVal;
748 if (sd->fAccepting == 0) {
749 sd->fAccepting = -1;
750 }
751 }
752 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753 // Both lookahead and non-lookahead accepting for this state.
754 // Favor the look-ahead. Expedient for line break.
755 // TODO: need a more elegant resolution for conflicting rules.
756 sd->fAccepting = endMarker->fVal;
757 }
758 // implicit else:
759 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
760
761 // If the end marker node is from a look-ahead rule, set
762 // the fLookAhead field for this state also.
763 if (endMarker->fLookAheadEnd) {
764 // TODO: don't change value if already set?
765 // TODO: allow for more than one active look-ahead rule in engine.
766 // Make value here an index to a side array in engine?
767 sd->fLookAhead = sd->fAccepting;
768 }
769 }
770 }
771 }
772 }
773
774
775 //-----------------------------------------------------------------------------
776 //
777 // flagLookAheadStates Very similar to flagAcceptingStates, above.
778 //
779 //-----------------------------------------------------------------------------
780 void RBBITableBuilder::flagLookAheadStates() {
781 if (U_FAILURE(*fStatus)) {
782 return;
783 }
784 UVector lookAheadNodes(*fStatus);
785 RBBINode *lookAheadNode;
786 int32_t i;
787 int32_t n;
788
789 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
790 if (U_FAILURE(*fStatus)) {
791 return;
792 }
793 for (i=0; i<lookAheadNodes.size(); i++) {
794 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795
796 for (n=0; n<fDStates->size(); n++) {
797 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799 sd->fLookAhead = lookAheadNode->fVal;
800 }
801 }
802 }
803 }
804
805
806
807
808 //-----------------------------------------------------------------------------
809 //
810 // flagTaggedStates
811 //
812 //-----------------------------------------------------------------------------
813 void RBBITableBuilder::flagTaggedStates() {
814 if (U_FAILURE(*fStatus)) {
815 return;
816 }
817 UVector tagNodes(*fStatus);
818 RBBINode *tagNode;
819 int32_t i;
820 int32_t n;
821
822 if (U_FAILURE(*fStatus)) {
823 return;
824 }
825 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
826 if (U_FAILURE(*fStatus)) {
827 return;
828 }
829 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
830 tagNode = (RBBINode *)tagNodes.elementAt(i);
831
832 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
833 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
835 sortedAdd(&sd->fTagVals, tagNode->fVal);
836 }
837 }
838 }
839 }
840
841
842
843
844 //-----------------------------------------------------------------------------
845 //
846 // mergeRuleStatusVals
847 //
848 // Update the global table of rule status {tag} values
849 // The rule builder has a global vector of status values that are common
850 // for all tables. Merge the ones from this table into the global set.
851 //
852 //-----------------------------------------------------------------------------
853 void RBBITableBuilder::mergeRuleStatusVals() {
854 //
855 // The basic outline of what happens here is this...
856 //
857 // for each state in this state table
858 // if the status tag list for this state is in the global statuses list
859 // record where and
860 // continue with the next state
861 // else
862 // add the tag list for this state to the global list.
863 //
864 int i;
865 int n;
866
867 // Pre-set a single tag of {0} into the table.
868 // We will need this as a default, for rule sets with no explicit tagging.
869 if (fRB->fRuleStatusVals->size() == 0) {
870 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
871 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
872 }
873
874 // For each state
875 for (n=0; n<fDStates->size(); n++) {
876 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877 UVector *thisStatesTagValues = sd->fTagVals;
878 if (thisStatesTagValues == NULL) {
879 // No tag values are explicitly associated with this state.
880 // Set the default tag value.
881 sd->fTagsIdx = 0;
882 continue;
883 }
884
885 // There are tag(s) associated with this state.
886 // fTagsIdx will be the index into the global tag list for this state's tag values.
887 // Initial value of -1 flags that we haven't got it set yet.
888 sd->fTagsIdx = -1;
889 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
890 int32_t nextTagGroupStart = 0;
891
892 // Loop runs once per group of tags in the global list
893 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894 thisTagGroupStart = nextTagGroupStart;
895 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897 // The number of tags for this state is different from
898 // the number of tags in this group from the global list.
899 // Continue with the next group from the global list.
900 continue;
901 }
902 // The lengths match, go ahead and compare the actual tag values
903 // between this state and the group from the global list.
904 for (i=0; i<thisStatesTagValues->size(); i++) {
905 if (thisStatesTagValues->elementAti(i) !=
906 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
907 // Mismatch.
908 break;
909 }
910 }
911
912 if (i == thisStatesTagValues->size()) {
913 // We found a set of tag values in the global list that match
914 // those for this state. Use them.
915 sd->fTagsIdx = thisTagGroupStart;
916 break;
917 }
918 }
919
920 if (sd->fTagsIdx == -1) {
921 // No suitable entry in the global tag list already. Add one
922 sd->fTagsIdx = fRB->fRuleStatusVals->size();
923 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924 for (i=0; i<thisStatesTagValues->size(); i++) {
925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926 }
927 }
928 }
929 }
930
931
932
933
934
935
936
937 //-----------------------------------------------------------------------------
938 //
939 // sortedAdd Add a value to a vector of sorted values (ints).
940 // Do not replicate entries; if the value is already there, do not
941 // add a second one.
942 // Lazily create the vector if it does not already exist.
943 //
944 //-----------------------------------------------------------------------------
945 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946 int32_t i;
947
948 if (*vector == NULL) {
949 *vector = new UVector(*fStatus);
950 }
951 if (*vector == NULL || U_FAILURE(*fStatus)) {
952 return;
953 }
954 UVector *vec = *vector;
955 int32_t vSize = vec->size();
956 for (i=0; i<vSize; i++) {
957 int32_t valAtI = vec->elementAti(i);
958 if (valAtI == val) {
959 // The value is already in the vector. Don't add it again.
960 return;
961 }
962 if (valAtI > val) {
963 break;
964 }
965 }
966 vec->insertElementAt(val, i, *fStatus);
967 }
968
969
970
971 //-----------------------------------------------------------------------------
972 //
973 // setAdd Set operation on UVector
974 // dest = dest union source
975 // Elements may only appear once and must be sorted.
976 //
977 //-----------------------------------------------------------------------------
978 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
979 int32_t destOriginalSize = dest->size();
980 int32_t sourceSize = source->size();
981 int32_t di = 0;
982 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
983 void **destPtr, **sourcePtr;
984 void **destLim, **sourceLim;
985
986 if (destOriginalSize > destArray.getCapacity()) {
987 if (destArray.resize(destOriginalSize) == NULL) {
988 return;
989 }
990 }
991 destPtr = destArray.getAlias();
992 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
993
994 if (sourceSize > sourceArray.getCapacity()) {
995 if (sourceArray.resize(sourceSize) == NULL) {
996 return;
997 }
998 }
999 sourcePtr = sourceArray.getAlias();
1000 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1001
1002 // Avoid multiple "get element" calls by getting the contents into arrays
1003 (void) dest->toArray(destPtr);
1004 (void) source->toArray(sourcePtr);
1005
1006 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1007
1008 while (sourcePtr < sourceLim && destPtr < destLim) {
1009 if (*destPtr == *sourcePtr) {
1010 dest->setElementAt(*sourcePtr++, di++);
1011 destPtr++;
1012 }
1013 // This check is required for machines with segmented memory, like i5/OS.
1014 // Direct pointer comparison is not recommended.
1015 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016 dest->setElementAt(*destPtr++, di++);
1017 }
1018 else { /* *sourcePtr < *destPtr */
1019 dest->setElementAt(*sourcePtr++, di++);
1020 }
1021 }
1022
1023 // At most one of these two cleanup loops will execute
1024 while (destPtr < destLim) {
1025 dest->setElementAt(*destPtr++, di++);
1026 }
1027 while (sourcePtr < sourceLim) {
1028 dest->setElementAt(*sourcePtr++, di++);
1029 }
1030
1031 dest->setSize(di, *fStatus);
1032 }
1033
1034
1035
1036 //-----------------------------------------------------------------------------
1037 //
1038 // setEqual Set operation on UVector.
1039 // Compare for equality.
1040 // Elements must be sorted.
1041 //
1042 //-----------------------------------------------------------------------------
1043 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1044 return a->equals(*b);
1045 }
1046
1047
1048 //-----------------------------------------------------------------------------
1049 //
1050 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1051 // for each node in the tree.
1052 //
1053 //-----------------------------------------------------------------------------
1054 #ifdef RBBI_DEBUG
1055 void RBBITableBuilder::printPosSets(RBBINode *n) {
1056 if (n==NULL) {
1057 return;
1058 }
1059 printf("\n");
1060 RBBINode::printNodeHeader();
1061 RBBINode::printNode(n);
1062 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1063
1064 RBBIDebugPrintf(" firstpos: ");
1065 printSet(n->fFirstPosSet);
1066
1067 RBBIDebugPrintf(" lastpos: ");
1068 printSet(n->fLastPosSet);
1069
1070 RBBIDebugPrintf(" followpos: ");
1071 printSet(n->fFollowPos);
1072
1073 printPosSets(n->fLeftChild);
1074 printPosSets(n->fRightChild);
1075 }
1076 #endif
1077
1078 //
1079 // findDuplCharClassFrom()
1080 //
1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082 int32_t numStates = fDStates->size();
1083 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084
1085 uint16_t table_base;
1086 uint16_t table_dupl;
1087 for (; categories->first < numCols-1; categories->first++) {
1088 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1089 for (int32_t state=0; state<numStates; state++) {
1090 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1091 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1092 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1093 if (table_base != table_dupl) {
1094 break;
1095 }
1096 }
1097 if (table_base == table_dupl) {
1098 return true;
1099 }
1100 }
1101 }
1102 return false;
1103 }
1104
1105
1106 //
1107 // removeColumn()
1108 //
1109 void RBBITableBuilder::removeColumn(int32_t column) {
1110 int32_t numStates = fDStates->size();
1111 for (int32_t state=0; state<numStates; state++) {
1112 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1113 U_ASSERT(column < sd->fDtran->size());
1114 sd->fDtran->removeElementAt(column);
1115 }
1116 }
1117
1118 /*
1119 * findDuplicateState
1120 */
1121 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1122 int32_t numStates = fDStates->size();
1123 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1124
1125 for (; states->first<numStates-1; states->first++) {
1126 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1127 for (states->second=states->first+1; states->second<numStates; states->second++) {
1128 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1129 if (firstSD->fAccepting != duplSD->fAccepting ||
1130 firstSD->fLookAhead != duplSD->fLookAhead ||
1131 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1132 continue;
1133 }
1134 bool rowsMatch = true;
1135 for (int32_t col=0; col < numCols; ++col) {
1136 int32_t firstVal = firstSD->fDtran->elementAti(col);
1137 int32_t duplVal = duplSD->fDtran->elementAti(col);
1138 if (!((firstVal == duplVal) ||
1139 ((firstVal == states->first || firstVal == states->second) &&
1140 (duplVal == states->first || duplVal == states->second)))) {
1141 rowsMatch = false;
1142 break;
1143 }
1144 }
1145 if (rowsMatch) {
1146 return true;
1147 }
1148 }
1149 }
1150 return false;
1151 }
1152
1153
1154 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1155 int32_t numStates = fSafeTable->size();
1156
1157 for (; states->first<numStates-1; states->first++) {
1158 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1159 for (states->second=states->first+1; states->second<numStates; states->second++) {
1160 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1161 bool rowsMatch = true;
1162 int32_t numCols = firstRow->length();
1163 for (int32_t col=0; col < numCols; ++col) {
1164 int32_t firstVal = firstRow->charAt(col);
1165 int32_t duplVal = duplRow->charAt(col);
1166 if (!((firstVal == duplVal) ||
1167 ((firstVal == states->first || firstVal == states->second) &&
1168 (duplVal == states->first || duplVal == states->second)))) {
1169 rowsMatch = false;
1170 break;
1171 }
1172 }
1173 if (rowsMatch) {
1174 return true;
1175 }
1176 }
1177 }
1178 return false;
1179 }
1180
1181
1182 void RBBITableBuilder::removeState(IntPair duplStates) {
1183 const int32_t keepState = duplStates.first;
1184 const int32_t duplState = duplStates.second;
1185 U_ASSERT(keepState < duplState);
1186 U_ASSERT(duplState < fDStates->size());
1187
1188 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1189 fDStates->removeElementAt(duplState);
1190 delete duplSD;
1191
1192 int32_t numStates = fDStates->size();
1193 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1194 for (int32_t state=0; state<numStates; ++state) {
1195 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1196 for (int32_t col=0; col<numCols; col++) {
1197 int32_t existingVal = sd->fDtran->elementAti(col);
1198 int32_t newVal = existingVal;
1199 if (existingVal == duplState) {
1200 newVal = keepState;
1201 } else if (existingVal > duplState) {
1202 newVal = existingVal - 1;
1203 }
1204 sd->fDtran->setElementAt(newVal, col);
1205 }
1206 if (sd->fAccepting == duplState) {
1207 sd->fAccepting = keepState;
1208 } else if (sd->fAccepting > duplState) {
1209 sd->fAccepting--;
1210 }
1211 if (sd->fLookAhead == duplState) {
1212 sd->fLookAhead = keepState;
1213 } else if (sd->fLookAhead > duplState) {
1214 sd->fLookAhead--;
1215 }
1216 }
1217 }
1218
1219 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1220 const int32_t keepState = duplStates.first;
1221 const int32_t duplState = duplStates.second;
1222 U_ASSERT(keepState < duplState);
1223 U_ASSERT(duplState < fSafeTable->size());
1224
1225 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1226 // and will auto-delete the removed element.
1227 int32_t numStates = fSafeTable->size();
1228 for (int32_t state=0; state<numStates; ++state) {
1229 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1230 int32_t numCols = sd->length();
1231 for (int32_t col=0; col<numCols; col++) {
1232 int32_t existingVal = sd->charAt(col);
1233 int32_t newVal = existingVal;
1234 if (existingVal == duplState) {
1235 newVal = keepState;
1236 } else if (existingVal > duplState) {
1237 newVal = existingVal - 1;
1238 }
1239 sd->setCharAt(col, newVal);
1240 }
1241 }
1242 }
1243
1244
1245 /*
1246 * RemoveDuplicateStates
1247 */
1248 void RBBITableBuilder::removeDuplicateStates() {
1249 IntPair dupls = {3, 0};
1250 while (findDuplicateState(&dupls)) {
1251 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1252 removeState(dupls);
1253 }
1254 }
1255
1256
1257 //-----------------------------------------------------------------------------
1258 //
1259 // getTableSize() Calculate the size of the runtime form of this
1260 // state transition table.
1261 //
1262 //-----------------------------------------------------------------------------
1263 int32_t RBBITableBuilder::getTableSize() const {
1264 int32_t size = 0;
1265 int32_t numRows;
1266 int32_t numCols;
1267 int32_t rowSize;
1268
1269 if (fTree == NULL) {
1270 return 0;
1271 }
1272
1273 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1274
1275 numRows = fDStates->size();
1276 numCols = fRB->fSetBuilder->getNumCharCategories();
1277
1278 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1279 size += numRows * rowSize;
1280 return size;
1281 }
1282
1283
1284 //-----------------------------------------------------------------------------
1285 //
1286 // exportTable() export the state transition table in the format required
1287 // by the runtime engine. getTableSize() bytes of memory
1288 // must be available at the output address "where".
1289 //
1290 //-----------------------------------------------------------------------------
1291 void RBBITableBuilder::exportTable(void *where) {
1292 RBBIStateTable *table = (RBBIStateTable *)where;
1293 uint32_t state;
1294 int col;
1295
1296 if (U_FAILURE(*fStatus) || fTree == NULL) {
1297 return;
1298 }
1299
1300 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1301 if (catCount > 0x7fff ||
1302 fDStates->size() > 0x7fff) {
1303 *fStatus = U_BRK_INTERNAL_ERROR;
1304 return;
1305 }
1306
1307 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1308 table->fNumStates = fDStates->size();
1309 table->fFlags = 0;
1310 if (fRB->fLookAheadHardBreak) {
1311 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1312 }
1313 if (fRB->fSetBuilder->sawBOF()) {
1314 table->fFlags |= RBBI_BOF_REQUIRED;
1315 }
1316 table->fReserved = 0;
1317
1318 for (state=0; state<table->fNumStates; state++) {
1319 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1320 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1321 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1322 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1323 row->fAccepting = (int16_t)sd->fAccepting;
1324 row->fLookAhead = (int16_t)sd->fLookAhead;
1325 row->fTagIdx = (int16_t)sd->fTagsIdx;
1326 for (col=0; col<catCount; col++) {
1327 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1328 }
1329 }
1330 }
1331
1332
1333 /**
1334 * Synthesize a safe state table from the main state table.
1335 */
1336 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1337 // The safe table creation has three steps:
1338
1339 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1340 // following the pair do not depend on context or state before the pair. To test
1341 // whether a pair is safe, run it through the main forward state table, starting
1342 // from each state. If the the final state is the same, no matter what the starting state,
1343 // the pair is safe.
1344 //
1345 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1346 // forward table, with a column for each input character [class], and a row for
1347 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1348 // create an additional state for each input character category; being in
1349 // one of these states means that the character has been seen, and is potentially
1350 // the first of a pair. In each of these rows, the entry for the second character
1351 // of a safe pair is set to the stop state (0), indicating that a match was found.
1352 // All other table entries are set to the state corresponding the current input
1353 // character, allowing that charcter to be the of a start following pair.
1354 //
1355 // Because the safe rules are to be run in reverse, moving backwards in the text,
1356 // the first and second pair categories are swapped when building the table.
1357 //
1358 // 3. Compress the table. There are typically many rows (states) that are
1359 // equivalent - that have zeroes (match completed) in the same columns -
1360 // and can be folded together.
1361
1362 // Each safe pair is stored as two UChars in the safePair string.
1363 UnicodeString safePairs;
1364
1365 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1366 int32_t numStates = fDStates->size();
1367
1368 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1369 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1370 int32_t wantedEndState = -1;
1371 int32_t endState = 0;
1372 for (int32_t startState = 1; startState < numStates; ++startState) {
1373 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1374 int32_t s2 = startStateD->fDtran->elementAti(c1);
1375 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1376 endState = s2StateD->fDtran->elementAti(c2);
1377 if (wantedEndState < 0) {
1378 wantedEndState = endState;
1379 } else {
1380 if (wantedEndState != endState) {
1381 break;
1382 }
1383 }
1384 }
1385 if (wantedEndState == endState) {
1386 safePairs.append((char16_t)c1);
1387 safePairs.append((char16_t)c2);
1388 // printf("(%d, %d) ", c1, c2);
1389 }
1390 }
1391 // printf("\n");
1392 }
1393
1394 // Populate the initial safe table.
1395 // The table as a whole is UVector<UnicodeString>
1396 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1397 // Row 0 is the stop state.
1398 // Row 1 is the start sate.
1399 // Row 2 and beyond are other states, initially one per char class, but
1400 // after initial construction, many of the states will be combined, compacting the table.
1401 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1402 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1403
1404 U_ASSERT(fSafeTable == nullptr);
1405 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1406 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1407 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1408 }
1409
1410 // From the start state, each input char class transitions to the state for that input.
1411 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1412 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1413 // Note: +2 for the start & stop state.
1414 startState.setCharAt(charClass, charClass+2);
1415 }
1416
1417 // Initially make every other state table row look like the start state row,
1418 for (int32_t row=2; row<numCharClasses+2; ++row) {
1419 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1420 rowState = startState; // UnicodeString assignment, copies contents.
1421 }
1422
1423 // Run through the safe pairs, set the next state to zero when pair has been seen.
1424 // Zero being the stop state, meaning we found a safe point.
1425 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1426 int32_t c1 = safePairs.charAt(pairIdx);
1427 int32_t c2 = safePairs.charAt(pairIdx + 1);
1428
1429 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1430 rowState.setCharAt(c1, 0);
1431 }
1432
1433 // Remove duplicate or redundant rows from the table.
1434 IntPair states = {1, 0};
1435 while (findDuplicateSafeState(&states)) {
1436 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1437 removeSafeState(states);
1438 }
1439 }
1440
1441
1442 //-----------------------------------------------------------------------------
1443 //
1444 // getSafeTableSize() Calculate the size of the runtime form of this
1445 // safe state table.
1446 //
1447 //-----------------------------------------------------------------------------
1448 int32_t RBBITableBuilder::getSafeTableSize() const {
1449 int32_t size = 0;
1450 int32_t numRows;
1451 int32_t numCols;
1452 int32_t rowSize;
1453
1454 if (fSafeTable == nullptr) {
1455 return 0;
1456 }
1457
1458 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1459
1460 numRows = fSafeTable->size();
1461 numCols = fRB->fSetBuilder->getNumCharCategories();
1462
1463 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1464 size += numRows * rowSize;
1465 return size;
1466 }
1467
1468
1469 //-----------------------------------------------------------------------------
1470 //
1471 // exportSafeTable() export the state transition table in the format required
1472 // by the runtime engine. getTableSize() bytes of memory
1473 // must be available at the output address "where".
1474 //
1475 //-----------------------------------------------------------------------------
1476 void RBBITableBuilder::exportSafeTable(void *where) {
1477 RBBIStateTable *table = (RBBIStateTable *)where;
1478 uint32_t state;
1479 int col;
1480
1481 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1482 return;
1483 }
1484
1485 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1486 if (catCount > 0x7fff ||
1487 fSafeTable->size() > 0x7fff) {
1488 *fStatus = U_BRK_INTERNAL_ERROR;
1489 return;
1490 }
1491
1492 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1493 table->fNumStates = fSafeTable->size();
1494 table->fFlags = 0;
1495 table->fReserved = 0;
1496
1497 for (state=0; state<table->fNumStates; state++) {
1498 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1499 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1500 row->fAccepting = 0;
1501 row->fLookAhead = 0;
1502 row->fTagIdx = 0;
1503 row->fReserved = 0;
1504 for (col=0; col<catCount; col++) {
1505 row->fNextState[col] = rowString->charAt(col);
1506 }
1507 }
1508 }
1509
1510
1511
1512
1513 //-----------------------------------------------------------------------------
1514 //
1515 // printSet Debug function. Print the contents of a UVector
1516 //
1517 //-----------------------------------------------------------------------------
1518 #ifdef RBBI_DEBUG
1519 void RBBITableBuilder::printSet(UVector *s) {
1520 int32_t i;
1521 for (i=0; i<s->size(); i++) {
1522 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1523 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1524 }
1525 RBBIDebugPrintf("\n");
1526 }
1527 #endif
1528
1529
1530 //-----------------------------------------------------------------------------
1531 //
1532 // printStates Debug Function. Dump the fully constructed state transition table.
1533 //
1534 //-----------------------------------------------------------------------------
1535 #ifdef RBBI_DEBUG
1536 void RBBITableBuilder::printStates() {
1537 int c; // input "character"
1538 int n; // state number
1539
1540 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1541 RBBIDebugPrintf(" | Acc LA Tag");
1542 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1543 RBBIDebugPrintf(" %2d", c);
1544 }
1545 RBBIDebugPrintf("\n");
1546 RBBIDebugPrintf(" |---------------");
1547 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548 RBBIDebugPrintf("---");
1549 }
1550 RBBIDebugPrintf("\n");
1551
1552 for (n=0; n<fDStates->size(); n++) {
1553 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1554 RBBIDebugPrintf(" %3d | " , n);
1555 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1556 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1557 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1558 }
1559 RBBIDebugPrintf("\n");
1560 }
1561 RBBIDebugPrintf("\n\n");
1562 }
1563 #endif
1564
1565
1566 //-----------------------------------------------------------------------------
1567 //
1568 // printSafeTable Debug Function. Dump the fully constructed safe table.
1569 //
1570 //-----------------------------------------------------------------------------
1571 #ifdef RBBI_DEBUG
1572 void RBBITableBuilder::printReverseTable() {
1573 int c; // input "character"
1574 int n; // state number
1575
1576 RBBIDebugPrintf(" Safe Reverse Table \n");
1577 if (fSafeTable == nullptr) {
1578 RBBIDebugPrintf(" --- nullptr ---\n");
1579 return;
1580 }
1581 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1582 RBBIDebugPrintf(" | Acc LA Tag");
1583 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1584 RBBIDebugPrintf(" %2d", c);
1585 }
1586 RBBIDebugPrintf("\n");
1587 RBBIDebugPrintf(" |---------------");
1588 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589 RBBIDebugPrintf("---");
1590 }
1591 RBBIDebugPrintf("\n");
1592
1593 for (n=0; n<fSafeTable->size(); n++) {
1594 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1595 RBBIDebugPrintf(" %3d | " , n);
1596 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1597 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1598 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1599 }
1600 RBBIDebugPrintf("\n");
1601 }
1602 RBBIDebugPrintf("\n\n");
1603 }
1604 #endif
1605
1606
1607
1608 //-----------------------------------------------------------------------------
1609 //
1610 // printRuleStatusTable Debug Function. Dump the common rule status table
1611 //
1612 //-----------------------------------------------------------------------------
1613 #ifdef RBBI_DEBUG
1614 void RBBITableBuilder::printRuleStatusTable() {
1615 int32_t thisRecord = 0;
1616 int32_t nextRecord = 0;
1617 int i;
1618 UVector *tbl = fRB->fRuleStatusVals;
1619
1620 RBBIDebugPrintf("index | tags \n");
1621 RBBIDebugPrintf("-------------------\n");
1622
1623 while (nextRecord < tbl->size()) {
1624 thisRecord = nextRecord;
1625 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1626 RBBIDebugPrintf("%4d ", thisRecord);
1627 for (i=thisRecord+1; i<nextRecord; i++) {
1628 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1629 }
1630 RBBIDebugPrintf("\n");
1631 }
1632 RBBIDebugPrintf("\n\n");
1633 }
1634 #endif
1635
1636
1637 //-----------------------------------------------------------------------------
1638 //
1639 // RBBIStateDescriptor Methods. This is a very struct-like class
1640 // Most access is directly to the fields.
1641 //
1642 //-----------------------------------------------------------------------------
1643
1644 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1645 fMarked = FALSE;
1646 fAccepting = 0;
1647 fLookAhead = 0;
1648 fTagsIdx = 0;
1649 fTagVals = NULL;
1650 fPositions = NULL;
1651 fDtran = NULL;
1652
1653 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1654 if (U_FAILURE(*fStatus)) {
1655 return;
1656 }
1657 if (fDtran == NULL) {
1658 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1659 return;
1660 }
1661 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1662 // It is indexed by input symbols, and will
1663 // hold the next state number for each
1664 // symbol.
1665 }
1666
1667
1668 RBBIStateDescriptor::~RBBIStateDescriptor() {
1669 delete fPositions;
1670 delete fDtran;
1671 delete fTagVals;
1672 fPositions = NULL;
1673 fDtran = NULL;
1674 fTagVals = NULL;
1675 }
1676
1677 U_NAMESPACE_END
1678
1679 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */