]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbitblb.cpp
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (c) 2002-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*/
73c04bcf
A
9//
10// rbbitblb.cpp
11//
12
b75a7d8f
A
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/unistr.h"
19#include "rbbitblb.h"
20#include "rbbirb.h"
21#include "rbbisetb.h"
22#include "rbbidata.h"
23#include "cstring.h"
24#include "uassert.h"
0f5d89e8 25#include "uvectr32.h"
73c04bcf 26#include "cmemory.h"
b75a7d8f
A
27
28U_NAMESPACE_BEGIN
29
0f5d89e8
A
30RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31 fRB(rb),
32 fTree(*rootNode),
33 fStatus(&status),
34 fDStates(nullptr),
35 fSafeTable(nullptr) {
374ca955 36 if (U_FAILURE(status)) {
374ca955
A
37 return;
38 }
0f5d89e8
A
39 // fDStates is UVector<RBBIStateDescriptor *>
40 fDStates = new UVector(status);
41 if (U_SUCCESS(status) && fDStates == nullptr ) {
42 status = U_MEMORY_ALLOCATION_ERROR;
374ca955 43 }
b75a7d8f
A
44}
45
46
47
48RBBITableBuilder::~RBBITableBuilder() {
49 int i;
50 for (i=0; i<fDStates->size(); i++) {
51 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52 }
0f5d89e8
A
53 delete fDStates;
54 delete fSafeTable;
b75a7d8f
A
55}
56
57
58//-----------------------------------------------------------------------------
59//
0f5d89e8
A
60// RBBITableBuilder::buildForwardTable - This is the main function for building
61// the DFA state transition table from the RBBI rules parse tree.
b75a7d8f
A
62//
63//-----------------------------------------------------------------------------
0f5d89e8 64void RBBITableBuilder::buildForwardTable() {
b75a7d8f
A
65
66 if (U_FAILURE(*fStatus)) {
67 return;
68 }
69
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
72 if (fTree==NULL) {
73 return;
74 }
75
76 //
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
79 //
80 fTree = fTree->flattenVariables();
73c04bcf 81#ifdef RBBI_DEBUG
b75a7d8f 82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
2ca993e8 83 RBBIDebugPuts("\nParse tree after flattening variable references.");
f3c0d7a5 84 RBBINode::printTree(fTree, TRUE);
b75a7d8f 85 }
73c04bcf
A
86#endif
87
88 //
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
93 //
94 if (fRB->fSetBuilder->sawBOF()) {
95 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
46f4442e
A
97 // Delete and exit if memory allocation failed.
98 if (bofTop == NULL || bofLeaf == NULL) {
99 *fStatus = U_MEMORY_ALLOCATION_ERROR;
100 delete bofTop;
101 delete bofLeaf;
102 return;
103 }
73c04bcf
A
104 bofTop->fLeftChild = bofLeaf;
105 bofTop->fRightChild = fTree;
106 bofLeaf->fParent = bofTop;
107 bofLeaf->fVal = 2; // Reserved value for {bof}.
108 fTree = bofTop;
109 }
b75a7d8f
A
110
111 //
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
115 //
116 RBBINode *cn = new RBBINode(RBBINode::opCat);
46f4442e
A
117 // Exit if memory allocation failed.
118 if (cn == NULL) {
119 *fStatus = U_MEMORY_ALLOCATION_ERROR;
120 return;
121 }
b75a7d8f
A
122 cn->fLeftChild = fTree;
123 fTree->fParent = cn;
124 cn->fRightChild = new RBBINode(RBBINode::endMark);
46f4442e
A
125 // Delete and exit if memory allocation failed.
126 if (cn->fRightChild == NULL) {
127 *fStatus = U_MEMORY_ALLOCATION_ERROR;
128 delete cn;
129 return;
130 }
b75a7d8f
A
131 cn->fRightChild->fParent = cn;
132 fTree = cn;
133
134 //
135 // Replace all references to UnicodeSets with the tree for the equivalent
136 // expression.
137 //
138 fTree->flattenSets();
73c04bcf 139#ifdef RBBI_DEBUG
b75a7d8f 140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
2ca993e8 141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
f3c0d7a5 142 RBBINode::printTree(fTree, TRUE);
b75a7d8f 143 }
73c04bcf 144#endif
b75a7d8f
A
145
146
147 //
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
153 //
154 calcNullable(fTree);
155 calcFirstPos(fTree);
156 calcLastPos(fTree);
157 calcFollowPos(fTree);
158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
374ca955 159 RBBIDebugPuts("\n");
b75a7d8f
A
160 printPosSets(fTree);
161 }
162
374ca955
A
163 //
164 // For "chained" rules, modify the followPos sets
165 //
166 if (fRB->fChainRules) {
167 calcChainedFollowPos(fTree);
168 }
169
73c04bcf
A
170 //
171 // BOF (start of input) test fixup.
172 //
173 if (fRB->fSetBuilder->sawBOF()) {
174 bofFixup();
175 }
176
b75a7d8f
A
177 //
178 // Build the DFA state transition tables.
179 //
180 buildStateTable();
181 flagAcceptingStates();
182 flagLookAheadStates();
183 flagTaggedStates();
b75a7d8f 184
374ca955
A
185 //
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
189 //
190 mergeRuleStatusVals();
b75a7d8f
A
191}
192
193
194
195//-----------------------------------------------------------------------------
196//
197// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198//
199//-----------------------------------------------------------------------------
200void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237}
238
239
240
241
242//-----------------------------------------------------------------------------
243//
244// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245//
246//-----------------------------------------------------------------------------
247void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
73c04bcf
A
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
b75a7d8f
A
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284}
285
286
287
288//-----------------------------------------------------------------------------
289//
290// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291//
292//-----------------------------------------------------------------------------
293void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
73c04bcf
A
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
b75a7d8f
A
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330}
331
332
333
334//-----------------------------------------------------------------------------
335//
336// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337//
338//-----------------------------------------------------------------------------
339void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376}
377
2ca993e8
A
378//-----------------------------------------------------------------------------
379//
380// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381// as roots of a rule to a destination vector.
382//
383//-----------------------------------------------------------------------------
384void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385 if (node == NULL || U_FAILURE(*fStatus)) {
386 return;
387 }
388 if (node->fRuleRoot) {
389 dest->addElement(node, *fStatus);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
392 return;
393 }
394 addRuleRootNodes(dest, node->fLeftChild);
395 addRuleRootNodes(dest, node->fRightChild);
396}
b75a7d8f 397
374ca955
A
398//-----------------------------------------------------------------------------
399//
400// calcChainedFollowPos. Modify the previously calculated followPos sets
401// to implement rule chaining. NOT described by Aho
402//
403//-----------------------------------------------------------------------------
404void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
406 UVector endMarkerNodes(*fStatus);
407 UVector leafNodes(*fStatus);
408 int32_t i;
409
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list of all endmarker nodes.
415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
73c04bcf 417 // get a list all leaf nodes
374ca955
A
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
2ca993e8
A
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
374ca955 429
2ca993e8 430 UVector matchStartNodes(*fStatus);
3d1f044b
A
431 for (int j=0; j<ruleRootNodes.size(); ++j) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
2ca993e8
A
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
374ca955 440
374ca955
A
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 RBBINode *endNode = NULL;
447
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i=0; i<endMarkerNodes.size(); i++) {
451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452 endNode = tNode;
453 break;
454 }
455 }
456 if (endNode == NULL) {
457 // node wasn't an end node. Try again with the next.
458 continue;
459 }
460
461 // We've got a node that can end a match.
462
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
f3c0d7a5 467 if (fRB->fLBCMNoChain) {
374ca955 468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
73c04bcf
A
469 if (c != -1) {
470 // c == -1 occurs with sets containing only the {eof} marker string.
f3c0d7a5
A
471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472 if (cLBProp == U_LB_COMBINING_MARK) {
473 continue;
73c04bcf 474 }
374ca955
A
475 }
476 }
477
478
479 // Now iterate over the nodes that can start a match, looking for ones
480 // with the same char class as our ending node.
481 RBBINode *startNode;
2ca993e8
A
482 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
374ca955
A
484 if (startNode->fType != RBBINode::leafChar) {
485 continue;
486 }
487
488 if (endNode->fVal == startNode->fVal) {
489 // The end val (character class) of one possible match is the
490 // same as the start of another.
491
492 // Add all nodes from the followPos of the start node to the
493 // followPos set of the end node, which will have the effect of
494 // letting matches transition from a match state at endNode
495 // to the second char of a match starting with startNode.
496 setAdd(endNode->fFollowPos, startNode->fFollowPos);
497 }
498 }
499 }
500}
501
502
73c04bcf
A
503//-----------------------------------------------------------------------------
504//
505// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
506// Do an swizzle similar to chaining, modifying the followPos set of
507// the bofNode to include the followPos nodes from other {bot} nodes
508// scattered through the tree.
509//
510// This function has much in common with calcChainedFollowPos().
511//
512//-----------------------------------------------------------------------------
513void RBBITableBuilder::bofFixup() {
514
515 if (U_FAILURE(*fStatus)) {
516 return;
517 }
518
519 // The parse tree looks like this ...
520 // fTree root ---> <cat>
521 // / \ .
522 // <cat> <#end node>
523 // / \ .
524 // <bofNode> rest
525 // of tree
526 //
527 // We will be adding things to the followPos set of the <bofNode>
528 //
529 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
530 U_ASSERT(bofNode->fType == RBBINode::leafChar);
531 U_ASSERT(bofNode->fVal == 2);
532
533 // Get all nodes that can be the start a match of the user-written rules
534 // (excluding the fake bofNode)
535 // We want the nodes that can start a match in the
536 // part labeled "rest of tree"
537 //
538 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539
540 RBBINode *startNode;
541 int startNodeIx;
542 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544 if (startNode->fType != RBBINode::leafChar) {
545 continue;
546 }
547
548 if (startNode->fVal == bofNode->fVal) {
549 // We found a leaf node corresponding to a {bof} that was
550 // explicitly written into a rule.
551 // Add everything from the followPos set of this node to the
552 // followPos set of the fake bofNode at the start of the tree.
553 //
554 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555 }
556 }
557}
558
b75a7d8f
A
559//-----------------------------------------------------------------------------
560//
561// buildStateTable() Determine the set of runtime DFA states and the
562// transition tables for these states, by the algorithm
563// of fig. 3.44 in Aho.
564//
565// Most of the comments are quotes of Aho's psuedo-code.
566//
567//-----------------------------------------------------------------------------
568void RBBITableBuilder::buildStateTable() {
374ca955
A
569 if (U_FAILURE(*fStatus)) {
570 return;
571 }
46f4442e
A
572 RBBIStateDescriptor *failState;
573 // Set it to NULL to avoid uninitialized warning
574 RBBIStateDescriptor *initialState = NULL;
b75a7d8f
A
575 //
576 // Add a dummy state 0 - the stop state. Not from Aho.
577 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
46f4442e
A
578 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579 if (failState == NULL) {
580 *fStatus = U_MEMORY_ALLOCATION_ERROR;
581 goto ExitBuildSTdeleteall;
582 }
b75a7d8f 583 failState->fPositions = new UVector(*fStatus);
46f4442e
A
584 if (failState->fPositions == NULL) {
585 *fStatus = U_MEMORY_ALLOCATION_ERROR;
586 }
587 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588 goto ExitBuildSTdeleteall;
374ca955 589 }
b75a7d8f 590 fDStates->addElement(failState, *fStatus);
374ca955 591 if (U_FAILURE(*fStatus)) {
46f4442e 592 goto ExitBuildSTdeleteall;
374ca955 593 }
b75a7d8f
A
594
595 // initially, the only unmarked state in Dstates is firstpos(root),
596 // where toot is the root of the syntax tree for (r)#;
46f4442e
A
597 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598 if (initialState == NULL) {
599 *fStatus = U_MEMORY_ALLOCATION_ERROR;
600 }
374ca955 601 if (U_FAILURE(*fStatus)) {
46f4442e 602 goto ExitBuildSTdeleteall;
374ca955 603 }
b75a7d8f 604 initialState->fPositions = new UVector(*fStatus);
46f4442e
A
605 if (initialState->fPositions == NULL) {
606 *fStatus = U_MEMORY_ALLOCATION_ERROR;
607 }
374ca955 608 if (U_FAILURE(*fStatus)) {
46f4442e 609 goto ExitBuildSTdeleteall;
374ca955 610 }
b75a7d8f
A
611 setAdd(initialState->fPositions, fTree->fFirstPosSet);
612 fDStates->addElement(initialState, *fStatus);
374ca955 613 if (U_FAILURE(*fStatus)) {
46f4442e 614 goto ExitBuildSTdeleteall;
374ca955 615 }
b75a7d8f
A
616
617 // while there is an unmarked state T in Dstates do begin
618 for (;;) {
619 RBBIStateDescriptor *T = NULL;
620 int32_t tx;
621 for (tx=1; tx<fDStates->size(); tx++) {
622 RBBIStateDescriptor *temp;
623 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624 if (temp->fMarked == FALSE) {
625 T = temp;
626 break;
627 }
628 }
629 if (T == NULL) {
630 break;
631 }
632
633 // mark T;
634 T->fMarked = TRUE;
635
636 // for each input symbol a do begin
637 int32_t a;
638 for (a = 1; a<=lastInputSymbol; a++) {
639 // let U be the set of positions that are in followpos(p)
640 // for some position p in T
641 // such that the symbol at position p is a;
642 UVector *U = NULL;
643 RBBINode *p;
644 int32_t px;
645 for (px=0; px<T->fPositions->size(); px++) {
646 p = (RBBINode *)T->fPositions->elementAt(px);
647 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
648 if (U == NULL) {
649 U = new UVector(*fStatus);
46f4442e
A
650 if (U == NULL) {
651 *fStatus = U_MEMORY_ALLOCATION_ERROR;
652 goto ExitBuildSTdeleteall;
653 }
b75a7d8f
A
654 }
655 setAdd(U, p->fFollowPos);
656 }
657 }
658
659 // if U is not empty and not in DStates then
660 int32_t ux = 0;
661 UBool UinDstates = FALSE;
662 if (U != NULL) {
663 U_ASSERT(U->size() > 0);
664 int ix;
665 for (ix=0; ix<fDStates->size(); ix++) {
666 RBBIStateDescriptor *temp2;
667 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668 if (setEquals(U, temp2->fPositions)) {
669 delete U;
670 U = temp2->fPositions;
671 ux = ix;
672 UinDstates = TRUE;
673 break;
674 }
675 }
676
677 // Add U as an unmarked state to Dstates
678 if (!UinDstates)
679 {
680 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
46f4442e
A
681 if (newState == NULL) {
682 *fStatus = U_MEMORY_ALLOCATION_ERROR;
683 }
374ca955 684 if (U_FAILURE(*fStatus)) {
46f4442e 685 goto ExitBuildSTdeleteall;
374ca955 686 }
b75a7d8f
A
687 newState->fPositions = U;
688 fDStates->addElement(newState, *fStatus);
374ca955
A
689 if (U_FAILURE(*fStatus)) {
690 return;
691 }
b75a7d8f
A
692 ux = fDStates->size()-1;
693 }
694
695 // Dtran[T, a] := U;
696 T->fDtran->setElementAt(ux, a);
697 }
698 }
699 }
46f4442e
A
700 return;
701 // delete local pointers only if error occured.
702ExitBuildSTdeleteall:
703 delete initialState;
704 delete failState;
b75a7d8f
A
705}
706
707
708
709//-----------------------------------------------------------------------------
710//
711// flagAcceptingStates Identify accepting states.
712// First get a list of all of the end marker nodes.
713// Then, for each state s,
714// if s contains one of the end marker nodes in its list of tree positions then
715// s is an accepting state.
716//
717//-----------------------------------------------------------------------------
718void RBBITableBuilder::flagAcceptingStates() {
374ca955
A
719 if (U_FAILURE(*fStatus)) {
720 return;
721 }
b75a7d8f
A
722 UVector endMarkerNodes(*fStatus);
723 RBBINode *endMarker;
724 int32_t i;
725 int32_t n;
726
374ca955
A
727 if (U_FAILURE(*fStatus)) {
728 return;
729 }
730
b75a7d8f 731 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
374ca955
A
732 if (U_FAILURE(*fStatus)) {
733 return;
734 }
b75a7d8f
A
735
736 for (i=0; i<endMarkerNodes.size(); i++) {
737 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738 for (n=0; n<fDStates->size(); n++) {
739 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740 if (sd->fPositions->indexOf(endMarker) >= 0) {
741 // Any non-zero value for fAccepting means this is an accepting node.
742 // The value is what will be returned to the user as the break status.
743 // If no other value was specified, force it to -1.
73c04bcf
A
744
745 if (sd->fAccepting==0) {
746 // State hasn't been marked as accepting yet. Do it now.
747 sd->fAccepting = endMarker->fVal;
748 if (sd->fAccepting == 0) {
749 sd->fAccepting = -1;
750 }
751 }
752 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753 // Both lookahead and non-lookahead accepting for this state.
754 // Favor the look-ahead. Expedient for line break.
755 // TODO: need a more elegant resolution for conflicting rules.
756 sd->fAccepting = endMarker->fVal;
b75a7d8f 757 }
73c04bcf
A
758 // implicit else:
759 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
b75a7d8f
A
760
761 // If the end marker node is from a look-ahead rule, set
0f5d89e8 762 // the fLookAhead field for this state also.
b75a7d8f 763 if (endMarker->fLookAheadEnd) {
73c04bcf
A
764 // TODO: don't change value if already set?
765 // TODO: allow for more than one active look-ahead rule in engine.
766 // Make value here an index to a side array in engine?
b75a7d8f
A
767 sd->fLookAhead = sd->fAccepting;
768 }
769 }
770 }
771 }
772}
773
774
775//-----------------------------------------------------------------------------
776//
777// flagLookAheadStates Very similar to flagAcceptingStates, above.
778//
779//-----------------------------------------------------------------------------
780void RBBITableBuilder::flagLookAheadStates() {
374ca955
A
781 if (U_FAILURE(*fStatus)) {
782 return;
783 }
b75a7d8f
A
784 UVector lookAheadNodes(*fStatus);
785 RBBINode *lookAheadNode;
786 int32_t i;
787 int32_t n;
788
789 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
374ca955
A
790 if (U_FAILURE(*fStatus)) {
791 return;
792 }
b75a7d8f
A
793 for (i=0; i<lookAheadNodes.size(); i++) {
794 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795
796 for (n=0; n<fDStates->size(); n++) {
797 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799 sd->fLookAhead = lookAheadNode->fVal;
800 }
801 }
802 }
803}
804
805
806
807
808//-----------------------------------------------------------------------------
809//
810// flagTaggedStates
811//
812//-----------------------------------------------------------------------------
813void RBBITableBuilder::flagTaggedStates() {
374ca955
A
814 if (U_FAILURE(*fStatus)) {
815 return;
816 }
b75a7d8f
A
817 UVector tagNodes(*fStatus);
818 RBBINode *tagNode;
819 int32_t i;
820 int32_t n;
821
374ca955
A
822 if (U_FAILURE(*fStatus)) {
823 return;
824 }
b75a7d8f 825 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
374ca955
A
826 if (U_FAILURE(*fStatus)) {
827 return;
828 }
b75a7d8f
A
829 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
830 tagNode = (RBBINode *)tagNodes.elementAt(i);
73c04bcf 831
b75a7d8f
A
832 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
833 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
374ca955 835 sortedAdd(&sd->fTagVals, tagNode->fVal);
b75a7d8f
A
836 }
837 }
838 }
839}
374ca955
A
840
841
842
843
844//-----------------------------------------------------------------------------
845//
846// mergeRuleStatusVals
847//
848// Update the global table of rule status {tag} values
849// The rule builder has a global vector of status values that are common
850// for all tables. Merge the ones from this table into the global set.
851//
852//-----------------------------------------------------------------------------
853void RBBITableBuilder::mergeRuleStatusVals() {
854 //
855 // The basic outline of what happens here is this...
856 //
857 // for each state in this state table
858 // if the status tag list for this state is in the global statuses list
859 // record where and
860 // continue with the next state
861 // else
862 // add the tag list for this state to the global list.
863 //
864 int i;
865 int n;
866
867 // Pre-set a single tag of {0} into the table.
868 // We will need this as a default, for rule sets with no explicit tagging.
869 if (fRB->fRuleStatusVals->size() == 0) {
870 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
871 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
872 }
73c04bcf
A
873
874 // For each state
875 for (n=0; n<fDStates->size(); n++) {
374ca955
A
876 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877 UVector *thisStatesTagValues = sd->fTagVals;
878 if (thisStatesTagValues == NULL) {
879 // No tag values are explicitly associated with this state.
880 // Set the default tag value.
881 sd->fTagsIdx = 0;
882 continue;
883 }
884
885 // There are tag(s) associated with this state.
886 // fTagsIdx will be the index into the global tag list for this state's tag values.
887 // Initial value of -1 flags that we haven't got it set yet.
888 sd->fTagsIdx = -1;
889 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
890 int32_t nextTagGroupStart = 0;
73c04bcf 891
374ca955
A
892 // Loop runs once per group of tags in the global list
893 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894 thisTagGroupStart = nextTagGroupStart;
895 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897 // The number of tags for this state is different from
898 // the number of tags in this group from the global list.
899 // Continue with the next group from the global list.
900 continue;
901 }
902 // The lengths match, go ahead and compare the actual tag values
903 // between this state and the group from the global list.
904 for (i=0; i<thisStatesTagValues->size(); i++) {
73c04bcf 905 if (thisStatesTagValues->elementAti(i) !=
374ca955 906 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
73c04bcf 907 // Mismatch.
374ca955
A
908 break;
909 }
910 }
73c04bcf 911
374ca955
A
912 if (i == thisStatesTagValues->size()) {
913 // We found a set of tag values in the global list that match
914 // those for this state. Use them.
915 sd->fTagsIdx = thisTagGroupStart;
73c04bcf 916 break;
374ca955
A
917 }
918 }
73c04bcf 919
374ca955
A
920 if (sd->fTagsIdx == -1) {
921 // No suitable entry in the global tag list already. Add one
922 sd->fTagsIdx = fRB->fRuleStatusVals->size();
923 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924 for (i=0; i<thisStatesTagValues->size(); i++) {
925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926 }
927 }
928 }
929}
930
931
932
933
934
935
936
937//-----------------------------------------------------------------------------
938//
939// sortedAdd Add a value to a vector of sorted values (ints).
940// Do not replicate entries; if the value is already there, do not
941// add a second one.
942// Lazily create the vector if it does not already exist.
943//
944//-----------------------------------------------------------------------------
945void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946 int32_t i;
947
948 if (*vector == NULL) {
949 *vector = new UVector(*fStatus);
950 }
951 if (*vector == NULL || U_FAILURE(*fStatus)) {
952 return;
953 }
954 UVector *vec = *vector;
955 int32_t vSize = vec->size();
956 for (i=0; i<vSize; i++) {
957 int32_t valAtI = vec->elementAti(i);
958 if (valAtI == val) {
959 // The value is already in the vector. Don't add it again.
960 return;
961 }
962 if (valAtI > val) {
963 break;
964 }
965 }
966 vec->insertElementAt(val, i, *fStatus);
b75a7d8f
A
967}
968
969
970
971//-----------------------------------------------------------------------------
972//
973// setAdd Set operation on UVector
974// dest = dest union source
73c04bcf 975// Elements may only appear once and must be sorted.
b75a7d8f
A
976//
977//-----------------------------------------------------------------------------
978void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
46f4442e
A
979 int32_t destOriginalSize = dest->size();
980 int32_t sourceSize = source->size();
73c04bcf 981 int32_t di = 0;
729e4ab9
A
982 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
983 void **destPtr, **sourcePtr;
73c04bcf 984 void **destLim, **sourceLim;
b75a7d8f 985
729e4ab9
A
986 if (destOriginalSize > destArray.getCapacity()) {
987 if (destArray.resize(destOriginalSize) == NULL) {
988 return;
989 }
73c04bcf 990 }
729e4ab9
A
991 destPtr = destArray.getAlias();
992 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
73c04bcf 993
729e4ab9
A
994 if (sourceSize > sourceArray.getCapacity()) {
995 if (sourceArray.resize(sourceSize) == NULL) {
996 return;
73c04bcf 997 }
73c04bcf 998 }
729e4ab9
A
999 sourcePtr = sourceArray.getAlias();
1000 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
73c04bcf
A
1001
1002 // Avoid multiple "get element" calls by getting the contents into arrays
729e4ab9
A
1003 (void) dest->toArray(destPtr);
1004 (void) source->toArray(sourcePtr);
73c04bcf 1005
46f4442e 1006 dest->setSize(sourceSize+destOriginalSize, *fStatus);
73c04bcf 1007
729e4ab9
A
1008 while (sourcePtr < sourceLim && destPtr < destLim) {
1009 if (*destPtr == *sourcePtr) {
1010 dest->setElementAt(*sourcePtr++, di++);
1011 destPtr++;
73c04bcf 1012 }
46f4442e
A
1013 // This check is required for machines with segmented memory, like i5/OS.
1014 // Direct pointer comparison is not recommended.
729e4ab9
A
1015 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016 dest->setElementAt(*destPtr++, di++);
46f4442e 1017 }
729e4ab9
A
1018 else { /* *sourcePtr < *destPtr */
1019 dest->setElementAt(*sourcePtr++, di++);
b75a7d8f 1020 }
73c04bcf
A
1021 }
1022
1023 // At most one of these two cleanup loops will execute
729e4ab9
A
1024 while (destPtr < destLim) {
1025 dest->setElementAt(*destPtr++, di++);
73c04bcf 1026 }
729e4ab9
A
1027 while (sourcePtr < sourceLim) {
1028 dest->setElementAt(*sourcePtr++, di++);
73c04bcf
A
1029 }
1030
46f4442e 1031 dest->setSize(di, *fStatus);
b75a7d8f
A
1032}
1033
1034
374ca955 1035
b75a7d8f
A
1036//-----------------------------------------------------------------------------
1037//
1038// setEqual Set operation on UVector.
1039// Compare for equality.
73c04bcf 1040// Elements must be sorted.
b75a7d8f
A
1041//
1042//-----------------------------------------------------------------------------
1043UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
73c04bcf 1044 return a->equals(*b);
b75a7d8f
A
1045}
1046
1047
1048//-----------------------------------------------------------------------------
1049//
1050// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1051// for each node in the tree.
1052//
1053//-----------------------------------------------------------------------------
b75a7d8f 1054#ifdef RBBI_DEBUG
374ca955 1055void RBBITableBuilder::printPosSets(RBBINode *n) {
b75a7d8f
A
1056 if (n==NULL) {
1057 return;
1058 }
2ca993e8
A
1059 printf("\n");
1060 RBBINode::printNodeHeader();
f3c0d7a5 1061 RBBINode::printNode(n);
b75a7d8f
A
1062 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1063
1064 RBBIDebugPrintf(" firstpos: ");
1065 printSet(n->fFirstPosSet);
1066
1067 RBBIDebugPrintf(" lastpos: ");
1068 printSet(n->fLastPosSet);
1069
1070 RBBIDebugPrintf(" followpos: ");
1071 printSet(n->fFollowPos);
1072
1073 printPosSets(n->fLeftChild);
1074 printPosSets(n->fRightChild);
b75a7d8f 1075}
374ca955 1076#endif
b75a7d8f 1077
0f5d89e8
A
1078//
1079// findDuplCharClassFrom()
1080//
1081bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082 int32_t numStates = fDStates->size();
1083 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084
0f5d89e8
A
1085 for (; categories->first < numCols-1; categories->first++) {
1086 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
3d1f044b
A
1087 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1088 uint16_t table_base = 0;
1089 uint16_t table_dupl = 1;
1090 for (int32_t state=0; state<numStates; state++) {
1091 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1092 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1093 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1094 if (table_base != table_dupl) {
1095 break;
1096 }
1097 }
1098 if (table_base == table_dupl) {
1099 return true;
1100 }
0f5d89e8
A
1101 }
1102 }
1103 return false;
1104}
1105
1106
1107//
1108// removeColumn()
1109//
1110void RBBITableBuilder::removeColumn(int32_t column) {
1111 int32_t numStates = fDStates->size();
1112 for (int32_t state=0; state<numStates; state++) {
1113 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1114 U_ASSERT(column < sd->fDtran->size());
1115 sd->fDtran->removeElementAt(column);
1116 }
1117}
1118
1119/*
1120 * findDuplicateState
1121 */
1122bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1123 int32_t numStates = fDStates->size();
1124 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1125
1126 for (; states->first<numStates-1; states->first++) {
1127 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1128 for (states->second=states->first+1; states->second<numStates; states->second++) {
1129 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1130 if (firstSD->fAccepting != duplSD->fAccepting ||
1131 firstSD->fLookAhead != duplSD->fLookAhead ||
1132 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1133 continue;
1134 }
1135 bool rowsMatch = true;
1136 for (int32_t col=0; col < numCols; ++col) {
1137 int32_t firstVal = firstSD->fDtran->elementAti(col);
1138 int32_t duplVal = duplSD->fDtran->elementAti(col);
1139 if (!((firstVal == duplVal) ||
1140 ((firstVal == states->first || firstVal == states->second) &&
1141 (duplVal == states->first || duplVal == states->second)))) {
1142 rowsMatch = false;
1143 break;
1144 }
1145 }
1146 if (rowsMatch) {
1147 return true;
1148 }
1149 }
1150 }
1151 return false;
1152}
1153
1154
1155bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1156 int32_t numStates = fSafeTable->size();
1157
1158 for (; states->first<numStates-1; states->first++) {
1159 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1160 for (states->second=states->first+1; states->second<numStates; states->second++) {
1161 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1162 bool rowsMatch = true;
1163 int32_t numCols = firstRow->length();
1164 for (int32_t col=0; col < numCols; ++col) {
1165 int32_t firstVal = firstRow->charAt(col);
1166 int32_t duplVal = duplRow->charAt(col);
1167 if (!((firstVal == duplVal) ||
1168 ((firstVal == states->first || firstVal == states->second) &&
1169 (duplVal == states->first || duplVal == states->second)))) {
1170 rowsMatch = false;
1171 break;
1172 }
1173 }
1174 if (rowsMatch) {
1175 return true;
1176 }
1177 }
1178 }
1179 return false;
1180}
1181
1182
1183void RBBITableBuilder::removeState(IntPair duplStates) {
1184 const int32_t keepState = duplStates.first;
1185 const int32_t duplState = duplStates.second;
1186 U_ASSERT(keepState < duplState);
1187 U_ASSERT(duplState < fDStates->size());
1188
1189 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1190 fDStates->removeElementAt(duplState);
1191 delete duplSD;
1192
1193 int32_t numStates = fDStates->size();
1194 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1195 for (int32_t state=0; state<numStates; ++state) {
1196 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1197 for (int32_t col=0; col<numCols; col++) {
1198 int32_t existingVal = sd->fDtran->elementAti(col);
1199 int32_t newVal = existingVal;
1200 if (existingVal == duplState) {
1201 newVal = keepState;
1202 } else if (existingVal > duplState) {
1203 newVal = existingVal - 1;
1204 }
1205 sd->fDtran->setElementAt(newVal, col);
1206 }
1207 if (sd->fAccepting == duplState) {
1208 sd->fAccepting = keepState;
1209 } else if (sd->fAccepting > duplState) {
1210 sd->fAccepting--;
1211 }
1212 if (sd->fLookAhead == duplState) {
1213 sd->fLookAhead = keepState;
1214 } else if (sd->fLookAhead > duplState) {
1215 sd->fLookAhead--;
1216 }
1217 }
1218}
1219
1220void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1221 const int32_t keepState = duplStates.first;
1222 const int32_t duplState = duplStates.second;
1223 U_ASSERT(keepState < duplState);
1224 U_ASSERT(duplState < fSafeTable->size());
1225
1226 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1227 // and will auto-delete the removed element.
1228 int32_t numStates = fSafeTable->size();
1229 for (int32_t state=0; state<numStates; ++state) {
1230 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1231 int32_t numCols = sd->length();
1232 for (int32_t col=0; col<numCols; col++) {
1233 int32_t existingVal = sd->charAt(col);
1234 int32_t newVal = existingVal;
1235 if (existingVal == duplState) {
1236 newVal = keepState;
1237 } else if (existingVal > duplState) {
1238 newVal = existingVal - 1;
1239 }
3d1f044b 1240 sd->setCharAt(col, static_cast<char16_t>(newVal));
0f5d89e8
A
1241 }
1242 }
1243}
1244
1245
1246/*
1247 * RemoveDuplicateStates
1248 */
3d1f044b 1249int32_t RBBITableBuilder::removeDuplicateStates() {
0f5d89e8 1250 IntPair dupls = {3, 0};
3d1f044b
A
1251 int32_t numStatesRemoved = 0;
1252
0f5d89e8
A
1253 while (findDuplicateState(&dupls)) {
1254 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1255 removeState(dupls);
3d1f044b 1256 ++numStatesRemoved;
0f5d89e8 1257 }
3d1f044b 1258 return numStatesRemoved;
0f5d89e8 1259}
b75a7d8f
A
1260
1261
1262//-----------------------------------------------------------------------------
1263//
1264// getTableSize() Calculate the size of the runtime form of this
1265// state transition table.
1266//
1267//-----------------------------------------------------------------------------
374ca955 1268int32_t RBBITableBuilder::getTableSize() const {
b75a7d8f
A
1269 int32_t size = 0;
1270 int32_t numRows;
1271 int32_t numCols;
1272 int32_t rowSize;
1273
1274 if (fTree == NULL) {
1275 return 0;
1276 }
1277
0f5d89e8 1278 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
b75a7d8f
A
1279
1280 numRows = fDStates->size();
1281 numCols = fRB->fSetBuilder->getNumCharCategories();
1282
0f5d89e8 1283 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
b75a7d8f
A
1284 size += numRows * rowSize;
1285 return size;
1286}
1287
1288
b75a7d8f
A
1289//-----------------------------------------------------------------------------
1290//
1291// exportTable() export the state transition table in the format required
1292// by the runtime engine. getTableSize() bytes of memory
1293// must be available at the output address "where".
1294//
1295//-----------------------------------------------------------------------------
1296void RBBITableBuilder::exportTable(void *where) {
1297 RBBIStateTable *table = (RBBIStateTable *)where;
1298 uint32_t state;
1299 int col;
1300
1301 if (U_FAILURE(*fStatus) || fTree == NULL) {
1302 return;
1303 }
1304
0f5d89e8
A
1305 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1306 if (catCount > 0x7fff ||
b75a7d8f
A
1307 fDStates->size() > 0x7fff) {
1308 *fStatus = U_BRK_INTERNAL_ERROR;
1309 return;
1310 }
1311
0f5d89e8 1312 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
b75a7d8f 1313 table->fNumStates = fDStates->size();
374ca955
A
1314 table->fFlags = 0;
1315 if (fRB->fLookAheadHardBreak) {
1316 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1317 }
73c04bcf
A
1318 if (fRB->fSetBuilder->sawBOF()) {
1319 table->fFlags |= RBBI_BOF_REQUIRED;
1320 }
374ca955 1321 table->fReserved = 0;
b75a7d8f
A
1322
1323 for (state=0; state<table->fNumStates; state++) {
1324 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1325 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1326 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1327 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1328 row->fAccepting = (int16_t)sd->fAccepting;
1329 row->fLookAhead = (int16_t)sd->fLookAhead;
374ca955 1330 row->fTagIdx = (int16_t)sd->fTagsIdx;
0f5d89e8 1331 for (col=0; col<catCount; col++) {
b75a7d8f
A
1332 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1333 }
1334 }
1335}
1336
1337
0f5d89e8
A
1338/**
1339 * Synthesize a safe state table from the main state table.
1340 */
1341void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1342 // The safe table creation has three steps:
1343
1344 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1345 // following the pair do not depend on context or state before the pair. To test
1346 // whether a pair is safe, run it through the main forward state table, starting
1347 // from each state. If the the final state is the same, no matter what the starting state,
1348 // the pair is safe.
1349 //
1350 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1351 // forward table, with a column for each input character [class], and a row for
1352 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1353 // create an additional state for each input character category; being in
1354 // one of these states means that the character has been seen, and is potentially
1355 // the first of a pair. In each of these rows, the entry for the second character
1356 // of a safe pair is set to the stop state (0), indicating that a match was found.
1357 // All other table entries are set to the state corresponding the current input
1358 // character, allowing that charcter to be the of a start following pair.
1359 //
1360 // Because the safe rules are to be run in reverse, moving backwards in the text,
1361 // the first and second pair categories are swapped when building the table.
1362 //
1363 // 3. Compress the table. There are typically many rows (states) that are
1364 // equivalent - that have zeroes (match completed) in the same columns -
1365 // and can be folded together.
1366
1367 // Each safe pair is stored as two UChars in the safePair string.
1368 UnicodeString safePairs;
1369
1370 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1371 int32_t numStates = fDStates->size();
1372
1373 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1374 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1375 int32_t wantedEndState = -1;
1376 int32_t endState = 0;
1377 for (int32_t startState = 1; startState < numStates; ++startState) {
1378 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1379 int32_t s2 = startStateD->fDtran->elementAti(c1);
1380 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1381 endState = s2StateD->fDtran->elementAti(c2);
1382 if (wantedEndState < 0) {
1383 wantedEndState = endState;
1384 } else {
1385 if (wantedEndState != endState) {
1386 break;
1387 }
1388 }
1389 }
1390 if (wantedEndState == endState) {
1391 safePairs.append((char16_t)c1);
1392 safePairs.append((char16_t)c2);
1393 // printf("(%d, %d) ", c1, c2);
1394 }
1395 }
1396 // printf("\n");
1397 }
1398
1399 // Populate the initial safe table.
1400 // The table as a whole is UVector<UnicodeString>
1401 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1402 // Row 0 is the stop state.
1403 // Row 1 is the start sate.
1404 // Row 2 and beyond are other states, initially one per char class, but
1405 // after initial construction, many of the states will be combined, compacting the table.
1406 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1407 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1408
1409 U_ASSERT(fSafeTable == nullptr);
1410 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1411 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1412 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1413 }
1414
1415 // From the start state, each input char class transitions to the state for that input.
1416 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1417 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1418 // Note: +2 for the start & stop state.
3d1f044b 1419 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
0f5d89e8
A
1420 }
1421
1422 // Initially make every other state table row look like the start state row,
1423 for (int32_t row=2; row<numCharClasses+2; ++row) {
1424 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1425 rowState = startState; // UnicodeString assignment, copies contents.
1426 }
1427
1428 // Run through the safe pairs, set the next state to zero when pair has been seen.
1429 // Zero being the stop state, meaning we found a safe point.
1430 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1431 int32_t c1 = safePairs.charAt(pairIdx);
1432 int32_t c2 = safePairs.charAt(pairIdx + 1);
1433
1434 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1435 rowState.setCharAt(c1, 0);
1436 }
1437
1438 // Remove duplicate or redundant rows from the table.
1439 IntPair states = {1, 0};
1440 while (findDuplicateSafeState(&states)) {
1441 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1442 removeSafeState(states);
1443 }
1444}
1445
1446
1447//-----------------------------------------------------------------------------
1448//
1449// getSafeTableSize() Calculate the size of the runtime form of this
1450// safe state table.
1451//
1452//-----------------------------------------------------------------------------
1453int32_t RBBITableBuilder::getSafeTableSize() const {
1454 int32_t size = 0;
1455 int32_t numRows;
1456 int32_t numCols;
1457 int32_t rowSize;
1458
1459 if (fSafeTable == nullptr) {
1460 return 0;
1461 }
1462
1463 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1464
1465 numRows = fSafeTable->size();
1466 numCols = fRB->fSetBuilder->getNumCharCategories();
1467
1468 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1469 size += numRows * rowSize;
1470 return size;
1471}
1472
1473
1474//-----------------------------------------------------------------------------
1475//
1476// exportSafeTable() export the state transition table in the format required
1477// by the runtime engine. getTableSize() bytes of memory
1478// must be available at the output address "where".
1479//
1480//-----------------------------------------------------------------------------
1481void RBBITableBuilder::exportSafeTable(void *where) {
1482 RBBIStateTable *table = (RBBIStateTable *)where;
1483 uint32_t state;
1484 int col;
1485
1486 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1487 return;
1488 }
1489
1490 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1491 if (catCount > 0x7fff ||
1492 fSafeTable->size() > 0x7fff) {
1493 *fStatus = U_BRK_INTERNAL_ERROR;
1494 return;
1495 }
1496
1497 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1498 table->fNumStates = fSafeTable->size();
1499 table->fFlags = 0;
1500 table->fReserved = 0;
1501
1502 for (state=0; state<table->fNumStates; state++) {
1503 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1504 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1505 row->fAccepting = 0;
1506 row->fLookAhead = 0;
1507 row->fTagIdx = 0;
1508 row->fReserved = 0;
1509 for (col=0; col<catCount; col++) {
1510 row->fNextState[col] = rowString->charAt(col);
1511 }
1512 }
1513}
1514
1515
1516
b75a7d8f
A
1517
1518//-----------------------------------------------------------------------------
1519//
1520// printSet Debug function. Print the contents of a UVector
1521//
1522//-----------------------------------------------------------------------------
b75a7d8f 1523#ifdef RBBI_DEBUG
374ca955 1524void RBBITableBuilder::printSet(UVector *s) {
b75a7d8f
A
1525 int32_t i;
1526 for (i=0; i<s->size(); i++) {
2ca993e8
A
1527 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1528 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
b75a7d8f
A
1529 }
1530 RBBIDebugPrintf("\n");
b75a7d8f 1531}
374ca955 1532#endif
b75a7d8f
A
1533
1534
1535//-----------------------------------------------------------------------------
1536//
1537// printStates Debug Function. Dump the fully constructed state transition table.
1538//
1539//-----------------------------------------------------------------------------
b75a7d8f 1540#ifdef RBBI_DEBUG
374ca955 1541void RBBITableBuilder::printStates() {
b75a7d8f
A
1542 int c; // input "character"
1543 int n; // state number
1544
1545 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1546 RBBIDebugPrintf(" | Acc LA Tag");
374ca955
A
1547 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548 RBBIDebugPrintf(" %2d", c);
1549 }
b75a7d8f
A
1550 RBBIDebugPrintf("\n");
1551 RBBIDebugPrintf(" |---------------");
374ca955
A
1552 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1553 RBBIDebugPrintf("---");
1554 }
b75a7d8f
A
1555 RBBIDebugPrintf("\n");
1556
1557 for (n=0; n<fDStates->size(); n++) {
1558 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1559 RBBIDebugPrintf(" %3d | " , n);
374ca955 1560 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
b75a7d8f
A
1561 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1562 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1563 }
1564 RBBIDebugPrintf("\n");
1565 }
1566 RBBIDebugPrintf("\n\n");
b75a7d8f 1567}
374ca955 1568#endif
b75a7d8f
A
1569
1570
0f5d89e8
A
1571//-----------------------------------------------------------------------------
1572//
1573// printSafeTable Debug Function. Dump the fully constructed safe table.
1574//
1575//-----------------------------------------------------------------------------
1576#ifdef RBBI_DEBUG
1577void RBBITableBuilder::printReverseTable() {
1578 int c; // input "character"
1579 int n; // state number
1580
1581 RBBIDebugPrintf(" Safe Reverse Table \n");
1582 if (fSafeTable == nullptr) {
1583 RBBIDebugPrintf(" --- nullptr ---\n");
1584 return;
1585 }
1586 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1587 RBBIDebugPrintf(" | Acc LA Tag");
1588 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589 RBBIDebugPrintf(" %2d", c);
1590 }
1591 RBBIDebugPrintf("\n");
1592 RBBIDebugPrintf(" |---------------");
1593 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1594 RBBIDebugPrintf("---");
1595 }
1596 RBBIDebugPrintf("\n");
1597
1598 for (n=0; n<fSafeTable->size(); n++) {
1599 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1600 RBBIDebugPrintf(" %3d | " , n);
1601 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1602 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1603 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1604 }
1605 RBBIDebugPrintf("\n");
1606 }
1607 RBBIDebugPrintf("\n\n");
1608}
1609#endif
1610
1611
b75a7d8f 1612
374ca955
A
1613//-----------------------------------------------------------------------------
1614//
1615// printRuleStatusTable Debug Function. Dump the common rule status table
1616//
1617//-----------------------------------------------------------------------------
1618#ifdef RBBI_DEBUG
1619void RBBITableBuilder::printRuleStatusTable() {
1620 int32_t thisRecord = 0;
1621 int32_t nextRecord = 0;
1622 int i;
1623 UVector *tbl = fRB->fRuleStatusVals;
1624
1625 RBBIDebugPrintf("index | tags \n");
1626 RBBIDebugPrintf("-------------------\n");
73c04bcf 1627
374ca955
A
1628 while (nextRecord < tbl->size()) {
1629 thisRecord = nextRecord;
1630 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1631 RBBIDebugPrintf("%4d ", thisRecord);
1632 for (i=thisRecord+1; i<nextRecord; i++) {
1633 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1634 }
1635 RBBIDebugPrintf("\n");
1636 }
1637 RBBIDebugPrintf("\n\n");
1638}
1639#endif
b75a7d8f
A
1640
1641
1642//-----------------------------------------------------------------------------
1643//
1644// RBBIStateDescriptor Methods. This is a very struct-like class
1645// Most access is directly to the fields.
1646//
1647//-----------------------------------------------------------------------------
1648
1649RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1650 fMarked = FALSE;
1651 fAccepting = 0;
1652 fLookAhead = 0;
374ca955
A
1653 fTagsIdx = 0;
1654 fTagVals = NULL;
b75a7d8f
A
1655 fPositions = NULL;
1656 fDtran = NULL;
73c04bcf 1657
0f5d89e8 1658 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
b75a7d8f
A
1659 if (U_FAILURE(*fStatus)) {
1660 return;
1661 }
b75a7d8f
A
1662 if (fDtran == NULL) {
1663 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1664 return;
1665 }
0f5d89e8 1666 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
b75a7d8f
A
1667 // It is indexed by input symbols, and will
1668 // hold the next state number for each
1669 // symbol.
1670}
1671
1672
1673RBBIStateDescriptor::~RBBIStateDescriptor() {
1674 delete fPositions;
1675 delete fDtran;
374ca955 1676 delete fTagVals;
b75a7d8f
A
1677 fPositions = NULL;
1678 fDtran = NULL;
374ca955 1679 fTagVals = NULL;
b75a7d8f
A
1680}
1681
1682U_NAMESPACE_END
1683
1684#endif /* #if !UCONFIG_NO_BREAK_ITERATION */