]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbitblb.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/*
4**********************************************************************
2ca993e8 5* Copyright (c) 2002-2016, International Business Machines
b75a7d8f
A
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*/
73c04bcf
A
9//
10// rbbitblb.cpp
11//
12
b75a7d8f
A
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/unistr.h"
19#include "rbbitblb.h"
20#include "rbbirb.h"
21#include "rbbisetb.h"
22#include "rbbidata.h"
23#include "cstring.h"
24#include "uassert.h"
0f5d89e8 25#include "uvectr32.h"
73c04bcf 26#include "cmemory.h"
b75a7d8f
A
27
28U_NAMESPACE_BEGIN
29
0f5d89e8
A
30RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31 fRB(rb),
32 fTree(*rootNode),
33 fStatus(&status),
34 fDStates(nullptr),
35 fSafeTable(nullptr) {
374ca955 36 if (U_FAILURE(status)) {
374ca955
A
37 return;
38 }
0f5d89e8
A
39 // fDStates is UVector<RBBIStateDescriptor *>
40 fDStates = new UVector(status);
41 if (U_SUCCESS(status) && fDStates == nullptr ) {
42 status = U_MEMORY_ALLOCATION_ERROR;
374ca955 43 }
b75a7d8f
A
44}
45
46
47
48RBBITableBuilder::~RBBITableBuilder() {
49 int i;
50 for (i=0; i<fDStates->size(); i++) {
51 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52 }
0f5d89e8
A
53 delete fDStates;
54 delete fSafeTable;
b75a7d8f
A
55}
56
57
58//-----------------------------------------------------------------------------
59//
0f5d89e8
A
60// RBBITableBuilder::buildForwardTable - This is the main function for building
61// the DFA state transition table from the RBBI rules parse tree.
b75a7d8f
A
62//
63//-----------------------------------------------------------------------------
0f5d89e8 64void RBBITableBuilder::buildForwardTable() {
b75a7d8f
A
65
66 if (U_FAILURE(*fStatus)) {
67 return;
68 }
69
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
72 if (fTree==NULL) {
73 return;
74 }
75
76 //
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
79 //
80 fTree = fTree->flattenVariables();
73c04bcf 81#ifdef RBBI_DEBUG
b75a7d8f 82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
2ca993e8 83 RBBIDebugPuts("\nParse tree after flattening variable references.");
f3c0d7a5 84 RBBINode::printTree(fTree, TRUE);
b75a7d8f 85 }
73c04bcf
A
86#endif
87
88 //
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
93 //
94 if (fRB->fSetBuilder->sawBOF()) {
95 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
46f4442e
A
97 // Delete and exit if memory allocation failed.
98 if (bofTop == NULL || bofLeaf == NULL) {
99 *fStatus = U_MEMORY_ALLOCATION_ERROR;
100 delete bofTop;
101 delete bofLeaf;
102 return;
103 }
73c04bcf
A
104 bofTop->fLeftChild = bofLeaf;
105 bofTop->fRightChild = fTree;
106 bofLeaf->fParent = bofTop;
107 bofLeaf->fVal = 2; // Reserved value for {bof}.
108 fTree = bofTop;
109 }
b75a7d8f
A
110
111 //
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
115 //
116 RBBINode *cn = new RBBINode(RBBINode::opCat);
46f4442e
A
117 // Exit if memory allocation failed.
118 if (cn == NULL) {
119 *fStatus = U_MEMORY_ALLOCATION_ERROR;
120 return;
121 }
b75a7d8f
A
122 cn->fLeftChild = fTree;
123 fTree->fParent = cn;
124 cn->fRightChild = new RBBINode(RBBINode::endMark);
46f4442e
A
125 // Delete and exit if memory allocation failed.
126 if (cn->fRightChild == NULL) {
127 *fStatus = U_MEMORY_ALLOCATION_ERROR;
128 delete cn;
129 return;
130 }
b75a7d8f
A
131 cn->fRightChild->fParent = cn;
132 fTree = cn;
133
134 //
135 // Replace all references to UnicodeSets with the tree for the equivalent
136 // expression.
137 //
138 fTree->flattenSets();
73c04bcf 139#ifdef RBBI_DEBUG
b75a7d8f 140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
2ca993e8 141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
f3c0d7a5 142 RBBINode::printTree(fTree, TRUE);
b75a7d8f 143 }
73c04bcf 144#endif
b75a7d8f
A
145
146
147 //
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
153 //
154 calcNullable(fTree);
155 calcFirstPos(fTree);
156 calcLastPos(fTree);
157 calcFollowPos(fTree);
158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
374ca955 159 RBBIDebugPuts("\n");
b75a7d8f
A
160 printPosSets(fTree);
161 }
162
374ca955
A
163 //
164 // For "chained" rules, modify the followPos sets
165 //
166 if (fRB->fChainRules) {
167 calcChainedFollowPos(fTree);
168 }
169
73c04bcf
A
170 //
171 // BOF (start of input) test fixup.
172 //
173 if (fRB->fSetBuilder->sawBOF()) {
174 bofFixup();
175 }
176
b75a7d8f
A
177 //
178 // Build the DFA state transition tables.
179 //
180 buildStateTable();
181 flagAcceptingStates();
182 flagLookAheadStates();
183 flagTaggedStates();
b75a7d8f 184
374ca955
A
185 //
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
189 //
190 mergeRuleStatusVals();
b75a7d8f
A
191}
192
193
194
195//-----------------------------------------------------------------------------
196//
197// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198//
199//-----------------------------------------------------------------------------
200void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237}
238
239
240
241
242//-----------------------------------------------------------------------------
243//
244// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245//
246//-----------------------------------------------------------------------------
247void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
73c04bcf
A
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
b75a7d8f
A
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284}
285
286
287
288//-----------------------------------------------------------------------------
289//
290// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291//
292//-----------------------------------------------------------------------------
293void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
73c04bcf
A
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
b75a7d8f
A
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330}
331
332
333
334//-----------------------------------------------------------------------------
335//
336// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337//
338//-----------------------------------------------------------------------------
339void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376}
377
2ca993e8
A
378//-----------------------------------------------------------------------------
379//
380// addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381// as roots of a rule to a destination vector.
382//
383//-----------------------------------------------------------------------------
384void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385 if (node == NULL || U_FAILURE(*fStatus)) {
386 return;
387 }
388 if (node->fRuleRoot) {
389 dest->addElement(node, *fStatus);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
392 return;
393 }
394 addRuleRootNodes(dest, node->fLeftChild);
395 addRuleRootNodes(dest, node->fRightChild);
396}
b75a7d8f 397
374ca955
A
398//-----------------------------------------------------------------------------
399//
400// calcChainedFollowPos. Modify the previously calculated followPos sets
401// to implement rule chaining. NOT described by Aho
402//
403//-----------------------------------------------------------------------------
404void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
406 UVector endMarkerNodes(*fStatus);
407 UVector leafNodes(*fStatus);
408 int32_t i;
409
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list of all endmarker nodes.
415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
73c04bcf 417 // get a list all leaf nodes
374ca955
A
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
2ca993e8
A
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
374ca955 429
2ca993e8
A
430 UVector matchStartNodes(*fStatus);
431 for (int i=0; i<ruleRootNodes.size(); ++i) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
374ca955 440
374ca955
A
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 RBBINode *endNode = NULL;
447
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i=0; i<endMarkerNodes.size(); i++) {
451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452 endNode = tNode;
453 break;
454 }
455 }
456 if (endNode == NULL) {
457 // node wasn't an end node. Try again with the next.
458 continue;
459 }
460
461 // We've got a node that can end a match.
462
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
f3c0d7a5 467 if (fRB->fLBCMNoChain) {
374ca955 468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
73c04bcf
A
469 if (c != -1) {
470 // c == -1 occurs with sets containing only the {eof} marker string.
f3c0d7a5
A
471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472 if (cLBProp == U_LB_COMBINING_MARK) {
473 continue;
73c04bcf 474 }
374ca955
A
475 }
476 }
477
478
479 // Now iterate over the nodes that can start a match, looking for ones
480 // with the same char class as our ending node.
481 RBBINode *startNode;
2ca993e8
A
482 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
374ca955
A
484 if (startNode->fType != RBBINode::leafChar) {
485 continue;
486 }
487
488 if (endNode->fVal == startNode->fVal) {
489 // The end val (character class) of one possible match is the
490 // same as the start of another.
491
492 // Add all nodes from the followPos of the start node to the
493 // followPos set of the end node, which will have the effect of
494 // letting matches transition from a match state at endNode
495 // to the second char of a match starting with startNode.
496 setAdd(endNode->fFollowPos, startNode->fFollowPos);
497 }
498 }
499 }
500}
501
502
73c04bcf
A
503//-----------------------------------------------------------------------------
504//
505// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
506// Do an swizzle similar to chaining, modifying the followPos set of
507// the bofNode to include the followPos nodes from other {bot} nodes
508// scattered through the tree.
509//
510// This function has much in common with calcChainedFollowPos().
511//
512//-----------------------------------------------------------------------------
513void RBBITableBuilder::bofFixup() {
514
515 if (U_FAILURE(*fStatus)) {
516 return;
517 }
518
519 // The parse tree looks like this ...
520 // fTree root ---> <cat>
521 // / \ .
522 // <cat> <#end node>
523 // / \ .
524 // <bofNode> rest
525 // of tree
526 //
527 // We will be adding things to the followPos set of the <bofNode>
528 //
529 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
530 U_ASSERT(bofNode->fType == RBBINode::leafChar);
531 U_ASSERT(bofNode->fVal == 2);
532
533 // Get all nodes that can be the start a match of the user-written rules
534 // (excluding the fake bofNode)
535 // We want the nodes that can start a match in the
536 // part labeled "rest of tree"
537 //
538 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539
540 RBBINode *startNode;
541 int startNodeIx;
542 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544 if (startNode->fType != RBBINode::leafChar) {
545 continue;
546 }
547
548 if (startNode->fVal == bofNode->fVal) {
549 // We found a leaf node corresponding to a {bof} that was
550 // explicitly written into a rule.
551 // Add everything from the followPos set of this node to the
552 // followPos set of the fake bofNode at the start of the tree.
553 //
554 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555 }
556 }
557}
558
b75a7d8f
A
559//-----------------------------------------------------------------------------
560//
561// buildStateTable() Determine the set of runtime DFA states and the
562// transition tables for these states, by the algorithm
563// of fig. 3.44 in Aho.
564//
565// Most of the comments are quotes of Aho's psuedo-code.
566//
567//-----------------------------------------------------------------------------
568void RBBITableBuilder::buildStateTable() {
374ca955
A
569 if (U_FAILURE(*fStatus)) {
570 return;
571 }
46f4442e
A
572 RBBIStateDescriptor *failState;
573 // Set it to NULL to avoid uninitialized warning
574 RBBIStateDescriptor *initialState = NULL;
b75a7d8f
A
575 //
576 // Add a dummy state 0 - the stop state. Not from Aho.
577 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
46f4442e
A
578 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579 if (failState == NULL) {
580 *fStatus = U_MEMORY_ALLOCATION_ERROR;
581 goto ExitBuildSTdeleteall;
582 }
b75a7d8f 583 failState->fPositions = new UVector(*fStatus);
46f4442e
A
584 if (failState->fPositions == NULL) {
585 *fStatus = U_MEMORY_ALLOCATION_ERROR;
586 }
587 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588 goto ExitBuildSTdeleteall;
374ca955 589 }
b75a7d8f 590 fDStates->addElement(failState, *fStatus);
374ca955 591 if (U_FAILURE(*fStatus)) {
46f4442e 592 goto ExitBuildSTdeleteall;
374ca955 593 }
b75a7d8f
A
594
595 // initially, the only unmarked state in Dstates is firstpos(root),
596 // where toot is the root of the syntax tree for (r)#;
46f4442e
A
597 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598 if (initialState == NULL) {
599 *fStatus = U_MEMORY_ALLOCATION_ERROR;
600 }
374ca955 601 if (U_FAILURE(*fStatus)) {
46f4442e 602 goto ExitBuildSTdeleteall;
374ca955 603 }
b75a7d8f 604 initialState->fPositions = new UVector(*fStatus);
46f4442e
A
605 if (initialState->fPositions == NULL) {
606 *fStatus = U_MEMORY_ALLOCATION_ERROR;
607 }
374ca955 608 if (U_FAILURE(*fStatus)) {
46f4442e 609 goto ExitBuildSTdeleteall;
374ca955 610 }
b75a7d8f
A
611 setAdd(initialState->fPositions, fTree->fFirstPosSet);
612 fDStates->addElement(initialState, *fStatus);
374ca955 613 if (U_FAILURE(*fStatus)) {
46f4442e 614 goto ExitBuildSTdeleteall;
374ca955 615 }
b75a7d8f
A
616
617 // while there is an unmarked state T in Dstates do begin
618 for (;;) {
619 RBBIStateDescriptor *T = NULL;
620 int32_t tx;
621 for (tx=1; tx<fDStates->size(); tx++) {
622 RBBIStateDescriptor *temp;
623 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624 if (temp->fMarked == FALSE) {
625 T = temp;
626 break;
627 }
628 }
629 if (T == NULL) {
630 break;
631 }
632
633 // mark T;
634 T->fMarked = TRUE;
635
636 // for each input symbol a do begin
637 int32_t a;
638 for (a = 1; a<=lastInputSymbol; a++) {
639 // let U be the set of positions that are in followpos(p)
640 // for some position p in T
641 // such that the symbol at position p is a;
642 UVector *U = NULL;
643 RBBINode *p;
644 int32_t px;
645 for (px=0; px<T->fPositions->size(); px++) {
646 p = (RBBINode *)T->fPositions->elementAt(px);
647 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
648 if (U == NULL) {
649 U = new UVector(*fStatus);
46f4442e
A
650 if (U == NULL) {
651 *fStatus = U_MEMORY_ALLOCATION_ERROR;
652 goto ExitBuildSTdeleteall;
653 }
b75a7d8f
A
654 }
655 setAdd(U, p->fFollowPos);
656 }
657 }
658
659 // if U is not empty and not in DStates then
660 int32_t ux = 0;
661 UBool UinDstates = FALSE;
662 if (U != NULL) {
663 U_ASSERT(U->size() > 0);
664 int ix;
665 for (ix=0; ix<fDStates->size(); ix++) {
666 RBBIStateDescriptor *temp2;
667 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668 if (setEquals(U, temp2->fPositions)) {
669 delete U;
670 U = temp2->fPositions;
671 ux = ix;
672 UinDstates = TRUE;
673 break;
674 }
675 }
676
677 // Add U as an unmarked state to Dstates
678 if (!UinDstates)
679 {
680 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
46f4442e
A
681 if (newState == NULL) {
682 *fStatus = U_MEMORY_ALLOCATION_ERROR;
683 }
374ca955 684 if (U_FAILURE(*fStatus)) {
46f4442e 685 goto ExitBuildSTdeleteall;
374ca955 686 }
b75a7d8f
A
687 newState->fPositions = U;
688 fDStates->addElement(newState, *fStatus);
374ca955
A
689 if (U_FAILURE(*fStatus)) {
690 return;
691 }
b75a7d8f
A
692 ux = fDStates->size()-1;
693 }
694
695 // Dtran[T, a] := U;
696 T->fDtran->setElementAt(ux, a);
697 }
698 }
699 }
46f4442e
A
700 return;
701 // delete local pointers only if error occured.
702ExitBuildSTdeleteall:
703 delete initialState;
704 delete failState;
b75a7d8f
A
705}
706
707
708
709//-----------------------------------------------------------------------------
710//
711// flagAcceptingStates Identify accepting states.
712// First get a list of all of the end marker nodes.
713// Then, for each state s,
714// if s contains one of the end marker nodes in its list of tree positions then
715// s is an accepting state.
716//
717//-----------------------------------------------------------------------------
718void RBBITableBuilder::flagAcceptingStates() {
374ca955
A
719 if (U_FAILURE(*fStatus)) {
720 return;
721 }
b75a7d8f
A
722 UVector endMarkerNodes(*fStatus);
723 RBBINode *endMarker;
724 int32_t i;
725 int32_t n;
726
374ca955
A
727 if (U_FAILURE(*fStatus)) {
728 return;
729 }
730
b75a7d8f 731 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
374ca955
A
732 if (U_FAILURE(*fStatus)) {
733 return;
734 }
b75a7d8f
A
735
736 for (i=0; i<endMarkerNodes.size(); i++) {
737 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738 for (n=0; n<fDStates->size(); n++) {
739 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740 if (sd->fPositions->indexOf(endMarker) >= 0) {
741 // Any non-zero value for fAccepting means this is an accepting node.
742 // The value is what will be returned to the user as the break status.
743 // If no other value was specified, force it to -1.
73c04bcf
A
744
745 if (sd->fAccepting==0) {
746 // State hasn't been marked as accepting yet. Do it now.
747 sd->fAccepting = endMarker->fVal;
748 if (sd->fAccepting == 0) {
749 sd->fAccepting = -1;
750 }
751 }
752 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753 // Both lookahead and non-lookahead accepting for this state.
754 // Favor the look-ahead. Expedient for line break.
755 // TODO: need a more elegant resolution for conflicting rules.
756 sd->fAccepting = endMarker->fVal;
b75a7d8f 757 }
73c04bcf
A
758 // implicit else:
759 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
b75a7d8f
A
760
761 // If the end marker node is from a look-ahead rule, set
0f5d89e8 762 // the fLookAhead field for this state also.
b75a7d8f 763 if (endMarker->fLookAheadEnd) {
73c04bcf
A
764 // TODO: don't change value if already set?
765 // TODO: allow for more than one active look-ahead rule in engine.
766 // Make value here an index to a side array in engine?
b75a7d8f
A
767 sd->fLookAhead = sd->fAccepting;
768 }
769 }
770 }
771 }
772}
773
774
775//-----------------------------------------------------------------------------
776//
777// flagLookAheadStates Very similar to flagAcceptingStates, above.
778//
779//-----------------------------------------------------------------------------
780void RBBITableBuilder::flagLookAheadStates() {
374ca955
A
781 if (U_FAILURE(*fStatus)) {
782 return;
783 }
b75a7d8f
A
784 UVector lookAheadNodes(*fStatus);
785 RBBINode *lookAheadNode;
786 int32_t i;
787 int32_t n;
788
789 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
374ca955
A
790 if (U_FAILURE(*fStatus)) {
791 return;
792 }
b75a7d8f
A
793 for (i=0; i<lookAheadNodes.size(); i++) {
794 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795
796 for (n=0; n<fDStates->size(); n++) {
797 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799 sd->fLookAhead = lookAheadNode->fVal;
800 }
801 }
802 }
803}
804
805
806
807
808//-----------------------------------------------------------------------------
809//
810// flagTaggedStates
811//
812//-----------------------------------------------------------------------------
813void RBBITableBuilder::flagTaggedStates() {
374ca955
A
814 if (U_FAILURE(*fStatus)) {
815 return;
816 }
b75a7d8f
A
817 UVector tagNodes(*fStatus);
818 RBBINode *tagNode;
819 int32_t i;
820 int32_t n;
821
374ca955
A
822 if (U_FAILURE(*fStatus)) {
823 return;
824 }
b75a7d8f 825 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
374ca955
A
826 if (U_FAILURE(*fStatus)) {
827 return;
828 }
b75a7d8f
A
829 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
830 tagNode = (RBBINode *)tagNodes.elementAt(i);
73c04bcf 831
b75a7d8f
A
832 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
833 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
374ca955 835 sortedAdd(&sd->fTagVals, tagNode->fVal);
b75a7d8f
A
836 }
837 }
838 }
839}
374ca955
A
840
841
842
843
844//-----------------------------------------------------------------------------
845//
846// mergeRuleStatusVals
847//
848// Update the global table of rule status {tag} values
849// The rule builder has a global vector of status values that are common
850// for all tables. Merge the ones from this table into the global set.
851//
852//-----------------------------------------------------------------------------
853void RBBITableBuilder::mergeRuleStatusVals() {
854 //
855 // The basic outline of what happens here is this...
856 //
857 // for each state in this state table
858 // if the status tag list for this state is in the global statuses list
859 // record where and
860 // continue with the next state
861 // else
862 // add the tag list for this state to the global list.
863 //
864 int i;
865 int n;
866
867 // Pre-set a single tag of {0} into the table.
868 // We will need this as a default, for rule sets with no explicit tagging.
869 if (fRB->fRuleStatusVals->size() == 0) {
870 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
871 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
872 }
73c04bcf
A
873
874 // For each state
875 for (n=0; n<fDStates->size(); n++) {
374ca955
A
876 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877 UVector *thisStatesTagValues = sd->fTagVals;
878 if (thisStatesTagValues == NULL) {
879 // No tag values are explicitly associated with this state.
880 // Set the default tag value.
881 sd->fTagsIdx = 0;
882 continue;
883 }
884
885 // There are tag(s) associated with this state.
886 // fTagsIdx will be the index into the global tag list for this state's tag values.
887 // Initial value of -1 flags that we haven't got it set yet.
888 sd->fTagsIdx = -1;
889 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
890 int32_t nextTagGroupStart = 0;
73c04bcf 891
374ca955
A
892 // Loop runs once per group of tags in the global list
893 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894 thisTagGroupStart = nextTagGroupStart;
895 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897 // The number of tags for this state is different from
898 // the number of tags in this group from the global list.
899 // Continue with the next group from the global list.
900 continue;
901 }
902 // The lengths match, go ahead and compare the actual tag values
903 // between this state and the group from the global list.
904 for (i=0; i<thisStatesTagValues->size(); i++) {
73c04bcf 905 if (thisStatesTagValues->elementAti(i) !=
374ca955 906 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
73c04bcf 907 // Mismatch.
374ca955
A
908 break;
909 }
910 }
73c04bcf 911
374ca955
A
912 if (i == thisStatesTagValues->size()) {
913 // We found a set of tag values in the global list that match
914 // those for this state. Use them.
915 sd->fTagsIdx = thisTagGroupStart;
73c04bcf 916 break;
374ca955
A
917 }
918 }
73c04bcf 919
374ca955
A
920 if (sd->fTagsIdx == -1) {
921 // No suitable entry in the global tag list already. Add one
922 sd->fTagsIdx = fRB->fRuleStatusVals->size();
923 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924 for (i=0; i<thisStatesTagValues->size(); i++) {
925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926 }
927 }
928 }
929}
930
931
932
933
934
935
936
937//-----------------------------------------------------------------------------
938//
939// sortedAdd Add a value to a vector of sorted values (ints).
940// Do not replicate entries; if the value is already there, do not
941// add a second one.
942// Lazily create the vector if it does not already exist.
943//
944//-----------------------------------------------------------------------------
945void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946 int32_t i;
947
948 if (*vector == NULL) {
949 *vector = new UVector(*fStatus);
950 }
951 if (*vector == NULL || U_FAILURE(*fStatus)) {
952 return;
953 }
954 UVector *vec = *vector;
955 int32_t vSize = vec->size();
956 for (i=0; i<vSize; i++) {
957 int32_t valAtI = vec->elementAti(i);
958 if (valAtI == val) {
959 // The value is already in the vector. Don't add it again.
960 return;
961 }
962 if (valAtI > val) {
963 break;
964 }
965 }
966 vec->insertElementAt(val, i, *fStatus);
b75a7d8f
A
967}
968
969
970
971//-----------------------------------------------------------------------------
972//
973// setAdd Set operation on UVector
974// dest = dest union source
73c04bcf 975// Elements may only appear once and must be sorted.
b75a7d8f
A
976//
977//-----------------------------------------------------------------------------
978void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
46f4442e
A
979 int32_t destOriginalSize = dest->size();
980 int32_t sourceSize = source->size();
73c04bcf 981 int32_t di = 0;
729e4ab9
A
982 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
983 void **destPtr, **sourcePtr;
73c04bcf 984 void **destLim, **sourceLim;
b75a7d8f 985
729e4ab9
A
986 if (destOriginalSize > destArray.getCapacity()) {
987 if (destArray.resize(destOriginalSize) == NULL) {
988 return;
989 }
73c04bcf 990 }
729e4ab9
A
991 destPtr = destArray.getAlias();
992 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
73c04bcf 993
729e4ab9
A
994 if (sourceSize > sourceArray.getCapacity()) {
995 if (sourceArray.resize(sourceSize) == NULL) {
996 return;
73c04bcf 997 }
73c04bcf 998 }
729e4ab9
A
999 sourcePtr = sourceArray.getAlias();
1000 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
73c04bcf
A
1001
1002 // Avoid multiple "get element" calls by getting the contents into arrays
729e4ab9
A
1003 (void) dest->toArray(destPtr);
1004 (void) source->toArray(sourcePtr);
73c04bcf 1005
46f4442e 1006 dest->setSize(sourceSize+destOriginalSize, *fStatus);
73c04bcf 1007
729e4ab9
A
1008 while (sourcePtr < sourceLim && destPtr < destLim) {
1009 if (*destPtr == *sourcePtr) {
1010 dest->setElementAt(*sourcePtr++, di++);
1011 destPtr++;
73c04bcf 1012 }
46f4442e
A
1013 // This check is required for machines with segmented memory, like i5/OS.
1014 // Direct pointer comparison is not recommended.
729e4ab9
A
1015 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016 dest->setElementAt(*destPtr++, di++);
46f4442e 1017 }
729e4ab9
A
1018 else { /* *sourcePtr < *destPtr */
1019 dest->setElementAt(*sourcePtr++, di++);
b75a7d8f 1020 }
73c04bcf
A
1021 }
1022
1023 // At most one of these two cleanup loops will execute
729e4ab9
A
1024 while (destPtr < destLim) {
1025 dest->setElementAt(*destPtr++, di++);
73c04bcf 1026 }
729e4ab9
A
1027 while (sourcePtr < sourceLim) {
1028 dest->setElementAt(*sourcePtr++, di++);
73c04bcf
A
1029 }
1030
46f4442e 1031 dest->setSize(di, *fStatus);
b75a7d8f
A
1032}
1033
1034
374ca955 1035
b75a7d8f
A
1036//-----------------------------------------------------------------------------
1037//
1038// setEqual Set operation on UVector.
1039// Compare for equality.
73c04bcf 1040// Elements must be sorted.
b75a7d8f
A
1041//
1042//-----------------------------------------------------------------------------
1043UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
73c04bcf 1044 return a->equals(*b);
b75a7d8f
A
1045}
1046
1047
1048//-----------------------------------------------------------------------------
1049//
1050// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1051// for each node in the tree.
1052//
1053//-----------------------------------------------------------------------------
b75a7d8f 1054#ifdef RBBI_DEBUG
374ca955 1055void RBBITableBuilder::printPosSets(RBBINode *n) {
b75a7d8f
A
1056 if (n==NULL) {
1057 return;
1058 }
2ca993e8
A
1059 printf("\n");
1060 RBBINode::printNodeHeader();
f3c0d7a5 1061 RBBINode::printNode(n);
b75a7d8f
A
1062 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1063
1064 RBBIDebugPrintf(" firstpos: ");
1065 printSet(n->fFirstPosSet);
1066
1067 RBBIDebugPrintf(" lastpos: ");
1068 printSet(n->fLastPosSet);
1069
1070 RBBIDebugPrintf(" followpos: ");
1071 printSet(n->fFollowPos);
1072
1073 printPosSets(n->fLeftChild);
1074 printPosSets(n->fRightChild);
b75a7d8f 1075}
374ca955 1076#endif
b75a7d8f 1077
0f5d89e8
A
1078//
1079// findDuplCharClassFrom()
1080//
1081bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082 int32_t numStates = fDStates->size();
1083 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084
1085 uint16_t table_base;
1086 uint16_t table_dupl;
1087 for (; categories->first < numCols-1; categories->first++) {
1088 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1089 for (int32_t state=0; state<numStates; state++) {
1090 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1091 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1092 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1093 if (table_base != table_dupl) {
1094 break;
1095 }
1096 }
1097 if (table_base == table_dupl) {
1098 return true;
1099 }
1100 }
1101 }
1102 return false;
1103}
1104
1105
1106//
1107// removeColumn()
1108//
1109void RBBITableBuilder::removeColumn(int32_t column) {
1110 int32_t numStates = fDStates->size();
1111 for (int32_t state=0; state<numStates; state++) {
1112 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1113 U_ASSERT(column < sd->fDtran->size());
1114 sd->fDtran->removeElementAt(column);
1115 }
1116}
1117
1118/*
1119 * findDuplicateState
1120 */
1121bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1122 int32_t numStates = fDStates->size();
1123 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1124
1125 for (; states->first<numStates-1; states->first++) {
1126 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1127 for (states->second=states->first+1; states->second<numStates; states->second++) {
1128 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1129 if (firstSD->fAccepting != duplSD->fAccepting ||
1130 firstSD->fLookAhead != duplSD->fLookAhead ||
1131 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1132 continue;
1133 }
1134 bool rowsMatch = true;
1135 for (int32_t col=0; col < numCols; ++col) {
1136 int32_t firstVal = firstSD->fDtran->elementAti(col);
1137 int32_t duplVal = duplSD->fDtran->elementAti(col);
1138 if (!((firstVal == duplVal) ||
1139 ((firstVal == states->first || firstVal == states->second) &&
1140 (duplVal == states->first || duplVal == states->second)))) {
1141 rowsMatch = false;
1142 break;
1143 }
1144 }
1145 if (rowsMatch) {
1146 return true;
1147 }
1148 }
1149 }
1150 return false;
1151}
1152
1153
1154bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1155 int32_t numStates = fSafeTable->size();
1156
1157 for (; states->first<numStates-1; states->first++) {
1158 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1159 for (states->second=states->first+1; states->second<numStates; states->second++) {
1160 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1161 bool rowsMatch = true;
1162 int32_t numCols = firstRow->length();
1163 for (int32_t col=0; col < numCols; ++col) {
1164 int32_t firstVal = firstRow->charAt(col);
1165 int32_t duplVal = duplRow->charAt(col);
1166 if (!((firstVal == duplVal) ||
1167 ((firstVal == states->first || firstVal == states->second) &&
1168 (duplVal == states->first || duplVal == states->second)))) {
1169 rowsMatch = false;
1170 break;
1171 }
1172 }
1173 if (rowsMatch) {
1174 return true;
1175 }
1176 }
1177 }
1178 return false;
1179}
1180
1181
1182void RBBITableBuilder::removeState(IntPair duplStates) {
1183 const int32_t keepState = duplStates.first;
1184 const int32_t duplState = duplStates.second;
1185 U_ASSERT(keepState < duplState);
1186 U_ASSERT(duplState < fDStates->size());
1187
1188 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1189 fDStates->removeElementAt(duplState);
1190 delete duplSD;
1191
1192 int32_t numStates = fDStates->size();
1193 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1194 for (int32_t state=0; state<numStates; ++state) {
1195 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1196 for (int32_t col=0; col<numCols; col++) {
1197 int32_t existingVal = sd->fDtran->elementAti(col);
1198 int32_t newVal = existingVal;
1199 if (existingVal == duplState) {
1200 newVal = keepState;
1201 } else if (existingVal > duplState) {
1202 newVal = existingVal - 1;
1203 }
1204 sd->fDtran->setElementAt(newVal, col);
1205 }
1206 if (sd->fAccepting == duplState) {
1207 sd->fAccepting = keepState;
1208 } else if (sd->fAccepting > duplState) {
1209 sd->fAccepting--;
1210 }
1211 if (sd->fLookAhead == duplState) {
1212 sd->fLookAhead = keepState;
1213 } else if (sd->fLookAhead > duplState) {
1214 sd->fLookAhead--;
1215 }
1216 }
1217}
1218
1219void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1220 const int32_t keepState = duplStates.first;
1221 const int32_t duplState = duplStates.second;
1222 U_ASSERT(keepState < duplState);
1223 U_ASSERT(duplState < fSafeTable->size());
1224
1225 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1226 // and will auto-delete the removed element.
1227 int32_t numStates = fSafeTable->size();
1228 for (int32_t state=0; state<numStates; ++state) {
1229 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1230 int32_t numCols = sd->length();
1231 for (int32_t col=0; col<numCols; col++) {
1232 int32_t existingVal = sd->charAt(col);
1233 int32_t newVal = existingVal;
1234 if (existingVal == duplState) {
1235 newVal = keepState;
1236 } else if (existingVal > duplState) {
1237 newVal = existingVal - 1;
1238 }
1239 sd->setCharAt(col, newVal);
1240 }
1241 }
1242}
1243
1244
1245/*
1246 * RemoveDuplicateStates
1247 */
1248void RBBITableBuilder::removeDuplicateStates() {
1249 IntPair dupls = {3, 0};
1250 while (findDuplicateState(&dupls)) {
1251 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1252 removeState(dupls);
1253 }
1254}
b75a7d8f
A
1255
1256
1257//-----------------------------------------------------------------------------
1258//
1259// getTableSize() Calculate the size of the runtime form of this
1260// state transition table.
1261//
1262//-----------------------------------------------------------------------------
374ca955 1263int32_t RBBITableBuilder::getTableSize() const {
b75a7d8f
A
1264 int32_t size = 0;
1265 int32_t numRows;
1266 int32_t numCols;
1267 int32_t rowSize;
1268
1269 if (fTree == NULL) {
1270 return 0;
1271 }
1272
0f5d89e8 1273 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
b75a7d8f
A
1274
1275 numRows = fDStates->size();
1276 numCols = fRB->fSetBuilder->getNumCharCategories();
1277
0f5d89e8 1278 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
b75a7d8f
A
1279 size += numRows * rowSize;
1280 return size;
1281}
1282
1283
b75a7d8f
A
1284//-----------------------------------------------------------------------------
1285//
1286// exportTable() export the state transition table in the format required
1287// by the runtime engine. getTableSize() bytes of memory
1288// must be available at the output address "where".
1289//
1290//-----------------------------------------------------------------------------
1291void RBBITableBuilder::exportTable(void *where) {
1292 RBBIStateTable *table = (RBBIStateTable *)where;
1293 uint32_t state;
1294 int col;
1295
1296 if (U_FAILURE(*fStatus) || fTree == NULL) {
1297 return;
1298 }
1299
0f5d89e8
A
1300 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1301 if (catCount > 0x7fff ||
b75a7d8f
A
1302 fDStates->size() > 0x7fff) {
1303 *fStatus = U_BRK_INTERNAL_ERROR;
1304 return;
1305 }
1306
0f5d89e8 1307 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
b75a7d8f 1308 table->fNumStates = fDStates->size();
374ca955
A
1309 table->fFlags = 0;
1310 if (fRB->fLookAheadHardBreak) {
1311 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1312 }
73c04bcf
A
1313 if (fRB->fSetBuilder->sawBOF()) {
1314 table->fFlags |= RBBI_BOF_REQUIRED;
1315 }
374ca955 1316 table->fReserved = 0;
b75a7d8f
A
1317
1318 for (state=0; state<table->fNumStates; state++) {
1319 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1320 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1321 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1322 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1323 row->fAccepting = (int16_t)sd->fAccepting;
1324 row->fLookAhead = (int16_t)sd->fLookAhead;
374ca955 1325 row->fTagIdx = (int16_t)sd->fTagsIdx;
0f5d89e8 1326 for (col=0; col<catCount; col++) {
b75a7d8f
A
1327 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1328 }
1329 }
1330}
1331
1332
0f5d89e8
A
1333/**
1334 * Synthesize a safe state table from the main state table.
1335 */
1336void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1337 // The safe table creation has three steps:
1338
1339 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1340 // following the pair do not depend on context or state before the pair. To test
1341 // whether a pair is safe, run it through the main forward state table, starting
1342 // from each state. If the the final state is the same, no matter what the starting state,
1343 // the pair is safe.
1344 //
1345 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1346 // forward table, with a column for each input character [class], and a row for
1347 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1348 // create an additional state for each input character category; being in
1349 // one of these states means that the character has been seen, and is potentially
1350 // the first of a pair. In each of these rows, the entry for the second character
1351 // of a safe pair is set to the stop state (0), indicating that a match was found.
1352 // All other table entries are set to the state corresponding the current input
1353 // character, allowing that charcter to be the of a start following pair.
1354 //
1355 // Because the safe rules are to be run in reverse, moving backwards in the text,
1356 // the first and second pair categories are swapped when building the table.
1357 //
1358 // 3. Compress the table. There are typically many rows (states) that are
1359 // equivalent - that have zeroes (match completed) in the same columns -
1360 // and can be folded together.
1361
1362 // Each safe pair is stored as two UChars in the safePair string.
1363 UnicodeString safePairs;
1364
1365 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1366 int32_t numStates = fDStates->size();
1367
1368 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1369 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1370 int32_t wantedEndState = -1;
1371 int32_t endState = 0;
1372 for (int32_t startState = 1; startState < numStates; ++startState) {
1373 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1374 int32_t s2 = startStateD->fDtran->elementAti(c1);
1375 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1376 endState = s2StateD->fDtran->elementAti(c2);
1377 if (wantedEndState < 0) {
1378 wantedEndState = endState;
1379 } else {
1380 if (wantedEndState != endState) {
1381 break;
1382 }
1383 }
1384 }
1385 if (wantedEndState == endState) {
1386 safePairs.append((char16_t)c1);
1387 safePairs.append((char16_t)c2);
1388 // printf("(%d, %d) ", c1, c2);
1389 }
1390 }
1391 // printf("\n");
1392 }
1393
1394 // Populate the initial safe table.
1395 // The table as a whole is UVector<UnicodeString>
1396 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1397 // Row 0 is the stop state.
1398 // Row 1 is the start sate.
1399 // Row 2 and beyond are other states, initially one per char class, but
1400 // after initial construction, many of the states will be combined, compacting the table.
1401 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1402 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1403
1404 U_ASSERT(fSafeTable == nullptr);
1405 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1406 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1407 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1408 }
1409
1410 // From the start state, each input char class transitions to the state for that input.
1411 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1412 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1413 // Note: +2 for the start & stop state.
1414 startState.setCharAt(charClass, charClass+2);
1415 }
1416
1417 // Initially make every other state table row look like the start state row,
1418 for (int32_t row=2; row<numCharClasses+2; ++row) {
1419 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1420 rowState = startState; // UnicodeString assignment, copies contents.
1421 }
1422
1423 // Run through the safe pairs, set the next state to zero when pair has been seen.
1424 // Zero being the stop state, meaning we found a safe point.
1425 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1426 int32_t c1 = safePairs.charAt(pairIdx);
1427 int32_t c2 = safePairs.charAt(pairIdx + 1);
1428
1429 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1430 rowState.setCharAt(c1, 0);
1431 }
1432
1433 // Remove duplicate or redundant rows from the table.
1434 IntPair states = {1, 0};
1435 while (findDuplicateSafeState(&states)) {
1436 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1437 removeSafeState(states);
1438 }
1439}
1440
1441
1442//-----------------------------------------------------------------------------
1443//
1444// getSafeTableSize() Calculate the size of the runtime form of this
1445// safe state table.
1446//
1447//-----------------------------------------------------------------------------
1448int32_t RBBITableBuilder::getSafeTableSize() const {
1449 int32_t size = 0;
1450 int32_t numRows;
1451 int32_t numCols;
1452 int32_t rowSize;
1453
1454 if (fSafeTable == nullptr) {
1455 return 0;
1456 }
1457
1458 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1459
1460 numRows = fSafeTable->size();
1461 numCols = fRB->fSetBuilder->getNumCharCategories();
1462
1463 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1464 size += numRows * rowSize;
1465 return size;
1466}
1467
1468
1469//-----------------------------------------------------------------------------
1470//
1471// exportSafeTable() export the state transition table in the format required
1472// by the runtime engine. getTableSize() bytes of memory
1473// must be available at the output address "where".
1474//
1475//-----------------------------------------------------------------------------
1476void RBBITableBuilder::exportSafeTable(void *where) {
1477 RBBIStateTable *table = (RBBIStateTable *)where;
1478 uint32_t state;
1479 int col;
1480
1481 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1482 return;
1483 }
1484
1485 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1486 if (catCount > 0x7fff ||
1487 fSafeTable->size() > 0x7fff) {
1488 *fStatus = U_BRK_INTERNAL_ERROR;
1489 return;
1490 }
1491
1492 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1493 table->fNumStates = fSafeTable->size();
1494 table->fFlags = 0;
1495 table->fReserved = 0;
1496
1497 for (state=0; state<table->fNumStates; state++) {
1498 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1499 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1500 row->fAccepting = 0;
1501 row->fLookAhead = 0;
1502 row->fTagIdx = 0;
1503 row->fReserved = 0;
1504 for (col=0; col<catCount; col++) {
1505 row->fNextState[col] = rowString->charAt(col);
1506 }
1507 }
1508}
1509
1510
1511
b75a7d8f
A
1512
1513//-----------------------------------------------------------------------------
1514//
1515// printSet Debug function. Print the contents of a UVector
1516//
1517//-----------------------------------------------------------------------------
b75a7d8f 1518#ifdef RBBI_DEBUG
374ca955 1519void RBBITableBuilder::printSet(UVector *s) {
b75a7d8f
A
1520 int32_t i;
1521 for (i=0; i<s->size(); i++) {
2ca993e8
A
1522 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1523 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
b75a7d8f
A
1524 }
1525 RBBIDebugPrintf("\n");
b75a7d8f 1526}
374ca955 1527#endif
b75a7d8f
A
1528
1529
1530//-----------------------------------------------------------------------------
1531//
1532// printStates Debug Function. Dump the fully constructed state transition table.
1533//
1534//-----------------------------------------------------------------------------
b75a7d8f 1535#ifdef RBBI_DEBUG
374ca955 1536void RBBITableBuilder::printStates() {
b75a7d8f
A
1537 int c; // input "character"
1538 int n; // state number
1539
1540 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1541 RBBIDebugPrintf(" | Acc LA Tag");
374ca955
A
1542 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1543 RBBIDebugPrintf(" %2d", c);
1544 }
b75a7d8f
A
1545 RBBIDebugPrintf("\n");
1546 RBBIDebugPrintf(" |---------------");
374ca955
A
1547 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548 RBBIDebugPrintf("---");
1549 }
b75a7d8f
A
1550 RBBIDebugPrintf("\n");
1551
1552 for (n=0; n<fDStates->size(); n++) {
1553 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1554 RBBIDebugPrintf(" %3d | " , n);
374ca955 1555 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
b75a7d8f
A
1556 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1557 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1558 }
1559 RBBIDebugPrintf("\n");
1560 }
1561 RBBIDebugPrintf("\n\n");
b75a7d8f 1562}
374ca955 1563#endif
b75a7d8f
A
1564
1565
0f5d89e8
A
1566//-----------------------------------------------------------------------------
1567//
1568// printSafeTable Debug Function. Dump the fully constructed safe table.
1569//
1570//-----------------------------------------------------------------------------
1571#ifdef RBBI_DEBUG
1572void RBBITableBuilder::printReverseTable() {
1573 int c; // input "character"
1574 int n; // state number
1575
1576 RBBIDebugPrintf(" Safe Reverse Table \n");
1577 if (fSafeTable == nullptr) {
1578 RBBIDebugPrintf(" --- nullptr ---\n");
1579 return;
1580 }
1581 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1582 RBBIDebugPrintf(" | Acc LA Tag");
1583 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1584 RBBIDebugPrintf(" %2d", c);
1585 }
1586 RBBIDebugPrintf("\n");
1587 RBBIDebugPrintf(" |---------------");
1588 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589 RBBIDebugPrintf("---");
1590 }
1591 RBBIDebugPrintf("\n");
1592
1593 for (n=0; n<fSafeTable->size(); n++) {
1594 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1595 RBBIDebugPrintf(" %3d | " , n);
1596 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1597 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1598 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1599 }
1600 RBBIDebugPrintf("\n");
1601 }
1602 RBBIDebugPrintf("\n\n");
1603}
1604#endif
1605
1606
b75a7d8f 1607
374ca955
A
1608//-----------------------------------------------------------------------------
1609//
1610// printRuleStatusTable Debug Function. Dump the common rule status table
1611//
1612//-----------------------------------------------------------------------------
1613#ifdef RBBI_DEBUG
1614void RBBITableBuilder::printRuleStatusTable() {
1615 int32_t thisRecord = 0;
1616 int32_t nextRecord = 0;
1617 int i;
1618 UVector *tbl = fRB->fRuleStatusVals;
1619
1620 RBBIDebugPrintf("index | tags \n");
1621 RBBIDebugPrintf("-------------------\n");
73c04bcf 1622
374ca955
A
1623 while (nextRecord < tbl->size()) {
1624 thisRecord = nextRecord;
1625 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1626 RBBIDebugPrintf("%4d ", thisRecord);
1627 for (i=thisRecord+1; i<nextRecord; i++) {
1628 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1629 }
1630 RBBIDebugPrintf("\n");
1631 }
1632 RBBIDebugPrintf("\n\n");
1633}
1634#endif
b75a7d8f
A
1635
1636
1637//-----------------------------------------------------------------------------
1638//
1639// RBBIStateDescriptor Methods. This is a very struct-like class
1640// Most access is directly to the fields.
1641//
1642//-----------------------------------------------------------------------------
1643
1644RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1645 fMarked = FALSE;
1646 fAccepting = 0;
1647 fLookAhead = 0;
374ca955
A
1648 fTagsIdx = 0;
1649 fTagVals = NULL;
b75a7d8f
A
1650 fPositions = NULL;
1651 fDtran = NULL;
73c04bcf 1652
0f5d89e8 1653 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
b75a7d8f
A
1654 if (U_FAILURE(*fStatus)) {
1655 return;
1656 }
b75a7d8f
A
1657 if (fDtran == NULL) {
1658 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1659 return;
1660 }
0f5d89e8 1661 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
b75a7d8f
A
1662 // It is indexed by input symbols, and will
1663 // hold the next state number for each
1664 // symbol.
1665}
1666
1667
1668RBBIStateDescriptor::~RBBIStateDescriptor() {
1669 delete fPositions;
1670 delete fDtran;
374ca955 1671 delete fTagVals;
b75a7d8f
A
1672 fPositions = NULL;
1673 fDtran = NULL;
374ca955 1674 fTagVals = NULL;
b75a7d8f
A
1675}
1676
1677U_NAMESPACE_END
1678
1679#endif /* #if !UCONFIG_NO_BREAK_ITERATION */