]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbitblb.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / rbbitblb.cpp
CommitLineData
b75a7d8f
A
1//
2// rbbitblb.cpp
3//
4
5/*
6**********************************************************************
374ca955 7* Copyright (c) 2002-2004, International Business Machines
b75a7d8f
A
8* Corporation and others. All Rights Reserved.
9**********************************************************************
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/unistr.h"
17#include "rbbitblb.h"
18#include "rbbirb.h"
19#include "rbbisetb.h"
20#include "rbbidata.h"
21#include "cstring.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
27 fTree(*rootNode) {
374ca955
A
28 fRB = rb;
29 fStatus = fRB->fStatus;
30 UErrorCode status = U_ZERO_ERROR;
31 fDStates = new UVector(status);
32 if (U_FAILURE(*fStatus)) {
33 return;
34 }
35 if (U_FAILURE(status)) {
36 *fStatus = status;
37 return;
38 }
39 if (fDStates == NULL) {
40 *fStatus = U_MEMORY_ALLOCATION_ERROR;;
41 }
b75a7d8f
A
42}
43
44
45
46RBBITableBuilder::~RBBITableBuilder() {
47 int i;
48 for (i=0; i<fDStates->size(); i++) {
49 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
50 }
51 delete fDStates;
52}
53
54
55//-----------------------------------------------------------------------------
56//
57// RBBITableBuilder::build - This is the main function for building the DFA state transtion
58// table from the RBBI rules parse tree.
59//
60//-----------------------------------------------------------------------------
61void RBBITableBuilder::build() {
62
63 if (U_FAILURE(*fStatus)) {
64 return;
65 }
66
67 // If there were no rules, just return. This situation can easily arise
68 // for the reverse rules.
69 if (fTree==NULL) {
70 return;
71 }
72
73 //
74 // Walk through the tree, replacing any references to $variables with a copy of the
75 // parse tree for the substition expression.
76 //
77 fTree = fTree->flattenVariables();
78 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
374ca955 79 RBBIDebugPuts("Parse tree after flattening variable references.");
b75a7d8f
A
80 fTree->printTree(TRUE);
81 }
82
83 //
84 // Add a unique right-end marker to the expression.
85 // Appears as a cat-node, left child being the original tree,
86 // right child being the end marker.
87 //
88 RBBINode *cn = new RBBINode(RBBINode::opCat);
89 cn->fLeftChild = fTree;
90 fTree->fParent = cn;
91 cn->fRightChild = new RBBINode(RBBINode::endMark);
92 cn->fRightChild->fParent = cn;
93 fTree = cn;
94
95 //
96 // Replace all references to UnicodeSets with the tree for the equivalent
97 // expression.
98 //
99 fTree->flattenSets();
100 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
374ca955 101 RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
b75a7d8f
A
102 fTree->printTree(TRUE);
103 }
104
105
106 //
107 // calculate the functions nullable, firstpos, lastpos and followpos on
108 // nodes in the parse tree.
109 // See the alogrithm description in Aho.
110 // Understanding how this works by looking at the code alone will be
111 // nearly impossible.
112 //
113 calcNullable(fTree);
114 calcFirstPos(fTree);
115 calcLastPos(fTree);
116 calcFollowPos(fTree);
117 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
374ca955 118 RBBIDebugPuts("\n");
b75a7d8f
A
119 printPosSets(fTree);
120 }
121
374ca955
A
122 //
123 // For "chained" rules, modify the followPos sets
124 //
125 if (fRB->fChainRules) {
126 calcChainedFollowPos(fTree);
127 }
128
b75a7d8f
A
129 //
130 // Build the DFA state transition tables.
131 //
132 buildStateTable();
133 flagAcceptingStates();
134 flagLookAheadStates();
135 flagTaggedStates();
b75a7d8f 136
374ca955
A
137 //
138 // Update the global table of rule status {tag} values
139 // The rule builder has a global vector of status values that are common
140 // for all tables. Merge the ones from this table into the global set.
141 //
142 mergeRuleStatusVals();
143
144 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
b75a7d8f
A
145}
146
147
148
149//-----------------------------------------------------------------------------
150//
151// calcNullable. Impossible to explain succinctly. See Aho, section 3.9
152//
153//-----------------------------------------------------------------------------
154void RBBITableBuilder::calcNullable(RBBINode *n) {
155 if (n == NULL) {
156 return;
157 }
158 if (n->fType == RBBINode::setRef ||
159 n->fType == RBBINode::endMark ) {
160 // These are non-empty leaf node types.
161 n->fNullable = FALSE;
162 return;
163 }
164
165 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
166 // Lookahead marker node. It's a leaf, so no recursion on children.
167 // It's nullable because it does not match any literal text from the input stream.
168 n->fNullable = TRUE;
169 return;
170 }
171
172
173 // The node is not a leaf.
174 // Calculate nullable on its children.
175 calcNullable(n->fLeftChild);
176 calcNullable(n->fRightChild);
177
178 // Apply functions from table 3.40 in Aho
179 if (n->fType == RBBINode::opOr) {
180 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
181 }
182 else if (n->fType == RBBINode::opCat) {
183 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
184 }
185 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
186 n->fNullable = TRUE;
187 }
188 else {
189 n->fNullable = FALSE;
190 }
191}
192
193
194
195
196//-----------------------------------------------------------------------------
197//
198// calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
199//
200//-----------------------------------------------------------------------------
201void RBBITableBuilder::calcFirstPos(RBBINode *n) {
202 if (n == NULL) {
203 return;
204 }
205 if (n->fType == RBBINode::leafChar ||
206 n->fType == RBBINode::endMark ||
207 n->fType == RBBINode::lookAhead ||
208 n->fType == RBBINode::tag) {
209 // These are non-empty leaf node types.
210 n->fFirstPosSet->addElement(n, *fStatus);
211 return;
212 }
213
214 // The node is not a leaf.
215 // Calculate firstPos on its children.
216 calcFirstPos(n->fLeftChild);
217 calcFirstPos(n->fRightChild);
218
219 // Apply functions from table 3.40 in Aho
220 if (n->fType == RBBINode::opOr) {
221 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
222 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
223 }
224 else if (n->fType == RBBINode::opCat) {
225 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
226 if (n->fLeftChild->fNullable) {
227 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
228 }
229 }
230 else if (n->fType == RBBINode::opStar ||
231 n->fType == RBBINode::opQuestion ||
232 n->fType == RBBINode::opPlus) {
233 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
234 }
235}
236
237
238
239//-----------------------------------------------------------------------------
240//
241// calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
242//
243//-----------------------------------------------------------------------------
244void RBBITableBuilder::calcLastPos(RBBINode *n) {
245 if (n == NULL) {
246 return;
247 }
248 if (n->fType == RBBINode::leafChar ||
249 n->fType == RBBINode::endMark ||
250 n->fType == RBBINode::lookAhead ||
251 n->fType == RBBINode::tag) {
252 // These are non-empty leaf node types.
253 n->fLastPosSet->addElement(n, *fStatus);
254 return;
255 }
256
257 // The node is not a leaf.
258 // Calculate lastPos on its children.
259 calcLastPos(n->fLeftChild);
260 calcLastPos(n->fRightChild);
261
262 // Apply functions from table 3.40 in Aho
263 if (n->fType == RBBINode::opOr) {
264 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
265 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
266 }
267 else if (n->fType == RBBINode::opCat) {
268 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
269 if (n->fRightChild->fNullable) {
270 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
271 }
272 }
273 else if (n->fType == RBBINode::opStar ||
274 n->fType == RBBINode::opQuestion ||
275 n->fType == RBBINode::opPlus) {
276 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
277 }
278}
279
280
281
282//-----------------------------------------------------------------------------
283//
284// calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
285//
286//-----------------------------------------------------------------------------
287void RBBITableBuilder::calcFollowPos(RBBINode *n) {
288 if (n == NULL ||
289 n->fType == RBBINode::leafChar ||
290 n->fType == RBBINode::endMark) {
291 return;
292 }
293
294 calcFollowPos(n->fLeftChild);
295 calcFollowPos(n->fRightChild);
296
297 // Aho rule #1
298 if (n->fType == RBBINode::opCat) {
299 RBBINode *i; // is 'i' in Aho's description
300 uint32_t ix;
301
302 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
303
304 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
305 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
306 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
307 }
308 }
309
310 // Aho rule #2
311 if (n->fType == RBBINode::opStar ||
312 n->fType == RBBINode::opPlus) {
313 RBBINode *i; // again, n and i are the names from Aho's description.
314 uint32_t ix;
315
316 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
317 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
318 setAdd(i->fFollowPos, n->fFirstPosSet);
319 }
320 }
321
322
323
324}
325
326
374ca955
A
327//-----------------------------------------------------------------------------
328//
329// calcChainedFollowPos. Modify the previously calculated followPos sets
330// to implement rule chaining. NOT described by Aho
331//
332//-----------------------------------------------------------------------------
333void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
334
335 UVector endMarkerNodes(*fStatus);
336 UVector leafNodes(*fStatus);
337 int32_t i;
338
339 if (U_FAILURE(*fStatus)) {
340 return;
341 }
342
343 // get a list of all endmarker nodes.
344 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
345
346 // get a list all leaf nodes
347 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
348 if (U_FAILURE(*fStatus)) {
349 return;
350 }
351
352 // Get all nodes that can be the start a match, which is FirstPosition(root)
353 UVector *matchStartNodes = tree->fFirstPosSet;
354
355
356 // Iteratate over all leaf nodes,
357 //
358 int32_t endNodeIx;
359 int32_t startNodeIx;
360
361 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
362 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
363 RBBINode *endNode = NULL;
364
365 // Identify leaf nodes that correspond to overall rule match positions.
366 // These include an endMarkerNode in their followPos sets.
367 for (i=0; i<endMarkerNodes.size(); i++) {
368 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
369 endNode = tNode;
370 break;
371 }
372 }
373 if (endNode == NULL) {
374 // node wasn't an end node. Try again with the next.
375 continue;
376 }
377
378 // We've got a node that can end a match.
379
380 // Line Break Specific hack: If this node's val correspond to the $CM char class,
381 // don't chain from it.
382 // TODO: Add rule syntax for this behavior, get specifics out of here and
383 // into the rule file.
384 if (fRB->fLBCMNoChain) {
385 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
386 U_ASSERT(c != -1);
387 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
388 if (cLBProp == U_LB_COMBINING_MARK) {
389 continue;
390 }
391 }
392
393
394 // Now iterate over the nodes that can start a match, looking for ones
395 // with the same char class as our ending node.
396 RBBINode *startNode;
397 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
398 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
399 if (startNode->fType != RBBINode::leafChar) {
400 continue;
401 }
402
403 if (endNode->fVal == startNode->fVal) {
404 // The end val (character class) of one possible match is the
405 // same as the start of another.
406
407 // Add all nodes from the followPos of the start node to the
408 // followPos set of the end node, which will have the effect of
409 // letting matches transition from a match state at endNode
410 // to the second char of a match starting with startNode.
411 setAdd(endNode->fFollowPos, startNode->fFollowPos);
412 }
413 }
414 }
415}
416
417
b75a7d8f
A
418//-----------------------------------------------------------------------------
419//
420// buildStateTable() Determine the set of runtime DFA states and the
421// transition tables for these states, by the algorithm
422// of fig. 3.44 in Aho.
423//
424// Most of the comments are quotes of Aho's psuedo-code.
425//
426//-----------------------------------------------------------------------------
427void RBBITableBuilder::buildStateTable() {
374ca955
A
428 if (U_FAILURE(*fStatus)) {
429 return;
430 }
b75a7d8f
A
431 //
432 // Add a dummy state 0 - the stop state. Not from Aho.
433 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
434 RBBIStateDescriptor *failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
435 failState->fPositions = new UVector(*fStatus);
374ca955
A
436 if (U_FAILURE(*fStatus)) {
437 return;
438 }
b75a7d8f 439 fDStates->addElement(failState, *fStatus);
374ca955
A
440 if (U_FAILURE(*fStatus)) {
441 return;
442 }
b75a7d8f
A
443
444 // initially, the only unmarked state in Dstates is firstpos(root),
445 // where toot is the root of the syntax tree for (r)#;
446 RBBIStateDescriptor *initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
374ca955
A
447 if (U_FAILURE(*fStatus)) {
448 return;
449 }
b75a7d8f 450 initialState->fPositions = new UVector(*fStatus);
374ca955
A
451 if (U_FAILURE(*fStatus)) {
452 return;
453 }
b75a7d8f
A
454 setAdd(initialState->fPositions, fTree->fFirstPosSet);
455 fDStates->addElement(initialState, *fStatus);
374ca955
A
456 if (U_FAILURE(*fStatus)) {
457 return;
458 }
b75a7d8f
A
459
460 // while there is an unmarked state T in Dstates do begin
461 for (;;) {
462 RBBIStateDescriptor *T = NULL;
463 int32_t tx;
464 for (tx=1; tx<fDStates->size(); tx++) {
465 RBBIStateDescriptor *temp;
466 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
467 if (temp->fMarked == FALSE) {
468 T = temp;
469 break;
470 }
471 }
472 if (T == NULL) {
473 break;
474 }
475
476 // mark T;
477 T->fMarked = TRUE;
478
479 // for each input symbol a do begin
480 int32_t a;
481 for (a = 1; a<=lastInputSymbol; a++) {
482 // let U be the set of positions that are in followpos(p)
483 // for some position p in T
484 // such that the symbol at position p is a;
485 UVector *U = NULL;
486 RBBINode *p;
487 int32_t px;
488 for (px=0; px<T->fPositions->size(); px++) {
489 p = (RBBINode *)T->fPositions->elementAt(px);
490 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
491 if (U == NULL) {
492 U = new UVector(*fStatus);
493 }
494 setAdd(U, p->fFollowPos);
495 }
496 }
497
498 // if U is not empty and not in DStates then
499 int32_t ux = 0;
500 UBool UinDstates = FALSE;
501 if (U != NULL) {
502 U_ASSERT(U->size() > 0);
503 int ix;
504 for (ix=0; ix<fDStates->size(); ix++) {
505 RBBIStateDescriptor *temp2;
506 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
507 if (setEquals(U, temp2->fPositions)) {
508 delete U;
509 U = temp2->fPositions;
510 ux = ix;
511 UinDstates = TRUE;
512 break;
513 }
514 }
515
516 // Add U as an unmarked state to Dstates
517 if (!UinDstates)
518 {
519 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
374ca955
A
520 if (U_FAILURE(*fStatus)) {
521 return;
522 }
b75a7d8f
A
523 newState->fPositions = U;
524 fDStates->addElement(newState, *fStatus);
374ca955
A
525 if (U_FAILURE(*fStatus)) {
526 return;
527 }
b75a7d8f
A
528 ux = fDStates->size()-1;
529 }
530
531 // Dtran[T, a] := U;
532 T->fDtran->setElementAt(ux, a);
533 }
534 }
535 }
536}
537
538
539
540//-----------------------------------------------------------------------------
541//
542// flagAcceptingStates Identify accepting states.
543// First get a list of all of the end marker nodes.
544// Then, for each state s,
545// if s contains one of the end marker nodes in its list of tree positions then
546// s is an accepting state.
547//
548//-----------------------------------------------------------------------------
549void RBBITableBuilder::flagAcceptingStates() {
374ca955
A
550 if (U_FAILURE(*fStatus)) {
551 return;
552 }
b75a7d8f
A
553 UVector endMarkerNodes(*fStatus);
554 RBBINode *endMarker;
555 int32_t i;
556 int32_t n;
557
374ca955
A
558 if (U_FAILURE(*fStatus)) {
559 return;
560 }
561
b75a7d8f 562 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
374ca955
A
563 if (U_FAILURE(*fStatus)) {
564 return;
565 }
b75a7d8f
A
566
567 for (i=0; i<endMarkerNodes.size(); i++) {
568 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
569 for (n=0; n<fDStates->size(); n++) {
570 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
571 if (sd->fPositions->indexOf(endMarker) >= 0) {
572 // Any non-zero value for fAccepting means this is an accepting node.
573 // The value is what will be returned to the user as the break status.
574 // If no other value was specified, force it to -1.
575 sd->fAccepting = endMarker->fVal;
576 if (sd->fAccepting == 0) {
577 sd->fAccepting = -1;
578 }
579
580 // If the end marker node is from a look-ahead rule, set
581 // the fLookAhead field or this state also.
582 if (endMarker->fLookAheadEnd) {
583 sd->fLookAhead = sd->fAccepting;
584 }
585 }
586 }
587 }
588}
589
590
591//-----------------------------------------------------------------------------
592//
593// flagLookAheadStates Very similar to flagAcceptingStates, above.
594//
595//-----------------------------------------------------------------------------
596void RBBITableBuilder::flagLookAheadStates() {
374ca955
A
597 if (U_FAILURE(*fStatus)) {
598 return;
599 }
b75a7d8f
A
600 UVector lookAheadNodes(*fStatus);
601 RBBINode *lookAheadNode;
602 int32_t i;
603 int32_t n;
604
605 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
374ca955
A
606 if (U_FAILURE(*fStatus)) {
607 return;
608 }
b75a7d8f
A
609 for (i=0; i<lookAheadNodes.size(); i++) {
610 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
611
612 for (n=0; n<fDStates->size(); n++) {
613 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
614 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
615 sd->fLookAhead = lookAheadNode->fVal;
616 }
617 }
618 }
619}
620
621
622
623
624//-----------------------------------------------------------------------------
625//
626// flagTaggedStates
627//
628//-----------------------------------------------------------------------------
629void RBBITableBuilder::flagTaggedStates() {
374ca955
A
630 if (U_FAILURE(*fStatus)) {
631 return;
632 }
b75a7d8f
A
633 UVector tagNodes(*fStatus);
634 RBBINode *tagNode;
635 int32_t i;
636 int32_t n;
637
374ca955
A
638 if (U_FAILURE(*fStatus)) {
639 return;
640 }
b75a7d8f 641 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
374ca955
A
642 if (U_FAILURE(*fStatus)) {
643 return;
644 }
b75a7d8f
A
645 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
646 tagNode = (RBBINode *)tagNodes.elementAt(i);
374ca955 647
b75a7d8f
A
648 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
649 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
650 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
374ca955 651 sortedAdd(&sd->fTagVals, tagNode->fVal);
b75a7d8f
A
652 }
653 }
654 }
655}
374ca955
A
656
657
658
659
660//-----------------------------------------------------------------------------
661//
662// mergeRuleStatusVals
663//
664// Update the global table of rule status {tag} values
665// The rule builder has a global vector of status values that are common
666// for all tables. Merge the ones from this table into the global set.
667//
668//-----------------------------------------------------------------------------
669void RBBITableBuilder::mergeRuleStatusVals() {
670 //
671 // The basic outline of what happens here is this...
672 //
673 // for each state in this state table
674 // if the status tag list for this state is in the global statuses list
675 // record where and
676 // continue with the next state
677 // else
678 // add the tag list for this state to the global list.
679 //
680 int i;
681 int n;
682
683 // Pre-set a single tag of {0} into the table.
684 // We will need this as a default, for rule sets with no explicit tagging.
685 if (fRB->fRuleStatusVals->size() == 0) {
686 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
687 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
688 }
689
690 // For each state
691 for (n=0; n<fDStates->size(); n++) {
692 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
693 UVector *thisStatesTagValues = sd->fTagVals;
694 if (thisStatesTagValues == NULL) {
695 // No tag values are explicitly associated with this state.
696 // Set the default tag value.
697 sd->fTagsIdx = 0;
698 continue;
699 }
700
701 // There are tag(s) associated with this state.
702 // fTagsIdx will be the index into the global tag list for this state's tag values.
703 // Initial value of -1 flags that we haven't got it set yet.
704 sd->fTagsIdx = -1;
705 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
706 int32_t nextTagGroupStart = 0;
707
708 // Loop runs once per group of tags in the global list
709 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
710 thisTagGroupStart = nextTagGroupStart;
711 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
712 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
713 // The number of tags for this state is different from
714 // the number of tags in this group from the global list.
715 // Continue with the next group from the global list.
716 continue;
717 }
718 // The lengths match, go ahead and compare the actual tag values
719 // between this state and the group from the global list.
720 for (i=0; i<thisStatesTagValues->size(); i++) {
721 if (thisStatesTagValues->elementAti(i) !=
722 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
723 // Mismatch.
724 break;
725 }
726 }
727
728 if (i == thisStatesTagValues->size()) {
729 // We found a set of tag values in the global list that match
730 // those for this state. Use them.
731 sd->fTagsIdx = thisTagGroupStart;
732 break;
733 }
734 }
735
736 if (sd->fTagsIdx == -1) {
737 // No suitable entry in the global tag list already. Add one
738 sd->fTagsIdx = fRB->fRuleStatusVals->size();
739 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
740 for (i=0; i<thisStatesTagValues->size(); i++) {
741 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
742 }
743 }
744 }
745}
746
747
748
749
750
751
752
753//-----------------------------------------------------------------------------
754//
755// sortedAdd Add a value to a vector of sorted values (ints).
756// Do not replicate entries; if the value is already there, do not
757// add a second one.
758// Lazily create the vector if it does not already exist.
759//
760//-----------------------------------------------------------------------------
761void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
762 int32_t i;
763
764 if (*vector == NULL) {
765 *vector = new UVector(*fStatus);
766 }
767 if (*vector == NULL || U_FAILURE(*fStatus)) {
768 return;
769 }
770 UVector *vec = *vector;
771 int32_t vSize = vec->size();
772 for (i=0; i<vSize; i++) {
773 int32_t valAtI = vec->elementAti(i);
774 if (valAtI == val) {
775 // The value is already in the vector. Don't add it again.
776 return;
777 }
778 if (valAtI > val) {
779 break;
780 }
781 }
782 vec->insertElementAt(val, i, *fStatus);
b75a7d8f
A
783}
784
785
786
787//-----------------------------------------------------------------------------
788//
789// setAdd Set operation on UVector
790// dest = dest union source
791// Elements may only appear once. Order is unimportant.
792//
793//-----------------------------------------------------------------------------
794void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
795 int destOriginalSize = dest->size();
796 int sourceSize = source->size();
797 int32_t si, di;
798
374ca955 799 for (si=0; si<sourceSize && U_SUCCESS(*fStatus); si++) {
b75a7d8f
A
800 void *elToAdd = source->elementAt(si);
801 for (di=0; di<destOriginalSize; di++) {
802 if (dest->elementAt(di) == elToAdd) {
803 goto elementAlreadyInDest;
804 }
805 }
806 dest->addElement(elToAdd, *fStatus);
374ca955 807 elementAlreadyInDest: ;
b75a7d8f
A
808 }
809}
810
811
374ca955 812
b75a7d8f
A
813//-----------------------------------------------------------------------------
814//
815// setEqual Set operation on UVector.
816// Compare for equality.
817// Elements may appear only once.
818// Elements may appear in any order.
819//
820//-----------------------------------------------------------------------------
821UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
822 int32_t aSize = a->size();
823 int32_t bSize = b->size();
824
825 if (aSize != bSize) {
826 return FALSE;
827 }
828
829 int32_t ax;
830 int32_t bx;
831 int32_t firstBx = 0;
832 void *aVal;
833 void *bVal = NULL;
834
835 for (ax=0; ax<aSize; ax++) {
836 aVal = a->elementAt(ax);
837 for (bx=firstBx; bx<bSize; bx++) {
838 bVal = b->elementAt(bx);
839 if (aVal == bVal) {
840 if (bx==firstBx) {
841 firstBx++;
842 }
843 break;
844 }
845 }
846 if (aVal != bVal) {
847 return FALSE;
848 }
849 }
850 return TRUE;
851}
852
853
854//-----------------------------------------------------------------------------
855//
856// printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
857// for each node in the tree.
858//
859//-----------------------------------------------------------------------------
b75a7d8f 860#ifdef RBBI_DEBUG
374ca955 861void RBBITableBuilder::printPosSets(RBBINode *n) {
b75a7d8f
A
862 if (n==NULL) {
863 return;
864 }
374ca955 865 n->printNode();
b75a7d8f
A
866 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
867
868 RBBIDebugPrintf(" firstpos: ");
869 printSet(n->fFirstPosSet);
870
871 RBBIDebugPrintf(" lastpos: ");
872 printSet(n->fLastPosSet);
873
874 RBBIDebugPrintf(" followpos: ");
875 printSet(n->fFollowPos);
876
877 printPosSets(n->fLeftChild);
878 printPosSets(n->fRightChild);
b75a7d8f 879}
374ca955 880#endif
b75a7d8f
A
881
882
883
884//-----------------------------------------------------------------------------
885//
886// getTableSize() Calculate the size of the runtime form of this
887// state transition table.
888//
889//-----------------------------------------------------------------------------
374ca955 890int32_t RBBITableBuilder::getTableSize() const {
b75a7d8f
A
891 int32_t size = 0;
892 int32_t numRows;
893 int32_t numCols;
894 int32_t rowSize;
895
896 if (fTree == NULL) {
897 return 0;
898 }
899
900 size = sizeof(RBBIStateTable) - 4; // The header, with no rows to the table.
901
902 numRows = fDStates->size();
903 numCols = fRB->fSetBuilder->getNumCharCategories();
904
905 // Note The declaration of RBBIStateTableRow is for a table of two columns.
906 // Therefore we subtract two from numCols when determining
907 // how much storage to add to a row for the total columns.
908 rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
909 size += numRows * rowSize;
910 return size;
911}
912
913
914
915//-----------------------------------------------------------------------------
916//
917// exportTable() export the state transition table in the format required
918// by the runtime engine. getTableSize() bytes of memory
919// must be available at the output address "where".
920//
921//-----------------------------------------------------------------------------
922void RBBITableBuilder::exportTable(void *where) {
923 RBBIStateTable *table = (RBBIStateTable *)where;
924 uint32_t state;
925 int col;
926
927 if (U_FAILURE(*fStatus) || fTree == NULL) {
928 return;
929 }
930
931 if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
932 fDStates->size() > 0x7fff) {
933 *fStatus = U_BRK_INTERNAL_ERROR;
934 return;
935 }
936
937 table->fRowLen = sizeof(RBBIStateTableRow) +
938 sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
939 table->fNumStates = fDStates->size();
374ca955
A
940 table->fFlags = 0;
941 if (fRB->fLookAheadHardBreak) {
942 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
943 }
944 table->fReserved = 0;
b75a7d8f
A
945
946 for (state=0; state<table->fNumStates; state++) {
947 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
948 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
949 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
950 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
951 row->fAccepting = (int16_t)sd->fAccepting;
952 row->fLookAhead = (int16_t)sd->fLookAhead;
374ca955 953 row->fTagIdx = (int16_t)sd->fTagsIdx;
b75a7d8f
A
954 for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
955 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
956 }
957 }
958}
959
960
961
962//-----------------------------------------------------------------------------
963//
964// printSet Debug function. Print the contents of a UVector
965//
966//-----------------------------------------------------------------------------
b75a7d8f 967#ifdef RBBI_DEBUG
374ca955 968void RBBITableBuilder::printSet(UVector *s) {
b75a7d8f
A
969 int32_t i;
970 for (i=0; i<s->size(); i++) {
971 void *v = s->elementAt(i);
972 RBBIDebugPrintf("%10p", v);
973 }
974 RBBIDebugPrintf("\n");
b75a7d8f 975}
374ca955 976#endif
b75a7d8f
A
977
978
979//-----------------------------------------------------------------------------
980//
981// printStates Debug Function. Dump the fully constructed state transition table.
982//
983//-----------------------------------------------------------------------------
b75a7d8f 984#ifdef RBBI_DEBUG
374ca955 985void RBBITableBuilder::printStates() {
b75a7d8f
A
986 int c; // input "character"
987 int n; // state number
988
989 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
990 RBBIDebugPrintf(" | Acc LA Tag");
374ca955
A
991 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
992 RBBIDebugPrintf(" %2d", c);
993 }
b75a7d8f
A
994 RBBIDebugPrintf("\n");
995 RBBIDebugPrintf(" |---------------");
374ca955
A
996 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
997 RBBIDebugPrintf("---");
998 }
b75a7d8f
A
999 RBBIDebugPrintf("\n");
1000
1001 for (n=0; n<fDStates->size(); n++) {
1002 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1003 RBBIDebugPrintf(" %3d | " , n);
374ca955 1004 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
b75a7d8f
A
1005 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1006 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1007 }
1008 RBBIDebugPrintf("\n");
1009 }
1010 RBBIDebugPrintf("\n\n");
b75a7d8f 1011}
374ca955 1012#endif
b75a7d8f
A
1013
1014
1015
374ca955
A
1016//-----------------------------------------------------------------------------
1017//
1018// printRuleStatusTable Debug Function. Dump the common rule status table
1019//
1020//-----------------------------------------------------------------------------
1021#ifdef RBBI_DEBUG
1022void RBBITableBuilder::printRuleStatusTable() {
1023 int32_t thisRecord = 0;
1024 int32_t nextRecord = 0;
1025 int i;
1026 UVector *tbl = fRB->fRuleStatusVals;
1027
1028 RBBIDebugPrintf("index | tags \n");
1029 RBBIDebugPrintf("-------------------\n");
1030
1031 while (nextRecord < tbl->size()) {
1032 thisRecord = nextRecord;
1033 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1034 RBBIDebugPrintf("%4d ", thisRecord);
1035 for (i=thisRecord+1; i<nextRecord; i++) {
1036 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1037 }
1038 RBBIDebugPrintf("\n");
1039 }
1040 RBBIDebugPrintf("\n\n");
1041}
1042#endif
b75a7d8f
A
1043
1044
1045//-----------------------------------------------------------------------------
1046//
1047// RBBIStateDescriptor Methods. This is a very struct-like class
1048// Most access is directly to the fields.
1049//
1050//-----------------------------------------------------------------------------
1051
1052RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1053 fMarked = FALSE;
1054 fAccepting = 0;
1055 fLookAhead = 0;
374ca955
A
1056 fTagsIdx = 0;
1057 fTagVals = NULL;
b75a7d8f
A
1058 fPositions = NULL;
1059 fDtran = NULL;
374ca955
A
1060
1061 fDtran = new UVector(lastInputSymbol+1, *fStatus);
b75a7d8f
A
1062 if (U_FAILURE(*fStatus)) {
1063 return;
1064 }
b75a7d8f
A
1065 if (fDtran == NULL) {
1066 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1067 return;
1068 }
1069 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1070 // It is indexed by input symbols, and will
1071 // hold the next state number for each
1072 // symbol.
1073}
1074
1075
1076RBBIStateDescriptor::~RBBIStateDescriptor() {
1077 delete fPositions;
1078 delete fDtran;
374ca955 1079 delete fTagVals;
b75a7d8f
A
1080 fPositions = NULL;
1081 fDtran = NULL;
374ca955 1082 fTagVals = NULL;
b75a7d8f
A
1083}
1084
1085U_NAMESPACE_END
1086
1087#endif /* #if !UCONFIG_NO_BREAK_ITERATION */