5 ***************************************************************************
6 * Copyright (C) 2002-2005 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
14 // Starting with the rules parse tree from the scanner,
16 // - Enumerate the set of UnicodeSets that are referenced
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
31 #include "unicode/utypes.h"
33 #if !UCONFIG_NO_BREAK_ITERATION
35 #include "unicode/uniset.h"
46 //------------------------------------------------------------------------
48 // getFoldedRBBIValue Call-back function used during building of Trie table.
49 // Folding value: just store the offset (16 bits)
50 // if there is any non-0 entry.
51 // (It'd really be nice if the Trie builder would provide a
52 // simple default, so this function could go away from here.)
54 //------------------------------------------------------------------------
55 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
57 static uint32_t U_CALLCONV
58 getFoldedRBBIValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
65 value
=utrie_get32(trie
, start
, &inBlockZero
);
67 start
+=UTRIE_DATA_BLOCK_LENGTH
;
69 return (uint32_t)(offset
|0x8000);
84 //------------------------------------------------------------------------
88 //------------------------------------------------------------------------
89 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder
*rb
)
92 fStatus
= rb
->fStatus
;
101 //------------------------------------------------------------------------
105 //------------------------------------------------------------------------
106 RBBISetBuilder::~RBBISetBuilder()
108 RangeDescriptor
*nextRangeDesc
;
110 // Walk through & delete the linked list of RangeDescriptors
111 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
112 RangeDescriptor
*r
= nextRangeDesc
;
113 nextRangeDesc
= r
->fNext
;
123 //------------------------------------------------------------------------
125 // build Build the list of non-overlapping character ranges
126 // from the Unicode Sets.
128 //------------------------------------------------------------------------
129 void RBBISetBuilder::build() {
131 RangeDescriptor
*rlRange
;
133 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
136 // Initialize the process by creating a single range encompassing all characters
137 // that is in no sets.
139 fRangeList
= new RangeDescriptor(*fStatus
); // will check for status here
140 fRangeList
->fStartChar
= 0;
141 fRangeList
->fEndChar
= 0x10ffff;
143 if (U_FAILURE(*fStatus
)) {
148 // Find the set of non-overlapping ranges of characters
151 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
152 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
153 if (usetNode
==NULL
) {
157 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
158 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
159 int inputSetRangeIndex
= 0;
160 rlRange
= fRangeList
;
163 if (inputSetRangeIndex
>= inputSetRangeCount
) {
166 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
167 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
169 // skip over ranges from the range list that are completely
170 // below the current range from the input unicode set.
171 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
172 rlRange
= rlRange
->fNext
;
175 // If the start of the range from the range list is before with
176 // the start of the range from the unicode set, split the range list range
177 // in two, with one part being before (wholly outside of) the unicode set
178 // and the other containing the rest.
179 // Then continue the loop; the post-split current range will then be skipped
181 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
182 rlRange
->split(inputSetRangeBegin
, *fStatus
);
183 if (U_FAILURE(*fStatus
)) {
189 // Same thing at the end of the ranges...
190 // If the end of the range from the range list doesn't coincide with
191 // the end of the range from the unicode set, split the range list
192 // range in two. The first part of the split range will be
193 // wholly inside the Unicode set.
194 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
195 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
196 if (U_FAILURE(*fStatus
)) {
201 // The current rlRange is now entirely within the UnicodeSet range.
202 // Add this unicode set to the list of sets for this rlRange
203 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
204 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
205 if (U_FAILURE(*fStatus
)) {
210 // Advance over ranges that we are finished with.
211 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
212 inputSetRangeIndex
++;
214 rlRange
= rlRange
->fNext
;
218 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
221 // Group the above ranges, with each group consisting of one or more
222 // ranges that are in exactly the same set of original UnicodeSets.
223 // The groups are numbered, and these group numbers are the set of
224 // input symbols recognized by the run-time state machine.
226 // Numbering: # 0 (state table column 0) is unused.
227 // # 1 is reserved - table column 1 is for end-of-input
228 // # 2 is reserved - table column 2 is for beginning-in-input
229 // # 3 is the first range list.
231 RangeDescriptor
*rlSearchRange
;
232 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
233 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
234 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
235 rlRange
->fNum
= rlSearchRange
->fNum
;
239 if (rlRange
->fNum
== 0) {
241 rlRange
->fNum
= fGroupCount
+2;
242 rlRange
->setDictionaryFlag();
243 addValToSets(rlRange
->fIncludesSets
, fGroupCount
+2);
247 // Handle input sets that contain the special string {eof}.
248 // Column 1 of the state table is reserved for EOF on input.
249 // Column 2 is reserved for before-the-start-input.
250 // (This column can be optimized away later if there are no rule
251 // references to {bof}.)
252 // Add this column value (1 or 2) to the equivalent expression
253 // subtree for each UnicodeSet that contains the string {eof}
254 // Because {bof} and {eof} are not a characters in the normal sense,
255 // they doesn't affect the computation of ranges or TRIE.
256 static const UChar eofUString
[] = {0x65, 0x6f, 0x66, 0};
257 static const UChar bofUString
[] = {0x62, 0x6f, 0x66, 0};
259 UnicodeString
eofString(eofUString
);
260 UnicodeString
bofString(bofUString
);
261 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
262 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
263 if (usetNode
==NULL
) {
266 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
267 if (inputSet
->contains(eofString
)) {
268 addValToSet(usetNode
, 1);
270 if (inputSet
->contains(bofString
)) {
271 addValToSet(usetNode
, 2);
277 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
278 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
281 // Build the Trie table for mapping UChar32 values to the corresponding
282 // range group number
284 fTrie
= utrie_open(NULL
, // Pre-existing trie to be filled in
285 NULL
, // Data array (utrie will allocate one)
286 100000, // Max Data Length
287 0, // Initial value for all code points
288 0, // Lead surrogate unit value
289 TRUE
); // Keep Latin 1 in separately
292 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
293 utrie_setRange32(fTrie
, rlRange
->fStartChar
, rlRange
->fEndChar
+1, rlRange
->fNum
, TRUE
);
299 //-----------------------------------------------------------------------------------
301 // getTrieSize() Return the size that will be required to serialize the Trie.
303 //-----------------------------------------------------------------------------------
304 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
305 fTrieSize
= utrie_serialize(fTrie
,
309 TRUE
, // Reduce to 16 bits
311 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
316 //-----------------------------------------------------------------------------------
318 // serializeTrie() Put the serialized trie at the specified address.
319 // Trust the caller to have given us enough memory.
320 // getTrieSize() MUST be called first.
322 //-----------------------------------------------------------------------------------
323 void RBBISetBuilder::serializeTrie(uint8_t *where
) {
324 utrie_serialize(fTrie
,
326 fTrieSize
, // Capacity
328 TRUE
, // Reduce to 16 bits
332 //------------------------------------------------------------------------
334 // addValToSets Add a runtime-mapped input value to each uset from a
335 // list of uset nodes. (val corresponds to a state table column.)
336 // For each of the original Unicode sets - which correspond
337 // directly to uset nodes - a logically equivalent expression
338 // is constructed in terms of the remapped runtime input
339 // symbol set. This function adds one runtime input symbol to
342 // The "logically equivalent expression" is the tree for an
343 // or-ing together of all of the symbols that go into the set.
345 //------------------------------------------------------------------------
346 void RBBISetBuilder::addValToSets(UVector
*sets
, uint32_t val
) {
349 for (ix
=0; ix
<sets
->size(); ix
++) {
350 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
351 addValToSet(usetNode
, val
);
355 void RBBISetBuilder::addValToSet(RBBINode
*usetNode
, uint32_t val
) {
356 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
357 leafNode
->fVal
= (unsigned short)val
;
358 if (usetNode
->fLeftChild
== NULL
) {
359 usetNode
->fLeftChild
= leafNode
;
360 leafNode
->fParent
= usetNode
;
362 // There are already input symbols present for this set.
363 // Set up an OR node, with the previous stuff as the left child
364 // and the new value as the right child.
365 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
366 orNode
->fLeftChild
= usetNode
->fLeftChild
;
367 orNode
->fRightChild
= leafNode
;
368 orNode
->fLeftChild
->fParent
= orNode
;
369 orNode
->fRightChild
->fParent
= orNode
;
370 usetNode
->fLeftChild
= orNode
;
371 orNode
->fParent
= usetNode
;
376 //------------------------------------------------------------------------
378 // getNumCharCategories
380 //------------------------------------------------------------------------
381 int32_t RBBISetBuilder::getNumCharCategories() const {
382 return fGroupCount
+ 3;
386 //------------------------------------------------------------------------
390 //------------------------------------------------------------------------
391 UBool
RBBISetBuilder::sawBOF() const {
396 //------------------------------------------------------------------------
398 // getFirstChar Given a runtime RBBI character category, find
399 // the first UChar32 that is in the set of chars
401 //------------------------------------------------------------------------
402 UChar32
RBBISetBuilder::getFirstChar(int32_t category
) const {
403 RangeDescriptor
*rlRange
;
404 UChar32 retVal
= (UChar32
)-1;
405 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
406 if (rlRange
->fNum
== category
) {
407 retVal
= rlRange
->fStartChar
;
416 //------------------------------------------------------------------------
418 // printRanges A debugging function.
419 // dump out all of the range definitions.
421 //------------------------------------------------------------------------
423 void RBBISetBuilder::printRanges() {
424 RangeDescriptor
*rlRange
;
427 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
428 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
429 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
431 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
432 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
433 UnicodeString setName
= UNICODE_STRING("anon", 4);
434 RBBINode
*setRef
= usetNode
->fParent
;
435 if (setRef
!= NULL
) {
436 RBBINode
*varRef
= setRef
->fParent
;
437 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
438 setName
= varRef
->fText
;
441 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
443 RBBIDebugPrintf("\n");
449 //------------------------------------------------------------------------
451 // printRangeGroups A debugging function.
452 // dump out all of the range groups.
454 //------------------------------------------------------------------------
456 void RBBISetBuilder::printRangeGroups() {
457 RangeDescriptor
*rlRange
;
458 RangeDescriptor
*tRange
;
460 int lastPrintedGroupNum
= 0;
462 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
463 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
464 int groupNum
= rlRange
->fNum
& 0xbfff;
465 if (groupNum
> lastPrintedGroupNum
) {
466 lastPrintedGroupNum
= groupNum
;
467 RBBIDebugPrintf("%2i ", groupNum
);
469 if (rlRange
->fNum
& 0x4000) { RBBIDebugPrintf(" <DICT> ");}
471 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
472 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
473 UnicodeString setName
= UNICODE_STRING("anon", 4);
474 RBBINode
*setRef
= usetNode
->fParent
;
475 if (setRef
!= NULL
) {
476 RBBINode
*varRef
= setRef
->fParent
;
477 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
478 setName
= varRef
->fText
;
481 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
485 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
486 if (tRange
->fNum
== rlRange
->fNum
) {
488 RBBIDebugPrintf("\n ");
490 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
493 RBBIDebugPrintf("\n");
496 RBBIDebugPrintf("\n");
501 //------------------------------------------------------------------------
503 // printSets A debugging function.
504 // dump out all of the set definitions.
506 //------------------------------------------------------------------------
508 void RBBISetBuilder::printSets() {
511 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
516 UnicodeString setName
;
518 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
519 if (usetNode
== NULL
) {
523 RBBIDebugPrintf("%3d ", i
);
524 setName
= UNICODE_STRING("anonymous", 9);
525 setRef
= usetNode
->fParent
;
526 if (setRef
!= NULL
) {
527 varRef
= setRef
->fParent
;
528 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
529 setName
= varRef
->fText
;
532 RBBI_DEBUG_printUnicodeString(setName
);
533 RBBIDebugPrintf(" ");
534 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
535 RBBIDebugPrintf("\n");
536 if (usetNode
->fLeftChild
!= NULL
) {
537 usetNode
->fLeftChild
->printTree(TRUE
);
540 RBBIDebugPrintf("\n");
546 //-------------------------------------------------------------------------------------
548 // RangeDescriptor copy constructor
550 //-------------------------------------------------------------------------------------
552 RangeDescriptor::RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
) {
555 this->fStartChar
= other
.fStartChar
;
556 this->fEndChar
= other
.fEndChar
;
557 this->fNum
= other
.fNum
;
559 UErrorCode oldstatus
= status
;
560 this->fIncludesSets
= new UVector(status
);
561 if (U_FAILURE(oldstatus
)) {
564 if (U_FAILURE(status
)) {
568 if (this->fIncludesSets
== 0) {
569 status
= U_MEMORY_ALLOCATION_ERROR
;
573 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
574 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
579 //-------------------------------------------------------------------------------------
581 // RangeDesriptor default constructor
583 //-------------------------------------------------------------------------------------
584 RangeDescriptor::RangeDescriptor(UErrorCode
&status
) {
585 this->fStartChar
= 0;
589 UErrorCode oldstatus
= status
;
590 this->fIncludesSets
= new UVector(status
);
591 if (U_FAILURE(oldstatus
)) {
594 if (U_FAILURE(status
)) {
598 if(this->fIncludesSets
== 0) {
599 status
= U_MEMORY_ALLOCATION_ERROR
;
606 //-------------------------------------------------------------------------------------
608 // RangeDesriptor Destructor
610 //-------------------------------------------------------------------------------------
611 RangeDescriptor::~RangeDescriptor() {
612 delete fIncludesSets
;
613 fIncludesSets
= NULL
;
616 //-------------------------------------------------------------------------------------
618 // RangeDesriptor::split()
620 //-------------------------------------------------------------------------------------
621 void RangeDescriptor::split(UChar32 where
, UErrorCode
&status
) {
622 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
623 RangeDescriptor
*nr
= new RangeDescriptor(*this, status
);
624 if (U_FAILURE(status
)) {
629 status
= U_MEMORY_ALLOCATION_ERROR
;
633 // RangeDescriptor copy constructor copies all fields.
634 // Only need to update those that are different after the split.
635 nr
->fStartChar
= where
;
636 this->fEndChar
= where
-1;
637 nr
->fNext
= this->fNext
;
642 //-------------------------------------------------------------------------------------
644 // RangeDescriptor::setDictionaryFlag
646 // Character Category Numbers that include characters from
647 // the original Unicode Set named "dictionary" have bit 14
648 // set to 1. The RBBI runtime engine uses this to trigger
649 // use of the word dictionary.
651 // This function looks through the Unicode Sets that it
652 // (the range) includes, and sets the bit in fNum when
653 // "dictionary" is among them.
655 // TODO: a faster way would be to find the set node for
656 // "dictionary" just once, rather than looking it
657 // up by name every time.
659 //-------------------------------------------------------------------------------------
660 void RangeDescriptor::setDictionaryFlag() {
663 for (i
=0; i
<this->fIncludesSets
->size(); i
++) {
664 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
665 UnicodeString setName
;
666 RBBINode
*setRef
= usetNode
->fParent
;
667 if (setRef
!= NULL
) {
668 RBBINode
*varRef
= setRef
->fParent
;
669 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
670 setName
= varRef
->fText
;
673 if (setName
.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
674 this->fNum
|= 0x4000;
684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */