5 ***************************************************************************
6 * Copyright (C) 2002-2004 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
14 // Starting with the rules parse tree from the scanner,
16 // - Enumerate the set of UnicodeSets that are referenced
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
31 #include "unicode/utypes.h"
33 #if !UCONFIG_NO_BREAK_ITERATION
35 #include "unicode/uniset.h"
46 //------------------------------------------------------------------------
48 // getFoldedRBBIValue Call-back function used during building of Trie table.
49 // Folding value: just store the offset (16 bits)
50 // if there is any non-0 entry.
51 // (It'd really be nice if the Trie builder would provide a
52 // simple default, so this function could go away from here.)
54 //------------------------------------------------------------------------
55 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
57 static uint32_t U_CALLCONV
58 getFoldedRBBIValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
65 value
=utrie_get32(trie
, start
, &inBlockZero
);
67 start
+=UTRIE_DATA_BLOCK_LENGTH
;
69 return (uint32_t)(offset
|0x8000);
84 //------------------------------------------------------------------------
88 //------------------------------------------------------------------------
89 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder
*rb
)
92 fStatus
= rb
->fStatus
;
100 //------------------------------------------------------------------------
104 //------------------------------------------------------------------------
105 RBBISetBuilder::~RBBISetBuilder()
107 RangeDescriptor
*nextRangeDesc
;
109 // Walk through & delete the linked list of RangeDescriptors
110 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
111 RangeDescriptor
*r
= nextRangeDesc
;
112 nextRangeDesc
= r
->fNext
;
122 //------------------------------------------------------------------------
124 // build Build the list of non-overlapping character ranges
125 // from the Unicode Sets.
127 //------------------------------------------------------------------------
128 void RBBISetBuilder::build() {
130 RangeDescriptor
*rlRange
;
132 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
135 // Initialize the process by creating a single range encompassing all characters
136 // that is in no sets.
138 fRangeList
= new RangeDescriptor(*fStatus
); // will check for status here
139 fRangeList
->fStartChar
= 0;
140 fRangeList
->fEndChar
= 0x10ffff;
142 if (U_FAILURE(*fStatus
)) {
147 // Find the set of non-overlapping ranges of characters
151 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
152 if (usetNode
==NULL
) {
156 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
157 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
158 int inputSetRangeIndex
= 0;
159 rlRange
= fRangeList
;
162 if (inputSetRangeIndex
>= inputSetRangeCount
) {
165 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
166 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
168 // skip over ranges from the range list that are completely
169 // below the current range from the input unicode set.
170 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
171 rlRange
= rlRange
->fNext
;
174 // If the start of the range from the range list is before with
175 // the start of the range from the unicode set, split the range list range
176 // in two, with one part being before (wholly outside of) the unicode set
177 // and the other containing the rest.
178 // Then continue the loop; the post-split current range will then be skipped
180 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
181 rlRange
->split(inputSetRangeBegin
, *fStatus
);
182 if (U_FAILURE(*fStatus
)) {
188 // Same thing at the end of the ranges...
189 // If the end of the range from the range list doesn't coincide with
190 // the end of the range from the unicode set, split the range list
191 // range in two. The first part of the split range will be
192 // wholly inside the Unicode set.
193 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
194 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
195 if (U_FAILURE(*fStatus
)) {
200 // The current rlRange is now entirely within the UnicodeSet range.
201 // Add this unicode set to the list of sets for this rlRange
202 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
203 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
204 if (U_FAILURE(*fStatus
)) {
209 // Advance over ranges that we are finished with.
210 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
211 inputSetRangeIndex
++;
213 rlRange
= rlRange
->fNext
;
217 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
220 // Group the above ranges, with each group consisting of one or more
221 // ranges that are in exactly the same set of original UnicodeSets.
222 // The groups are numbered, and these group numbers are the set of
223 // input symbols recognized by the run-time state machine.
225 RangeDescriptor
*rlSearchRange
;
226 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
227 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
228 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
229 rlRange
->fNum
= rlSearchRange
->fNum
;
233 if (rlRange
->fNum
== 0) {
235 rlRange
->fNum
= fGroupCount
;
236 rlRange
->setDictionaryFlag();
237 addValToSets(rlRange
->fIncludesSets
, fGroupCount
);
241 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
242 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
245 // Build the Trie table for mapping UChar32 values to the corresponding
246 // range group number
248 fTrie
= utrie_open(NULL
, // Pre-existing trie to be filled in
249 NULL
, // Data array (utrie will allocate one)
250 100000, // Max Data Length
251 0, // Initial value for all code points
252 0, // Lead surrogate unit value
253 TRUE
); // Keep Latin 1 in separately
256 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
257 utrie_setRange32(fTrie
, rlRange
->fStartChar
, rlRange
->fEndChar
+1, rlRange
->fNum
, TRUE
);
263 //-----------------------------------------------------------------------------------
265 // getTrieSize() Return the size that will be required to serialize the Trie.
267 //-----------------------------------------------------------------------------------
268 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
269 fTrieSize
= utrie_serialize(fTrie
,
273 TRUE
, // Reduce to 16 bits
275 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
280 //-----------------------------------------------------------------------------------
282 // serializeTrie() Put the serialized trie at the specified address.
283 // Trust the caller to have given us enough memory.
284 // getTrieSize() MUST be called first.
286 //-----------------------------------------------------------------------------------
287 void RBBISetBuilder::serializeTrie(uint8_t *where
) {
288 utrie_serialize(fTrie
,
290 fTrieSize
, // Capacity
292 TRUE
, // Reduce to 16 bits
296 //------------------------------------------------------------------------
298 // addValToSets Add a runtime-mapped input value to each uset from a
299 // list of uset nodes.
300 // For each of the original Unicode sets - which correspond
301 // directly to uset nodes - a logically equivalent expression
302 // is constructed in terms of the remapped runtime input
303 // symbol set. This function adds one runtime input symbol to
306 // The "logically equivalent expression" is the tree for an
307 // or-ing together of all of the symbols that go into the set.
309 //------------------------------------------------------------------------
310 void RBBISetBuilder::addValToSets(UVector
*sets
, uint32_t val
) {
313 for (ix
=0; ix
<sets
->size(); ix
++) {
314 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
315 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
316 leafNode
->fVal
= (unsigned short)val
;
317 if (usetNode
->fLeftChild
== NULL
) {
318 usetNode
->fLeftChild
= leafNode
;
319 leafNode
->fParent
= usetNode
;
321 // There are already input symbols present for this set.
322 // Set up an OR node, with the previous stuff as the left child
323 // and the new value as the right child.
324 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
325 orNode
->fLeftChild
= usetNode
->fLeftChild
;
326 orNode
->fRightChild
= leafNode
;
327 orNode
->fLeftChild
->fParent
= orNode
;
328 orNode
->fRightChild
->fParent
= orNode
;
329 usetNode
->fLeftChild
= orNode
;
330 orNode
->fParent
= usetNode
;
337 //------------------------------------------------------------------------
341 //------------------------------------------------------------------------
342 int32_t RBBISetBuilder::getNumCharCategories() const {
343 return fGroupCount
+ 1;
348 //------------------------------------------------------------------------
350 // getFirstChar Given a runtime RBBI character category, find
351 // the first UChar32 that is in the set of chars
353 //------------------------------------------------------------------------
354 UChar32
RBBISetBuilder::getFirstChar(int32_t category
) const {
355 RangeDescriptor
*rlRange
;
356 UChar32 retVal
= (UChar32
)-1;
357 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
358 if (rlRange
->fNum
== category
) {
359 retVal
= rlRange
->fStartChar
;
368 //------------------------------------------------------------------------
370 // printRanges A debugging function.
371 // dump out all of the range definitions.
373 //------------------------------------------------------------------------
375 void RBBISetBuilder::printRanges() {
376 RangeDescriptor
*rlRange
;
379 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
380 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
381 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
383 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
384 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
385 UnicodeString setName
= UNICODE_STRING("anon", 4);
386 RBBINode
*setRef
= usetNode
->fParent
;
387 if (setRef
!= NULL
) {
388 RBBINode
*varRef
= setRef
->fParent
;
389 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
390 setName
= varRef
->fText
;
393 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
395 RBBIDebugPrintf("\n");
401 //------------------------------------------------------------------------
403 // printRangeGroups A debugging function.
404 // dump out all of the range groups.
406 //------------------------------------------------------------------------
408 void RBBISetBuilder::printRangeGroups() {
409 RangeDescriptor
*rlRange
;
410 RangeDescriptor
*tRange
;
412 int lastPrintedGroupNum
= 0;
414 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
415 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
416 int groupNum
= rlRange
->fNum
& 0xbfff;
417 if (groupNum
> lastPrintedGroupNum
) {
418 lastPrintedGroupNum
= groupNum
;
419 RBBIDebugPrintf("%2i ", groupNum
);
421 if (rlRange
->fNum
& 0x4000) { RBBIDebugPrintf(" <DICT> ");}
423 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
424 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
425 UnicodeString setName
= UNICODE_STRING("anon", 4);
426 RBBINode
*setRef
= usetNode
->fParent
;
427 if (setRef
!= NULL
) {
428 RBBINode
*varRef
= setRef
->fParent
;
429 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
430 setName
= varRef
->fText
;
433 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
437 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
438 if (tRange
->fNum
== rlRange
->fNum
) {
440 RBBIDebugPrintf("\n ");
442 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
445 RBBIDebugPrintf("\n");
448 RBBIDebugPrintf("\n");
453 //------------------------------------------------------------------------
455 // printSets A debugging function.
456 // dump out all of the set definitions.
458 //------------------------------------------------------------------------
460 void RBBISetBuilder::printSets() {
463 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
468 UnicodeString setName
;
470 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
471 if (usetNode
== NULL
) {
475 RBBIDebugPrintf("%3d ", i
);
476 setName
= UNICODE_STRING("anonymous", 9);
477 setRef
= usetNode
->fParent
;
478 if (setRef
!= NULL
) {
479 varRef
= setRef
->fParent
;
480 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
481 setName
= varRef
->fText
;
484 RBBI_DEBUG_printUnicodeString(setName
);
485 RBBIDebugPrintf(" ");
486 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
487 RBBIDebugPrintf("\n");
488 if (usetNode
->fLeftChild
!= NULL
) {
489 usetNode
->fLeftChild
->printTree(TRUE
);
492 RBBIDebugPrintf("\n");
498 //-------------------------------------------------------------------------------------
500 // RangeDescriptor copy constructor
502 //-------------------------------------------------------------------------------------
504 RangeDescriptor::RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
) {
507 this->fStartChar
= other
.fStartChar
;
508 this->fEndChar
= other
.fEndChar
;
509 this->fNum
= other
.fNum
;
511 UErrorCode oldstatus
= status
;
512 this->fIncludesSets
= new UVector(status
);
513 if (U_FAILURE(oldstatus
)) {
516 if (U_FAILURE(status
)) {
520 if (this->fIncludesSets
== 0) {
521 status
= U_MEMORY_ALLOCATION_ERROR
;
525 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
526 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
531 //-------------------------------------------------------------------------------------
533 // RangeDesriptor default constructor
535 //-------------------------------------------------------------------------------------
536 RangeDescriptor::RangeDescriptor(UErrorCode
&status
) {
537 this->fStartChar
= 0;
541 UErrorCode oldstatus
= status
;
542 this->fIncludesSets
= new UVector(status
);
543 if (U_FAILURE(oldstatus
)) {
546 if (U_FAILURE(status
)) {
550 if(this->fIncludesSets
== 0) {
551 status
= U_MEMORY_ALLOCATION_ERROR
;
558 //-------------------------------------------------------------------------------------
560 // RangeDesriptor Destructor
562 //-------------------------------------------------------------------------------------
563 RangeDescriptor::~RangeDescriptor() {
564 delete fIncludesSets
;
565 fIncludesSets
= NULL
;
568 //-------------------------------------------------------------------------------------
570 // RangeDesriptor::split()
572 //-------------------------------------------------------------------------------------
573 void RangeDescriptor::split(UChar32 where
, UErrorCode
&status
) {
574 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
575 RangeDescriptor
*nr
= new RangeDescriptor(*this, status
);
576 if (U_FAILURE(status
)) {
581 status
= U_MEMORY_ALLOCATION_ERROR
;
585 // RangeDescriptor copy constructor copies all fields.
586 // Only need to update those that are different after the split.
587 nr
->fStartChar
= where
;
588 this->fEndChar
= where
-1;
589 nr
->fNext
= this->fNext
;
594 //-------------------------------------------------------------------------------------
596 // RangeDescriptor::setDictionaryFlag
598 // Character Category Numbers that include characters from
599 // the original Unicode Set named "dictionary" have bit 14
600 // set to 1. The RBBI runtime engine uses this to trigger
601 // use of the word dictionary.
603 // This function looks through the Unicode Sets that it
604 // (the range) includes, and sets the bit in fNum when
605 // "dictionary" is among them.
607 // TODO: a faster way would be to find the set node for
608 // "dictionary" just once, rather than looking it
609 // up by name every time.
611 //-------------------------------------------------------------------------------------
612 void RangeDescriptor::setDictionaryFlag() {
615 for (i
=0; i
<this->fIncludesSets
->size(); i
++) {
616 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
617 UnicodeString setName
;
618 RBBINode
*setRef
= usetNode
->fParent
;
619 if (setRef
!= NULL
) {
620 RBBINode
*varRef
= setRef
->fParent
;
621 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
622 setName
= varRef
->fText
;
625 if (setName
.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
626 this->fNum
|= 0x4000;
636 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */