5 ***************************************************************************
6 * Copyright (C) 2002-2008 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
14 // Starting with the rules parse tree from the scanner,
16 // - Enumerate the set of UnicodeSets that are referenced
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
31 #include "unicode/utypes.h"
33 #if !UCONFIG_NO_BREAK_ITERATION
35 #include "unicode/uniset.h"
46 //------------------------------------------------------------------------
48 // getFoldedRBBIValue Call-back function used during building of Trie table.
49 // Folding value: just store the offset (16 bits)
50 // if there is any non-0 entry.
51 // (It'd really be nice if the Trie builder would provide a
52 // simple default, so this function could go away from here.)
54 //------------------------------------------------------------------------
55 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
57 static uint32_t U_CALLCONV
58 getFoldedRBBIValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
65 value
=utrie_get32(trie
, start
, &inBlockZero
);
67 start
+=UTRIE_DATA_BLOCK_LENGTH
;
69 return (uint32_t)(offset
|0x8000);
84 //------------------------------------------------------------------------
88 //------------------------------------------------------------------------
89 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder
*rb
)
92 fStatus
= rb
->fStatus
;
101 //------------------------------------------------------------------------
105 //------------------------------------------------------------------------
106 RBBISetBuilder::~RBBISetBuilder()
108 RangeDescriptor
*nextRangeDesc
;
110 // Walk through & delete the linked list of RangeDescriptors
111 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
112 RangeDescriptor
*r
= nextRangeDesc
;
113 nextRangeDesc
= r
->fNext
;
123 //------------------------------------------------------------------------
125 // build Build the list of non-overlapping character ranges
126 // from the Unicode Sets.
128 //------------------------------------------------------------------------
129 void RBBISetBuilder::build() {
131 RangeDescriptor
*rlRange
;
133 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
136 // Initialize the process by creating a single range encompassing all characters
137 // that is in no sets.
139 fRangeList
= new RangeDescriptor(*fStatus
); // will check for status here
140 if (fRangeList
== NULL
) {
141 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
144 fRangeList
->fStartChar
= 0;
145 fRangeList
->fEndChar
= 0x10ffff;
147 if (U_FAILURE(*fStatus
)) {
152 // Find the set of non-overlapping ranges of characters
155 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
156 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
157 if (usetNode
==NULL
) {
161 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
162 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
163 int inputSetRangeIndex
= 0;
164 rlRange
= fRangeList
;
167 if (inputSetRangeIndex
>= inputSetRangeCount
) {
170 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
171 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
173 // skip over ranges from the range list that are completely
174 // below the current range from the input unicode set.
175 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
176 rlRange
= rlRange
->fNext
;
179 // If the start of the range from the range list is before with
180 // the start of the range from the unicode set, split the range list range
181 // in two, with one part being before (wholly outside of) the unicode set
182 // and the other containing the rest.
183 // Then continue the loop; the post-split current range will then be skipped
185 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
186 rlRange
->split(inputSetRangeBegin
, *fStatus
);
187 if (U_FAILURE(*fStatus
)) {
193 // Same thing at the end of the ranges...
194 // If the end of the range from the range list doesn't coincide with
195 // the end of the range from the unicode set, split the range list
196 // range in two. The first part of the split range will be
197 // wholly inside the Unicode set.
198 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
199 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
200 if (U_FAILURE(*fStatus
)) {
205 // The current rlRange is now entirely within the UnicodeSet range.
206 // Add this unicode set to the list of sets for this rlRange
207 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
208 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
209 if (U_FAILURE(*fStatus
)) {
214 // Advance over ranges that we are finished with.
215 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
216 inputSetRangeIndex
++;
218 rlRange
= rlRange
->fNext
;
222 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
225 // Group the above ranges, with each group consisting of one or more
226 // ranges that are in exactly the same set of original UnicodeSets.
227 // The groups are numbered, and these group numbers are the set of
228 // input symbols recognized by the run-time state machine.
230 // Numbering: # 0 (state table column 0) is unused.
231 // # 1 is reserved - table column 1 is for end-of-input
232 // # 2 is reserved - table column 2 is for beginning-in-input
233 // # 3 is the first range list.
235 RangeDescriptor
*rlSearchRange
;
236 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
237 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
238 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
239 rlRange
->fNum
= rlSearchRange
->fNum
;
243 if (rlRange
->fNum
== 0) {
245 rlRange
->fNum
= fGroupCount
+2;
246 rlRange
->setDictionaryFlag();
247 addValToSets(rlRange
->fIncludesSets
, fGroupCount
+2);
251 // Handle input sets that contain the special string {eof}.
252 // Column 1 of the state table is reserved for EOF on input.
253 // Column 2 is reserved for before-the-start-input.
254 // (This column can be optimized away later if there are no rule
255 // references to {bof}.)
256 // Add this column value (1 or 2) to the equivalent expression
257 // subtree for each UnicodeSet that contains the string {eof}
258 // Because {bof} and {eof} are not a characters in the normal sense,
259 // they doesn't affect the computation of ranges or TRIE.
260 static const UChar eofUString
[] = {0x65, 0x6f, 0x66, 0};
261 static const UChar bofUString
[] = {0x62, 0x6f, 0x66, 0};
263 UnicodeString
eofString(eofUString
);
264 UnicodeString
bofString(bofUString
);
265 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
266 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
267 if (usetNode
==NULL
) {
270 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
271 if (inputSet
->contains(eofString
)) {
272 addValToSet(usetNode
, 1);
274 if (inputSet
->contains(bofString
)) {
275 addValToSet(usetNode
, 2);
281 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
282 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
285 // Build the Trie table for mapping UChar32 values to the corresponding
286 // range group number
288 fTrie
= utrie_open(NULL
, // Pre-existing trie to be filled in
289 NULL
, // Data array (utrie will allocate one)
290 100000, // Max Data Length
291 0, // Initial value for all code points
292 0, // Lead surrogate unit value
293 TRUE
); // Keep Latin 1 in separately
296 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
297 utrie_setRange32(fTrie
, rlRange
->fStartChar
, rlRange
->fEndChar
+1, rlRange
->fNum
, TRUE
);
303 //-----------------------------------------------------------------------------------
305 // getTrieSize() Return the size that will be required to serialize the Trie.
307 //-----------------------------------------------------------------------------------
308 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
309 fTrieSize
= utrie_serialize(fTrie
,
313 TRUE
, // Reduce to 16 bits
315 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
320 //-----------------------------------------------------------------------------------
322 // serializeTrie() Put the serialized trie at the specified address.
323 // Trust the caller to have given us enough memory.
324 // getTrieSize() MUST be called first.
326 //-----------------------------------------------------------------------------------
327 void RBBISetBuilder::serializeTrie(uint8_t *where
) {
328 utrie_serialize(fTrie
,
330 fTrieSize
, // Capacity
332 TRUE
, // Reduce to 16 bits
336 //------------------------------------------------------------------------
338 // addValToSets Add a runtime-mapped input value to each uset from a
339 // list of uset nodes. (val corresponds to a state table column.)
340 // For each of the original Unicode sets - which correspond
341 // directly to uset nodes - a logically equivalent expression
342 // is constructed in terms of the remapped runtime input
343 // symbol set. This function adds one runtime input symbol to
346 // The "logically equivalent expression" is the tree for an
347 // or-ing together of all of the symbols that go into the set.
349 //------------------------------------------------------------------------
350 void RBBISetBuilder::addValToSets(UVector
*sets
, uint32_t val
) {
353 for (ix
=0; ix
<sets
->size(); ix
++) {
354 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
355 addValToSet(usetNode
, val
);
359 void RBBISetBuilder::addValToSet(RBBINode
*usetNode
, uint32_t val
) {
360 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
361 if (leafNode
== NULL
) {
362 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
365 leafNode
->fVal
= (unsigned short)val
;
366 if (usetNode
->fLeftChild
== NULL
) {
367 usetNode
->fLeftChild
= leafNode
;
368 leafNode
->fParent
= usetNode
;
370 // There are already input symbols present for this set.
371 // Set up an OR node, with the previous stuff as the left child
372 // and the new value as the right child.
373 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
374 if (orNode
== NULL
) {
375 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
378 orNode
->fLeftChild
= usetNode
->fLeftChild
;
379 orNode
->fRightChild
= leafNode
;
380 orNode
->fLeftChild
->fParent
= orNode
;
381 orNode
->fRightChild
->fParent
= orNode
;
382 usetNode
->fLeftChild
= orNode
;
383 orNode
->fParent
= usetNode
;
388 //------------------------------------------------------------------------
390 // getNumCharCategories
392 //------------------------------------------------------------------------
393 int32_t RBBISetBuilder::getNumCharCategories() const {
394 return fGroupCount
+ 3;
398 //------------------------------------------------------------------------
402 //------------------------------------------------------------------------
403 UBool
RBBISetBuilder::sawBOF() const {
408 //------------------------------------------------------------------------
410 // getFirstChar Given a runtime RBBI character category, find
411 // the first UChar32 that is in the set of chars
413 //------------------------------------------------------------------------
414 UChar32
RBBISetBuilder::getFirstChar(int32_t category
) const {
415 RangeDescriptor
*rlRange
;
416 UChar32 retVal
= (UChar32
)-1;
417 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
418 if (rlRange
->fNum
== category
) {
419 retVal
= rlRange
->fStartChar
;
428 //------------------------------------------------------------------------
430 // printRanges A debugging function.
431 // dump out all of the range definitions.
433 //------------------------------------------------------------------------
435 void RBBISetBuilder::printRanges() {
436 RangeDescriptor
*rlRange
;
439 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
440 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
441 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
443 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
444 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
445 UnicodeString setName
= UNICODE_STRING("anon", 4);
446 RBBINode
*setRef
= usetNode
->fParent
;
447 if (setRef
!= NULL
) {
448 RBBINode
*varRef
= setRef
->fParent
;
449 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
450 setName
= varRef
->fText
;
453 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
455 RBBIDebugPrintf("\n");
461 //------------------------------------------------------------------------
463 // printRangeGroups A debugging function.
464 // dump out all of the range groups.
466 //------------------------------------------------------------------------
468 void RBBISetBuilder::printRangeGroups() {
469 RangeDescriptor
*rlRange
;
470 RangeDescriptor
*tRange
;
472 int lastPrintedGroupNum
= 0;
474 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
475 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
476 int groupNum
= rlRange
->fNum
& 0xbfff;
477 if (groupNum
> lastPrintedGroupNum
) {
478 lastPrintedGroupNum
= groupNum
;
479 RBBIDebugPrintf("%2i ", groupNum
);
481 if (rlRange
->fNum
& 0x4000) { RBBIDebugPrintf(" <DICT> ");}
483 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
484 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
485 UnicodeString setName
= UNICODE_STRING("anon", 4);
486 RBBINode
*setRef
= usetNode
->fParent
;
487 if (setRef
!= NULL
) {
488 RBBINode
*varRef
= setRef
->fParent
;
489 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
490 setName
= varRef
->fText
;
493 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
497 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
498 if (tRange
->fNum
== rlRange
->fNum
) {
500 RBBIDebugPrintf("\n ");
502 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
505 RBBIDebugPrintf("\n");
508 RBBIDebugPrintf("\n");
513 //------------------------------------------------------------------------
515 // printSets A debugging function.
516 // dump out all of the set definitions.
518 //------------------------------------------------------------------------
520 void RBBISetBuilder::printSets() {
523 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
528 UnicodeString setName
;
530 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
531 if (usetNode
== NULL
) {
535 RBBIDebugPrintf("%3d ", i
);
536 setName
= UNICODE_STRING("anonymous", 9);
537 setRef
= usetNode
->fParent
;
538 if (setRef
!= NULL
) {
539 varRef
= setRef
->fParent
;
540 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
541 setName
= varRef
->fText
;
544 RBBI_DEBUG_printUnicodeString(setName
);
545 RBBIDebugPrintf(" ");
546 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
547 RBBIDebugPrintf("\n");
548 if (usetNode
->fLeftChild
!= NULL
) {
549 usetNode
->fLeftChild
->printTree(TRUE
);
552 RBBIDebugPrintf("\n");
558 //-------------------------------------------------------------------------------------
560 // RangeDescriptor copy constructor
562 //-------------------------------------------------------------------------------------
564 RangeDescriptor::RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
) {
567 this->fStartChar
= other
.fStartChar
;
568 this->fEndChar
= other
.fEndChar
;
569 this->fNum
= other
.fNum
;
571 UErrorCode oldstatus
= status
;
572 this->fIncludesSets
= new UVector(status
);
573 if (U_FAILURE(oldstatus
)) {
576 if (U_FAILURE(status
)) {
580 if (this->fIncludesSets
== 0) {
581 status
= U_MEMORY_ALLOCATION_ERROR
;
585 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
586 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
591 //-------------------------------------------------------------------------------------
593 // RangeDesriptor default constructor
595 //-------------------------------------------------------------------------------------
596 RangeDescriptor::RangeDescriptor(UErrorCode
&status
) {
597 this->fStartChar
= 0;
601 UErrorCode oldstatus
= status
;
602 this->fIncludesSets
= new UVector(status
);
603 if (U_FAILURE(oldstatus
)) {
606 if (U_FAILURE(status
)) {
610 if(this->fIncludesSets
== 0) {
611 status
= U_MEMORY_ALLOCATION_ERROR
;
618 //-------------------------------------------------------------------------------------
620 // RangeDesriptor Destructor
622 //-------------------------------------------------------------------------------------
623 RangeDescriptor::~RangeDescriptor() {
624 delete fIncludesSets
;
625 fIncludesSets
= NULL
;
628 //-------------------------------------------------------------------------------------
630 // RangeDesriptor::split()
632 //-------------------------------------------------------------------------------------
633 void RangeDescriptor::split(UChar32 where
, UErrorCode
&status
) {
634 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
635 RangeDescriptor
*nr
= new RangeDescriptor(*this, status
);
637 status
= U_MEMORY_ALLOCATION_ERROR
;
640 if (U_FAILURE(status
)) {
644 // RangeDescriptor copy constructor copies all fields.
645 // Only need to update those that are different after the split.
646 nr
->fStartChar
= where
;
647 this->fEndChar
= where
-1;
648 nr
->fNext
= this->fNext
;
653 //-------------------------------------------------------------------------------------
655 // RangeDescriptor::setDictionaryFlag
657 // Character Category Numbers that include characters from
658 // the original Unicode Set named "dictionary" have bit 14
659 // set to 1. The RBBI runtime engine uses this to trigger
660 // use of the word dictionary.
662 // This function looks through the Unicode Sets that it
663 // (the range) includes, and sets the bit in fNum when
664 // "dictionary" is among them.
666 // TODO: a faster way would be to find the set node for
667 // "dictionary" just once, rather than looking it
668 // up by name every time.
670 //-------------------------------------------------------------------------------------
671 void RangeDescriptor::setDictionaryFlag() {
674 for (i
=0; i
<this->fIncludesSets
->size(); i
++) {
675 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
676 UnicodeString setName
;
677 RBBINode
*setRef
= usetNode
->fParent
;
678 if (setRef
!= NULL
) {
679 RBBINode
*varRef
= setRef
->fParent
;
680 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
681 setName
= varRef
->fText
;
684 if (setName
.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
685 this->fNum
|= 0x4000;
695 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */