1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
7 ***************************************************************************
8 * Copyright (C) 2002-2008 International Business Machines Corporation *
9 * and others. All rights reserved. *
10 ***************************************************************************
13 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
14 // (part of the rule building process.)
16 // Starting with the rules parse tree from the scanner,
18 // - Enumerate the set of UnicodeSets that are referenced
20 // - compute a set of non-overlapping character ranges
21 // with all characters within a range belonging to the same
22 // set of input uniocde sets.
23 // - Derive a set of non-overlapping UnicodeSet (like things)
24 // that will correspond to columns in the state table for
25 // the RBBI execution engine. All characters within one
26 // of these sets belong to the same set of the original
27 // UnicodeSets from the user's rules.
28 // - construct the trie table that maps input characters
29 // to the index of the matching non-overlapping set of set from
33 #include "unicode/utypes.h"
35 #if !UCONFIG_NO_BREAK_ITERATION
37 #include "unicode/uniset.h"
49 //------------------------------------------------------------------------
53 //------------------------------------------------------------------------
54 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder
*rb
)
57 fStatus
= rb
->fStatus
;
66 //------------------------------------------------------------------------
70 //------------------------------------------------------------------------
71 RBBISetBuilder::~RBBISetBuilder()
73 RangeDescriptor
*nextRangeDesc
;
75 // Walk through & delete the linked list of RangeDescriptors
76 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
77 RangeDescriptor
*r
= nextRangeDesc
;
78 nextRangeDesc
= r
->fNext
;
88 //------------------------------------------------------------------------
90 // build Build the list of non-overlapping character ranges
91 // from the Unicode Sets.
93 //------------------------------------------------------------------------
94 void RBBISetBuilder::buildRanges() {
96 RangeDescriptor
*rlRange
;
98 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
101 // Initialize the process by creating a single range encompassing all characters
102 // that is in no sets.
104 fRangeList
= new RangeDescriptor(*fStatus
); // will check for status here
105 if (fRangeList
== NULL
) {
106 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
109 fRangeList
->fStartChar
= 0;
110 fRangeList
->fEndChar
= 0x10ffff;
112 if (U_FAILURE(*fStatus
)) {
117 // Find the set of non-overlapping ranges of characters
120 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
121 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
122 if (usetNode
==NULL
) {
126 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
127 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
128 int inputSetRangeIndex
= 0;
129 rlRange
= fRangeList
;
132 if (inputSetRangeIndex
>= inputSetRangeCount
) {
135 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
136 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
138 // skip over ranges from the range list that are completely
139 // below the current range from the input unicode set.
140 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
141 rlRange
= rlRange
->fNext
;
144 // If the start of the range from the range list is before with
145 // the start of the range from the unicode set, split the range list range
146 // in two, with one part being before (wholly outside of) the unicode set
147 // and the other containing the rest.
148 // Then continue the loop; the post-split current range will then be skipped
150 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
151 rlRange
->split(inputSetRangeBegin
, *fStatus
);
152 if (U_FAILURE(*fStatus
)) {
158 // Same thing at the end of the ranges...
159 // If the end of the range from the range list doesn't coincide with
160 // the end of the range from the unicode set, split the range list
161 // range in two. The first part of the split range will be
162 // wholly inside the Unicode set.
163 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
164 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
165 if (U_FAILURE(*fStatus
)) {
170 // The current rlRange is now entirely within the UnicodeSet range.
171 // Add this unicode set to the list of sets for this rlRange
172 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
173 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
174 if (U_FAILURE(*fStatus
)) {
179 // Advance over ranges that we are finished with.
180 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
181 inputSetRangeIndex
++;
183 rlRange
= rlRange
->fNext
;
187 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
190 // Group the above ranges, with each group consisting of one or more
191 // ranges that are in exactly the same set of original UnicodeSets.
192 // The groups are numbered, and these group numbers are the set of
193 // input symbols recognized by the run-time state machine.
195 // Numbering: # 0 (state table column 0) is unused.
196 // # 1 is reserved - table column 1 is for end-of-input
197 // # 2 is reserved - table column 2 is for beginning-in-input
198 // # 3 is the first range list.
200 RangeDescriptor
*rlSearchRange
;
201 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
202 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
203 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
204 rlRange
->fNum
= rlSearchRange
->fNum
;
208 if (rlRange
->fNum
== 0) {
210 rlRange
->fNum
= fGroupCount
+2;
211 rlRange
->setDictionaryFlag();
212 addValToSets(rlRange
->fIncludesSets
, fGroupCount
+2);
216 // Handle input sets that contain the special string {eof}.
217 // Column 1 of the state table is reserved for EOF on input.
218 // Column 2 is reserved for before-the-start-input.
219 // (This column can be optimized away later if there are no rule
220 // references to {bof}.)
221 // Add this column value (1 or 2) to the equivalent expression
222 // subtree for each UnicodeSet that contains the string {eof}
223 // Because {bof} and {eof} are not a characters in the normal sense,
224 // they doesn't affect the computation of ranges or TRIE.
225 static const UChar eofUString
[] = {0x65, 0x6f, 0x66, 0};
226 static const UChar bofUString
[] = {0x62, 0x6f, 0x66, 0};
228 UnicodeString
eofString(eofUString
);
229 UnicodeString
bofString(bofUString
);
230 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
231 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
232 if (usetNode
==NULL
) {
235 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
236 if (inputSet
->contains(eofString
)) {
237 addValToSet(usetNode
, 1);
239 if (inputSet
->contains(bofString
)) {
240 addValToSet(usetNode
, 2);
246 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
247 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
252 // Build the Trie table for mapping UChar32 values to the corresponding
253 // range group number.
255 void RBBISetBuilder::buildTrie() {
256 RangeDescriptor
*rlRange
;
258 fTrie
= utrie2_open(0, // Initial value for all code points.
259 0, // Error value for out-of-range input.
262 for (rlRange
= fRangeList
; rlRange
!=0 && U_SUCCESS(*fStatus
); rlRange
=rlRange
->fNext
) {
263 utrie2_setRange32(fTrie
,
264 rlRange
->fStartChar
, // Range start
265 rlRange
->fEndChar
, // Range end (inclusive)
266 rlRange
->fNum
, // value for range
267 TRUE
, // Overwrite previously written values
273 void RBBISetBuilder::mergeCategories(IntPair categories
) {
274 U_ASSERT(categories
.first
>= 1);
275 U_ASSERT(categories
.second
> categories
.first
);
276 for (RangeDescriptor
*rd
= fRangeList
; rd
!= nullptr; rd
= rd
->fNext
) {
277 int32_t rangeNum
= rd
->fNum
& ~DICT_BIT
;
278 int32_t rangeDict
= rd
->fNum
& DICT_BIT
;
279 if (rangeNum
== categories
.second
) {
280 rd
->fNum
= categories
.first
| rangeDict
;
281 } else if (rangeNum
> categories
.second
) {
289 //-----------------------------------------------------------------------------------
291 // getTrieSize() Return the size that will be required to serialize the Trie.
293 //-----------------------------------------------------------------------------------
294 int32_t RBBISetBuilder::getTrieSize() {
295 if (U_FAILURE(*fStatus
)) {
298 utrie2_freeze(fTrie
, UTRIE2_16_VALUE_BITS
, fStatus
);
299 fTrieSize
= utrie2_serialize(fTrie
,
303 if (*fStatus
== U_BUFFER_OVERFLOW_ERROR
) {
304 *fStatus
= U_ZERO_ERROR
;
306 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
311 //-----------------------------------------------------------------------------------
313 // serializeTrie() Put the serialized trie at the specified address.
314 // Trust the caller to have given us enough memory.
315 // getTrieSize() MUST be called first.
317 //-----------------------------------------------------------------------------------
318 void RBBISetBuilder::serializeTrie(uint8_t *where
) {
319 utrie2_serialize(fTrie
,
321 fTrieSize
, // Capacity
325 //------------------------------------------------------------------------
327 // addValToSets Add a runtime-mapped input value to each uset from a
328 // list of uset nodes. (val corresponds to a state table column.)
329 // For each of the original Unicode sets - which correspond
330 // directly to uset nodes - a logically equivalent expression
331 // is constructed in terms of the remapped runtime input
332 // symbol set. This function adds one runtime input symbol to
335 // The "logically equivalent expression" is the tree for an
336 // or-ing together of all of the symbols that go into the set.
338 //------------------------------------------------------------------------
339 void RBBISetBuilder::addValToSets(UVector
*sets
, uint32_t val
) {
342 for (ix
=0; ix
<sets
->size(); ix
++) {
343 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
344 addValToSet(usetNode
, val
);
348 void RBBISetBuilder::addValToSet(RBBINode
*usetNode
, uint32_t val
) {
349 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
350 if (leafNode
== NULL
) {
351 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
354 leafNode
->fVal
= (unsigned short)val
;
355 if (usetNode
->fLeftChild
== NULL
) {
356 usetNode
->fLeftChild
= leafNode
;
357 leafNode
->fParent
= usetNode
;
359 // There are already input symbols present for this set.
360 // Set up an OR node, with the previous stuff as the left child
361 // and the new value as the right child.
362 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
363 if (orNode
== NULL
) {
364 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
367 orNode
->fLeftChild
= usetNode
->fLeftChild
;
368 orNode
->fRightChild
= leafNode
;
369 orNode
->fLeftChild
->fParent
= orNode
;
370 orNode
->fRightChild
->fParent
= orNode
;
371 usetNode
->fLeftChild
= orNode
;
372 orNode
->fParent
= usetNode
;
377 //------------------------------------------------------------------------
379 // getNumCharCategories
381 //------------------------------------------------------------------------
382 int32_t RBBISetBuilder::getNumCharCategories() const {
383 return fGroupCount
+ 3;
387 //------------------------------------------------------------------------
391 //------------------------------------------------------------------------
392 UBool
RBBISetBuilder::sawBOF() const {
397 //------------------------------------------------------------------------
399 // getFirstChar Given a runtime RBBI character category, find
400 // the first UChar32 that is in the set of chars
402 //------------------------------------------------------------------------
403 UChar32
RBBISetBuilder::getFirstChar(int32_t category
) const {
404 RangeDescriptor
*rlRange
;
405 UChar32 retVal
= (UChar32
)-1;
406 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
407 if (rlRange
->fNum
== category
) {
408 retVal
= rlRange
->fStartChar
;
417 //------------------------------------------------------------------------
419 // printRanges A debugging function.
420 // dump out all of the range definitions.
422 //------------------------------------------------------------------------
424 void RBBISetBuilder::printRanges() {
425 RangeDescriptor
*rlRange
;
428 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
429 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
430 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
432 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
433 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
434 UnicodeString setName
= UNICODE_STRING("anon", 4);
435 RBBINode
*setRef
= usetNode
->fParent
;
436 if (setRef
!= NULL
) {
437 RBBINode
*varRef
= setRef
->fParent
;
438 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
439 setName
= varRef
->fText
;
442 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
444 RBBIDebugPrintf("\n");
450 //------------------------------------------------------------------------
452 // printRangeGroups A debugging function.
453 // dump out all of the range groups.
455 //------------------------------------------------------------------------
457 void RBBISetBuilder::printRangeGroups() {
458 RangeDescriptor
*rlRange
;
459 RangeDescriptor
*tRange
;
461 int lastPrintedGroupNum
= 0;
463 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
464 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
465 int groupNum
= rlRange
->fNum
& 0xbfff;
466 if (groupNum
> lastPrintedGroupNum
) {
467 lastPrintedGroupNum
= groupNum
;
468 RBBIDebugPrintf("%2i ", groupNum
);
470 if (rlRange
->fNum
& DICT_BIT
) { RBBIDebugPrintf(" <DICT> ");}
472 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
473 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
474 UnicodeString setName
= UNICODE_STRING("anon", 4);
475 RBBINode
*setRef
= usetNode
->fParent
;
476 if (setRef
!= NULL
) {
477 RBBINode
*varRef
= setRef
->fParent
;
478 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
479 setName
= varRef
->fText
;
482 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
486 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
487 if (tRange
->fNum
== rlRange
->fNum
) {
489 RBBIDebugPrintf("\n ");
491 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
494 RBBIDebugPrintf("\n");
497 RBBIDebugPrintf("\n");
502 //------------------------------------------------------------------------
504 // printSets A debugging function.
505 // dump out all of the set definitions.
507 //------------------------------------------------------------------------
509 void RBBISetBuilder::printSets() {
512 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
517 UnicodeString setName
;
519 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
520 if (usetNode
== NULL
) {
524 RBBIDebugPrintf("%3d ", i
);
525 setName
= UNICODE_STRING("anonymous", 9);
526 setRef
= usetNode
->fParent
;
527 if (setRef
!= NULL
) {
528 varRef
= setRef
->fParent
;
529 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
530 setName
= varRef
->fText
;
533 RBBI_DEBUG_printUnicodeString(setName
);
534 RBBIDebugPrintf(" ");
535 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
536 RBBIDebugPrintf("\n");
537 if (usetNode
->fLeftChild
!= NULL
) {
538 RBBINode::printTree(usetNode
->fLeftChild
, TRUE
);
541 RBBIDebugPrintf("\n");
547 //-------------------------------------------------------------------------------------
549 // RangeDescriptor copy constructor
551 //-------------------------------------------------------------------------------------
553 RangeDescriptor::RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
) {
556 this->fStartChar
= other
.fStartChar
;
557 this->fEndChar
= other
.fEndChar
;
558 this->fNum
= other
.fNum
;
560 UErrorCode oldstatus
= status
;
561 this->fIncludesSets
= new UVector(status
);
562 if (U_FAILURE(oldstatus
)) {
565 if (U_FAILURE(status
)) {
569 if (this->fIncludesSets
== 0) {
570 status
= U_MEMORY_ALLOCATION_ERROR
;
574 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
575 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
580 //-------------------------------------------------------------------------------------
582 // RangeDesriptor default constructor
584 //-------------------------------------------------------------------------------------
585 RangeDescriptor::RangeDescriptor(UErrorCode
&status
) {
586 this->fStartChar
= 0;
590 UErrorCode oldstatus
= status
;
591 this->fIncludesSets
= new UVector(status
);
592 if (U_FAILURE(oldstatus
)) {
595 if (U_FAILURE(status
)) {
599 if(this->fIncludesSets
== 0) {
600 status
= U_MEMORY_ALLOCATION_ERROR
;
607 //-------------------------------------------------------------------------------------
609 // RangeDesriptor Destructor
611 //-------------------------------------------------------------------------------------
612 RangeDescriptor::~RangeDescriptor() {
613 delete fIncludesSets
;
614 fIncludesSets
= NULL
;
617 //-------------------------------------------------------------------------------------
619 // RangeDesriptor::split()
621 //-------------------------------------------------------------------------------------
622 void RangeDescriptor::split(UChar32 where
, UErrorCode
&status
) {
623 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
624 RangeDescriptor
*nr
= new RangeDescriptor(*this, status
);
626 status
= U_MEMORY_ALLOCATION_ERROR
;
629 if (U_FAILURE(status
)) {
633 // RangeDescriptor copy constructor copies all fields.
634 // Only need to update those that are different after the split.
635 nr
->fStartChar
= where
;
636 this->fEndChar
= where
-1;
637 nr
->fNext
= this->fNext
;
642 //-------------------------------------------------------------------------------------
644 // RangeDescriptor::setDictionaryFlag
646 // Character Category Numbers that include characters from
647 // the original Unicode Set named "dictionary" have bit 14
648 // set to 1. The RBBI runtime engine uses this to trigger
649 // use of the word dictionary.
651 // This function looks through the Unicode Sets that it
652 // (the range) includes, and sets the bit in fNum when
653 // "dictionary" is among them.
655 // TODO: a faster way would be to find the set node for
656 // "dictionary" just once, rather than looking it
657 // up by name every time.
659 //-------------------------------------------------------------------------------------
660 void RangeDescriptor::setDictionaryFlag() {
663 static const char16_t *dictionary
= u
"dictionary";
664 for (i
=0; i
<fIncludesSets
->size(); i
++) {
665 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
666 RBBINode
*setRef
= usetNode
->fParent
;
667 if (setRef
!= nullptr) {
668 RBBINode
*varRef
= setRef
->fParent
;
669 if (varRef
&& varRef
->fType
== RBBINode::varRef
) {
670 const UnicodeString
*setName
= &varRef
->fText
;
671 if (setName
->compare(dictionary
, -1) == 0) {
672 fNum
|= RBBISetBuilder::DICT_BIT
;
684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */