1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
7 ***************************************************************************
8 * Copyright (C) 2002-2008 International Business Machines Corporation *
9 * and others. All rights reserved. *
10 ***************************************************************************
13 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
14 // (part of the rule building process.)
16 // Starting with the rules parse tree from the scanner,
18 // - Enumerate the set of UnicodeSets that are referenced
20 // - compute a set of non-overlapping character ranges
21 // with all characters within a range belonging to the same
22 // set of input uniocde sets.
23 // - Derive a set of non-overlapping UnicodeSet (like things)
24 // that will correspond to columns in the state table for
25 // the RBBI execution engine. All characters within one
26 // of these sets belong to the same set of the original
27 // UnicodeSets from the user's rules.
28 // - construct the trie table that maps input characters
29 // to the index of the matching non-overlapping set of set from
33 #include "unicode/utypes.h"
35 #if !UCONFIG_NO_BREAK_ITERATION
37 #include "unicode/uniset.h"
48 //------------------------------------------------------------------------
50 // getFoldedRBBIValue Call-back function used during building of Trie table.
51 // Folding value: just store the offset (16 bits)
52 // if there is any non-0 entry.
53 // (It'd really be nice if the Trie builder would provide a
54 // simple default, so this function could go away from here.)
56 //------------------------------------------------------------------------
57 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
59 static uint32_t U_CALLCONV
60 getFoldedRBBIValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
67 value
=utrie_get32(trie
, start
, &inBlockZero
);
69 start
+=UTRIE_DATA_BLOCK_LENGTH
;
71 return (uint32_t)(offset
|0x8000);
86 //------------------------------------------------------------------------
90 //------------------------------------------------------------------------
91 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder
*rb
)
94 fStatus
= rb
->fStatus
;
103 //------------------------------------------------------------------------
107 //------------------------------------------------------------------------
108 RBBISetBuilder::~RBBISetBuilder()
110 RangeDescriptor
*nextRangeDesc
;
112 // Walk through & delete the linked list of RangeDescriptors
113 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
114 RangeDescriptor
*r
= nextRangeDesc
;
115 nextRangeDesc
= r
->fNext
;
125 //------------------------------------------------------------------------
127 // build Build the list of non-overlapping character ranges
128 // from the Unicode Sets.
130 //------------------------------------------------------------------------
131 void RBBISetBuilder::build() {
133 RangeDescriptor
*rlRange
;
135 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
138 // Initialize the process by creating a single range encompassing all characters
139 // that is in no sets.
141 fRangeList
= new RangeDescriptor(*fStatus
); // will check for status here
142 if (fRangeList
== NULL
) {
143 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
146 fRangeList
->fStartChar
= 0;
147 fRangeList
->fEndChar
= 0x10ffff;
149 if (U_FAILURE(*fStatus
)) {
154 // Find the set of non-overlapping ranges of characters
157 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
158 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
159 if (usetNode
==NULL
) {
163 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
164 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
165 int inputSetRangeIndex
= 0;
166 rlRange
= fRangeList
;
169 if (inputSetRangeIndex
>= inputSetRangeCount
) {
172 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
173 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
175 // skip over ranges from the range list that are completely
176 // below the current range from the input unicode set.
177 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
178 rlRange
= rlRange
->fNext
;
181 // If the start of the range from the range list is before with
182 // the start of the range from the unicode set, split the range list range
183 // in two, with one part being before (wholly outside of) the unicode set
184 // and the other containing the rest.
185 // Then continue the loop; the post-split current range will then be skipped
187 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
188 rlRange
->split(inputSetRangeBegin
, *fStatus
);
189 if (U_FAILURE(*fStatus
)) {
195 // Same thing at the end of the ranges...
196 // If the end of the range from the range list doesn't coincide with
197 // the end of the range from the unicode set, split the range list
198 // range in two. The first part of the split range will be
199 // wholly inside the Unicode set.
200 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
201 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
202 if (U_FAILURE(*fStatus
)) {
207 // The current rlRange is now entirely within the UnicodeSet range.
208 // Add this unicode set to the list of sets for this rlRange
209 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
210 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
211 if (U_FAILURE(*fStatus
)) {
216 // Advance over ranges that we are finished with.
217 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
218 inputSetRangeIndex
++;
220 rlRange
= rlRange
->fNext
;
224 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
227 // Group the above ranges, with each group consisting of one or more
228 // ranges that are in exactly the same set of original UnicodeSets.
229 // The groups are numbered, and these group numbers are the set of
230 // input symbols recognized by the run-time state machine.
232 // Numbering: # 0 (state table column 0) is unused.
233 // # 1 is reserved - table column 1 is for end-of-input
234 // # 2 is reserved - table column 2 is for beginning-in-input
235 // # 3 is the first range list.
237 RangeDescriptor
*rlSearchRange
;
238 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
239 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
240 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
241 rlRange
->fNum
= rlSearchRange
->fNum
;
245 if (rlRange
->fNum
== 0) {
247 rlRange
->fNum
= fGroupCount
+2;
248 rlRange
->setDictionaryFlag();
249 addValToSets(rlRange
->fIncludesSets
, fGroupCount
+2);
253 // Handle input sets that contain the special string {eof}.
254 // Column 1 of the state table is reserved for EOF on input.
255 // Column 2 is reserved for before-the-start-input.
256 // (This column can be optimized away later if there are no rule
257 // references to {bof}.)
258 // Add this column value (1 or 2) to the equivalent expression
259 // subtree for each UnicodeSet that contains the string {eof}
260 // Because {bof} and {eof} are not a characters in the normal sense,
261 // they doesn't affect the computation of ranges or TRIE.
262 static const UChar eofUString
[] = {0x65, 0x6f, 0x66, 0};
263 static const UChar bofUString
[] = {0x62, 0x6f, 0x66, 0};
265 UnicodeString
eofString(eofUString
);
266 UnicodeString
bofString(bofUString
);
267 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
268 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
269 if (usetNode
==NULL
) {
272 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
273 if (inputSet
->contains(eofString
)) {
274 addValToSet(usetNode
, 1);
276 if (inputSet
->contains(bofString
)) {
277 addValToSet(usetNode
, 2);
283 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
284 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
287 // Build the Trie table for mapping UChar32 values to the corresponding
288 // range group number
290 fTrie
= utrie_open(NULL
, // Pre-existing trie to be filled in
291 NULL
, // Data array (utrie will allocate one)
292 100000, // Max Data Length
293 0, // Initial value for all code points
294 0, // Lead surrogate unit value
295 TRUE
); // Keep Latin 1 in separately
298 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
299 utrie_setRange32(fTrie
, rlRange
->fStartChar
, rlRange
->fEndChar
+1, rlRange
->fNum
, TRUE
);
305 //-----------------------------------------------------------------------------------
307 // getTrieSize() Return the size that will be required to serialize the Trie.
309 //-----------------------------------------------------------------------------------
310 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
311 fTrieSize
= utrie_serialize(fTrie
,
315 TRUE
, // Reduce to 16 bits
317 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
322 //-----------------------------------------------------------------------------------
324 // serializeTrie() Put the serialized trie at the specified address.
325 // Trust the caller to have given us enough memory.
326 // getTrieSize() MUST be called first.
328 //-----------------------------------------------------------------------------------
329 void RBBISetBuilder::serializeTrie(uint8_t *where
) {
330 utrie_serialize(fTrie
,
332 fTrieSize
, // Capacity
334 TRUE
, // Reduce to 16 bits
338 //------------------------------------------------------------------------
340 // addValToSets Add a runtime-mapped input value to each uset from a
341 // list of uset nodes. (val corresponds to a state table column.)
342 // For each of the original Unicode sets - which correspond
343 // directly to uset nodes - a logically equivalent expression
344 // is constructed in terms of the remapped runtime input
345 // symbol set. This function adds one runtime input symbol to
348 // The "logically equivalent expression" is the tree for an
349 // or-ing together of all of the symbols that go into the set.
351 //------------------------------------------------------------------------
352 void RBBISetBuilder::addValToSets(UVector
*sets
, uint32_t val
) {
355 for (ix
=0; ix
<sets
->size(); ix
++) {
356 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
357 addValToSet(usetNode
, val
);
361 void RBBISetBuilder::addValToSet(RBBINode
*usetNode
, uint32_t val
) {
362 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
363 if (leafNode
== NULL
) {
364 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
367 leafNode
->fVal
= (unsigned short)val
;
368 if (usetNode
->fLeftChild
== NULL
) {
369 usetNode
->fLeftChild
= leafNode
;
370 leafNode
->fParent
= usetNode
;
372 // There are already input symbols present for this set.
373 // Set up an OR node, with the previous stuff as the left child
374 // and the new value as the right child.
375 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
376 if (orNode
== NULL
) {
377 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
380 orNode
->fLeftChild
= usetNode
->fLeftChild
;
381 orNode
->fRightChild
= leafNode
;
382 orNode
->fLeftChild
->fParent
= orNode
;
383 orNode
->fRightChild
->fParent
= orNode
;
384 usetNode
->fLeftChild
= orNode
;
385 orNode
->fParent
= usetNode
;
390 //------------------------------------------------------------------------
392 // getNumCharCategories
394 //------------------------------------------------------------------------
395 int32_t RBBISetBuilder::getNumCharCategories() const {
396 return fGroupCount
+ 3;
400 //------------------------------------------------------------------------
404 //------------------------------------------------------------------------
405 UBool
RBBISetBuilder::sawBOF() const {
410 //------------------------------------------------------------------------
412 // getFirstChar Given a runtime RBBI character category, find
413 // the first UChar32 that is in the set of chars
415 //------------------------------------------------------------------------
416 UChar32
RBBISetBuilder::getFirstChar(int32_t category
) const {
417 RangeDescriptor
*rlRange
;
418 UChar32 retVal
= (UChar32
)-1;
419 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
420 if (rlRange
->fNum
== category
) {
421 retVal
= rlRange
->fStartChar
;
430 //------------------------------------------------------------------------
432 // printRanges A debugging function.
433 // dump out all of the range definitions.
435 //------------------------------------------------------------------------
437 void RBBISetBuilder::printRanges() {
438 RangeDescriptor
*rlRange
;
441 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
442 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
443 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
445 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
446 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
447 UnicodeString setName
= UNICODE_STRING("anon", 4);
448 RBBINode
*setRef
= usetNode
->fParent
;
449 if (setRef
!= NULL
) {
450 RBBINode
*varRef
= setRef
->fParent
;
451 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
452 setName
= varRef
->fText
;
455 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
457 RBBIDebugPrintf("\n");
463 //------------------------------------------------------------------------
465 // printRangeGroups A debugging function.
466 // dump out all of the range groups.
468 //------------------------------------------------------------------------
470 void RBBISetBuilder::printRangeGroups() {
471 RangeDescriptor
*rlRange
;
472 RangeDescriptor
*tRange
;
474 int lastPrintedGroupNum
= 0;
476 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
477 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
478 int groupNum
= rlRange
->fNum
& 0xbfff;
479 if (groupNum
> lastPrintedGroupNum
) {
480 lastPrintedGroupNum
= groupNum
;
481 RBBIDebugPrintf("%2i ", groupNum
);
483 if (rlRange
->fNum
& 0x4000) { RBBIDebugPrintf(" <DICT> ");}
485 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
486 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
487 UnicodeString setName
= UNICODE_STRING("anon", 4);
488 RBBINode
*setRef
= usetNode
->fParent
;
489 if (setRef
!= NULL
) {
490 RBBINode
*varRef
= setRef
->fParent
;
491 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
492 setName
= varRef
->fText
;
495 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
499 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
500 if (tRange
->fNum
== rlRange
->fNum
) {
502 RBBIDebugPrintf("\n ");
504 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
507 RBBIDebugPrintf("\n");
510 RBBIDebugPrintf("\n");
515 //------------------------------------------------------------------------
517 // printSets A debugging function.
518 // dump out all of the set definitions.
520 //------------------------------------------------------------------------
522 void RBBISetBuilder::printSets() {
525 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
530 UnicodeString setName
;
532 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
533 if (usetNode
== NULL
) {
537 RBBIDebugPrintf("%3d ", i
);
538 setName
= UNICODE_STRING("anonymous", 9);
539 setRef
= usetNode
->fParent
;
540 if (setRef
!= NULL
) {
541 varRef
= setRef
->fParent
;
542 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
543 setName
= varRef
->fText
;
546 RBBI_DEBUG_printUnicodeString(setName
);
547 RBBIDebugPrintf(" ");
548 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
549 RBBIDebugPrintf("\n");
550 if (usetNode
->fLeftChild
!= NULL
) {
551 RBBINode::printTree(usetNode
->fLeftChild
, TRUE
);
554 RBBIDebugPrintf("\n");
560 //-------------------------------------------------------------------------------------
562 // RangeDescriptor copy constructor
564 //-------------------------------------------------------------------------------------
566 RangeDescriptor::RangeDescriptor(const RangeDescriptor
&other
, UErrorCode
&status
) {
569 this->fStartChar
= other
.fStartChar
;
570 this->fEndChar
= other
.fEndChar
;
571 this->fNum
= other
.fNum
;
573 UErrorCode oldstatus
= status
;
574 this->fIncludesSets
= new UVector(status
);
575 if (U_FAILURE(oldstatus
)) {
578 if (U_FAILURE(status
)) {
582 if (this->fIncludesSets
== 0) {
583 status
= U_MEMORY_ALLOCATION_ERROR
;
587 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
588 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
593 //-------------------------------------------------------------------------------------
595 // RangeDesriptor default constructor
597 //-------------------------------------------------------------------------------------
598 RangeDescriptor::RangeDescriptor(UErrorCode
&status
) {
599 this->fStartChar
= 0;
603 UErrorCode oldstatus
= status
;
604 this->fIncludesSets
= new UVector(status
);
605 if (U_FAILURE(oldstatus
)) {
608 if (U_FAILURE(status
)) {
612 if(this->fIncludesSets
== 0) {
613 status
= U_MEMORY_ALLOCATION_ERROR
;
620 //-------------------------------------------------------------------------------------
622 // RangeDesriptor Destructor
624 //-------------------------------------------------------------------------------------
625 RangeDescriptor::~RangeDescriptor() {
626 delete fIncludesSets
;
627 fIncludesSets
= NULL
;
630 //-------------------------------------------------------------------------------------
632 // RangeDesriptor::split()
634 //-------------------------------------------------------------------------------------
635 void RangeDescriptor::split(UChar32 where
, UErrorCode
&status
) {
636 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
637 RangeDescriptor
*nr
= new RangeDescriptor(*this, status
);
639 status
= U_MEMORY_ALLOCATION_ERROR
;
642 if (U_FAILURE(status
)) {
646 // RangeDescriptor copy constructor copies all fields.
647 // Only need to update those that are different after the split.
648 nr
->fStartChar
= where
;
649 this->fEndChar
= where
-1;
650 nr
->fNext
= this->fNext
;
655 //-------------------------------------------------------------------------------------
657 // RangeDescriptor::setDictionaryFlag
659 // Character Category Numbers that include characters from
660 // the original Unicode Set named "dictionary" have bit 14
661 // set to 1. The RBBI runtime engine uses this to trigger
662 // use of the word dictionary.
664 // This function looks through the Unicode Sets that it
665 // (the range) includes, and sets the bit in fNum when
666 // "dictionary" is among them.
668 // TODO: a faster way would be to find the set node for
669 // "dictionary" just once, rather than looking it
670 // up by name every time.
672 //-------------------------------------------------------------------------------------
673 void RangeDescriptor::setDictionaryFlag() {
676 for (i
=0; i
<this->fIncludesSets
->size(); i
++) {
677 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
678 UnicodeString setName
;
679 RBBINode
*setRef
= usetNode
->fParent
;
680 if (setRef
!= NULL
) {
681 RBBINode
*varRef
= setRef
->fParent
;
682 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
683 setName
= varRef
->fText
;
686 if (setName
.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
687 this->fNum
|= 0x4000;
697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */