1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 ***************************************************************************
6 * Copyright (C) 2002-2008 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 // RBBISetBuilder57 Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
14 // Starting with the rules parse tree from the scanner,
16 // - Enumerate the set of UnicodeSets that are referenced
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
31 #include "unicode/utypes.h"
33 #if !UCONFIG_NO_BREAK_ITERATION
35 #include "unicode/uniset.h"
43 #include "rbbisetb57.h"
47 //------------------------------------------------------------------------
49 // getFoldedRBBIValue Call-back function used during building of Trie table.
50 // Folding value: just store the offset (16 bits)
51 // if there is any non-0 entry.
52 // (It'd really be nice if the Trie builder would provide a
53 // simple default, so this function could go away from here.)
55 //------------------------------------------------------------------------
56 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
58 static uint32_t U_CALLCONV
59 getFoldedRBBIValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
66 value
=utrie_get32(trie
, start
, &inBlockZero
);
68 start
+=UTRIE_DATA_BLOCK_LENGTH
;
70 return (uint32_t)(offset
|0x8000);
85 //------------------------------------------------------------------------
89 //------------------------------------------------------------------------
90 RBBISetBuilder57::RBBISetBuilder57(RBBIRuleBuilder57
*rb
)
93 fStatus
= rb
->fStatus
;
102 //------------------------------------------------------------------------
106 //------------------------------------------------------------------------
107 RBBISetBuilder57::~RBBISetBuilder57()
109 RangeDescriptor57
*nextRangeDesc
;
111 // Walk through & delete the linked list of RangeDescriptors
112 for (nextRangeDesc
= fRangeList
; nextRangeDesc
!=NULL
;) {
113 RangeDescriptor57
*r
= nextRangeDesc
;
114 nextRangeDesc
= r
->fNext
;
124 //------------------------------------------------------------------------
126 // build Build the list of non-overlapping character ranges
127 // from the Unicode Sets.
129 //------------------------------------------------------------------------
130 void RBBISetBuilder57::build() {
132 RangeDescriptor57
*rlRange
;
134 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "usets")) {printSets();}
137 // Initialize the process by creating a single range encompassing all characters
138 // that is in no sets.
140 fRangeList
= new RangeDescriptor57(*fStatus
); // will check for status here
141 if (fRangeList
== NULL
) {
142 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
145 fRangeList
->fStartChar
= 0;
146 fRangeList
->fEndChar
= 0x10ffff;
148 if (U_FAILURE(*fStatus
)) {
153 // Find the set of non-overlapping ranges of characters
156 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
157 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
158 if (usetNode
==NULL
) {
162 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
163 int32_t inputSetRangeCount
= inputSet
->getRangeCount();
164 int inputSetRangeIndex
= 0;
165 rlRange
= fRangeList
;
168 if (inputSetRangeIndex
>= inputSetRangeCount
) {
171 UChar32 inputSetRangeBegin
= inputSet
->getRangeStart(inputSetRangeIndex
);
172 UChar32 inputSetRangeEnd
= inputSet
->getRangeEnd(inputSetRangeIndex
);
174 // skip over ranges from the range list that are completely
175 // below the current range from the input unicode set.
176 while (rlRange
->fEndChar
< inputSetRangeBegin
) {
177 rlRange
= rlRange
->fNext
;
180 // If the start of the range from the range list is before with
181 // the start of the range from the unicode set, split the range list range
182 // in two, with one part being before (wholly outside of) the unicode set
183 // and the other containing the rest.
184 // Then continue the loop; the post-split current range will then be skipped
186 if (rlRange
->fStartChar
< inputSetRangeBegin
) {
187 rlRange
->split(inputSetRangeBegin
, *fStatus
);
188 if (U_FAILURE(*fStatus
)) {
194 // Same thing at the end of the ranges...
195 // If the end of the range from the range list doesn't coincide with
196 // the end of the range from the unicode set, split the range list
197 // range in two. The first part of the split range will be
198 // wholly inside the Unicode set.
199 if (rlRange
->fEndChar
> inputSetRangeEnd
) {
200 rlRange
->split(inputSetRangeEnd
+1, *fStatus
);
201 if (U_FAILURE(*fStatus
)) {
206 // The current rlRange is now entirely within the UnicodeSet range.
207 // Add this unicode set to the list of sets for this rlRange
208 if (rlRange
->fIncludesSets
->indexOf(usetNode
) == -1) {
209 rlRange
->fIncludesSets
->addElement(usetNode
, *fStatus
);
210 if (U_FAILURE(*fStatus
)) {
215 // Advance over ranges that we are finished with.
216 if (inputSetRangeEnd
== rlRange
->fEndChar
) {
217 inputSetRangeIndex
++;
219 rlRange
= rlRange
->fNext
;
223 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "range")) { printRanges();}
226 // Group the above ranges, with each group consisting of one or more
227 // ranges that are in exactly the same set of original UnicodeSets.
228 // The groups are numbered, and these group numbers are the set of
229 // input symbols recognized by the run-time state machine.
231 // Numbering: # 0 (state table column 0) is unused.
232 // # 1 is reserved - table column 1 is for end-of-input
233 // # 2 is reserved - table column 2 is for beginning-in-input
234 // # 3 is the first range list.
236 RangeDescriptor57
*rlSearchRange
;
237 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
238 for (rlSearchRange
=fRangeList
; rlSearchRange
!= rlRange
; rlSearchRange
=rlSearchRange
->fNext
) {
239 if (rlRange
->fIncludesSets
->equals(*rlSearchRange
->fIncludesSets
)) {
240 rlRange
->fNum
= rlSearchRange
->fNum
;
244 if (rlRange
->fNum
== 0) {
246 rlRange
->fNum
= fGroupCount
+2;
247 rlRange
->setDictionaryFlag();
248 addValToSets(rlRange
->fIncludesSets
, fGroupCount
+2);
252 // Handle input sets that contain the special string {eof}.
253 // Column 1 of the state table is reserved for EOF on input.
254 // Column 2 is reserved for before-the-start-input.
255 // (This column can be optimized away later if there are no rule
256 // references to {bof}.)
257 // Add this column value (1 or 2) to the equivalent expression
258 // subtree for each UnicodeSet that contains the string {eof}
259 // Because {bof} and {eof} are not a characters in the normal sense,
260 // they doesn't affect the computation of ranges or TRIE.
261 static const UChar eofUString
[] = {0x65, 0x6f, 0x66, 0};
262 static const UChar bofUString
[] = {0x62, 0x6f, 0x66, 0};
264 UnicodeString
eofString(eofUString
);
265 UnicodeString
bofString(bofUString
);
266 for (ni
=0; ; ni
++) { // Loop over each of the UnicodeSets encountered in the input rules
267 usetNode
= (RBBINode
*)this->fRB
->fUSetNodes
->elementAt(ni
);
268 if (usetNode
==NULL
) {
271 UnicodeSet
*inputSet
= usetNode
->fInputSet
;
272 if (inputSet
->contains(eofString
)) {
273 addValToSet(usetNode
, 1);
275 if (inputSet
->contains(bofString
)) {
276 addValToSet(usetNode
, 2);
282 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "rgroup")) {printRangeGroups();}
283 if (fRB
->fDebugEnv
&& uprv_strstr(fRB
->fDebugEnv
, "esets")) {printSets();}
286 // Build the Trie table for mapping UChar32 values to the corresponding
287 // range group number
289 fTrie
= utrie_open(NULL
, // Pre-existing trie to be filled in
290 NULL
, // Data array (utrie will allocate one)
291 100000, // Max Data Length
292 0, // Initial value for all code points
293 0, // Lead surrogate unit value
294 TRUE
); // Keep Latin 1 in separately
297 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
298 utrie_setRange32(fTrie
, rlRange
->fStartChar
, rlRange
->fEndChar
+1, rlRange
->fNum
, TRUE
);
303 //-----------------------------------------------------------------------------------
305 // getTrieSize() Return the size that will be required to serialize the Trie.
307 //-----------------------------------------------------------------------------------
308 int32_t RBBISetBuilder57::getTrieSize() /*const*/ {
309 fTrieSize
= utrie_serialize(fTrie
,
313 TRUE
, // Reduce to 16 bits
315 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
320 //-----------------------------------------------------------------------------------
322 // serializeTrie() Put the serialized trie at the specified address.
323 // Trust the caller to have given us enough memory.
324 // getTrieSize() MUST be called first.
326 //-----------------------------------------------------------------------------------
327 void RBBISetBuilder57::serializeTrie(uint8_t *where
) {
328 utrie_serialize(fTrie
,
330 fTrieSize
, // Capacity
332 TRUE
, // Reduce to 16 bits
336 //------------------------------------------------------------------------
338 // addValToSets Add a runtime-mapped input value to each uset from a
339 // list of uset nodes. (val corresponds to a state table column.)
340 // For each of the original Unicode sets - which correspond
341 // directly to uset nodes - a logically equivalent expression
342 // is constructed in terms of the remapped runtime input
343 // symbol set. This function adds one runtime input symbol to
346 // The "logically equivalent expression" is the tree for an
347 // or-ing together of all of the symbols that go into the set.
349 //------------------------------------------------------------------------
350 void RBBISetBuilder57::addValToSets(UVector
*sets
, uint32_t val
) {
353 for (ix
=0; ix
<sets
->size(); ix
++) {
354 RBBINode
*usetNode
= (RBBINode
*)sets
->elementAt(ix
);
355 addValToSet(usetNode
, val
);
359 void RBBISetBuilder57::addValToSet(RBBINode
*usetNode
, uint32_t val
) {
360 RBBINode
*leafNode
= new RBBINode(RBBINode::leafChar
);
361 if (leafNode
== NULL
) {
362 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
365 leafNode
->fVal
= (unsigned short)val
;
366 if (usetNode
->fLeftChild
== NULL
) {
367 usetNode
->fLeftChild
= leafNode
;
368 leafNode
->fParent
= usetNode
;
370 // There are already input symbols present for this set.
371 // Set up an OR node, with the previous stuff as the left child
372 // and the new value as the right child.
373 RBBINode
*orNode
= new RBBINode(RBBINode::opOr
);
374 if (orNode
== NULL
) {
375 *fStatus
= U_MEMORY_ALLOCATION_ERROR
;
378 orNode
->fLeftChild
= usetNode
->fLeftChild
;
379 orNode
->fRightChild
= leafNode
;
380 orNode
->fLeftChild
->fParent
= orNode
;
381 orNode
->fRightChild
->fParent
= orNode
;
382 usetNode
->fLeftChild
= orNode
;
383 orNode
->fParent
= usetNode
;
388 //------------------------------------------------------------------------
390 // getNumCharCategories
392 //------------------------------------------------------------------------
393 int32_t RBBISetBuilder57::getNumCharCategories() const {
394 return fGroupCount
+ 3;
398 //------------------------------------------------------------------------
402 //------------------------------------------------------------------------
403 UBool
RBBISetBuilder57::sawBOF() const {
408 //------------------------------------------------------------------------
410 // getFirstChar Given a runtime RBBI character category, find
411 // the first UChar32 that is in the set of chars
413 //------------------------------------------------------------------------
414 UChar32
RBBISetBuilder57::getFirstChar(int32_t category
) const {
415 RangeDescriptor57
*rlRange
;
416 UChar32 retVal
= (UChar32
)-1;
417 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
418 if (rlRange
->fNum
== category
) {
419 retVal
= rlRange
->fStartChar
;
428 //------------------------------------------------------------------------
430 // printRanges A debugging function.
431 // dump out all of the range definitions.
433 //------------------------------------------------------------------------
435 void RBBISetBuilder57::printRanges() {
436 RangeDescriptor57
*rlRange
;
439 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
440 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
441 RBBIDebugPrintf("%2i %4x-%4x ", rlRange
->fNum
, rlRange
->fStartChar
, rlRange
->fEndChar
);
443 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
444 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
445 UnicodeString setName
= UNICODE_STRING("anon", 4);
446 RBBINode
*setRef
= usetNode
->fParent
;
447 if (setRef
!= NULL
) {
448 RBBINode
*varRef
= setRef
->fParent
;
449 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
450 setName
= varRef
->fText
;
453 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
455 RBBIDebugPrintf("\n");
461 //------------------------------------------------------------------------
463 // printRangeGroups A debugging function.
464 // dump out all of the range groups.
466 //------------------------------------------------------------------------
468 void RBBISetBuilder57::printRangeGroups() {
469 RangeDescriptor57
*rlRange
;
470 RangeDescriptor57
*tRange
;
472 int lastPrintedGroupNum
= 0;
474 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
475 for (rlRange
= fRangeList
; rlRange
!=0; rlRange
=rlRange
->fNext
) {
476 int groupNum
= rlRange
->fNum
& 0xbfff;
477 if (groupNum
> lastPrintedGroupNum
) {
478 lastPrintedGroupNum
= groupNum
;
479 RBBIDebugPrintf("%2i ", groupNum
);
481 if (rlRange
->fNum
& 0x4000) { RBBIDebugPrintf(" <DICT> ");}
483 for (i
=0; i
<rlRange
->fIncludesSets
->size(); i
++) {
484 RBBINode
*usetNode
= (RBBINode
*)rlRange
->fIncludesSets
->elementAt(i
);
485 UnicodeString setName
= UNICODE_STRING("anon", 4);
486 RBBINode
*setRef
= usetNode
->fParent
;
487 if (setRef
!= NULL
) {
488 RBBINode
*varRef
= setRef
->fParent
;
489 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
490 setName
= varRef
->fText
;
493 RBBI_DEBUG_printUnicodeString(setName
); RBBIDebugPrintf(" ");
497 for (tRange
= rlRange
; tRange
!= 0; tRange
= tRange
->fNext
) {
498 if (tRange
->fNum
== rlRange
->fNum
) {
500 RBBIDebugPrintf("\n ");
502 RBBIDebugPrintf(" %05x-%05x", tRange
->fStartChar
, tRange
->fEndChar
);
505 RBBIDebugPrintf("\n");
508 RBBIDebugPrintf("\n");
513 //------------------------------------------------------------------------
515 // printSets A debugging function.
516 // dump out all of the set definitions.
518 //------------------------------------------------------------------------
520 void RBBISetBuilder57::printSets() {
523 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
528 UnicodeString setName
;
530 usetNode
= (RBBINode
*)fRB
->fUSetNodes
->elementAt(i
);
531 if (usetNode
== NULL
) {
535 RBBIDebugPrintf("%3d ", i
);
536 setName
= UNICODE_STRING("anonymous", 9);
537 setRef
= usetNode
->fParent
;
538 if (setRef
!= NULL
) {
539 varRef
= setRef
->fParent
;
540 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
541 setName
= varRef
->fText
;
544 RBBI_DEBUG_printUnicodeString(setName
);
545 RBBIDebugPrintf(" ");
546 RBBI_DEBUG_printUnicodeString(usetNode
->fText
);
547 RBBIDebugPrintf("\n");
548 if (usetNode
->fLeftChild
!= NULL
) {
549 RBBINode::printTree(usetNode
->fLeftChild
, TRUE
);
552 RBBIDebugPrintf("\n");
558 //-------------------------------------------------------------------------------------
560 // RangeDescriptor57 copy constructor
562 //-------------------------------------------------------------------------------------
564 RangeDescriptor57::RangeDescriptor57(const RangeDescriptor57
&other
, UErrorCode
&status
) {
567 this->fStartChar
= other
.fStartChar
;
568 this->fEndChar
= other
.fEndChar
;
569 this->fNum
= other
.fNum
;
571 UErrorCode oldstatus
= status
;
572 this->fIncludesSets
= new UVector(status
);
573 if (U_FAILURE(oldstatus
)) {
576 if (U_FAILURE(status
)) {
580 if (this->fIncludesSets
== 0) {
581 status
= U_MEMORY_ALLOCATION_ERROR
;
585 for (i
=0; i
<other
.fIncludesSets
->size(); i
++) {
586 this->fIncludesSets
->addElement(other
.fIncludesSets
->elementAt(i
), status
);
591 //-------------------------------------------------------------------------------------
593 // RangeDesriptor default constructor
595 //-------------------------------------------------------------------------------------
596 RangeDescriptor57::RangeDescriptor57(UErrorCode
&status
) {
597 this->fStartChar
= 0;
601 UErrorCode oldstatus
= status
;
602 this->fIncludesSets
= new UVector(status
);
603 if (U_FAILURE(oldstatus
)) {
606 if (U_FAILURE(status
)) {
610 if(this->fIncludesSets
== 0) {
611 status
= U_MEMORY_ALLOCATION_ERROR
;
618 //-------------------------------------------------------------------------------------
620 // RangeDesriptor Destructor
622 //-------------------------------------------------------------------------------------
623 RangeDescriptor57::~RangeDescriptor57() {
624 delete fIncludesSets
;
625 fIncludesSets
= NULL
;
628 //-------------------------------------------------------------------------------------
630 // RangeDesriptor::split()
632 //-------------------------------------------------------------------------------------
633 void RangeDescriptor57::split(UChar32 where
, UErrorCode
&status
) {
634 U_ASSERT(where
>fStartChar
&& where
<=fEndChar
);
635 RangeDescriptor57
*nr
= new RangeDescriptor57(*this, status
);
637 status
= U_MEMORY_ALLOCATION_ERROR
;
640 if (U_FAILURE(status
)) {
644 // RangeDescriptor57 copy constructor copies all fields.
645 // Only need to update those that are different after the split.
646 nr
->fStartChar
= where
;
647 this->fEndChar
= where
-1;
648 nr
->fNext
= this->fNext
;
653 //-------------------------------------------------------------------------------------
655 // RangeDescriptor57::setDictionaryFlag
657 // Character Category Numbers that include characters from
658 // the original Unicode Set named "dictionary" have bit 14
659 // set to 1. The RBBI runtime engine uses this to trigger
660 // use of the word dictionary.
662 // This function looks through the Unicode Sets that it
663 // (the range) includes, and sets the bit in fNum when
664 // "dictionary" is among them.
666 // TODO: a faster way would be to find the set node for
667 // "dictionary" just once, rather than looking it
668 // up by name every time.
670 //-------------------------------------------------------------------------------------
671 void RangeDescriptor57::setDictionaryFlag() {
674 for (i
=0; i
<this->fIncludesSets
->size(); i
++) {
675 RBBINode
*usetNode
= (RBBINode
*)fIncludesSets
->elementAt(i
);
676 UnicodeString setName
;
677 RBBINode
*setRef
= usetNode
->fParent
;
678 if (setRef
!= NULL
) {
679 RBBINode
*varRef
= setRef
->fParent
;
680 if (varRef
!= NULL
&& varRef
->fType
== RBBINode::varRef
) {
681 setName
= varRef
->fText
;
684 if (setName
.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
685 this->fNum
|= 0x4000;
695 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */