]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbisetb.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb.cpp
1 //
2 // rbbisetb.cpp
3 //
4 /*
5 ***************************************************************************
6 * Copyright (C) 2002-2004 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
9 */
10 //
11 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12 // (part of the rule building process.)
13 //
14 // Starting with the rules parse tree from the scanner,
15 //
16 // - Enumerate the set of UnicodeSets that are referenced
17 // by the RBBI rules.
18 // - compute a set of non-overlapping character ranges
19 // with all characters within a range belonging to the same
20 // set of input uniocde sets.
21 // - Derive a set of non-overlapping UnicodeSet (like things)
22 // that will correspond to columns in the state table for
23 // the RBBI execution engine. All characters within one
24 // of these sets belong to the same set of the original
25 // UnicodeSets from the user's rules.
26 // - construct the trie table that maps input characters
27 // to the index of the matching non-overlapping set of set from
28 // the previous step.
29 //
30
31 #include "unicode/utypes.h"
32
33 #if !UCONFIG_NO_BREAK_ITERATION
34
35 #include "unicode/uniset.h"
36 #include "utrie.h"
37 #include "uvector.h"
38 #include "uassert.h"
39 #include "cmemory.h"
40 #include "cstring.h"
41
42 #include "rbbisetb.h"
43 #include "rbbinode.h"
44
45
46 //------------------------------------------------------------------------
47 //
48 // getFoldedRBBIValue Call-back function used during building of Trie table.
49 // Folding value: just store the offset (16 bits)
50 // if there is any non-0 entry.
51 // (It'd really be nice if the Trie builder would provide a
52 // simple default, so this function could go away from here.)
53 //
54 //------------------------------------------------------------------------
55 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
56 U_CDECL_BEGIN
57 static uint32_t U_CALLCONV
58 getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
59 uint32_t value;
60 UChar32 limit;
61 UBool inBlockZero;
62
63 limit=start+0x400;
64 while(start<limit) {
65 value=utrie_get32(trie, start, &inBlockZero);
66 if(inBlockZero) {
67 start+=UTRIE_DATA_BLOCK_LENGTH;
68 } else if(value!=0) {
69 return (uint32_t)(offset|0x8000);
70 } else {
71 ++start;
72 }
73 }
74 return 0;
75 }
76
77
78 U_CDECL_END
79
80
81
82 U_NAMESPACE_BEGIN
83
84 //------------------------------------------------------------------------
85 //
86 // Constructor
87 //
88 //------------------------------------------------------------------------
89 RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
90 {
91 fRB = rb;
92 fStatus = rb->fStatus;
93 fRangeList = 0;
94 fTrie = 0;
95 fTrieSize = 0;
96 fGroupCount = 0;
97 }
98
99
100 //------------------------------------------------------------------------
101 //
102 // Destructor
103 //
104 //------------------------------------------------------------------------
105 RBBISetBuilder::~RBBISetBuilder()
106 {
107 RangeDescriptor *nextRangeDesc;
108
109 // Walk through & delete the linked list of RangeDescriptors
110 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
111 RangeDescriptor *r = nextRangeDesc;
112 nextRangeDesc = r->fNext;
113 delete r;
114 }
115
116 utrie_close(fTrie);
117 }
118
119
120
121
122 //------------------------------------------------------------------------
123 //
124 // build Build the list of non-overlapping character ranges
125 // from the Unicode Sets.
126 //
127 //------------------------------------------------------------------------
128 void RBBISetBuilder::build() {
129 RBBINode *usetNode;
130 RangeDescriptor *rlRange;
131
132 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
133
134 //
135 // Initialize the process by creating a single range encompassing all characters
136 // that is in no sets.
137 //
138 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
139 fRangeList->fStartChar = 0;
140 fRangeList->fEndChar = 0x10ffff;
141
142 if (U_FAILURE(*fStatus)) {
143 return;
144 }
145
146 //
147 // Find the set of non-overlapping ranges of characters
148 //
149 int ni;
150 for (ni=0; ; ni++) {
151 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
152 if (usetNode==NULL) {
153 break;
154 }
155
156 UnicodeSet *inputSet = usetNode->fInputSet;
157 int32_t inputSetRangeCount = inputSet->getRangeCount();
158 int inputSetRangeIndex = 0;
159 rlRange = fRangeList;
160
161 for (;;) {
162 if (inputSetRangeIndex >= inputSetRangeCount) {
163 break;
164 }
165 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
166 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
167
168 // skip over ranges from the range list that are completely
169 // below the current range from the input unicode set.
170 while (rlRange->fEndChar < inputSetRangeBegin) {
171 rlRange = rlRange->fNext;
172 }
173
174 // If the start of the range from the range list is before with
175 // the start of the range from the unicode set, split the range list range
176 // in two, with one part being before (wholly outside of) the unicode set
177 // and the other containing the rest.
178 // Then continue the loop; the post-split current range will then be skipped
179 // over
180 if (rlRange->fStartChar < inputSetRangeBegin) {
181 rlRange->split(inputSetRangeBegin, *fStatus);
182 if (U_FAILURE(*fStatus)) {
183 return;
184 }
185 continue;
186 }
187
188 // Same thing at the end of the ranges...
189 // If the end of the range from the range list doesn't coincide with
190 // the end of the range from the unicode set, split the range list
191 // range in two. The first part of the split range will be
192 // wholly inside the Unicode set.
193 if (rlRange->fEndChar > inputSetRangeEnd) {
194 rlRange->split(inputSetRangeEnd+1, *fStatus);
195 if (U_FAILURE(*fStatus)) {
196 return;
197 }
198 }
199
200 // The current rlRange is now entirely within the UnicodeSet range.
201 // Add this unicode set to the list of sets for this rlRange
202 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
203 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
204 if (U_FAILURE(*fStatus)) {
205 return;
206 }
207 }
208
209 // Advance over ranges that we are finished with.
210 if (inputSetRangeEnd == rlRange->fEndChar) {
211 inputSetRangeIndex++;
212 }
213 rlRange = rlRange->fNext;
214 }
215 }
216
217 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
218
219 //
220 // Group the above ranges, with each group consisting of one or more
221 // ranges that are in exactly the same set of original UnicodeSets.
222 // The groups are numbered, and these group numbers are the set of
223 // input symbols recognized by the run-time state machine.
224 //
225 RangeDescriptor *rlSearchRange;
226 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
227 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
228 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
229 rlRange->fNum = rlSearchRange->fNum;
230 break;
231 }
232 }
233 if (rlRange->fNum == 0) {
234 fGroupCount ++;
235 rlRange->fNum = fGroupCount;
236 rlRange->setDictionaryFlag();
237 addValToSets(rlRange->fIncludesSets, fGroupCount);
238 }
239 }
240
241 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
242 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
243
244 //
245 // Build the Trie table for mapping UChar32 values to the corresponding
246 // range group number
247 //
248 fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
249 NULL, // Data array (utrie will allocate one)
250 100000, // Max Data Length
251 0, // Initial value for all code points
252 0, // Lead surrogate unit value
253 TRUE); // Keep Latin 1 in separately
254
255
256 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
257 utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
258 }
259 }
260
261
262
263 //-----------------------------------------------------------------------------------
264 //
265 // getTrieSize() Return the size that will be required to serialize the Trie.
266 //
267 //-----------------------------------------------------------------------------------
268 int32_t RBBISetBuilder::getTrieSize() /*const*/ {
269 fTrieSize = utrie_serialize(fTrie,
270 NULL, // Buffer
271 0, // Capacity
272 getFoldedRBBIValue,
273 TRUE, // Reduce to 16 bits
274 fStatus);
275 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
276 return fTrieSize;
277 }
278
279
280 //-----------------------------------------------------------------------------------
281 //
282 // serializeTrie() Put the serialized trie at the specified address.
283 // Trust the caller to have given us enough memory.
284 // getTrieSize() MUST be called first.
285 //
286 //-----------------------------------------------------------------------------------
287 void RBBISetBuilder::serializeTrie(uint8_t *where) {
288 utrie_serialize(fTrie,
289 where, // Buffer
290 fTrieSize, // Capacity
291 getFoldedRBBIValue,
292 TRUE, // Reduce to 16 bits
293 fStatus);
294 }
295
296 //------------------------------------------------------------------------
297 //
298 // addValToSets Add a runtime-mapped input value to each uset from a
299 // list of uset nodes.
300 // For each of the original Unicode sets - which correspond
301 // directly to uset nodes - a logically equivalent expression
302 // is constructed in terms of the remapped runtime input
303 // symbol set. This function adds one runtime input symbol to
304 // a list of sets.
305 //
306 // The "logically equivalent expression" is the tree for an
307 // or-ing together of all of the symbols that go into the set.
308 //
309 //------------------------------------------------------------------------
310 void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
311 int32_t ix;
312
313 for (ix=0; ix<sets->size(); ix++) {
314 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
315 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
316 leafNode->fVal = (unsigned short)val;
317 if (usetNode->fLeftChild == NULL) {
318 usetNode->fLeftChild = leafNode;
319 leafNode->fParent = usetNode;
320 } else {
321 // There are already input symbols present for this set.
322 // Set up an OR node, with the previous stuff as the left child
323 // and the new value as the right child.
324 RBBINode *orNode = new RBBINode(RBBINode::opOr);
325 orNode->fLeftChild = usetNode->fLeftChild;
326 orNode->fRightChild = leafNode;
327 orNode->fLeftChild->fParent = orNode;
328 orNode->fRightChild->fParent = orNode;
329 usetNode->fLeftChild = orNode;
330 orNode->fParent = usetNode;
331 }
332 }
333 }
334
335
336
337 //------------------------------------------------------------------------
338 //
339 // getNumOutputSets
340 //
341 //------------------------------------------------------------------------
342 int32_t RBBISetBuilder::getNumCharCategories() const {
343 return fGroupCount + 1;
344 }
345
346
347
348 //------------------------------------------------------------------------
349 //
350 // getFirstChar Given a runtime RBBI character category, find
351 // the first UChar32 that is in the set of chars
352 // in the category.
353 //------------------------------------------------------------------------
354 UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
355 RangeDescriptor *rlRange;
356 UChar32 retVal = (UChar32)-1;
357 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
358 if (rlRange->fNum == category) {
359 retVal = rlRange->fStartChar;
360 break;
361 }
362 }
363 return retVal;
364 }
365
366
367
368 //------------------------------------------------------------------------
369 //
370 // printRanges A debugging function.
371 // dump out all of the range definitions.
372 //
373 //------------------------------------------------------------------------
374 #ifdef RBBI_DEBUG
375 void RBBISetBuilder::printRanges() {
376 RangeDescriptor *rlRange;
377 int i;
378
379 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
380 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
381 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
382
383 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
384 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
385 UnicodeString setName = UNICODE_STRING("anon", 4);
386 RBBINode *setRef = usetNode->fParent;
387 if (setRef != NULL) {
388 RBBINode *varRef = setRef->fParent;
389 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
390 setName = varRef->fText;
391 }
392 }
393 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
394 }
395 RBBIDebugPrintf("\n");
396 }
397 }
398 #endif
399
400
401 //------------------------------------------------------------------------
402 //
403 // printRangeGroups A debugging function.
404 // dump out all of the range groups.
405 //
406 //------------------------------------------------------------------------
407 #ifdef RBBI_DEBUG
408 void RBBISetBuilder::printRangeGroups() {
409 RangeDescriptor *rlRange;
410 RangeDescriptor *tRange;
411 int i;
412 int lastPrintedGroupNum = 0;
413
414 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
415 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
416 int groupNum = rlRange->fNum & 0xbfff;
417 if (groupNum > lastPrintedGroupNum) {
418 lastPrintedGroupNum = groupNum;
419 RBBIDebugPrintf("%2i ", groupNum);
420
421 if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
422
423 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
424 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
425 UnicodeString setName = UNICODE_STRING("anon", 4);
426 RBBINode *setRef = usetNode->fParent;
427 if (setRef != NULL) {
428 RBBINode *varRef = setRef->fParent;
429 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
430 setName = varRef->fText;
431 }
432 }
433 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
434 }
435
436 i = 0;
437 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
438 if (tRange->fNum == rlRange->fNum) {
439 if (i++ % 5 == 0) {
440 RBBIDebugPrintf("\n ");
441 }
442 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
443 }
444 }
445 RBBIDebugPrintf("\n");
446 }
447 }
448 RBBIDebugPrintf("\n");
449 }
450 #endif
451
452
453 //------------------------------------------------------------------------
454 //
455 // printSets A debugging function.
456 // dump out all of the set definitions.
457 //
458 //------------------------------------------------------------------------
459 #ifdef RBBI_DEBUG
460 void RBBISetBuilder::printSets() {
461 int i;
462
463 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
464 for (i=0; ; i++) {
465 RBBINode *usetNode;
466 RBBINode *setRef;
467 RBBINode *varRef;
468 UnicodeString setName;
469
470 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
471 if (usetNode == NULL) {
472 break;
473 }
474
475 RBBIDebugPrintf("%3d ", i);
476 setName = UNICODE_STRING("anonymous", 9);
477 setRef = usetNode->fParent;
478 if (setRef != NULL) {
479 varRef = setRef->fParent;
480 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
481 setName = varRef->fText;
482 }
483 }
484 RBBI_DEBUG_printUnicodeString(setName);
485 RBBIDebugPrintf(" ");
486 RBBI_DEBUG_printUnicodeString(usetNode->fText);
487 RBBIDebugPrintf("\n");
488 if (usetNode->fLeftChild != NULL) {
489 usetNode->fLeftChild->printTree(TRUE);
490 }
491 }
492 RBBIDebugPrintf("\n");
493 }
494 #endif
495
496
497
498 //-------------------------------------------------------------------------------------
499 //
500 // RangeDescriptor copy constructor
501 //
502 //-------------------------------------------------------------------------------------
503
504 RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
505 int i;
506
507 this->fStartChar = other.fStartChar;
508 this->fEndChar = other.fEndChar;
509 this->fNum = other.fNum;
510 this->fNext = NULL;
511 UErrorCode oldstatus = status;
512 this->fIncludesSets = new UVector(status);
513 if (U_FAILURE(oldstatus)) {
514 status = oldstatus;
515 }
516 if (U_FAILURE(status)) {
517 return;
518 }
519 /* test for NULL */
520 if (this->fIncludesSets == 0) {
521 status = U_MEMORY_ALLOCATION_ERROR;
522 return;
523 }
524
525 for (i=0; i<other.fIncludesSets->size(); i++) {
526 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
527 }
528 }
529
530
531 //-------------------------------------------------------------------------------------
532 //
533 // RangeDesriptor default constructor
534 //
535 //-------------------------------------------------------------------------------------
536 RangeDescriptor::RangeDescriptor(UErrorCode &status) {
537 this->fStartChar = 0;
538 this->fEndChar = 0;
539 this->fNum = 0;
540 this->fNext = NULL;
541 UErrorCode oldstatus = status;
542 this->fIncludesSets = new UVector(status);
543 if (U_FAILURE(oldstatus)) {
544 status = oldstatus;
545 }
546 if (U_FAILURE(status)) {
547 return;
548 }
549 /* test for NULL */
550 if(this->fIncludesSets == 0) {
551 status = U_MEMORY_ALLOCATION_ERROR;
552 return;
553 }
554
555 }
556
557
558 //-------------------------------------------------------------------------------------
559 //
560 // RangeDesriptor Destructor
561 //
562 //-------------------------------------------------------------------------------------
563 RangeDescriptor::~RangeDescriptor() {
564 delete fIncludesSets;
565 fIncludesSets = NULL;
566 }
567
568 //-------------------------------------------------------------------------------------
569 //
570 // RangeDesriptor::split()
571 //
572 //-------------------------------------------------------------------------------------
573 void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
574 U_ASSERT(where>fStartChar && where<=fEndChar);
575 RangeDescriptor *nr = new RangeDescriptor(*this, status);
576 if (U_FAILURE(status)) {
577 return;
578 }
579 /* test for NULL */
580 if(nr == 0) {
581 status = U_MEMORY_ALLOCATION_ERROR;
582 return;
583 }
584
585 // RangeDescriptor copy constructor copies all fields.
586 // Only need to update those that are different after the split.
587 nr->fStartChar = where;
588 this->fEndChar = where-1;
589 nr->fNext = this->fNext;
590 this->fNext = nr;
591 }
592
593
594 //-------------------------------------------------------------------------------------
595 //
596 // RangeDescriptor::setDictionaryFlag
597 //
598 // Character Category Numbers that include characters from
599 // the original Unicode Set named "dictionary" have bit 14
600 // set to 1. The RBBI runtime engine uses this to trigger
601 // use of the word dictionary.
602 //
603 // This function looks through the Unicode Sets that it
604 // (the range) includes, and sets the bit in fNum when
605 // "dictionary" is among them.
606 //
607 // TODO: a faster way would be to find the set node for
608 // "dictionary" just once, rather than looking it
609 // up by name every time.
610 //
611 //-------------------------------------------------------------------------------------
612 void RangeDescriptor::setDictionaryFlag() {
613 int i;
614
615 for (i=0; i<this->fIncludesSets->size(); i++) {
616 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
617 UnicodeString setName;
618 RBBINode *setRef = usetNode->fParent;
619 if (setRef != NULL) {
620 RBBINode *varRef = setRef->fParent;
621 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
622 setName = varRef->fText;
623 }
624 }
625 if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
626 this->fNum |= 0x4000;
627 break;
628 }
629 }
630 }
631
632
633
634 U_NAMESPACE_END
635
636 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */