]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbisetb.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / rbbisetb.cpp
CommitLineData
b75a7d8f
A
1//
2// rbbisetb.cpp
3//
4/*
5***************************************************************************
374ca955 6* Copyright (C) 2002-2004 International Business Machines Corporation *
b75a7d8f
A
7* and others. All rights reserved. *
8***************************************************************************
9*/
10//
11// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
12// (part of the rule building process.)
13//
14// Starting with the rules parse tree from the scanner,
15//
16// - Enumerate the set of UnicodeSets that are referenced
17// by the RBBI rules.
18// - compute a set of non-overlapping character ranges
19// with all characters within a range belonging to the same
20// set of input uniocde sets.
21// - Derive a set of non-overlapping UnicodeSet (like things)
22// that will correspond to columns in the state table for
23// the RBBI execution engine. All characters within one
24// of these sets belong to the same set of the original
25// UnicodeSets from the user's rules.
26// - construct the trie table that maps input characters
27// to the index of the matching non-overlapping set of set from
28// the previous step.
29//
30
31#include "unicode/utypes.h"
32
33#if !UCONFIG_NO_BREAK_ITERATION
34
35#include "unicode/uniset.h"
36#include "utrie.h"
37#include "uvector.h"
38#include "uassert.h"
39#include "cmemory.h"
40#include "cstring.h"
41
42#include "rbbisetb.h"
43#include "rbbinode.h"
44
45
46//------------------------------------------------------------------------
47//
48// getFoldedRBBIValue Call-back function used during building of Trie table.
49// Folding value: just store the offset (16 bits)
50// if there is any non-0 entry.
51// (It'd really be nice if the Trie builder would provide a
52// simple default, so this function could go away from here.)
53//
54//------------------------------------------------------------------------
55/* folding value: just store the offset (16 bits) if there is any non-0 entry */
56U_CDECL_BEGIN
57static uint32_t U_CALLCONV
58getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
59 uint32_t value;
60 UChar32 limit;
61 UBool inBlockZero;
62
63 limit=start+0x400;
64 while(start<limit) {
65 value=utrie_get32(trie, start, &inBlockZero);
66 if(inBlockZero) {
67 start+=UTRIE_DATA_BLOCK_LENGTH;
68 } else if(value!=0) {
69 return (uint32_t)(offset|0x8000);
70 } else {
71 ++start;
72 }
73 }
74 return 0;
75}
76
77
78U_CDECL_END
79
80
81
82U_NAMESPACE_BEGIN
83
84//------------------------------------------------------------------------
85//
86// Constructor
87//
88//------------------------------------------------------------------------
89RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
90{
91 fRB = rb;
92 fStatus = rb->fStatus;
93 fRangeList = 0;
94 fTrie = 0;
95 fTrieSize = 0;
96 fGroupCount = 0;
97}
98
99
100//------------------------------------------------------------------------
101//
102// Destructor
103//
104//------------------------------------------------------------------------
105RBBISetBuilder::~RBBISetBuilder()
106{
107 RangeDescriptor *nextRangeDesc;
108
109 // Walk through & delete the linked list of RangeDescriptors
110 for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
111 RangeDescriptor *r = nextRangeDesc;
112 nextRangeDesc = r->fNext;
113 delete r;
114 }
115
116 utrie_close(fTrie);
117}
118
119
120
121
122//------------------------------------------------------------------------
123//
124// build Build the list of non-overlapping character ranges
125// from the Unicode Sets.
126//
127//------------------------------------------------------------------------
128void RBBISetBuilder::build() {
129 RBBINode *usetNode;
130 RangeDescriptor *rlRange;
131
132 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
133
134 //
135 // Initialize the process by creating a single range encompassing all characters
136 // that is in no sets.
137 //
374ca955 138 fRangeList = new RangeDescriptor(*fStatus); // will check for status here
b75a7d8f
A
139 fRangeList->fStartChar = 0;
140 fRangeList->fEndChar = 0x10ffff;
141
374ca955
A
142 if (U_FAILURE(*fStatus)) {
143 return;
144 }
b75a7d8f
A
145
146 //
147 // Find the set of non-overlapping ranges of characters
148 //
149 int ni;
150 for (ni=0; ; ni++) {
151 usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
152 if (usetNode==NULL) {
153 break;
154 }
155
156 UnicodeSet *inputSet = usetNode->fInputSet;
157 int32_t inputSetRangeCount = inputSet->getRangeCount();
158 int inputSetRangeIndex = 0;
159 rlRange = fRangeList;
160
161 for (;;) {
162 if (inputSetRangeIndex >= inputSetRangeCount) {
163 break;
164 }
165 UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex);
166 UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex);
167
168 // skip over ranges from the range list that are completely
169 // below the current range from the input unicode set.
170 while (rlRange->fEndChar < inputSetRangeBegin) {
171 rlRange = rlRange->fNext;
172 }
173
174 // If the start of the range from the range list is before with
175 // the start of the range from the unicode set, split the range list range
176 // in two, with one part being before (wholly outside of) the unicode set
177 // and the other containing the rest.
178 // Then continue the loop; the post-split current range will then be skipped
179 // over
180 if (rlRange->fStartChar < inputSetRangeBegin) {
181 rlRange->split(inputSetRangeBegin, *fStatus);
374ca955
A
182 if (U_FAILURE(*fStatus)) {
183 return;
184 }
b75a7d8f
A
185 continue;
186 }
187
188 // Same thing at the end of the ranges...
189 // If the end of the range from the range list doesn't coincide with
190 // the end of the range from the unicode set, split the range list
191 // range in two. The first part of the split range will be
192 // wholly inside the Unicode set.
193 if (rlRange->fEndChar > inputSetRangeEnd) {
194 rlRange->split(inputSetRangeEnd+1, *fStatus);
374ca955
A
195 if (U_FAILURE(*fStatus)) {
196 return;
197 }
b75a7d8f
A
198 }
199
200 // The current rlRange is now entirely within the UnicodeSet range.
201 // Add this unicode set to the list of sets for this rlRange
202 if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
203 rlRange->fIncludesSets->addElement(usetNode, *fStatus);
374ca955
A
204 if (U_FAILURE(*fStatus)) {
205 return;
206 }
b75a7d8f
A
207 }
208
209 // Advance over ranges that we are finished with.
210 if (inputSetRangeEnd == rlRange->fEndChar) {
211 inputSetRangeIndex++;
212 }
213 rlRange = rlRange->fNext;
214 }
215 }
216
217 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
218
219 //
220 // Group the above ranges, with each group consisting of one or more
221 // ranges that are in exactly the same set of original UnicodeSets.
222 // The groups are numbered, and these group numbers are the set of
223 // input symbols recognized by the run-time state machine.
224 //
225 RangeDescriptor *rlSearchRange;
226 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
227 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
228 if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
229 rlRange->fNum = rlSearchRange->fNum;
230 break;
231 }
232 }
233 if (rlRange->fNum == 0) {
234 fGroupCount ++;
235 rlRange->fNum = fGroupCount;
236 rlRange->setDictionaryFlag();
237 addValToSets(rlRange->fIncludesSets, fGroupCount);
238 }
239 }
240
241 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
242 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
243
244 //
245 // Build the Trie table for mapping UChar32 values to the corresponding
246 // range group number
247 //
248 fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
249 NULL, // Data array (utrie will allocate one)
250 100000, // Max Data Length
251 0, // Initial value for all code points
374ca955 252 0, // Lead surrogate unit value
b75a7d8f
A
253 TRUE); // Keep Latin 1 in separately
254
255
256 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
257 utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
258 }
259}
260
261
262
263//-----------------------------------------------------------------------------------
264//
265// getTrieSize() Return the size that will be required to serialize the Trie.
266//
267//-----------------------------------------------------------------------------------
374ca955 268int32_t RBBISetBuilder::getTrieSize() /*const*/ {
b75a7d8f
A
269 fTrieSize = utrie_serialize(fTrie,
270 NULL, // Buffer
271 0, // Capacity
272 getFoldedRBBIValue,
273 TRUE, // Reduce to 16 bits
274 fStatus);
275 // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
276 return fTrieSize;
277}
278
279
280//-----------------------------------------------------------------------------------
281//
282// serializeTrie() Put the serialized trie at the specified address.
283// Trust the caller to have given us enough memory.
284// getTrieSize() MUST be called first.
285//
286//-----------------------------------------------------------------------------------
287void RBBISetBuilder::serializeTrie(uint8_t *where) {
288 utrie_serialize(fTrie,
289 where, // Buffer
290 fTrieSize, // Capacity
291 getFoldedRBBIValue,
292 TRUE, // Reduce to 16 bits
293 fStatus);
294}
295
296//------------------------------------------------------------------------
297//
298// addValToSets Add a runtime-mapped input value to each uset from a
299// list of uset nodes.
300// For each of the original Unicode sets - which correspond
301// directly to uset nodes - a logically equivalent expression
302// is constructed in terms of the remapped runtime input
303// symbol set. This function adds one runtime input symbol to
304// a list of sets.
305//
306// The "logically equivalent expression" is the tree for an
307// or-ing together of all of the symbols that go into the set.
308//
309//------------------------------------------------------------------------
310void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
311 int32_t ix;
312
313 for (ix=0; ix<sets->size(); ix++) {
314 RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
315 RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
316 leafNode->fVal = (unsigned short)val;
317 if (usetNode->fLeftChild == NULL) {
318 usetNode->fLeftChild = leafNode;
319 leafNode->fParent = usetNode;
320 } else {
321 // There are already input symbols present for this set.
322 // Set up an OR node, with the previous stuff as the left child
323 // and the new value as the right child.
324 RBBINode *orNode = new RBBINode(RBBINode::opOr);
325 orNode->fLeftChild = usetNode->fLeftChild;
326 orNode->fRightChild = leafNode;
327 orNode->fLeftChild->fParent = orNode;
328 orNode->fRightChild->fParent = orNode;
329 usetNode->fLeftChild = orNode;
330 orNode->fParent = usetNode;
331 }
332 }
333}
334
335
336
337//------------------------------------------------------------------------
338//
339// getNumOutputSets
340//
341//------------------------------------------------------------------------
374ca955 342int32_t RBBISetBuilder::getNumCharCategories() const {
b75a7d8f
A
343 return fGroupCount + 1;
344}
345
346
347
374ca955
A
348//------------------------------------------------------------------------
349//
350// getFirstChar Given a runtime RBBI character category, find
351// the first UChar32 that is in the set of chars
352// in the category.
353//------------------------------------------------------------------------
354UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
355 RangeDescriptor *rlRange;
356 UChar32 retVal = (UChar32)-1;
357 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
358 if (rlRange->fNum == category) {
359 retVal = rlRange->fStartChar;
360 break;
361 }
362 }
363 return retVal;
364}
365
366
367
b75a7d8f
A
368//------------------------------------------------------------------------
369//
370// printRanges A debugging function.
371// dump out all of the range definitions.
372//
373//------------------------------------------------------------------------
b75a7d8f 374#ifdef RBBI_DEBUG
374ca955 375void RBBISetBuilder::printRanges() {
b75a7d8f
A
376 RangeDescriptor *rlRange;
377 int i;
378
379 RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
380 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
381 RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
382
383 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
384 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
374ca955 385 UnicodeString setName = UNICODE_STRING("anon", 4);
b75a7d8f
A
386 RBBINode *setRef = usetNode->fParent;
387 if (setRef != NULL) {
388 RBBINode *varRef = setRef->fParent;
389 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
390 setName = varRef->fText;
391 }
392 }
374ca955 393 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
b75a7d8f
A
394 }
395 RBBIDebugPrintf("\n");
396 }
b75a7d8f 397}
374ca955 398#endif
b75a7d8f
A
399
400
401//------------------------------------------------------------------------
402//
403// printRangeGroups A debugging function.
404// dump out all of the range groups.
405//
406//------------------------------------------------------------------------
374ca955 407#ifdef RBBI_DEBUG
b75a7d8f
A
408void RBBISetBuilder::printRangeGroups() {
409 RangeDescriptor *rlRange;
410 RangeDescriptor *tRange;
411 int i;
412 int lastPrintedGroupNum = 0;
413
414 RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
415 for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
416 int groupNum = rlRange->fNum & 0xbfff;
417 if (groupNum > lastPrintedGroupNum) {
418 lastPrintedGroupNum = groupNum;
419 RBBIDebugPrintf("%2i ", groupNum);
420
421 if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
422
423 for (i=0; i<rlRange->fIncludesSets->size(); i++) {
424 RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
374ca955 425 UnicodeString setName = UNICODE_STRING("anon", 4);
b75a7d8f
A
426 RBBINode *setRef = usetNode->fParent;
427 if (setRef != NULL) {
428 RBBINode *varRef = setRef->fParent;
429 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
430 setName = varRef->fText;
431 }
432 }
374ca955 433 RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
b75a7d8f
A
434 }
435
436 i = 0;
437 for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
438 if (tRange->fNum == rlRange->fNum) {
439 if (i++ % 5 == 0) {
440 RBBIDebugPrintf("\n ");
441 }
442 RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar);
443 }
444 }
445 RBBIDebugPrintf("\n");
446 }
447 }
448 RBBIDebugPrintf("\n");
449}
374ca955 450#endif
b75a7d8f
A
451
452
453//------------------------------------------------------------------------
454//
455// printSets A debugging function.
456// dump out all of the set definitions.
457//
458//------------------------------------------------------------------------
b75a7d8f 459#ifdef RBBI_DEBUG
374ca955 460void RBBISetBuilder::printSets() {
b75a7d8f
A
461 int i;
462
463 RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
464 for (i=0; ; i++) {
465 RBBINode *usetNode;
466 RBBINode *setRef;
467 RBBINode *varRef;
468 UnicodeString setName;
469
470 usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
471 if (usetNode == NULL) {
472 break;
473 }
474
475 RBBIDebugPrintf("%3d ", i);
374ca955 476 setName = UNICODE_STRING("anonymous", 9);
b75a7d8f
A
477 setRef = usetNode->fParent;
478 if (setRef != NULL) {
479 varRef = setRef->fParent;
480 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
481 setName = varRef->fText;
482 }
483 }
374ca955 484 RBBI_DEBUG_printUnicodeString(setName);
b75a7d8f 485 RBBIDebugPrintf(" ");
374ca955 486 RBBI_DEBUG_printUnicodeString(usetNode->fText);
b75a7d8f
A
487 RBBIDebugPrintf("\n");
488 if (usetNode->fLeftChild != NULL) {
374ca955 489 usetNode->fLeftChild->printTree(TRUE);
b75a7d8f
A
490 }
491 }
492 RBBIDebugPrintf("\n");
b75a7d8f 493}
374ca955 494#endif
b75a7d8f
A
495
496
497
498//-------------------------------------------------------------------------------------
499//
500// RangeDescriptor copy constructor
501//
502//-------------------------------------------------------------------------------------
503
504RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
505 int i;
506
507 this->fStartChar = other.fStartChar;
508 this->fEndChar = other.fEndChar;
509 this->fNum = other.fNum;
510 this->fNext = NULL;
374ca955 511 UErrorCode oldstatus = status;
b75a7d8f 512 this->fIncludesSets = new UVector(status);
374ca955
A
513 if (U_FAILURE(oldstatus)) {
514 status = oldstatus;
515 }
516 if (U_FAILURE(status)) {
517 return;
518 }
b75a7d8f
A
519 /* test for NULL */
520 if (this->fIncludesSets == 0) {
521 status = U_MEMORY_ALLOCATION_ERROR;
522 return;
523 }
524
525 for (i=0; i<other.fIncludesSets->size(); i++) {
526 this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
527 }
528}
529
530
531//-------------------------------------------------------------------------------------
532//
533// RangeDesriptor default constructor
534//
535//-------------------------------------------------------------------------------------
536RangeDescriptor::RangeDescriptor(UErrorCode &status) {
537 this->fStartChar = 0;
538 this->fEndChar = 0;
539 this->fNum = 0;
540 this->fNext = NULL;
374ca955 541 UErrorCode oldstatus = status;
b75a7d8f 542 this->fIncludesSets = new UVector(status);
374ca955
A
543 if (U_FAILURE(oldstatus)) {
544 status = oldstatus;
545 }
546 if (U_FAILURE(status)) {
547 return;
548 }
b75a7d8f
A
549 /* test for NULL */
550 if(this->fIncludesSets == 0) {
551 status = U_MEMORY_ALLOCATION_ERROR;
552 return;
553 }
554
555}
556
557
558//-------------------------------------------------------------------------------------
559//
560// RangeDesriptor Destructor
561//
562//-------------------------------------------------------------------------------------
563RangeDescriptor::~RangeDescriptor() {
564 delete fIncludesSets;
565 fIncludesSets = NULL;
566}
567
568//-------------------------------------------------------------------------------------
569//
570// RangeDesriptor::split()
571//
572//-------------------------------------------------------------------------------------
573void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
574 U_ASSERT(where>fStartChar && where<=fEndChar);
575 RangeDescriptor *nr = new RangeDescriptor(*this, status);
374ca955
A
576 if (U_FAILURE(status)) {
577 return;
578 }
b75a7d8f
A
579 /* test for NULL */
580 if(nr == 0) {
581 status = U_MEMORY_ALLOCATION_ERROR;
582 return;
583 }
584
585 // RangeDescriptor copy constructor copies all fields.
586 // Only need to update those that are different after the split.
587 nr->fStartChar = where;
588 this->fEndChar = where-1;
589 nr->fNext = this->fNext;
590 this->fNext = nr;
591}
592
593
594//-------------------------------------------------------------------------------------
595//
596// RangeDescriptor::setDictionaryFlag
597//
598// Character Category Numbers that include characters from
599// the original Unicode Set named "dictionary" have bit 14
600// set to 1. The RBBI runtime engine uses this to trigger
601// use of the word dictionary.
602//
603// This function looks through the Unicode Sets that it
604// (the range) includes, and sets the bit in fNum when
605// "dictionary" is among them.
606//
607// TODO: a faster way would be to find the set node for
608// "dictionary" just once, rather than looking it
609// up by name every time.
610//
611//-------------------------------------------------------------------------------------
612void RangeDescriptor::setDictionaryFlag() {
613 int i;
614
615 for (i=0; i<this->fIncludesSets->size(); i++) {
616 RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
617 UnicodeString setName;
618 RBBINode *setRef = usetNode->fParent;
619 if (setRef != NULL) {
620 RBBINode *varRef = setRef->fParent;
621 if (varRef != NULL && varRef->fType == RBBINode::varRef) {
622 setName = varRef->fText;
623 }
624 }
374ca955 625 if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
b75a7d8f
A
626 this->fNum |= 0x4000;
627 break;
628 }
629 }
630}
631
632
633
634U_NAMESPACE_END
635
636#endif /* #if !UCONFIG_NO_BREAK_ITERATION */