]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | // |
2 | // rbbisetb.cpp | |
3 | // | |
4 | /* | |
5 | *************************************************************************** | |
46f4442e | 6 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
b75a7d8f A |
7 | * and others. All rights reserved. * |
8 | *************************************************************************** | |
9 | */ | |
10 | // | |
11 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules | |
12 | // (part of the rule building process.) | |
13 | // | |
14 | // Starting with the rules parse tree from the scanner, | |
15 | // | |
16 | // - Enumerate the set of UnicodeSets that are referenced | |
17 | // by the RBBI rules. | |
18 | // - compute a set of non-overlapping character ranges | |
19 | // with all characters within a range belonging to the same | |
20 | // set of input uniocde sets. | |
21 | // - Derive a set of non-overlapping UnicodeSet (like things) | |
22 | // that will correspond to columns in the state table for | |
23 | // the RBBI execution engine. All characters within one | |
24 | // of these sets belong to the same set of the original | |
25 | // UnicodeSets from the user's rules. | |
26 | // - construct the trie table that maps input characters | |
27 | // to the index of the matching non-overlapping set of set from | |
28 | // the previous step. | |
29 | // | |
30 | ||
31 | #include "unicode/utypes.h" | |
32 | ||
33 | #if !UCONFIG_NO_BREAK_ITERATION | |
34 | ||
35 | #include "unicode/uniset.h" | |
36 | #include "utrie.h" | |
37 | #include "uvector.h" | |
38 | #include "uassert.h" | |
39 | #include "cmemory.h" | |
40 | #include "cstring.h" | |
41 | ||
42 | #include "rbbisetb.h" | |
43 | #include "rbbinode.h" | |
44 | ||
45 | ||
46 | //------------------------------------------------------------------------ | |
47 | // | |
48 | // getFoldedRBBIValue Call-back function used during building of Trie table. | |
49 | // Folding value: just store the offset (16 bits) | |
50 | // if there is any non-0 entry. | |
51 | // (It'd really be nice if the Trie builder would provide a | |
52 | // simple default, so this function could go away from here.) | |
53 | // | |
54 | //------------------------------------------------------------------------ | |
55 | /* folding value: just store the offset (16 bits) if there is any non-0 entry */ | |
56 | U_CDECL_BEGIN | |
57 | static uint32_t U_CALLCONV | |
58 | getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) { | |
59 | uint32_t value; | |
60 | UChar32 limit; | |
61 | UBool inBlockZero; | |
62 | ||
63 | limit=start+0x400; | |
64 | while(start<limit) { | |
65 | value=utrie_get32(trie, start, &inBlockZero); | |
66 | if(inBlockZero) { | |
67 | start+=UTRIE_DATA_BLOCK_LENGTH; | |
68 | } else if(value!=0) { | |
69 | return (uint32_t)(offset|0x8000); | |
70 | } else { | |
71 | ++start; | |
72 | } | |
73 | } | |
74 | return 0; | |
75 | } | |
76 | ||
77 | ||
78 | U_CDECL_END | |
79 | ||
80 | ||
81 | ||
82 | U_NAMESPACE_BEGIN | |
83 | ||
84 | //------------------------------------------------------------------------ | |
85 | // | |
86 | // Constructor | |
87 | // | |
88 | //------------------------------------------------------------------------ | |
89 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) | |
90 | { | |
91 | fRB = rb; | |
92 | fStatus = rb->fStatus; | |
93 | fRangeList = 0; | |
94 | fTrie = 0; | |
95 | fTrieSize = 0; | |
96 | fGroupCount = 0; | |
73c04bcf | 97 | fSawBOF = FALSE; |
b75a7d8f A |
98 | } |
99 | ||
100 | ||
101 | //------------------------------------------------------------------------ | |
102 | // | |
103 | // Destructor | |
104 | // | |
105 | //------------------------------------------------------------------------ | |
106 | RBBISetBuilder::~RBBISetBuilder() | |
107 | { | |
108 | RangeDescriptor *nextRangeDesc; | |
109 | ||
110 | // Walk through & delete the linked list of RangeDescriptors | |
111 | for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { | |
112 | RangeDescriptor *r = nextRangeDesc; | |
113 | nextRangeDesc = r->fNext; | |
114 | delete r; | |
115 | } | |
116 | ||
117 | utrie_close(fTrie); | |
118 | } | |
119 | ||
120 | ||
121 | ||
122 | ||
123 | //------------------------------------------------------------------------ | |
124 | // | |
125 | // build Build the list of non-overlapping character ranges | |
126 | // from the Unicode Sets. | |
127 | // | |
128 | //------------------------------------------------------------------------ | |
129 | void RBBISetBuilder::build() { | |
130 | RBBINode *usetNode; | |
131 | RangeDescriptor *rlRange; | |
132 | ||
133 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} | |
134 | ||
135 | // | |
136 | // Initialize the process by creating a single range encompassing all characters | |
137 | // that is in no sets. | |
138 | // | |
374ca955 | 139 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
46f4442e A |
140 | if (fRangeList == NULL) { |
141 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
142 | return; | |
143 | } | |
b75a7d8f A |
144 | fRangeList->fStartChar = 0; |
145 | fRangeList->fEndChar = 0x10ffff; | |
146 | ||
374ca955 A |
147 | if (U_FAILURE(*fStatus)) { |
148 | return; | |
149 | } | |
b75a7d8f A |
150 | |
151 | // | |
152 | // Find the set of non-overlapping ranges of characters | |
153 | // | |
154 | int ni; | |
73c04bcf | 155 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
b75a7d8f A |
156 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
157 | if (usetNode==NULL) { | |
158 | break; | |
159 | } | |
160 | ||
161 | UnicodeSet *inputSet = usetNode->fInputSet; | |
162 | int32_t inputSetRangeCount = inputSet->getRangeCount(); | |
163 | int inputSetRangeIndex = 0; | |
164 | rlRange = fRangeList; | |
165 | ||
166 | for (;;) { | |
167 | if (inputSetRangeIndex >= inputSetRangeCount) { | |
168 | break; | |
169 | } | |
170 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); | |
171 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); | |
172 | ||
173 | // skip over ranges from the range list that are completely | |
174 | // below the current range from the input unicode set. | |
175 | while (rlRange->fEndChar < inputSetRangeBegin) { | |
176 | rlRange = rlRange->fNext; | |
177 | } | |
178 | ||
179 | // If the start of the range from the range list is before with | |
180 | // the start of the range from the unicode set, split the range list range | |
181 | // in two, with one part being before (wholly outside of) the unicode set | |
182 | // and the other containing the rest. | |
183 | // Then continue the loop; the post-split current range will then be skipped | |
184 | // over | |
185 | if (rlRange->fStartChar < inputSetRangeBegin) { | |
186 | rlRange->split(inputSetRangeBegin, *fStatus); | |
374ca955 A |
187 | if (U_FAILURE(*fStatus)) { |
188 | return; | |
189 | } | |
b75a7d8f A |
190 | continue; |
191 | } | |
192 | ||
193 | // Same thing at the end of the ranges... | |
194 | // If the end of the range from the range list doesn't coincide with | |
195 | // the end of the range from the unicode set, split the range list | |
196 | // range in two. The first part of the split range will be | |
197 | // wholly inside the Unicode set. | |
198 | if (rlRange->fEndChar > inputSetRangeEnd) { | |
199 | rlRange->split(inputSetRangeEnd+1, *fStatus); | |
374ca955 A |
200 | if (U_FAILURE(*fStatus)) { |
201 | return; | |
202 | } | |
b75a7d8f A |
203 | } |
204 | ||
205 | // The current rlRange is now entirely within the UnicodeSet range. | |
206 | // Add this unicode set to the list of sets for this rlRange | |
207 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { | |
208 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); | |
374ca955 A |
209 | if (U_FAILURE(*fStatus)) { |
210 | return; | |
211 | } | |
b75a7d8f A |
212 | } |
213 | ||
214 | // Advance over ranges that we are finished with. | |
215 | if (inputSetRangeEnd == rlRange->fEndChar) { | |
216 | inputSetRangeIndex++; | |
217 | } | |
218 | rlRange = rlRange->fNext; | |
219 | } | |
220 | } | |
221 | ||
222 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} | |
223 | ||
224 | // | |
225 | // Group the above ranges, with each group consisting of one or more | |
226 | // ranges that are in exactly the same set of original UnicodeSets. | |
227 | // The groups are numbered, and these group numbers are the set of | |
228 | // input symbols recognized by the run-time state machine. | |
229 | // | |
73c04bcf A |
230 | // Numbering: # 0 (state table column 0) is unused. |
231 | // # 1 is reserved - table column 1 is for end-of-input | |
232 | // # 2 is reserved - table column 2 is for beginning-in-input | |
233 | // # 3 is the first range list. | |
234 | // | |
b75a7d8f A |
235 | RangeDescriptor *rlSearchRange; |
236 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
237 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { | |
238 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { | |
239 | rlRange->fNum = rlSearchRange->fNum; | |
240 | break; | |
241 | } | |
242 | } | |
243 | if (rlRange->fNum == 0) { | |
244 | fGroupCount ++; | |
73c04bcf | 245 | rlRange->fNum = fGroupCount+2; |
b75a7d8f | 246 | rlRange->setDictionaryFlag(); |
73c04bcf A |
247 | addValToSets(rlRange->fIncludesSets, fGroupCount+2); |
248 | } | |
249 | } | |
250 | ||
251 | // Handle input sets that contain the special string {eof}. | |
252 | // Column 1 of the state table is reserved for EOF on input. | |
253 | // Column 2 is reserved for before-the-start-input. | |
254 | // (This column can be optimized away later if there are no rule | |
255 | // references to {bof}.) | |
256 | // Add this column value (1 or 2) to the equivalent expression | |
257 | // subtree for each UnicodeSet that contains the string {eof} | |
258 | // Because {bof} and {eof} are not a characters in the normal sense, | |
259 | // they doesn't affect the computation of ranges or TRIE. | |
260 | static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; | |
261 | static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; | |
262 | ||
263 | UnicodeString eofString(eofUString); | |
264 | UnicodeString bofString(bofUString); | |
265 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules | |
266 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); | |
267 | if (usetNode==NULL) { | |
268 | break; | |
269 | } | |
270 | UnicodeSet *inputSet = usetNode->fInputSet; | |
271 | if (inputSet->contains(eofString)) { | |
272 | addValToSet(usetNode, 1); | |
273 | } | |
274 | if (inputSet->contains(bofString)) { | |
275 | addValToSet(usetNode, 2); | |
276 | fSawBOF = TRUE; | |
b75a7d8f A |
277 | } |
278 | } | |
279 | ||
73c04bcf | 280 | |
b75a7d8f A |
281 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} |
282 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} | |
283 | ||
284 | // | |
285 | // Build the Trie table for mapping UChar32 values to the corresponding | |
286 | // range group number | |
287 | // | |
288 | fTrie = utrie_open(NULL, // Pre-existing trie to be filled in | |
289 | NULL, // Data array (utrie will allocate one) | |
290 | 100000, // Max Data Length | |
291 | 0, // Initial value for all code points | |
374ca955 | 292 | 0, // Lead surrogate unit value |
b75a7d8f A |
293 | TRUE); // Keep Latin 1 in separately |
294 | ||
295 | ||
296 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
297 | utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); | |
298 | } | |
299 | } | |
300 | ||
301 | ||
302 | ||
303 | //----------------------------------------------------------------------------------- | |
304 | // | |
305 | // getTrieSize() Return the size that will be required to serialize the Trie. | |
306 | // | |
307 | //----------------------------------------------------------------------------------- | |
374ca955 | 308 | int32_t RBBISetBuilder::getTrieSize() /*const*/ { |
b75a7d8f A |
309 | fTrieSize = utrie_serialize(fTrie, |
310 | NULL, // Buffer | |
311 | 0, // Capacity | |
312 | getFoldedRBBIValue, | |
313 | TRUE, // Reduce to 16 bits | |
314 | fStatus); | |
315 | // RBBIDebugPrintf("Trie table size is %d\n", trieSize); | |
316 | return fTrieSize; | |
317 | } | |
318 | ||
319 | ||
320 | //----------------------------------------------------------------------------------- | |
321 | // | |
322 | // serializeTrie() Put the serialized trie at the specified address. | |
323 | // Trust the caller to have given us enough memory. | |
324 | // getTrieSize() MUST be called first. | |
325 | // | |
326 | //----------------------------------------------------------------------------------- | |
327 | void RBBISetBuilder::serializeTrie(uint8_t *where) { | |
328 | utrie_serialize(fTrie, | |
329 | where, // Buffer | |
330 | fTrieSize, // Capacity | |
331 | getFoldedRBBIValue, | |
332 | TRUE, // Reduce to 16 bits | |
333 | fStatus); | |
334 | } | |
335 | ||
336 | //------------------------------------------------------------------------ | |
337 | // | |
338 | // addValToSets Add a runtime-mapped input value to each uset from a | |
73c04bcf | 339 | // list of uset nodes. (val corresponds to a state table column.) |
b75a7d8f A |
340 | // For each of the original Unicode sets - which correspond |
341 | // directly to uset nodes - a logically equivalent expression | |
342 | // is constructed in terms of the remapped runtime input | |
343 | // symbol set. This function adds one runtime input symbol to | |
344 | // a list of sets. | |
345 | // | |
346 | // The "logically equivalent expression" is the tree for an | |
347 | // or-ing together of all of the symbols that go into the set. | |
348 | // | |
349 | //------------------------------------------------------------------------ | |
350 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { | |
351 | int32_t ix; | |
352 | ||
353 | for (ix=0; ix<sets->size(); ix++) { | |
354 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); | |
73c04bcf | 355 | addValToSet(usetNode, val); |
b75a7d8f A |
356 | } |
357 | } | |
358 | ||
73c04bcf A |
359 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
360 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); | |
46f4442e A |
361 | if (leafNode == NULL) { |
362 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
363 | return; | |
364 | } | |
73c04bcf A |
365 | leafNode->fVal = (unsigned short)val; |
366 | if (usetNode->fLeftChild == NULL) { | |
367 | usetNode->fLeftChild = leafNode; | |
368 | leafNode->fParent = usetNode; | |
369 | } else { | |
370 | // There are already input symbols present for this set. | |
371 | // Set up an OR node, with the previous stuff as the left child | |
372 | // and the new value as the right child. | |
373 | RBBINode *orNode = new RBBINode(RBBINode::opOr); | |
46f4442e A |
374 | if (orNode == NULL) { |
375 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
376 | return; | |
377 | } | |
73c04bcf A |
378 | orNode->fLeftChild = usetNode->fLeftChild; |
379 | orNode->fRightChild = leafNode; | |
380 | orNode->fLeftChild->fParent = orNode; | |
381 | orNode->fRightChild->fParent = orNode; | |
382 | usetNode->fLeftChild = orNode; | |
383 | orNode->fParent = usetNode; | |
384 | } | |
385 | } | |
b75a7d8f A |
386 | |
387 | ||
388 | //------------------------------------------------------------------------ | |
389 | // | |
73c04bcf | 390 | // getNumCharCategories |
b75a7d8f A |
391 | // |
392 | //------------------------------------------------------------------------ | |
374ca955 | 393 | int32_t RBBISetBuilder::getNumCharCategories() const { |
73c04bcf | 394 | return fGroupCount + 3; |
b75a7d8f A |
395 | } |
396 | ||
397 | ||
73c04bcf A |
398 | //------------------------------------------------------------------------ |
399 | // | |
400 | // sawBOF | |
401 | // | |
402 | //------------------------------------------------------------------------ | |
403 | UBool RBBISetBuilder::sawBOF() const { | |
404 | return fSawBOF; | |
405 | } | |
406 | ||
b75a7d8f | 407 | |
374ca955 A |
408 | //------------------------------------------------------------------------ |
409 | // | |
410 | // getFirstChar Given a runtime RBBI character category, find | |
411 | // the first UChar32 that is in the set of chars | |
412 | // in the category. | |
413 | //------------------------------------------------------------------------ | |
414 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { | |
415 | RangeDescriptor *rlRange; | |
416 | UChar32 retVal = (UChar32)-1; | |
417 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
418 | if (rlRange->fNum == category) { | |
419 | retVal = rlRange->fStartChar; | |
420 | break; | |
421 | } | |
422 | } | |
423 | return retVal; | |
424 | } | |
425 | ||
426 | ||
427 | ||
b75a7d8f A |
428 | //------------------------------------------------------------------------ |
429 | // | |
430 | // printRanges A debugging function. | |
431 | // dump out all of the range definitions. | |
432 | // | |
433 | //------------------------------------------------------------------------ | |
b75a7d8f | 434 | #ifdef RBBI_DEBUG |
374ca955 | 435 | void RBBISetBuilder::printRanges() { |
b75a7d8f A |
436 | RangeDescriptor *rlRange; |
437 | int i; | |
438 | ||
439 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); | |
440 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
441 | RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); | |
442 | ||
443 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
444 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 445 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
446 | RBBINode *setRef = usetNode->fParent; |
447 | if (setRef != NULL) { | |
448 | RBBINode *varRef = setRef->fParent; | |
449 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
450 | setName = varRef->fText; | |
451 | } | |
452 | } | |
374ca955 | 453 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
454 | } |
455 | RBBIDebugPrintf("\n"); | |
456 | } | |
b75a7d8f | 457 | } |
374ca955 | 458 | #endif |
b75a7d8f A |
459 | |
460 | ||
461 | //------------------------------------------------------------------------ | |
462 | // | |
463 | // printRangeGroups A debugging function. | |
464 | // dump out all of the range groups. | |
465 | // | |
466 | //------------------------------------------------------------------------ | |
374ca955 | 467 | #ifdef RBBI_DEBUG |
b75a7d8f A |
468 | void RBBISetBuilder::printRangeGroups() { |
469 | RangeDescriptor *rlRange; | |
470 | RangeDescriptor *tRange; | |
471 | int i; | |
472 | int lastPrintedGroupNum = 0; | |
473 | ||
474 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); | |
475 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
476 | int groupNum = rlRange->fNum & 0xbfff; | |
477 | if (groupNum > lastPrintedGroupNum) { | |
478 | lastPrintedGroupNum = groupNum; | |
479 | RBBIDebugPrintf("%2i ", groupNum); | |
480 | ||
481 | if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} | |
482 | ||
483 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
484 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 485 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
486 | RBBINode *setRef = usetNode->fParent; |
487 | if (setRef != NULL) { | |
488 | RBBINode *varRef = setRef->fParent; | |
489 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
490 | setName = varRef->fText; | |
491 | } | |
492 | } | |
374ca955 | 493 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
494 | } |
495 | ||
496 | i = 0; | |
497 | for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { | |
498 | if (tRange->fNum == rlRange->fNum) { | |
499 | if (i++ % 5 == 0) { | |
500 | RBBIDebugPrintf("\n "); | |
501 | } | |
502 | RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); | |
503 | } | |
504 | } | |
505 | RBBIDebugPrintf("\n"); | |
506 | } | |
507 | } | |
508 | RBBIDebugPrintf("\n"); | |
509 | } | |
374ca955 | 510 | #endif |
b75a7d8f A |
511 | |
512 | ||
513 | //------------------------------------------------------------------------ | |
514 | // | |
515 | // printSets A debugging function. | |
516 | // dump out all of the set definitions. | |
517 | // | |
518 | //------------------------------------------------------------------------ | |
b75a7d8f | 519 | #ifdef RBBI_DEBUG |
374ca955 | 520 | void RBBISetBuilder::printSets() { |
b75a7d8f A |
521 | int i; |
522 | ||
523 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); | |
524 | for (i=0; ; i++) { | |
46f4442e | 525 | RBBINode *usetNode; |
b75a7d8f A |
526 | RBBINode *setRef; |
527 | RBBINode *varRef; | |
528 | UnicodeString setName; | |
529 | ||
530 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); | |
531 | if (usetNode == NULL) { | |
532 | break; | |
533 | } | |
534 | ||
535 | RBBIDebugPrintf("%3d ", i); | |
374ca955 | 536 | setName = UNICODE_STRING("anonymous", 9); |
b75a7d8f A |
537 | setRef = usetNode->fParent; |
538 | if (setRef != NULL) { | |
539 | varRef = setRef->fParent; | |
540 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
541 | setName = varRef->fText; | |
542 | } | |
543 | } | |
374ca955 | 544 | RBBI_DEBUG_printUnicodeString(setName); |
b75a7d8f | 545 | RBBIDebugPrintf(" "); |
374ca955 | 546 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
b75a7d8f A |
547 | RBBIDebugPrintf("\n"); |
548 | if (usetNode->fLeftChild != NULL) { | |
374ca955 | 549 | usetNode->fLeftChild->printTree(TRUE); |
b75a7d8f A |
550 | } |
551 | } | |
552 | RBBIDebugPrintf("\n"); | |
b75a7d8f | 553 | } |
374ca955 | 554 | #endif |
b75a7d8f A |
555 | |
556 | ||
557 | ||
558 | //------------------------------------------------------------------------------------- | |
559 | // | |
560 | // RangeDescriptor copy constructor | |
561 | // | |
562 | //------------------------------------------------------------------------------------- | |
563 | ||
564 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { | |
565 | int i; | |
566 | ||
567 | this->fStartChar = other.fStartChar; | |
568 | this->fEndChar = other.fEndChar; | |
569 | this->fNum = other.fNum; | |
570 | this->fNext = NULL; | |
374ca955 | 571 | UErrorCode oldstatus = status; |
b75a7d8f | 572 | this->fIncludesSets = new UVector(status); |
374ca955 A |
573 | if (U_FAILURE(oldstatus)) { |
574 | status = oldstatus; | |
575 | } | |
576 | if (U_FAILURE(status)) { | |
577 | return; | |
578 | } | |
b75a7d8f A |
579 | /* test for NULL */ |
580 | if (this->fIncludesSets == 0) { | |
581 | status = U_MEMORY_ALLOCATION_ERROR; | |
582 | return; | |
583 | } | |
584 | ||
585 | for (i=0; i<other.fIncludesSets->size(); i++) { | |
586 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); | |
587 | } | |
588 | } | |
589 | ||
590 | ||
591 | //------------------------------------------------------------------------------------- | |
592 | // | |
593 | // RangeDesriptor default constructor | |
594 | // | |
595 | //------------------------------------------------------------------------------------- | |
596 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { | |
597 | this->fStartChar = 0; | |
598 | this->fEndChar = 0; | |
599 | this->fNum = 0; | |
600 | this->fNext = NULL; | |
374ca955 | 601 | UErrorCode oldstatus = status; |
b75a7d8f | 602 | this->fIncludesSets = new UVector(status); |
374ca955 A |
603 | if (U_FAILURE(oldstatus)) { |
604 | status = oldstatus; | |
605 | } | |
606 | if (U_FAILURE(status)) { | |
607 | return; | |
608 | } | |
b75a7d8f A |
609 | /* test for NULL */ |
610 | if(this->fIncludesSets == 0) { | |
611 | status = U_MEMORY_ALLOCATION_ERROR; | |
612 | return; | |
613 | } | |
614 | ||
615 | } | |
616 | ||
617 | ||
618 | //------------------------------------------------------------------------------------- | |
619 | // | |
620 | // RangeDesriptor Destructor | |
621 | // | |
622 | //------------------------------------------------------------------------------------- | |
623 | RangeDescriptor::~RangeDescriptor() { | |
624 | delete fIncludesSets; | |
625 | fIncludesSets = NULL; | |
626 | } | |
627 | ||
628 | //------------------------------------------------------------------------------------- | |
629 | // | |
630 | // RangeDesriptor::split() | |
631 | // | |
632 | //------------------------------------------------------------------------------------- | |
633 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { | |
634 | U_ASSERT(where>fStartChar && where<=fEndChar); | |
635 | RangeDescriptor *nr = new RangeDescriptor(*this, status); | |
b75a7d8f A |
636 | if(nr == 0) { |
637 | status = U_MEMORY_ALLOCATION_ERROR; | |
638 | return; | |
639 | } | |
46f4442e A |
640 | if (U_FAILURE(status)) { |
641 | delete nr; | |
642 | return; | |
643 | } | |
b75a7d8f A |
644 | // RangeDescriptor copy constructor copies all fields. |
645 | // Only need to update those that are different after the split. | |
646 | nr->fStartChar = where; | |
647 | this->fEndChar = where-1; | |
648 | nr->fNext = this->fNext; | |
649 | this->fNext = nr; | |
650 | } | |
651 | ||
652 | ||
653 | //------------------------------------------------------------------------------------- | |
654 | // | |
655 | // RangeDescriptor::setDictionaryFlag | |
656 | // | |
657 | // Character Category Numbers that include characters from | |
658 | // the original Unicode Set named "dictionary" have bit 14 | |
659 | // set to 1. The RBBI runtime engine uses this to trigger | |
660 | // use of the word dictionary. | |
661 | // | |
662 | // This function looks through the Unicode Sets that it | |
663 | // (the range) includes, and sets the bit in fNum when | |
664 | // "dictionary" is among them. | |
665 | // | |
666 | // TODO: a faster way would be to find the set node for | |
667 | // "dictionary" just once, rather than looking it | |
668 | // up by name every time. | |
669 | // | |
670 | //------------------------------------------------------------------------------------- | |
671 | void RangeDescriptor::setDictionaryFlag() { | |
672 | int i; | |
673 | ||
674 | for (i=0; i<this->fIncludesSets->size(); i++) { | |
675 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); | |
676 | UnicodeString setName; | |
677 | RBBINode *setRef = usetNode->fParent; | |
678 | if (setRef != NULL) { | |
679 | RBBINode *varRef = setRef->fParent; | |
680 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
681 | setName = varRef->fText; | |
682 | } | |
683 | } | |
374ca955 | 684 | if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. |
b75a7d8f A |
685 | this->fNum |= 0x4000; |
686 | break; | |
687 | } | |
688 | } | |
689 | } | |
690 | ||
691 | ||
692 | ||
693 | U_NAMESPACE_END | |
694 | ||
695 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |