]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | // |
4 | // rbbisetb.cpp | |
5 | // | |
6 | /* | |
7 | *************************************************************************** | |
46f4442e | 8 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
b75a7d8f A |
9 | * and others. All rights reserved. * |
10 | *************************************************************************** | |
11 | */ | |
12 | // | |
13 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules | |
14 | // (part of the rule building process.) | |
15 | // | |
16 | // Starting with the rules parse tree from the scanner, | |
17 | // | |
18 | // - Enumerate the set of UnicodeSets that are referenced | |
19 | // by the RBBI rules. | |
20 | // - compute a set of non-overlapping character ranges | |
21 | // with all characters within a range belonging to the same | |
22 | // set of input uniocde sets. | |
23 | // - Derive a set of non-overlapping UnicodeSet (like things) | |
24 | // that will correspond to columns in the state table for | |
25 | // the RBBI execution engine. All characters within one | |
26 | // of these sets belong to the same set of the original | |
27 | // UnicodeSets from the user's rules. | |
28 | // - construct the trie table that maps input characters | |
29 | // to the index of the matching non-overlapping set of set from | |
30 | // the previous step. | |
31 | // | |
32 | ||
33 | #include "unicode/utypes.h" | |
34 | ||
35 | #if !UCONFIG_NO_BREAK_ITERATION | |
36 | ||
37 | #include "unicode/uniset.h" | |
38 | #include "utrie.h" | |
39 | #include "uvector.h" | |
40 | #include "uassert.h" | |
41 | #include "cmemory.h" | |
42 | #include "cstring.h" | |
43 | ||
44 | #include "rbbisetb.h" | |
45 | #include "rbbinode.h" | |
46 | ||
47 | ||
48 | //------------------------------------------------------------------------ | |
49 | // | |
50 | // getFoldedRBBIValue Call-back function used during building of Trie table. | |
51 | // Folding value: just store the offset (16 bits) | |
52 | // if there is any non-0 entry. | |
53 | // (It'd really be nice if the Trie builder would provide a | |
54 | // simple default, so this function could go away from here.) | |
55 | // | |
56 | //------------------------------------------------------------------------ | |
57 | /* folding value: just store the offset (16 bits) if there is any non-0 entry */ | |
58 | U_CDECL_BEGIN | |
59 | static uint32_t U_CALLCONV | |
60 | getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) { | |
61 | uint32_t value; | |
62 | UChar32 limit; | |
63 | UBool inBlockZero; | |
64 | ||
65 | limit=start+0x400; | |
66 | while(start<limit) { | |
67 | value=utrie_get32(trie, start, &inBlockZero); | |
68 | if(inBlockZero) { | |
69 | start+=UTRIE_DATA_BLOCK_LENGTH; | |
70 | } else if(value!=0) { | |
71 | return (uint32_t)(offset|0x8000); | |
72 | } else { | |
73 | ++start; | |
74 | } | |
75 | } | |
76 | return 0; | |
77 | } | |
78 | ||
79 | ||
80 | U_CDECL_END | |
81 | ||
82 | ||
83 | ||
84 | U_NAMESPACE_BEGIN | |
85 | ||
86 | //------------------------------------------------------------------------ | |
87 | // | |
88 | // Constructor | |
89 | // | |
90 | //------------------------------------------------------------------------ | |
91 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) | |
92 | { | |
93 | fRB = rb; | |
94 | fStatus = rb->fStatus; | |
95 | fRangeList = 0; | |
96 | fTrie = 0; | |
97 | fTrieSize = 0; | |
98 | fGroupCount = 0; | |
73c04bcf | 99 | fSawBOF = FALSE; |
b75a7d8f A |
100 | } |
101 | ||
102 | ||
103 | //------------------------------------------------------------------------ | |
104 | // | |
105 | // Destructor | |
106 | // | |
107 | //------------------------------------------------------------------------ | |
108 | RBBISetBuilder::~RBBISetBuilder() | |
109 | { | |
110 | RangeDescriptor *nextRangeDesc; | |
111 | ||
112 | // Walk through & delete the linked list of RangeDescriptors | |
113 | for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { | |
114 | RangeDescriptor *r = nextRangeDesc; | |
115 | nextRangeDesc = r->fNext; | |
116 | delete r; | |
117 | } | |
118 | ||
119 | utrie_close(fTrie); | |
120 | } | |
121 | ||
122 | ||
123 | ||
124 | ||
125 | //------------------------------------------------------------------------ | |
126 | // | |
127 | // build Build the list of non-overlapping character ranges | |
128 | // from the Unicode Sets. | |
129 | // | |
130 | //------------------------------------------------------------------------ | |
131 | void RBBISetBuilder::build() { | |
132 | RBBINode *usetNode; | |
133 | RangeDescriptor *rlRange; | |
134 | ||
135 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} | |
136 | ||
137 | // | |
138 | // Initialize the process by creating a single range encompassing all characters | |
139 | // that is in no sets. | |
140 | // | |
374ca955 | 141 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
46f4442e A |
142 | if (fRangeList == NULL) { |
143 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
144 | return; | |
145 | } | |
b75a7d8f A |
146 | fRangeList->fStartChar = 0; |
147 | fRangeList->fEndChar = 0x10ffff; | |
148 | ||
374ca955 A |
149 | if (U_FAILURE(*fStatus)) { |
150 | return; | |
151 | } | |
b75a7d8f A |
152 | |
153 | // | |
154 | // Find the set of non-overlapping ranges of characters | |
155 | // | |
156 | int ni; | |
73c04bcf | 157 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
b75a7d8f A |
158 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
159 | if (usetNode==NULL) { | |
160 | break; | |
161 | } | |
162 | ||
163 | UnicodeSet *inputSet = usetNode->fInputSet; | |
164 | int32_t inputSetRangeCount = inputSet->getRangeCount(); | |
165 | int inputSetRangeIndex = 0; | |
166 | rlRange = fRangeList; | |
167 | ||
168 | for (;;) { | |
169 | if (inputSetRangeIndex >= inputSetRangeCount) { | |
170 | break; | |
171 | } | |
172 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); | |
173 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); | |
174 | ||
175 | // skip over ranges from the range list that are completely | |
176 | // below the current range from the input unicode set. | |
177 | while (rlRange->fEndChar < inputSetRangeBegin) { | |
178 | rlRange = rlRange->fNext; | |
179 | } | |
180 | ||
181 | // If the start of the range from the range list is before with | |
182 | // the start of the range from the unicode set, split the range list range | |
183 | // in two, with one part being before (wholly outside of) the unicode set | |
184 | // and the other containing the rest. | |
185 | // Then continue the loop; the post-split current range will then be skipped | |
186 | // over | |
187 | if (rlRange->fStartChar < inputSetRangeBegin) { | |
188 | rlRange->split(inputSetRangeBegin, *fStatus); | |
374ca955 A |
189 | if (U_FAILURE(*fStatus)) { |
190 | return; | |
191 | } | |
b75a7d8f A |
192 | continue; |
193 | } | |
194 | ||
195 | // Same thing at the end of the ranges... | |
196 | // If the end of the range from the range list doesn't coincide with | |
197 | // the end of the range from the unicode set, split the range list | |
198 | // range in two. The first part of the split range will be | |
199 | // wholly inside the Unicode set. | |
200 | if (rlRange->fEndChar > inputSetRangeEnd) { | |
201 | rlRange->split(inputSetRangeEnd+1, *fStatus); | |
374ca955 A |
202 | if (U_FAILURE(*fStatus)) { |
203 | return; | |
204 | } | |
b75a7d8f A |
205 | } |
206 | ||
207 | // The current rlRange is now entirely within the UnicodeSet range. | |
208 | // Add this unicode set to the list of sets for this rlRange | |
209 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { | |
210 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); | |
374ca955 A |
211 | if (U_FAILURE(*fStatus)) { |
212 | return; | |
213 | } | |
b75a7d8f A |
214 | } |
215 | ||
216 | // Advance over ranges that we are finished with. | |
217 | if (inputSetRangeEnd == rlRange->fEndChar) { | |
218 | inputSetRangeIndex++; | |
219 | } | |
220 | rlRange = rlRange->fNext; | |
221 | } | |
222 | } | |
223 | ||
224 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} | |
225 | ||
226 | // | |
227 | // Group the above ranges, with each group consisting of one or more | |
228 | // ranges that are in exactly the same set of original UnicodeSets. | |
229 | // The groups are numbered, and these group numbers are the set of | |
230 | // input symbols recognized by the run-time state machine. | |
231 | // | |
73c04bcf A |
232 | // Numbering: # 0 (state table column 0) is unused. |
233 | // # 1 is reserved - table column 1 is for end-of-input | |
234 | // # 2 is reserved - table column 2 is for beginning-in-input | |
235 | // # 3 is the first range list. | |
236 | // | |
b75a7d8f A |
237 | RangeDescriptor *rlSearchRange; |
238 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
239 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { | |
240 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { | |
241 | rlRange->fNum = rlSearchRange->fNum; | |
242 | break; | |
243 | } | |
244 | } | |
245 | if (rlRange->fNum == 0) { | |
246 | fGroupCount ++; | |
73c04bcf | 247 | rlRange->fNum = fGroupCount+2; |
b75a7d8f | 248 | rlRange->setDictionaryFlag(); |
73c04bcf A |
249 | addValToSets(rlRange->fIncludesSets, fGroupCount+2); |
250 | } | |
251 | } | |
252 | ||
253 | // Handle input sets that contain the special string {eof}. | |
254 | // Column 1 of the state table is reserved for EOF on input. | |
255 | // Column 2 is reserved for before-the-start-input. | |
256 | // (This column can be optimized away later if there are no rule | |
257 | // references to {bof}.) | |
258 | // Add this column value (1 or 2) to the equivalent expression | |
259 | // subtree for each UnicodeSet that contains the string {eof} | |
260 | // Because {bof} and {eof} are not a characters in the normal sense, | |
261 | // they doesn't affect the computation of ranges or TRIE. | |
262 | static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; | |
263 | static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; | |
264 | ||
265 | UnicodeString eofString(eofUString); | |
266 | UnicodeString bofString(bofUString); | |
267 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules | |
268 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); | |
269 | if (usetNode==NULL) { | |
270 | break; | |
271 | } | |
272 | UnicodeSet *inputSet = usetNode->fInputSet; | |
273 | if (inputSet->contains(eofString)) { | |
274 | addValToSet(usetNode, 1); | |
275 | } | |
276 | if (inputSet->contains(bofString)) { | |
277 | addValToSet(usetNode, 2); | |
278 | fSawBOF = TRUE; | |
b75a7d8f A |
279 | } |
280 | } | |
281 | ||
73c04bcf | 282 | |
b75a7d8f A |
283 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} |
284 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} | |
285 | ||
286 | // | |
287 | // Build the Trie table for mapping UChar32 values to the corresponding | |
288 | // range group number | |
289 | // | |
290 | fTrie = utrie_open(NULL, // Pre-existing trie to be filled in | |
291 | NULL, // Data array (utrie will allocate one) | |
292 | 100000, // Max Data Length | |
293 | 0, // Initial value for all code points | |
374ca955 | 294 | 0, // Lead surrogate unit value |
b75a7d8f A |
295 | TRUE); // Keep Latin 1 in separately |
296 | ||
297 | ||
298 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
299 | utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE); | |
300 | } | |
301 | } | |
302 | ||
303 | ||
304 | ||
305 | //----------------------------------------------------------------------------------- | |
306 | // | |
307 | // getTrieSize() Return the size that will be required to serialize the Trie. | |
308 | // | |
309 | //----------------------------------------------------------------------------------- | |
374ca955 | 310 | int32_t RBBISetBuilder::getTrieSize() /*const*/ { |
b75a7d8f A |
311 | fTrieSize = utrie_serialize(fTrie, |
312 | NULL, // Buffer | |
313 | 0, // Capacity | |
314 | getFoldedRBBIValue, | |
315 | TRUE, // Reduce to 16 bits | |
316 | fStatus); | |
317 | // RBBIDebugPrintf("Trie table size is %d\n", trieSize); | |
318 | return fTrieSize; | |
319 | } | |
320 | ||
321 | ||
322 | //----------------------------------------------------------------------------------- | |
323 | // | |
324 | // serializeTrie() Put the serialized trie at the specified address. | |
325 | // Trust the caller to have given us enough memory. | |
326 | // getTrieSize() MUST be called first. | |
327 | // | |
328 | //----------------------------------------------------------------------------------- | |
329 | void RBBISetBuilder::serializeTrie(uint8_t *where) { | |
330 | utrie_serialize(fTrie, | |
331 | where, // Buffer | |
332 | fTrieSize, // Capacity | |
333 | getFoldedRBBIValue, | |
334 | TRUE, // Reduce to 16 bits | |
335 | fStatus); | |
336 | } | |
337 | ||
338 | //------------------------------------------------------------------------ | |
339 | // | |
340 | // addValToSets Add a runtime-mapped input value to each uset from a | |
73c04bcf | 341 | // list of uset nodes. (val corresponds to a state table column.) |
b75a7d8f A |
342 | // For each of the original Unicode sets - which correspond |
343 | // directly to uset nodes - a logically equivalent expression | |
344 | // is constructed in terms of the remapped runtime input | |
345 | // symbol set. This function adds one runtime input symbol to | |
346 | // a list of sets. | |
347 | // | |
348 | // The "logically equivalent expression" is the tree for an | |
349 | // or-ing together of all of the symbols that go into the set. | |
350 | // | |
351 | //------------------------------------------------------------------------ | |
352 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { | |
353 | int32_t ix; | |
354 | ||
355 | for (ix=0; ix<sets->size(); ix++) { | |
356 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); | |
73c04bcf | 357 | addValToSet(usetNode, val); |
b75a7d8f A |
358 | } |
359 | } | |
360 | ||
73c04bcf A |
361 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
362 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); | |
46f4442e A |
363 | if (leafNode == NULL) { |
364 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
365 | return; | |
366 | } | |
73c04bcf A |
367 | leafNode->fVal = (unsigned short)val; |
368 | if (usetNode->fLeftChild == NULL) { | |
369 | usetNode->fLeftChild = leafNode; | |
370 | leafNode->fParent = usetNode; | |
371 | } else { | |
372 | // There are already input symbols present for this set. | |
373 | // Set up an OR node, with the previous stuff as the left child | |
374 | // and the new value as the right child. | |
375 | RBBINode *orNode = new RBBINode(RBBINode::opOr); | |
46f4442e A |
376 | if (orNode == NULL) { |
377 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
378 | return; | |
379 | } | |
73c04bcf A |
380 | orNode->fLeftChild = usetNode->fLeftChild; |
381 | orNode->fRightChild = leafNode; | |
382 | orNode->fLeftChild->fParent = orNode; | |
383 | orNode->fRightChild->fParent = orNode; | |
384 | usetNode->fLeftChild = orNode; | |
385 | orNode->fParent = usetNode; | |
386 | } | |
387 | } | |
b75a7d8f A |
388 | |
389 | ||
390 | //------------------------------------------------------------------------ | |
391 | // | |
73c04bcf | 392 | // getNumCharCategories |
b75a7d8f A |
393 | // |
394 | //------------------------------------------------------------------------ | |
374ca955 | 395 | int32_t RBBISetBuilder::getNumCharCategories() const { |
73c04bcf | 396 | return fGroupCount + 3; |
b75a7d8f A |
397 | } |
398 | ||
399 | ||
73c04bcf A |
400 | //------------------------------------------------------------------------ |
401 | // | |
402 | // sawBOF | |
403 | // | |
404 | //------------------------------------------------------------------------ | |
405 | UBool RBBISetBuilder::sawBOF() const { | |
406 | return fSawBOF; | |
407 | } | |
408 | ||
b75a7d8f | 409 | |
374ca955 A |
410 | //------------------------------------------------------------------------ |
411 | // | |
412 | // getFirstChar Given a runtime RBBI character category, find | |
413 | // the first UChar32 that is in the set of chars | |
414 | // in the category. | |
415 | //------------------------------------------------------------------------ | |
416 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { | |
417 | RangeDescriptor *rlRange; | |
418 | UChar32 retVal = (UChar32)-1; | |
419 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
420 | if (rlRange->fNum == category) { | |
421 | retVal = rlRange->fStartChar; | |
422 | break; | |
423 | } | |
424 | } | |
425 | return retVal; | |
426 | } | |
427 | ||
428 | ||
429 | ||
b75a7d8f A |
430 | //------------------------------------------------------------------------ |
431 | // | |
432 | // printRanges A debugging function. | |
433 | // dump out all of the range definitions. | |
434 | // | |
435 | //------------------------------------------------------------------------ | |
b75a7d8f | 436 | #ifdef RBBI_DEBUG |
374ca955 | 437 | void RBBISetBuilder::printRanges() { |
b75a7d8f A |
438 | RangeDescriptor *rlRange; |
439 | int i; | |
440 | ||
441 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); | |
442 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
443 | RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); | |
444 | ||
445 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
446 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 447 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
448 | RBBINode *setRef = usetNode->fParent; |
449 | if (setRef != NULL) { | |
450 | RBBINode *varRef = setRef->fParent; | |
451 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
452 | setName = varRef->fText; | |
453 | } | |
454 | } | |
374ca955 | 455 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
456 | } |
457 | RBBIDebugPrintf("\n"); | |
458 | } | |
b75a7d8f | 459 | } |
374ca955 | 460 | #endif |
b75a7d8f A |
461 | |
462 | ||
463 | //------------------------------------------------------------------------ | |
464 | // | |
465 | // printRangeGroups A debugging function. | |
466 | // dump out all of the range groups. | |
467 | // | |
468 | //------------------------------------------------------------------------ | |
374ca955 | 469 | #ifdef RBBI_DEBUG |
b75a7d8f A |
470 | void RBBISetBuilder::printRangeGroups() { |
471 | RangeDescriptor *rlRange; | |
472 | RangeDescriptor *tRange; | |
473 | int i; | |
474 | int lastPrintedGroupNum = 0; | |
475 | ||
476 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); | |
477 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
478 | int groupNum = rlRange->fNum & 0xbfff; | |
479 | if (groupNum > lastPrintedGroupNum) { | |
480 | lastPrintedGroupNum = groupNum; | |
481 | RBBIDebugPrintf("%2i ", groupNum); | |
482 | ||
483 | if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");} | |
484 | ||
485 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
486 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 487 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
488 | RBBINode *setRef = usetNode->fParent; |
489 | if (setRef != NULL) { | |
490 | RBBINode *varRef = setRef->fParent; | |
491 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
492 | setName = varRef->fText; | |
493 | } | |
494 | } | |
374ca955 | 495 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
496 | } |
497 | ||
498 | i = 0; | |
499 | for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { | |
500 | if (tRange->fNum == rlRange->fNum) { | |
501 | if (i++ % 5 == 0) { | |
502 | RBBIDebugPrintf("\n "); | |
503 | } | |
504 | RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); | |
505 | } | |
506 | } | |
507 | RBBIDebugPrintf("\n"); | |
508 | } | |
509 | } | |
510 | RBBIDebugPrintf("\n"); | |
511 | } | |
374ca955 | 512 | #endif |
b75a7d8f A |
513 | |
514 | ||
515 | //------------------------------------------------------------------------ | |
516 | // | |
517 | // printSets A debugging function. | |
518 | // dump out all of the set definitions. | |
519 | // | |
520 | //------------------------------------------------------------------------ | |
b75a7d8f | 521 | #ifdef RBBI_DEBUG |
374ca955 | 522 | void RBBISetBuilder::printSets() { |
b75a7d8f A |
523 | int i; |
524 | ||
525 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); | |
526 | for (i=0; ; i++) { | |
46f4442e | 527 | RBBINode *usetNode; |
b75a7d8f A |
528 | RBBINode *setRef; |
529 | RBBINode *varRef; | |
530 | UnicodeString setName; | |
531 | ||
532 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); | |
533 | if (usetNode == NULL) { | |
534 | break; | |
535 | } | |
536 | ||
537 | RBBIDebugPrintf("%3d ", i); | |
374ca955 | 538 | setName = UNICODE_STRING("anonymous", 9); |
b75a7d8f A |
539 | setRef = usetNode->fParent; |
540 | if (setRef != NULL) { | |
541 | varRef = setRef->fParent; | |
542 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
543 | setName = varRef->fText; | |
544 | } | |
545 | } | |
374ca955 | 546 | RBBI_DEBUG_printUnicodeString(setName); |
b75a7d8f | 547 | RBBIDebugPrintf(" "); |
374ca955 | 548 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
b75a7d8f A |
549 | RBBIDebugPrintf("\n"); |
550 | if (usetNode->fLeftChild != NULL) { | |
f3c0d7a5 | 551 | RBBINode::printTree(usetNode->fLeftChild, TRUE); |
b75a7d8f A |
552 | } |
553 | } | |
554 | RBBIDebugPrintf("\n"); | |
b75a7d8f | 555 | } |
374ca955 | 556 | #endif |
b75a7d8f A |
557 | |
558 | ||
559 | ||
560 | //------------------------------------------------------------------------------------- | |
561 | // | |
562 | // RangeDescriptor copy constructor | |
563 | // | |
564 | //------------------------------------------------------------------------------------- | |
565 | ||
566 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { | |
567 | int i; | |
568 | ||
569 | this->fStartChar = other.fStartChar; | |
570 | this->fEndChar = other.fEndChar; | |
571 | this->fNum = other.fNum; | |
572 | this->fNext = NULL; | |
374ca955 | 573 | UErrorCode oldstatus = status; |
b75a7d8f | 574 | this->fIncludesSets = new UVector(status); |
374ca955 A |
575 | if (U_FAILURE(oldstatus)) { |
576 | status = oldstatus; | |
577 | } | |
578 | if (U_FAILURE(status)) { | |
579 | return; | |
580 | } | |
b75a7d8f A |
581 | /* test for NULL */ |
582 | if (this->fIncludesSets == 0) { | |
583 | status = U_MEMORY_ALLOCATION_ERROR; | |
584 | return; | |
585 | } | |
586 | ||
587 | for (i=0; i<other.fIncludesSets->size(); i++) { | |
588 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); | |
589 | } | |
590 | } | |
591 | ||
592 | ||
593 | //------------------------------------------------------------------------------------- | |
594 | // | |
595 | // RangeDesriptor default constructor | |
596 | // | |
597 | //------------------------------------------------------------------------------------- | |
598 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { | |
599 | this->fStartChar = 0; | |
600 | this->fEndChar = 0; | |
601 | this->fNum = 0; | |
602 | this->fNext = NULL; | |
374ca955 | 603 | UErrorCode oldstatus = status; |
b75a7d8f | 604 | this->fIncludesSets = new UVector(status); |
374ca955 A |
605 | if (U_FAILURE(oldstatus)) { |
606 | status = oldstatus; | |
607 | } | |
608 | if (U_FAILURE(status)) { | |
609 | return; | |
610 | } | |
b75a7d8f A |
611 | /* test for NULL */ |
612 | if(this->fIncludesSets == 0) { | |
613 | status = U_MEMORY_ALLOCATION_ERROR; | |
614 | return; | |
615 | } | |
616 | ||
617 | } | |
618 | ||
619 | ||
620 | //------------------------------------------------------------------------------------- | |
621 | // | |
622 | // RangeDesriptor Destructor | |
623 | // | |
624 | //------------------------------------------------------------------------------------- | |
625 | RangeDescriptor::~RangeDescriptor() { | |
626 | delete fIncludesSets; | |
627 | fIncludesSets = NULL; | |
628 | } | |
629 | ||
630 | //------------------------------------------------------------------------------------- | |
631 | // | |
632 | // RangeDesriptor::split() | |
633 | // | |
634 | //------------------------------------------------------------------------------------- | |
635 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { | |
636 | U_ASSERT(where>fStartChar && where<=fEndChar); | |
637 | RangeDescriptor *nr = new RangeDescriptor(*this, status); | |
b75a7d8f A |
638 | if(nr == 0) { |
639 | status = U_MEMORY_ALLOCATION_ERROR; | |
640 | return; | |
641 | } | |
46f4442e A |
642 | if (U_FAILURE(status)) { |
643 | delete nr; | |
644 | return; | |
645 | } | |
b75a7d8f A |
646 | // RangeDescriptor copy constructor copies all fields. |
647 | // Only need to update those that are different after the split. | |
648 | nr->fStartChar = where; | |
649 | this->fEndChar = where-1; | |
650 | nr->fNext = this->fNext; | |
651 | this->fNext = nr; | |
652 | } | |
653 | ||
654 | ||
655 | //------------------------------------------------------------------------------------- | |
656 | // | |
657 | // RangeDescriptor::setDictionaryFlag | |
658 | // | |
659 | // Character Category Numbers that include characters from | |
660 | // the original Unicode Set named "dictionary" have bit 14 | |
661 | // set to 1. The RBBI runtime engine uses this to trigger | |
662 | // use of the word dictionary. | |
663 | // | |
664 | // This function looks through the Unicode Sets that it | |
665 | // (the range) includes, and sets the bit in fNum when | |
666 | // "dictionary" is among them. | |
667 | // | |
668 | // TODO: a faster way would be to find the set node for | |
669 | // "dictionary" just once, rather than looking it | |
670 | // up by name every time. | |
671 | // | |
672 | //------------------------------------------------------------------------------------- | |
673 | void RangeDescriptor::setDictionaryFlag() { | |
674 | int i; | |
675 | ||
676 | for (i=0; i<this->fIncludesSets->size(); i++) { | |
677 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); | |
678 | UnicodeString setName; | |
679 | RBBINode *setRef = usetNode->fParent; | |
680 | if (setRef != NULL) { | |
681 | RBBINode *varRef = setRef->fParent; | |
682 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
683 | setName = varRef->fText; | |
684 | } | |
685 | } | |
374ca955 | 686 | if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals. |
b75a7d8f A |
687 | this->fNum |= 0x4000; |
688 | break; | |
689 | } | |
690 | } | |
691 | } | |
692 | ||
693 | ||
694 | ||
695 | U_NAMESPACE_END | |
696 | ||
697 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |