]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | // |
4 | // rbbisetb.cpp | |
5 | // | |
6 | /* | |
7 | *************************************************************************** | |
46f4442e | 8 | * Copyright (C) 2002-2008 International Business Machines Corporation * |
b75a7d8f A |
9 | * and others. All rights reserved. * |
10 | *************************************************************************** | |
11 | */ | |
12 | // | |
13 | // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules | |
14 | // (part of the rule building process.) | |
15 | // | |
16 | // Starting with the rules parse tree from the scanner, | |
17 | // | |
18 | // - Enumerate the set of UnicodeSets that are referenced | |
19 | // by the RBBI rules. | |
20 | // - compute a set of non-overlapping character ranges | |
21 | // with all characters within a range belonging to the same | |
22 | // set of input uniocde sets. | |
23 | // - Derive a set of non-overlapping UnicodeSet (like things) | |
24 | // that will correspond to columns in the state table for | |
25 | // the RBBI execution engine. All characters within one | |
26 | // of these sets belong to the same set of the original | |
27 | // UnicodeSets from the user's rules. | |
28 | // - construct the trie table that maps input characters | |
29 | // to the index of the matching non-overlapping set of set from | |
30 | // the previous step. | |
31 | // | |
32 | ||
33 | #include "unicode/utypes.h" | |
34 | ||
35 | #if !UCONFIG_NO_BREAK_ITERATION | |
36 | ||
37 | #include "unicode/uniset.h" | |
0f5d89e8 | 38 | #include "utrie2.h" |
b75a7d8f A |
39 | #include "uvector.h" |
40 | #include "uassert.h" | |
41 | #include "cmemory.h" | |
42 | #include "cstring.h" | |
43 | ||
44 | #include "rbbisetb.h" | |
45 | #include "rbbinode.h" | |
46 | ||
b75a7d8f A |
47 | U_NAMESPACE_BEGIN |
48 | ||
49 | //------------------------------------------------------------------------ | |
50 | // | |
51 | // Constructor | |
52 | // | |
53 | //------------------------------------------------------------------------ | |
54 | RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) | |
55 | { | |
56 | fRB = rb; | |
57 | fStatus = rb->fStatus; | |
58 | fRangeList = 0; | |
59 | fTrie = 0; | |
60 | fTrieSize = 0; | |
61 | fGroupCount = 0; | |
73c04bcf | 62 | fSawBOF = FALSE; |
b75a7d8f A |
63 | } |
64 | ||
65 | ||
66 | //------------------------------------------------------------------------ | |
67 | // | |
68 | // Destructor | |
69 | // | |
70 | //------------------------------------------------------------------------ | |
71 | RBBISetBuilder::~RBBISetBuilder() | |
72 | { | |
73 | RangeDescriptor *nextRangeDesc; | |
74 | ||
75 | // Walk through & delete the linked list of RangeDescriptors | |
76 | for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { | |
77 | RangeDescriptor *r = nextRangeDesc; | |
78 | nextRangeDesc = r->fNext; | |
79 | delete r; | |
80 | } | |
81 | ||
0f5d89e8 | 82 | utrie2_close(fTrie); |
b75a7d8f A |
83 | } |
84 | ||
85 | ||
86 | ||
87 | ||
88 | //------------------------------------------------------------------------ | |
89 | // | |
90 | // build Build the list of non-overlapping character ranges | |
91 | // from the Unicode Sets. | |
92 | // | |
93 | //------------------------------------------------------------------------ | |
0f5d89e8 | 94 | void RBBISetBuilder::buildRanges() { |
b75a7d8f A |
95 | RBBINode *usetNode; |
96 | RangeDescriptor *rlRange; | |
97 | ||
98 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} | |
99 | ||
100 | // | |
101 | // Initialize the process by creating a single range encompassing all characters | |
102 | // that is in no sets. | |
103 | // | |
374ca955 | 104 | fRangeList = new RangeDescriptor(*fStatus); // will check for status here |
46f4442e A |
105 | if (fRangeList == NULL) { |
106 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
107 | return; | |
108 | } | |
b75a7d8f A |
109 | fRangeList->fStartChar = 0; |
110 | fRangeList->fEndChar = 0x10ffff; | |
111 | ||
374ca955 A |
112 | if (U_FAILURE(*fStatus)) { |
113 | return; | |
114 | } | |
b75a7d8f A |
115 | |
116 | // | |
117 | // Find the set of non-overlapping ranges of characters | |
118 | // | |
119 | int ni; | |
73c04bcf | 120 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules |
b75a7d8f A |
121 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); |
122 | if (usetNode==NULL) { | |
123 | break; | |
124 | } | |
125 | ||
126 | UnicodeSet *inputSet = usetNode->fInputSet; | |
127 | int32_t inputSetRangeCount = inputSet->getRangeCount(); | |
128 | int inputSetRangeIndex = 0; | |
129 | rlRange = fRangeList; | |
130 | ||
131 | for (;;) { | |
132 | if (inputSetRangeIndex >= inputSetRangeCount) { | |
133 | break; | |
134 | } | |
135 | UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); | |
136 | UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); | |
137 | ||
138 | // skip over ranges from the range list that are completely | |
139 | // below the current range from the input unicode set. | |
140 | while (rlRange->fEndChar < inputSetRangeBegin) { | |
141 | rlRange = rlRange->fNext; | |
142 | } | |
143 | ||
144 | // If the start of the range from the range list is before with | |
145 | // the start of the range from the unicode set, split the range list range | |
146 | // in two, with one part being before (wholly outside of) the unicode set | |
147 | // and the other containing the rest. | |
148 | // Then continue the loop; the post-split current range will then be skipped | |
149 | // over | |
150 | if (rlRange->fStartChar < inputSetRangeBegin) { | |
151 | rlRange->split(inputSetRangeBegin, *fStatus); | |
374ca955 A |
152 | if (U_FAILURE(*fStatus)) { |
153 | return; | |
154 | } | |
b75a7d8f A |
155 | continue; |
156 | } | |
157 | ||
158 | // Same thing at the end of the ranges... | |
159 | // If the end of the range from the range list doesn't coincide with | |
160 | // the end of the range from the unicode set, split the range list | |
161 | // range in two. The first part of the split range will be | |
162 | // wholly inside the Unicode set. | |
163 | if (rlRange->fEndChar > inputSetRangeEnd) { | |
164 | rlRange->split(inputSetRangeEnd+1, *fStatus); | |
374ca955 A |
165 | if (U_FAILURE(*fStatus)) { |
166 | return; | |
167 | } | |
b75a7d8f A |
168 | } |
169 | ||
170 | // The current rlRange is now entirely within the UnicodeSet range. | |
171 | // Add this unicode set to the list of sets for this rlRange | |
172 | if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { | |
173 | rlRange->fIncludesSets->addElement(usetNode, *fStatus); | |
374ca955 A |
174 | if (U_FAILURE(*fStatus)) { |
175 | return; | |
176 | } | |
b75a7d8f A |
177 | } |
178 | ||
179 | // Advance over ranges that we are finished with. | |
180 | if (inputSetRangeEnd == rlRange->fEndChar) { | |
181 | inputSetRangeIndex++; | |
182 | } | |
183 | rlRange = rlRange->fNext; | |
184 | } | |
185 | } | |
186 | ||
187 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} | |
188 | ||
189 | // | |
190 | // Group the above ranges, with each group consisting of one or more | |
191 | // ranges that are in exactly the same set of original UnicodeSets. | |
192 | // The groups are numbered, and these group numbers are the set of | |
193 | // input symbols recognized by the run-time state machine. | |
194 | // | |
73c04bcf A |
195 | // Numbering: # 0 (state table column 0) is unused. |
196 | // # 1 is reserved - table column 1 is for end-of-input | |
197 | // # 2 is reserved - table column 2 is for beginning-in-input | |
198 | // # 3 is the first range list. | |
199 | // | |
b75a7d8f A |
200 | RangeDescriptor *rlSearchRange; |
201 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
202 | for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { | |
203 | if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { | |
204 | rlRange->fNum = rlSearchRange->fNum; | |
205 | break; | |
206 | } | |
207 | } | |
208 | if (rlRange->fNum == 0) { | |
209 | fGroupCount ++; | |
73c04bcf | 210 | rlRange->fNum = fGroupCount+2; |
b75a7d8f | 211 | rlRange->setDictionaryFlag(); |
73c04bcf A |
212 | addValToSets(rlRange->fIncludesSets, fGroupCount+2); |
213 | } | |
214 | } | |
215 | ||
216 | // Handle input sets that contain the special string {eof}. | |
217 | // Column 1 of the state table is reserved for EOF on input. | |
218 | // Column 2 is reserved for before-the-start-input. | |
219 | // (This column can be optimized away later if there are no rule | |
220 | // references to {bof}.) | |
221 | // Add this column value (1 or 2) to the equivalent expression | |
222 | // subtree for each UnicodeSet that contains the string {eof} | |
223 | // Because {bof} and {eof} are not a characters in the normal sense, | |
224 | // they doesn't affect the computation of ranges or TRIE. | |
225 | static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; | |
226 | static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; | |
227 | ||
228 | UnicodeString eofString(eofUString); | |
229 | UnicodeString bofString(bofUString); | |
230 | for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules | |
231 | usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); | |
232 | if (usetNode==NULL) { | |
233 | break; | |
234 | } | |
235 | UnicodeSet *inputSet = usetNode->fInputSet; | |
236 | if (inputSet->contains(eofString)) { | |
237 | addValToSet(usetNode, 1); | |
238 | } | |
239 | if (inputSet->contains(bofString)) { | |
240 | addValToSet(usetNode, 2); | |
241 | fSawBOF = TRUE; | |
b75a7d8f A |
242 | } |
243 | } | |
244 | ||
73c04bcf | 245 | |
b75a7d8f A |
246 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} |
247 | if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} | |
0f5d89e8 | 248 | } |
b75a7d8f | 249 | |
b75a7d8f | 250 | |
0f5d89e8 A |
251 | // |
252 | // Build the Trie table for mapping UChar32 values to the corresponding | |
253 | // range group number. | |
254 | // | |
255 | void RBBISetBuilder::buildTrie() { | |
256 | RangeDescriptor *rlRange; | |
b75a7d8f | 257 | |
0f5d89e8 A |
258 | fTrie = utrie2_open(0, // Initial value for all code points. |
259 | 0, // Error value for out-of-range input. | |
260 | fStatus); | |
261 | ||
262 | for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { | |
263 | utrie2_setRange32(fTrie, | |
264 | rlRange->fStartChar, // Range start | |
265 | rlRange->fEndChar, // Range end (inclusive) | |
266 | rlRange->fNum, // value for range | |
267 | TRUE, // Overwrite previously written values | |
268 | fStatus); | |
b75a7d8f A |
269 | } |
270 | } | |
271 | ||
272 | ||
0f5d89e8 A |
273 | void RBBISetBuilder::mergeCategories(IntPair categories) { |
274 | U_ASSERT(categories.first >= 1); | |
275 | U_ASSERT(categories.second > categories.first); | |
276 | for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { | |
277 | int32_t rangeNum = rd->fNum & ~DICT_BIT; | |
278 | int32_t rangeDict = rd->fNum & DICT_BIT; | |
279 | if (rangeNum == categories.second) { | |
280 | rd->fNum = categories.first | rangeDict; | |
281 | } else if (rangeNum > categories.second) { | |
282 | rd->fNum--; | |
283 | } | |
284 | } | |
285 | --fGroupCount; | |
286 | } | |
287 | ||
b75a7d8f A |
288 | |
289 | //----------------------------------------------------------------------------------- | |
290 | // | |
291 | // getTrieSize() Return the size that will be required to serialize the Trie. | |
292 | // | |
293 | //----------------------------------------------------------------------------------- | |
0f5d89e8 A |
294 | int32_t RBBISetBuilder::getTrieSize() { |
295 | if (U_FAILURE(*fStatus)) { | |
296 | return 0; | |
297 | } | |
298 | utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); | |
299 | fTrieSize = utrie2_serialize(fTrie, | |
300 | NULL, // Buffer | |
301 | 0, // Capacity | |
302 | fStatus); | |
303 | if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { | |
304 | *fStatus = U_ZERO_ERROR; | |
305 | } | |
b75a7d8f A |
306 | // RBBIDebugPrintf("Trie table size is %d\n", trieSize); |
307 | return fTrieSize; | |
308 | } | |
309 | ||
310 | ||
311 | //----------------------------------------------------------------------------------- | |
312 | // | |
313 | // serializeTrie() Put the serialized trie at the specified address. | |
314 | // Trust the caller to have given us enough memory. | |
315 | // getTrieSize() MUST be called first. | |
316 | // | |
317 | //----------------------------------------------------------------------------------- | |
318 | void RBBISetBuilder::serializeTrie(uint8_t *where) { | |
0f5d89e8 A |
319 | utrie2_serialize(fTrie, |
320 | where, // Buffer | |
321 | fTrieSize, // Capacity | |
322 | fStatus); | |
b75a7d8f A |
323 | } |
324 | ||
325 | //------------------------------------------------------------------------ | |
326 | // | |
327 | // addValToSets Add a runtime-mapped input value to each uset from a | |
73c04bcf | 328 | // list of uset nodes. (val corresponds to a state table column.) |
b75a7d8f A |
329 | // For each of the original Unicode sets - which correspond |
330 | // directly to uset nodes - a logically equivalent expression | |
331 | // is constructed in terms of the remapped runtime input | |
332 | // symbol set. This function adds one runtime input symbol to | |
333 | // a list of sets. | |
334 | // | |
335 | // The "logically equivalent expression" is the tree for an | |
336 | // or-ing together of all of the symbols that go into the set. | |
337 | // | |
338 | //------------------------------------------------------------------------ | |
339 | void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { | |
340 | int32_t ix; | |
341 | ||
342 | for (ix=0; ix<sets->size(); ix++) { | |
343 | RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); | |
73c04bcf | 344 | addValToSet(usetNode, val); |
b75a7d8f A |
345 | } |
346 | } | |
347 | ||
73c04bcf A |
348 | void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { |
349 | RBBINode *leafNode = new RBBINode(RBBINode::leafChar); | |
46f4442e A |
350 | if (leafNode == NULL) { |
351 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
352 | return; | |
353 | } | |
73c04bcf A |
354 | leafNode->fVal = (unsigned short)val; |
355 | if (usetNode->fLeftChild == NULL) { | |
356 | usetNode->fLeftChild = leafNode; | |
357 | leafNode->fParent = usetNode; | |
358 | } else { | |
359 | // There are already input symbols present for this set. | |
360 | // Set up an OR node, with the previous stuff as the left child | |
361 | // and the new value as the right child. | |
362 | RBBINode *orNode = new RBBINode(RBBINode::opOr); | |
46f4442e A |
363 | if (orNode == NULL) { |
364 | *fStatus = U_MEMORY_ALLOCATION_ERROR; | |
365 | return; | |
366 | } | |
73c04bcf A |
367 | orNode->fLeftChild = usetNode->fLeftChild; |
368 | orNode->fRightChild = leafNode; | |
369 | orNode->fLeftChild->fParent = orNode; | |
370 | orNode->fRightChild->fParent = orNode; | |
371 | usetNode->fLeftChild = orNode; | |
372 | orNode->fParent = usetNode; | |
373 | } | |
374 | } | |
b75a7d8f A |
375 | |
376 | ||
377 | //------------------------------------------------------------------------ | |
378 | // | |
73c04bcf | 379 | // getNumCharCategories |
b75a7d8f A |
380 | // |
381 | //------------------------------------------------------------------------ | |
374ca955 | 382 | int32_t RBBISetBuilder::getNumCharCategories() const { |
73c04bcf | 383 | return fGroupCount + 3; |
b75a7d8f A |
384 | } |
385 | ||
386 | ||
73c04bcf A |
387 | //------------------------------------------------------------------------ |
388 | // | |
389 | // sawBOF | |
390 | // | |
391 | //------------------------------------------------------------------------ | |
392 | UBool RBBISetBuilder::sawBOF() const { | |
393 | return fSawBOF; | |
394 | } | |
395 | ||
b75a7d8f | 396 | |
374ca955 A |
397 | //------------------------------------------------------------------------ |
398 | // | |
399 | // getFirstChar Given a runtime RBBI character category, find | |
400 | // the first UChar32 that is in the set of chars | |
401 | // in the category. | |
402 | //------------------------------------------------------------------------ | |
403 | UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { | |
404 | RangeDescriptor *rlRange; | |
405 | UChar32 retVal = (UChar32)-1; | |
406 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
407 | if (rlRange->fNum == category) { | |
408 | retVal = rlRange->fStartChar; | |
409 | break; | |
410 | } | |
411 | } | |
412 | return retVal; | |
413 | } | |
414 | ||
415 | ||
416 | ||
b75a7d8f A |
417 | //------------------------------------------------------------------------ |
418 | // | |
419 | // printRanges A debugging function. | |
420 | // dump out all of the range definitions. | |
421 | // | |
422 | //------------------------------------------------------------------------ | |
b75a7d8f | 423 | #ifdef RBBI_DEBUG |
374ca955 | 424 | void RBBISetBuilder::printRanges() { |
b75a7d8f A |
425 | RangeDescriptor *rlRange; |
426 | int i; | |
427 | ||
428 | RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); | |
429 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
430 | RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar); | |
431 | ||
432 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
433 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 434 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
435 | RBBINode *setRef = usetNode->fParent; |
436 | if (setRef != NULL) { | |
437 | RBBINode *varRef = setRef->fParent; | |
438 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
439 | setName = varRef->fText; | |
440 | } | |
441 | } | |
374ca955 | 442 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
443 | } |
444 | RBBIDebugPrintf("\n"); | |
445 | } | |
b75a7d8f | 446 | } |
374ca955 | 447 | #endif |
b75a7d8f A |
448 | |
449 | ||
450 | //------------------------------------------------------------------------ | |
451 | // | |
452 | // printRangeGroups A debugging function. | |
453 | // dump out all of the range groups. | |
454 | // | |
455 | //------------------------------------------------------------------------ | |
374ca955 | 456 | #ifdef RBBI_DEBUG |
b75a7d8f A |
457 | void RBBISetBuilder::printRangeGroups() { |
458 | RangeDescriptor *rlRange; | |
459 | RangeDescriptor *tRange; | |
460 | int i; | |
461 | int lastPrintedGroupNum = 0; | |
462 | ||
463 | RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); | |
464 | for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { | |
465 | int groupNum = rlRange->fNum & 0xbfff; | |
466 | if (groupNum > lastPrintedGroupNum) { | |
467 | lastPrintedGroupNum = groupNum; | |
468 | RBBIDebugPrintf("%2i ", groupNum); | |
469 | ||
0f5d89e8 | 470 | if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");} |
b75a7d8f A |
471 | |
472 | for (i=0; i<rlRange->fIncludesSets->size(); i++) { | |
473 | RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); | |
374ca955 | 474 | UnicodeString setName = UNICODE_STRING("anon", 4); |
b75a7d8f A |
475 | RBBINode *setRef = usetNode->fParent; |
476 | if (setRef != NULL) { | |
477 | RBBINode *varRef = setRef->fParent; | |
478 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
479 | setName = varRef->fText; | |
480 | } | |
481 | } | |
374ca955 | 482 | RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); |
b75a7d8f A |
483 | } |
484 | ||
485 | i = 0; | |
486 | for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) { | |
487 | if (tRange->fNum == rlRange->fNum) { | |
488 | if (i++ % 5 == 0) { | |
489 | RBBIDebugPrintf("\n "); | |
490 | } | |
491 | RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); | |
492 | } | |
493 | } | |
494 | RBBIDebugPrintf("\n"); | |
495 | } | |
496 | } | |
497 | RBBIDebugPrintf("\n"); | |
498 | } | |
374ca955 | 499 | #endif |
b75a7d8f A |
500 | |
501 | ||
502 | //------------------------------------------------------------------------ | |
503 | // | |
504 | // printSets A debugging function. | |
505 | // dump out all of the set definitions. | |
506 | // | |
507 | //------------------------------------------------------------------------ | |
b75a7d8f | 508 | #ifdef RBBI_DEBUG |
374ca955 | 509 | void RBBISetBuilder::printSets() { |
b75a7d8f A |
510 | int i; |
511 | ||
512 | RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); | |
513 | for (i=0; ; i++) { | |
46f4442e | 514 | RBBINode *usetNode; |
b75a7d8f A |
515 | RBBINode *setRef; |
516 | RBBINode *varRef; | |
517 | UnicodeString setName; | |
518 | ||
519 | usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); | |
520 | if (usetNode == NULL) { | |
521 | break; | |
522 | } | |
523 | ||
524 | RBBIDebugPrintf("%3d ", i); | |
374ca955 | 525 | setName = UNICODE_STRING("anonymous", 9); |
b75a7d8f A |
526 | setRef = usetNode->fParent; |
527 | if (setRef != NULL) { | |
528 | varRef = setRef->fParent; | |
529 | if (varRef != NULL && varRef->fType == RBBINode::varRef) { | |
530 | setName = varRef->fText; | |
531 | } | |
532 | } | |
374ca955 | 533 | RBBI_DEBUG_printUnicodeString(setName); |
b75a7d8f | 534 | RBBIDebugPrintf(" "); |
374ca955 | 535 | RBBI_DEBUG_printUnicodeString(usetNode->fText); |
b75a7d8f A |
536 | RBBIDebugPrintf("\n"); |
537 | if (usetNode->fLeftChild != NULL) { | |
f3c0d7a5 | 538 | RBBINode::printTree(usetNode->fLeftChild, TRUE); |
b75a7d8f A |
539 | } |
540 | } | |
541 | RBBIDebugPrintf("\n"); | |
b75a7d8f | 542 | } |
374ca955 | 543 | #endif |
b75a7d8f A |
544 | |
545 | ||
546 | ||
547 | //------------------------------------------------------------------------------------- | |
548 | // | |
549 | // RangeDescriptor copy constructor | |
550 | // | |
551 | //------------------------------------------------------------------------------------- | |
552 | ||
553 | RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) { | |
554 | int i; | |
555 | ||
556 | this->fStartChar = other.fStartChar; | |
557 | this->fEndChar = other.fEndChar; | |
558 | this->fNum = other.fNum; | |
559 | this->fNext = NULL; | |
374ca955 | 560 | UErrorCode oldstatus = status; |
b75a7d8f | 561 | this->fIncludesSets = new UVector(status); |
374ca955 A |
562 | if (U_FAILURE(oldstatus)) { |
563 | status = oldstatus; | |
564 | } | |
565 | if (U_FAILURE(status)) { | |
566 | return; | |
567 | } | |
b75a7d8f A |
568 | /* test for NULL */ |
569 | if (this->fIncludesSets == 0) { | |
570 | status = U_MEMORY_ALLOCATION_ERROR; | |
571 | return; | |
572 | } | |
573 | ||
574 | for (i=0; i<other.fIncludesSets->size(); i++) { | |
575 | this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); | |
576 | } | |
577 | } | |
578 | ||
579 | ||
580 | //------------------------------------------------------------------------------------- | |
581 | // | |
582 | // RangeDesriptor default constructor | |
583 | // | |
584 | //------------------------------------------------------------------------------------- | |
585 | RangeDescriptor::RangeDescriptor(UErrorCode &status) { | |
586 | this->fStartChar = 0; | |
587 | this->fEndChar = 0; | |
588 | this->fNum = 0; | |
589 | this->fNext = NULL; | |
374ca955 | 590 | UErrorCode oldstatus = status; |
b75a7d8f | 591 | this->fIncludesSets = new UVector(status); |
374ca955 A |
592 | if (U_FAILURE(oldstatus)) { |
593 | status = oldstatus; | |
594 | } | |
595 | if (U_FAILURE(status)) { | |
596 | return; | |
597 | } | |
b75a7d8f A |
598 | /* test for NULL */ |
599 | if(this->fIncludesSets == 0) { | |
600 | status = U_MEMORY_ALLOCATION_ERROR; | |
601 | return; | |
602 | } | |
603 | ||
604 | } | |
605 | ||
606 | ||
607 | //------------------------------------------------------------------------------------- | |
608 | // | |
609 | // RangeDesriptor Destructor | |
610 | // | |
611 | //------------------------------------------------------------------------------------- | |
612 | RangeDescriptor::~RangeDescriptor() { | |
613 | delete fIncludesSets; | |
614 | fIncludesSets = NULL; | |
615 | } | |
616 | ||
617 | //------------------------------------------------------------------------------------- | |
618 | // | |
619 | // RangeDesriptor::split() | |
620 | // | |
621 | //------------------------------------------------------------------------------------- | |
622 | void RangeDescriptor::split(UChar32 where, UErrorCode &status) { | |
623 | U_ASSERT(where>fStartChar && where<=fEndChar); | |
624 | RangeDescriptor *nr = new RangeDescriptor(*this, status); | |
b75a7d8f A |
625 | if(nr == 0) { |
626 | status = U_MEMORY_ALLOCATION_ERROR; | |
627 | return; | |
628 | } | |
46f4442e A |
629 | if (U_FAILURE(status)) { |
630 | delete nr; | |
631 | return; | |
632 | } | |
b75a7d8f A |
633 | // RangeDescriptor copy constructor copies all fields. |
634 | // Only need to update those that are different after the split. | |
635 | nr->fStartChar = where; | |
636 | this->fEndChar = where-1; | |
637 | nr->fNext = this->fNext; | |
638 | this->fNext = nr; | |
639 | } | |
640 | ||
641 | ||
642 | //------------------------------------------------------------------------------------- | |
643 | // | |
644 | // RangeDescriptor::setDictionaryFlag | |
645 | // | |
646 | // Character Category Numbers that include characters from | |
647 | // the original Unicode Set named "dictionary" have bit 14 | |
648 | // set to 1. The RBBI runtime engine uses this to trigger | |
649 | // use of the word dictionary. | |
650 | // | |
651 | // This function looks through the Unicode Sets that it | |
652 | // (the range) includes, and sets the bit in fNum when | |
653 | // "dictionary" is among them. | |
654 | // | |
655 | // TODO: a faster way would be to find the set node for | |
656 | // "dictionary" just once, rather than looking it | |
657 | // up by name every time. | |
658 | // | |
659 | //------------------------------------------------------------------------------------- | |
660 | void RangeDescriptor::setDictionaryFlag() { | |
661 | int i; | |
662 | ||
0f5d89e8 A |
663 | static const char16_t *dictionary = u"dictionary"; |
664 | for (i=0; i<fIncludesSets->size(); i++) { | |
665 | RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); | |
666 | RBBINode *setRef = usetNode->fParent; | |
667 | if (setRef != nullptr) { | |
b75a7d8f | 668 | RBBINode *varRef = setRef->fParent; |
0f5d89e8 A |
669 | if (varRef && varRef->fType == RBBINode::varRef) { |
670 | const UnicodeString *setName = &varRef->fText; | |
671 | if (setName->compare(dictionary, -1) == 0) { | |
672 | fNum |= RBBISetBuilder::DICT_BIT; | |
673 | break; | |
674 | } | |
b75a7d8f A |
675 | } |
676 | } | |
b75a7d8f A |
677 | } |
678 | } | |
679 | ||
680 | ||
681 | ||
682 | U_NAMESPACE_END | |
683 | ||
684 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |