]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ****************************************************************************** | |
2ca993e8 | 3 | * Copyright (C) 2001-2016, International Business Machines |
b75a7d8f A |
4 | * Corporation and others. All Rights Reserved. |
5 | ****************************************************************************** | |
6 | * | |
7 | * File ucoleitr.cpp | |
8 | * | |
9 | * Modification History: | |
10 | * | |
11 | * Date Name Description | |
12 | * 02/15/2001 synwee Modified all methods to process its own function | |
13 | * instead of calling the equivalent c++ api (coleitr.h) | |
57a6839d | 14 | * 2012-2014 markus Rewritten in C++ again. |
b75a7d8f A |
15 | ******************************************************************************/ |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_COLLATION | |
20 | ||
57a6839d A |
21 | #include "unicode/coleitr.h" |
22 | #include "unicode/tblcoll.h" | |
b75a7d8f A |
23 | #include "unicode/ucoleitr.h" |
24 | #include "unicode/ustring.h" | |
25 | #include "unicode/sortkey.h" | |
46f4442e | 26 | #include "unicode/uobject.h" |
b75a7d8f | 27 | #include "cmemory.h" |
57a6839d A |
28 | #include "usrchimp.h" |
29 | ||
b75a7d8f A |
30 | U_NAMESPACE_USE |
31 | ||
32 | #define BUFFER_LENGTH 100 | |
33 | ||
46f4442e A |
34 | #define DEFAULT_BUFFER_SIZE 16 |
35 | #define BUFFER_GROW 8 | |
36 | ||
a62d09fc | 37 | #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) |
46f4442e | 38 | |
a62d09fc | 39 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) |
46f4442e | 40 | |
46f4442e A |
41 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
42 | ||
46f4442e A |
43 | struct RCEI |
44 | { | |
45 | uint32_t ce; | |
46 | int32_t low; | |
47 | int32_t high; | |
48 | }; | |
49 | ||
50 | U_NAMESPACE_BEGIN | |
51 | ||
52 | struct RCEBuffer | |
53 | { | |
54 | RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; | |
55 | RCEI *buffer; | |
56 | int32_t bufferIndex; | |
57 | int32_t bufferSize; | |
58 | ||
59 | RCEBuffer(); | |
60 | ~RCEBuffer(); | |
61 | ||
2ca993e8 A |
62 | UBool isEmpty() const; |
63 | void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); | |
46f4442e A |
64 | const RCEI *get(); |
65 | }; | |
66 | ||
67 | RCEBuffer::RCEBuffer() | |
68 | { | |
69 | buffer = defaultBuffer; | |
70 | bufferIndex = 0; | |
b331163b | 71 | bufferSize = UPRV_LENGTHOF(defaultBuffer); |
46f4442e A |
72 | } |
73 | ||
74 | RCEBuffer::~RCEBuffer() | |
75 | { | |
76 | if (buffer != defaultBuffer) { | |
77 | DELETE_ARRAY(buffer); | |
78 | } | |
79 | } | |
80 | ||
2ca993e8 | 81 | UBool RCEBuffer::isEmpty() const |
46f4442e A |
82 | { |
83 | return bufferIndex <= 0; | |
84 | } | |
85 | ||
2ca993e8 | 86 | void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) |
46f4442e | 87 | { |
2ca993e8 A |
88 | if (U_FAILURE(errorCode)) { |
89 | return; | |
90 | } | |
46f4442e A |
91 | if (bufferIndex >= bufferSize) { |
92 | RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); | |
2ca993e8 A |
93 | if (newBuffer == NULL) { |
94 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
95 | return; | |
96 | } | |
46f4442e A |
97 | |
98 | ARRAY_COPY(newBuffer, buffer, bufferSize); | |
99 | ||
100 | if (buffer != defaultBuffer) { | |
101 | DELETE_ARRAY(buffer); | |
102 | } | |
103 | ||
104 | buffer = newBuffer; | |
105 | bufferSize += BUFFER_GROW; | |
106 | } | |
107 | ||
108 | buffer[bufferIndex].ce = ce; | |
109 | buffer[bufferIndex].low = ixLow; | |
110 | buffer[bufferIndex].high = ixHigh; | |
111 | ||
112 | bufferIndex += 1; | |
113 | } | |
114 | ||
115 | const RCEI *RCEBuffer::get() | |
116 | { | |
117 | if (bufferIndex > 0) { | |
118 | return &buffer[--bufferIndex]; | |
119 | } | |
120 | ||
121 | return NULL; | |
122 | } | |
123 | ||
46f4442e A |
124 | PCEBuffer::PCEBuffer() |
125 | { | |
126 | buffer = defaultBuffer; | |
127 | bufferIndex = 0; | |
b331163b | 128 | bufferSize = UPRV_LENGTHOF(defaultBuffer); |
46f4442e A |
129 | } |
130 | ||
131 | PCEBuffer::~PCEBuffer() | |
132 | { | |
133 | if (buffer != defaultBuffer) { | |
134 | DELETE_ARRAY(buffer); | |
135 | } | |
136 | } | |
137 | ||
138 | void PCEBuffer::reset() | |
139 | { | |
140 | bufferIndex = 0; | |
141 | } | |
142 | ||
2ca993e8 | 143 | UBool PCEBuffer::isEmpty() const |
46f4442e A |
144 | { |
145 | return bufferIndex <= 0; | |
146 | } | |
147 | ||
2ca993e8 | 148 | void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) |
46f4442e | 149 | { |
2ca993e8 A |
150 | if (U_FAILURE(errorCode)) { |
151 | return; | |
152 | } | |
46f4442e A |
153 | if (bufferIndex >= bufferSize) { |
154 | PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); | |
2ca993e8 A |
155 | if (newBuffer == NULL) { |
156 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
157 | return; | |
158 | } | |
46f4442e A |
159 | |
160 | ARRAY_COPY(newBuffer, buffer, bufferSize); | |
161 | ||
162 | if (buffer != defaultBuffer) { | |
163 | DELETE_ARRAY(buffer); | |
164 | } | |
165 | ||
166 | buffer = newBuffer; | |
167 | bufferSize += BUFFER_GROW; | |
168 | } | |
169 | ||
170 | buffer[bufferIndex].ce = ce; | |
171 | buffer[bufferIndex].low = ixLow; | |
172 | buffer[bufferIndex].high = ixHigh; | |
173 | ||
174 | bufferIndex += 1; | |
175 | } | |
176 | ||
177 | const PCEI *PCEBuffer::get() | |
178 | { | |
179 | if (bufferIndex > 0) { | |
180 | return &buffer[--bufferIndex]; | |
181 | } | |
182 | ||
183 | return NULL; | |
184 | } | |
185 | ||
57a6839d | 186 | UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } |
46f4442e | 187 | |
57a6839d | 188 | UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } |
46f4442e | 189 | |
57a6839d A |
190 | void UCollationPCE::init(UCollationElements *elems) { |
191 | init(CollationElementIterator::fromUCollationElements(elems)); | |
192 | } | |
46f4442e | 193 | |
57a6839d | 194 | void UCollationPCE::init(CollationElementIterator *iter) |
46f4442e | 195 | { |
57a6839d A |
196 | cei = iter; |
197 | init(*iter->rbc_); | |
46f4442e A |
198 | } |
199 | ||
57a6839d | 200 | void UCollationPCE::init(const Collator &coll) |
46f4442e A |
201 | { |
202 | UErrorCode status = U_ZERO_ERROR; | |
203 | ||
57a6839d A |
204 | strength = coll.getAttribute(UCOL_STRENGTH, status); |
205 | toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; | |
46f4442e | 206 | isShifted = FALSE; |
57a6839d | 207 | variableTop = coll.getVariableTop(status); |
46f4442e A |
208 | } |
209 | ||
210 | UCollationPCE::~UCollationPCE() | |
211 | { | |
212 | // nothing to do | |
213 | } | |
214 | ||
57a6839d | 215 | uint64_t UCollationPCE::processCE(uint32_t ce) |
46f4442e A |
216 | { |
217 | uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; | |
218 | ||
219 | // This is clean, but somewhat slow... | |
220 | // We could apply the mask to ce and then | |
221 | // just get all three orders... | |
57a6839d | 222 | switch(strength) { |
46f4442e A |
223 | default: |
224 | tertiary = ucol_tertiaryOrder(ce); | |
2ca993e8 | 225 | U_FALLTHROUGH; |
46f4442e A |
226 | |
227 | case UCOL_SECONDARY: | |
228 | secondary = ucol_secondaryOrder(ce); | |
2ca993e8 | 229 | U_FALLTHROUGH; |
46f4442e A |
230 | |
231 | case UCOL_PRIMARY: | |
232 | primary = ucol_primaryOrder(ce); | |
233 | } | |
234 | ||
729e4ab9 A |
235 | // **** This should probably handle continuations too. **** |
236 | // **** That means that we need 24 bits for the primary **** | |
237 | // **** instead of the 16 that we're currently using. **** | |
238 | // **** So we can lay out the 64 bits as: 24.12.12.16. **** | |
239 | // **** Another complication with continuations is that **** | |
240 | // **** the *second* CE is marked as a continuation, so **** | |
241 | // **** we always have to peek ahead to know how long **** | |
242 | // **** the primary is... **** | |
57a6839d A |
243 | if ((toShift && variableTop > ce && primary != 0) |
244 | || (isShifted && primary == 0)) { | |
46f4442e A |
245 | |
246 | if (primary == 0) { | |
247 | return UCOL_IGNORABLE; | |
248 | } | |
249 | ||
57a6839d | 250 | if (strength >= UCOL_QUATERNARY) { |
46f4442e A |
251 | quaternary = primary; |
252 | } | |
253 | ||
254 | primary = secondary = tertiary = 0; | |
57a6839d | 255 | isShifted = TRUE; |
46f4442e | 256 | } else { |
57a6839d | 257 | if (strength >= UCOL_QUATERNARY) { |
46f4442e A |
258 | quaternary = 0xFFFF; |
259 | } | |
260 | ||
57a6839d | 261 | isShifted = FALSE; |
46f4442e A |
262 | } |
263 | ||
46f4442e A |
264 | return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; |
265 | } | |
266 | ||
57a6839d | 267 | U_NAMESPACE_END |
46f4442e | 268 | |
b75a7d8f A |
269 | /* public methods ---------------------------------------------------- */ |
270 | ||
b75a7d8f A |
271 | U_CAPI UCollationElements* U_EXPORT2 |
272 | ucol_openElements(const UCollator *coll, | |
273 | const UChar *text, | |
274 | int32_t textLength, | |
275 | UErrorCode *status) | |
276 | { | |
46f4442e A |
277 | if (U_FAILURE(*status)) { |
278 | return NULL; | |
279 | } | |
57a6839d A |
280 | if (coll == NULL || (text == NULL && textLength != 0)) { |
281 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
282 | return NULL; | |
283 | } | |
284 | const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); | |
285 | if (rbc == NULL) { | |
286 | *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator | |
46f4442e A |
287 | return NULL; |
288 | } | |
729e4ab9 | 289 | |
57a6839d A |
290 | UnicodeString s((UBool)(textLength < 0), text, textLength); |
291 | CollationElementIterator *cei = rbc->createCollationElementIterator(s); | |
292 | if (cei == NULL) { | |
293 | *status = U_MEMORY_ALLOCATION_ERROR; | |
294 | return NULL; | |
46f4442e | 295 | } |
b75a7d8f | 296 | |
57a6839d | 297 | return cei->toUCollationElements(); |
b75a7d8f A |
298 | } |
299 | ||
729e4ab9 | 300 | |
b75a7d8f A |
301 | U_CAPI void U_EXPORT2 |
302 | ucol_closeElements(UCollationElements *elems) | |
303 | { | |
57a6839d | 304 | delete CollationElementIterator::fromUCollationElements(elems); |
b75a7d8f A |
305 | } |
306 | ||
307 | U_CAPI void U_EXPORT2 | |
308 | ucol_reset(UCollationElements *elems) | |
309 | { | |
57a6839d | 310 | CollationElementIterator::fromUCollationElements(elems)->reset(); |
729e4ab9 A |
311 | } |
312 | ||
b75a7d8f A |
313 | U_CAPI int32_t U_EXPORT2 |
314 | ucol_next(UCollationElements *elems, | |
315 | UErrorCode *status) | |
316 | { | |
46f4442e A |
317 | if (U_FAILURE(*status)) { |
318 | return UCOL_NULLORDER; | |
319 | } | |
b75a7d8f | 320 | |
57a6839d | 321 | return CollationElementIterator::fromUCollationElements(elems)->next(*status); |
46f4442e A |
322 | } |
323 | ||
57a6839d | 324 | // temporarily restore the following removed internal function which is used by Spotlight |
46f4442e A |
325 | U_CAPI int64_t U_EXPORT2 |
326 | ucol_nextProcessed(UCollationElements *elems, | |
327 | int32_t *ixLow, | |
328 | int32_t *ixHigh, | |
329 | UErrorCode *status) | |
330 | { | |
57a6839d A |
331 | return (UCollationPCE::UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status); |
332 | } | |
333 | ||
334 | ||
335 | U_NAMESPACE_BEGIN | |
336 | ||
337 | int64_t | |
338 | UCollationPCE::nextProcessed( | |
339 | int32_t *ixLow, | |
340 | int32_t *ixHigh, | |
341 | UErrorCode *status) | |
342 | { | |
46f4442e A |
343 | int64_t result = UCOL_IGNORABLE; |
344 | uint32_t low = 0, high = 0; | |
345 | ||
346 | if (U_FAILURE(*status)) { | |
347 | return UCOL_PROCESSED_NULLORDER; | |
348 | } | |
349 | ||
57a6839d | 350 | pceBuffer.reset(); |
46f4442e A |
351 | |
352 | do { | |
57a6839d A |
353 | low = cei->getOffset(); |
354 | int32_t ce = cei->next(*status); | |
355 | high = cei->getOffset(); | |
46f4442e | 356 | |
57a6839d | 357 | if (ce == UCOL_NULLORDER) { |
46f4442e A |
358 | result = UCOL_PROCESSED_NULLORDER; |
359 | break; | |
360 | } | |
361 | ||
57a6839d | 362 | result = processCE((uint32_t)ce); |
46f4442e A |
363 | } while (result == UCOL_IGNORABLE); |
364 | ||
365 | if (ixLow != NULL) { | |
366 | *ixLow = low; | |
367 | } | |
368 | ||
369 | if (ixHigh != NULL) { | |
370 | *ixHigh = high; | |
371 | } | |
372 | ||
373 | return result; | |
b75a7d8f A |
374 | } |
375 | ||
57a6839d A |
376 | U_NAMESPACE_END |
377 | ||
b75a7d8f A |
378 | U_CAPI int32_t U_EXPORT2 |
379 | ucol_previous(UCollationElements *elems, | |
380 | UErrorCode *status) | |
381 | { | |
46f4442e A |
382 | if(U_FAILURE(*status)) { |
383 | return UCOL_NULLORDER; | |
384 | } | |
57a6839d | 385 | return CollationElementIterator::fromUCollationElements(elems)->previous(*status); |
46f4442e A |
386 | } |
387 | ||
57a6839d | 388 | // temporarily restore the following removed internal function which is used by Spotlight |
46f4442e A |
389 | U_CAPI int64_t U_EXPORT2 |
390 | ucol_previousProcessed(UCollationElements *elems, | |
391 | int32_t *ixLow, | |
392 | int32_t *ixHigh, | |
393 | UErrorCode *status) | |
394 | { | |
57a6839d A |
395 | return (UCollationPCE::UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status); |
396 | } | |
397 | ||
398 | U_NAMESPACE_BEGIN | |
399 | ||
400 | int64_t | |
401 | UCollationPCE::previousProcessed( | |
402 | int32_t *ixLow, | |
403 | int32_t *ixHigh, | |
404 | UErrorCode *status) | |
405 | { | |
46f4442e | 406 | int64_t result = UCOL_IGNORABLE; |
46f4442e A |
407 | int32_t low = 0, high = 0; |
408 | ||
409 | if (U_FAILURE(*status)) { | |
410 | return UCOL_PROCESSED_NULLORDER; | |
411 | } | |
b75a7d8f | 412 | |
57a6839d | 413 | // pceBuffer.reset(); |
b75a7d8f | 414 | |
2ca993e8 | 415 | while (pceBuffer.isEmpty()) { |
46f4442e A |
416 | // buffer raw CEs up to non-ignorable primary |
417 | RCEBuffer rceb; | |
57a6839d | 418 | int32_t ce; |
46f4442e A |
419 | |
420 | // **** do we need to reset rceb, or will it always be empty at this point **** | |
421 | do { | |
57a6839d A |
422 | high = cei->getOffset(); |
423 | ce = cei->previous(*status); | |
424 | low = cei->getOffset(); | |
46f4442e | 425 | |
57a6839d | 426 | if (ce == UCOL_NULLORDER) { |
2ca993e8 | 427 | if (!rceb.isEmpty()) { |
46f4442e A |
428 | break; |
429 | } | |
430 | ||
431 | goto finish; | |
432 | } | |
433 | ||
2ca993e8 A |
434 | rceb.put((uint32_t)ce, low, high, *status); |
435 | } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce))); | |
46f4442e A |
436 | |
437 | // process the raw CEs | |
2ca993e8 | 438 | while (U_SUCCESS(*status) && !rceb.isEmpty()) { |
46f4442e A |
439 | const RCEI *rcei = rceb.get(); |
440 | ||
57a6839d | 441 | result = processCE(rcei->ce); |
46f4442e A |
442 | |
443 | if (result != UCOL_IGNORABLE) { | |
2ca993e8 | 444 | pceBuffer.put(result, rcei->low, rcei->high, *status); |
46f4442e A |
445 | } |
446 | } | |
2ca993e8 A |
447 | if (U_FAILURE(*status)) { |
448 | return UCOL_PROCESSED_NULLORDER; | |
449 | } | |
46f4442e | 450 | } |
b75a7d8f | 451 | |
46f4442e | 452 | finish: |
2ca993e8 | 453 | if (pceBuffer.isEmpty()) { |
46f4442e A |
454 | // **** Is -1 the right value for ixLow, ixHigh? **** |
455 | if (ixLow != NULL) { | |
456 | *ixLow = -1; | |
457 | } | |
458 | ||
459 | if (ixHigh != NULL) { | |
460 | *ixHigh = -1 | |
461 | ; | |
462 | } | |
463 | return UCOL_PROCESSED_NULLORDER; | |
b75a7d8f A |
464 | } |
465 | ||
57a6839d | 466 | const PCEI *pcei = pceBuffer.get(); |
46f4442e A |
467 | |
468 | if (ixLow != NULL) { | |
469 | *ixLow = pcei->low; | |
470 | } | |
471 | ||
472 | if (ixHigh != NULL) { | |
473 | *ixHigh = pcei->high; | |
474 | } | |
475 | ||
476 | return pcei->ce; | |
b75a7d8f A |
477 | } |
478 | ||
57a6839d A |
479 | U_NAMESPACE_END |
480 | ||
b75a7d8f A |
481 | U_CAPI int32_t U_EXPORT2 |
482 | ucol_getMaxExpansion(const UCollationElements *elems, | |
483 | int32_t order) | |
484 | { | |
57a6839d | 485 | return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); |
46f4442e | 486 | |
57a6839d A |
487 | // TODO: The old code masked the order according to strength and then did a binary search. |
488 | // However this was probably at least partially broken because of the following comment. | |
489 | // Still, it might have found a match when this version may not. | |
46f4442e A |
490 | |
491 | // FIXME: with a masked search, there might be more than one hit, | |
492 | // so we need to look forward and backward from the match to find all | |
493 | // of the hits... | |
b75a7d8f | 494 | } |
57a6839d | 495 | |
b75a7d8f A |
496 | U_CAPI void U_EXPORT2 |
497 | ucol_setText( UCollationElements *elems, | |
498 | const UChar *text, | |
499 | int32_t textLength, | |
500 | UErrorCode *status) | |
501 | { | |
46f4442e A |
502 | if (U_FAILURE(*status)) { |
503 | return; | |
504 | } | |
b75a7d8f | 505 | |
57a6839d A |
506 | if ((text == NULL && textLength != 0)) { |
507 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
508 | return; | |
729e4ab9 | 509 | } |
57a6839d A |
510 | UnicodeString s((UBool)(textLength < 0), text, textLength); |
511 | return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); | |
b75a7d8f A |
512 | } |
513 | ||
514 | U_CAPI int32_t U_EXPORT2 | |
515 | ucol_getOffset(const UCollationElements *elems) | |
516 | { | |
57a6839d | 517 | return CollationElementIterator::fromUCollationElements(elems)->getOffset(); |
b75a7d8f A |
518 | } |
519 | ||
520 | U_CAPI void U_EXPORT2 | |
521 | ucol_setOffset(UCollationElements *elems, | |
522 | int32_t offset, | |
523 | UErrorCode *status) | |
524 | { | |
46f4442e A |
525 | if (U_FAILURE(*status)) { |
526 | return; | |
527 | } | |
b75a7d8f | 528 | |
57a6839d | 529 | CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); |
b75a7d8f A |
530 | } |
531 | ||
532 | U_CAPI int32_t U_EXPORT2 | |
533 | ucol_primaryOrder (int32_t order) | |
534 | { | |
57a6839d | 535 | return (order >> 16) & 0xffff; |
b75a7d8f A |
536 | } |
537 | ||
538 | U_CAPI int32_t U_EXPORT2 | |
539 | ucol_secondaryOrder (int32_t order) | |
540 | { | |
57a6839d | 541 | return (order >> 8) & 0xff; |
b75a7d8f A |
542 | } |
543 | ||
544 | U_CAPI int32_t U_EXPORT2 | |
545 | ucol_tertiaryOrder (int32_t order) | |
546 | { | |
57a6839d | 547 | return order & 0xff; |
729e4ab9 A |
548 | } |
549 | ||
b75a7d8f | 550 | #endif /* #if !UCONFIG_NO_COLLATION */ |