]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ****************************************************************************** | |
2ca993e8 | 5 | * Copyright (C) 2001-2016, International Business Machines |
b75a7d8f A |
6 | * Corporation and others. All Rights Reserved. |
7 | ****************************************************************************** | |
8 | * | |
9 | * File ucoleitr.cpp | |
10 | * | |
11 | * Modification History: | |
12 | * | |
13 | * Date Name Description | |
14 | * 02/15/2001 synwee Modified all methods to process its own function | |
15 | * instead of calling the equivalent c++ api (coleitr.h) | |
57a6839d | 16 | * 2012-2014 markus Rewritten in C++ again. |
b75a7d8f A |
17 | ******************************************************************************/ |
18 | ||
19 | #include "unicode/utypes.h" | |
20 | ||
21 | #if !UCONFIG_NO_COLLATION | |
22 | ||
57a6839d A |
23 | #include "unicode/coleitr.h" |
24 | #include "unicode/tblcoll.h" | |
b75a7d8f A |
25 | #include "unicode/ucoleitr.h" |
26 | #include "unicode/ustring.h" | |
27 | #include "unicode/sortkey.h" | |
46f4442e | 28 | #include "unicode/uobject.h" |
b75a7d8f | 29 | #include "cmemory.h" |
57a6839d A |
30 | #include "usrchimp.h" |
31 | ||
b75a7d8f A |
32 | U_NAMESPACE_USE |
33 | ||
34 | #define BUFFER_LENGTH 100 | |
35 | ||
46f4442e A |
36 | #define DEFAULT_BUFFER_SIZE 16 |
37 | #define BUFFER_GROW 8 | |
38 | ||
a62d09fc | 39 | #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0]) |
46f4442e | 40 | |
a62d09fc | 41 | #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type)) |
46f4442e | 42 | |
46f4442e A |
43 | #define DELETE_ARRAY(array) uprv_free((void *) (array)) |
44 | ||
46f4442e A |
45 | struct RCEI |
46 | { | |
47 | uint32_t ce; | |
48 | int32_t low; | |
49 | int32_t high; | |
50 | }; | |
51 | ||
52 | U_NAMESPACE_BEGIN | |
53 | ||
54 | struct RCEBuffer | |
55 | { | |
56 | RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; | |
57 | RCEI *buffer; | |
58 | int32_t bufferIndex; | |
59 | int32_t bufferSize; | |
60 | ||
61 | RCEBuffer(); | |
62 | ~RCEBuffer(); | |
63 | ||
2ca993e8 A |
64 | UBool isEmpty() const; |
65 | void put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode); | |
46f4442e A |
66 | const RCEI *get(); |
67 | }; | |
68 | ||
69 | RCEBuffer::RCEBuffer() | |
70 | { | |
71 | buffer = defaultBuffer; | |
72 | bufferIndex = 0; | |
b331163b | 73 | bufferSize = UPRV_LENGTHOF(defaultBuffer); |
46f4442e A |
74 | } |
75 | ||
76 | RCEBuffer::~RCEBuffer() | |
77 | { | |
78 | if (buffer != defaultBuffer) { | |
79 | DELETE_ARRAY(buffer); | |
80 | } | |
81 | } | |
82 | ||
2ca993e8 | 83 | UBool RCEBuffer::isEmpty() const |
46f4442e A |
84 | { |
85 | return bufferIndex <= 0; | |
86 | } | |
87 | ||
2ca993e8 | 88 | void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) |
46f4442e | 89 | { |
2ca993e8 A |
90 | if (U_FAILURE(errorCode)) { |
91 | return; | |
92 | } | |
46f4442e A |
93 | if (bufferIndex >= bufferSize) { |
94 | RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); | |
2ca993e8 A |
95 | if (newBuffer == NULL) { |
96 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
97 | return; | |
98 | } | |
46f4442e A |
99 | |
100 | ARRAY_COPY(newBuffer, buffer, bufferSize); | |
101 | ||
102 | if (buffer != defaultBuffer) { | |
103 | DELETE_ARRAY(buffer); | |
104 | } | |
105 | ||
106 | buffer = newBuffer; | |
107 | bufferSize += BUFFER_GROW; | |
108 | } | |
109 | ||
110 | buffer[bufferIndex].ce = ce; | |
111 | buffer[bufferIndex].low = ixLow; | |
112 | buffer[bufferIndex].high = ixHigh; | |
113 | ||
114 | bufferIndex += 1; | |
115 | } | |
116 | ||
117 | const RCEI *RCEBuffer::get() | |
118 | { | |
119 | if (bufferIndex > 0) { | |
120 | return &buffer[--bufferIndex]; | |
121 | } | |
122 | ||
123 | return NULL; | |
124 | } | |
125 | ||
46f4442e A |
126 | PCEBuffer::PCEBuffer() |
127 | { | |
128 | buffer = defaultBuffer; | |
129 | bufferIndex = 0; | |
b331163b | 130 | bufferSize = UPRV_LENGTHOF(defaultBuffer); |
46f4442e A |
131 | } |
132 | ||
133 | PCEBuffer::~PCEBuffer() | |
134 | { | |
135 | if (buffer != defaultBuffer) { | |
136 | DELETE_ARRAY(buffer); | |
137 | } | |
138 | } | |
139 | ||
140 | void PCEBuffer::reset() | |
141 | { | |
142 | bufferIndex = 0; | |
143 | } | |
144 | ||
2ca993e8 | 145 | UBool PCEBuffer::isEmpty() const |
46f4442e A |
146 | { |
147 | return bufferIndex <= 0; | |
148 | } | |
149 | ||
2ca993e8 | 150 | void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode) |
46f4442e | 151 | { |
2ca993e8 A |
152 | if (U_FAILURE(errorCode)) { |
153 | return; | |
154 | } | |
46f4442e A |
155 | if (bufferIndex >= bufferSize) { |
156 | PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); | |
2ca993e8 A |
157 | if (newBuffer == NULL) { |
158 | errorCode = U_MEMORY_ALLOCATION_ERROR; | |
159 | return; | |
160 | } | |
46f4442e A |
161 | |
162 | ARRAY_COPY(newBuffer, buffer, bufferSize); | |
163 | ||
164 | if (buffer != defaultBuffer) { | |
165 | DELETE_ARRAY(buffer); | |
166 | } | |
167 | ||
168 | buffer = newBuffer; | |
169 | bufferSize += BUFFER_GROW; | |
170 | } | |
171 | ||
172 | buffer[bufferIndex].ce = ce; | |
173 | buffer[bufferIndex].low = ixLow; | |
174 | buffer[bufferIndex].high = ixHigh; | |
175 | ||
176 | bufferIndex += 1; | |
177 | } | |
178 | ||
179 | const PCEI *PCEBuffer::get() | |
180 | { | |
181 | if (bufferIndex > 0) { | |
182 | return &buffer[--bufferIndex]; | |
183 | } | |
184 | ||
185 | return NULL; | |
186 | } | |
187 | ||
57a6839d | 188 | UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); } |
46f4442e | 189 | |
57a6839d | 190 | UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); } |
46f4442e | 191 | |
57a6839d A |
192 | void UCollationPCE::init(UCollationElements *elems) { |
193 | init(CollationElementIterator::fromUCollationElements(elems)); | |
194 | } | |
46f4442e | 195 | |
57a6839d | 196 | void UCollationPCE::init(CollationElementIterator *iter) |
46f4442e | 197 | { |
57a6839d A |
198 | cei = iter; |
199 | init(*iter->rbc_); | |
46f4442e A |
200 | } |
201 | ||
57a6839d | 202 | void UCollationPCE::init(const Collator &coll) |
46f4442e A |
203 | { |
204 | UErrorCode status = U_ZERO_ERROR; | |
205 | ||
57a6839d A |
206 | strength = coll.getAttribute(UCOL_STRENGTH, status); |
207 | toShift = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; | |
46f4442e | 208 | isShifted = FALSE; |
57a6839d | 209 | variableTop = coll.getVariableTop(status); |
46f4442e A |
210 | } |
211 | ||
212 | UCollationPCE::~UCollationPCE() | |
213 | { | |
214 | // nothing to do | |
215 | } | |
216 | ||
57a6839d | 217 | uint64_t UCollationPCE::processCE(uint32_t ce) |
46f4442e A |
218 | { |
219 | uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; | |
220 | ||
221 | // This is clean, but somewhat slow... | |
222 | // We could apply the mask to ce and then | |
223 | // just get all three orders... | |
57a6839d | 224 | switch(strength) { |
46f4442e A |
225 | default: |
226 | tertiary = ucol_tertiaryOrder(ce); | |
2ca993e8 | 227 | U_FALLTHROUGH; |
46f4442e A |
228 | |
229 | case UCOL_SECONDARY: | |
230 | secondary = ucol_secondaryOrder(ce); | |
2ca993e8 | 231 | U_FALLTHROUGH; |
46f4442e A |
232 | |
233 | case UCOL_PRIMARY: | |
234 | primary = ucol_primaryOrder(ce); | |
235 | } | |
236 | ||
729e4ab9 A |
237 | // **** This should probably handle continuations too. **** |
238 | // **** That means that we need 24 bits for the primary **** | |
239 | // **** instead of the 16 that we're currently using. **** | |
240 | // **** So we can lay out the 64 bits as: 24.12.12.16. **** | |
241 | // **** Another complication with continuations is that **** | |
242 | // **** the *second* CE is marked as a continuation, so **** | |
243 | // **** we always have to peek ahead to know how long **** | |
244 | // **** the primary is... **** | |
57a6839d A |
245 | if ((toShift && variableTop > ce && primary != 0) |
246 | || (isShifted && primary == 0)) { | |
46f4442e A |
247 | |
248 | if (primary == 0) { | |
249 | return UCOL_IGNORABLE; | |
250 | } | |
251 | ||
57a6839d | 252 | if (strength >= UCOL_QUATERNARY) { |
46f4442e A |
253 | quaternary = primary; |
254 | } | |
255 | ||
256 | primary = secondary = tertiary = 0; | |
57a6839d | 257 | isShifted = TRUE; |
46f4442e | 258 | } else { |
57a6839d | 259 | if (strength >= UCOL_QUATERNARY) { |
46f4442e A |
260 | quaternary = 0xFFFF; |
261 | } | |
262 | ||
57a6839d | 263 | isShifted = FALSE; |
46f4442e A |
264 | } |
265 | ||
46f4442e A |
266 | return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; |
267 | } | |
268 | ||
57a6839d | 269 | U_NAMESPACE_END |
46f4442e | 270 | |
b75a7d8f A |
271 | /* public methods ---------------------------------------------------- */ |
272 | ||
b75a7d8f A |
273 | U_CAPI UCollationElements* U_EXPORT2 |
274 | ucol_openElements(const UCollator *coll, | |
275 | const UChar *text, | |
276 | int32_t textLength, | |
277 | UErrorCode *status) | |
278 | { | |
46f4442e A |
279 | if (U_FAILURE(*status)) { |
280 | return NULL; | |
281 | } | |
57a6839d A |
282 | if (coll == NULL || (text == NULL && textLength != 0)) { |
283 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
284 | return NULL; | |
285 | } | |
286 | const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); | |
287 | if (rbc == NULL) { | |
288 | *status = U_UNSUPPORTED_ERROR; // coll is a Collator but not a RuleBasedCollator | |
46f4442e A |
289 | return NULL; |
290 | } | |
729e4ab9 | 291 | |
57a6839d A |
292 | UnicodeString s((UBool)(textLength < 0), text, textLength); |
293 | CollationElementIterator *cei = rbc->createCollationElementIterator(s); | |
294 | if (cei == NULL) { | |
295 | *status = U_MEMORY_ALLOCATION_ERROR; | |
296 | return NULL; | |
46f4442e | 297 | } |
b75a7d8f | 298 | |
57a6839d | 299 | return cei->toUCollationElements(); |
b75a7d8f A |
300 | } |
301 | ||
729e4ab9 | 302 | |
b75a7d8f A |
303 | U_CAPI void U_EXPORT2 |
304 | ucol_closeElements(UCollationElements *elems) | |
305 | { | |
57a6839d | 306 | delete CollationElementIterator::fromUCollationElements(elems); |
b75a7d8f A |
307 | } |
308 | ||
309 | U_CAPI void U_EXPORT2 | |
310 | ucol_reset(UCollationElements *elems) | |
311 | { | |
57a6839d | 312 | CollationElementIterator::fromUCollationElements(elems)->reset(); |
729e4ab9 A |
313 | } |
314 | ||
b75a7d8f A |
315 | U_CAPI int32_t U_EXPORT2 |
316 | ucol_next(UCollationElements *elems, | |
317 | UErrorCode *status) | |
318 | { | |
46f4442e A |
319 | if (U_FAILURE(*status)) { |
320 | return UCOL_NULLORDER; | |
321 | } | |
b75a7d8f | 322 | |
57a6839d | 323 | return CollationElementIterator::fromUCollationElements(elems)->next(*status); |
46f4442e A |
324 | } |
325 | ||
57a6839d | 326 | // temporarily restore the following removed internal function which is used by Spotlight |
46f4442e A |
327 | U_CAPI int64_t U_EXPORT2 |
328 | ucol_nextProcessed(UCollationElements *elems, | |
329 | int32_t *ixLow, | |
330 | int32_t *ixHigh, | |
331 | UErrorCode *status) | |
332 | { | |
57a6839d A |
333 | return (UCollationPCE::UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status); |
334 | } | |
335 | ||
336 | ||
337 | U_NAMESPACE_BEGIN | |
338 | ||
339 | int64_t | |
340 | UCollationPCE::nextProcessed( | |
341 | int32_t *ixLow, | |
342 | int32_t *ixHigh, | |
343 | UErrorCode *status) | |
344 | { | |
46f4442e A |
345 | int64_t result = UCOL_IGNORABLE; |
346 | uint32_t low = 0, high = 0; | |
347 | ||
348 | if (U_FAILURE(*status)) { | |
349 | return UCOL_PROCESSED_NULLORDER; | |
350 | } | |
351 | ||
57a6839d | 352 | pceBuffer.reset(); |
46f4442e A |
353 | |
354 | do { | |
57a6839d A |
355 | low = cei->getOffset(); |
356 | int32_t ce = cei->next(*status); | |
357 | high = cei->getOffset(); | |
46f4442e | 358 | |
57a6839d | 359 | if (ce == UCOL_NULLORDER) { |
46f4442e A |
360 | result = UCOL_PROCESSED_NULLORDER; |
361 | break; | |
362 | } | |
363 | ||
57a6839d | 364 | result = processCE((uint32_t)ce); |
46f4442e A |
365 | } while (result == UCOL_IGNORABLE); |
366 | ||
367 | if (ixLow != NULL) { | |
368 | *ixLow = low; | |
369 | } | |
370 | ||
371 | if (ixHigh != NULL) { | |
372 | *ixHigh = high; | |
373 | } | |
374 | ||
375 | return result; | |
b75a7d8f A |
376 | } |
377 | ||
57a6839d A |
378 | U_NAMESPACE_END |
379 | ||
b75a7d8f A |
380 | U_CAPI int32_t U_EXPORT2 |
381 | ucol_previous(UCollationElements *elems, | |
382 | UErrorCode *status) | |
383 | { | |
46f4442e A |
384 | if(U_FAILURE(*status)) { |
385 | return UCOL_NULLORDER; | |
386 | } | |
57a6839d | 387 | return CollationElementIterator::fromUCollationElements(elems)->previous(*status); |
46f4442e A |
388 | } |
389 | ||
57a6839d | 390 | // temporarily restore the following removed internal function which is used by Spotlight |
46f4442e A |
391 | U_CAPI int64_t U_EXPORT2 |
392 | ucol_previousProcessed(UCollationElements *elems, | |
393 | int32_t *ixLow, | |
394 | int32_t *ixHigh, | |
395 | UErrorCode *status) | |
396 | { | |
57a6839d A |
397 | return (UCollationPCE::UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status); |
398 | } | |
399 | ||
400 | U_NAMESPACE_BEGIN | |
401 | ||
402 | int64_t | |
403 | UCollationPCE::previousProcessed( | |
404 | int32_t *ixLow, | |
405 | int32_t *ixHigh, | |
406 | UErrorCode *status) | |
407 | { | |
46f4442e | 408 | int64_t result = UCOL_IGNORABLE; |
46f4442e A |
409 | int32_t low = 0, high = 0; |
410 | ||
411 | if (U_FAILURE(*status)) { | |
412 | return UCOL_PROCESSED_NULLORDER; | |
413 | } | |
b75a7d8f | 414 | |
57a6839d | 415 | // pceBuffer.reset(); |
b75a7d8f | 416 | |
2ca993e8 | 417 | while (pceBuffer.isEmpty()) { |
46f4442e A |
418 | // buffer raw CEs up to non-ignorable primary |
419 | RCEBuffer rceb; | |
57a6839d | 420 | int32_t ce; |
46f4442e A |
421 | |
422 | // **** do we need to reset rceb, or will it always be empty at this point **** | |
423 | do { | |
57a6839d A |
424 | high = cei->getOffset(); |
425 | ce = cei->previous(*status); | |
426 | low = cei->getOffset(); | |
46f4442e | 427 | |
57a6839d | 428 | if (ce == UCOL_NULLORDER) { |
2ca993e8 | 429 | if (!rceb.isEmpty()) { |
46f4442e A |
430 | break; |
431 | } | |
432 | ||
433 | goto finish; | |
434 | } | |
435 | ||
2ca993e8 A |
436 | rceb.put((uint32_t)ce, low, high, *status); |
437 | } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce))); | |
46f4442e A |
438 | |
439 | // process the raw CEs | |
2ca993e8 | 440 | while (U_SUCCESS(*status) && !rceb.isEmpty()) { |
46f4442e A |
441 | const RCEI *rcei = rceb.get(); |
442 | ||
57a6839d | 443 | result = processCE(rcei->ce); |
46f4442e A |
444 | |
445 | if (result != UCOL_IGNORABLE) { | |
2ca993e8 | 446 | pceBuffer.put(result, rcei->low, rcei->high, *status); |
46f4442e A |
447 | } |
448 | } | |
2ca993e8 A |
449 | if (U_FAILURE(*status)) { |
450 | return UCOL_PROCESSED_NULLORDER; | |
451 | } | |
46f4442e | 452 | } |
b75a7d8f | 453 | |
46f4442e | 454 | finish: |
2ca993e8 | 455 | if (pceBuffer.isEmpty()) { |
46f4442e A |
456 | // **** Is -1 the right value for ixLow, ixHigh? **** |
457 | if (ixLow != NULL) { | |
458 | *ixLow = -1; | |
459 | } | |
460 | ||
461 | if (ixHigh != NULL) { | |
462 | *ixHigh = -1 | |
463 | ; | |
464 | } | |
465 | return UCOL_PROCESSED_NULLORDER; | |
b75a7d8f A |
466 | } |
467 | ||
57a6839d | 468 | const PCEI *pcei = pceBuffer.get(); |
46f4442e A |
469 | |
470 | if (ixLow != NULL) { | |
471 | *ixLow = pcei->low; | |
472 | } | |
473 | ||
474 | if (ixHigh != NULL) { | |
475 | *ixHigh = pcei->high; | |
476 | } | |
477 | ||
478 | return pcei->ce; | |
b75a7d8f A |
479 | } |
480 | ||
57a6839d A |
481 | U_NAMESPACE_END |
482 | ||
b75a7d8f A |
483 | U_CAPI int32_t U_EXPORT2 |
484 | ucol_getMaxExpansion(const UCollationElements *elems, | |
485 | int32_t order) | |
486 | { | |
57a6839d | 487 | return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order); |
46f4442e | 488 | |
57a6839d A |
489 | // TODO: The old code masked the order according to strength and then did a binary search. |
490 | // However this was probably at least partially broken because of the following comment. | |
491 | // Still, it might have found a match when this version may not. | |
46f4442e A |
492 | |
493 | // FIXME: with a masked search, there might be more than one hit, | |
494 | // so we need to look forward and backward from the match to find all | |
495 | // of the hits... | |
b75a7d8f | 496 | } |
57a6839d | 497 | |
b75a7d8f A |
498 | U_CAPI void U_EXPORT2 |
499 | ucol_setText( UCollationElements *elems, | |
500 | const UChar *text, | |
501 | int32_t textLength, | |
502 | UErrorCode *status) | |
503 | { | |
46f4442e A |
504 | if (U_FAILURE(*status)) { |
505 | return; | |
506 | } | |
b75a7d8f | 507 | |
57a6839d A |
508 | if ((text == NULL && textLength != 0)) { |
509 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
510 | return; | |
729e4ab9 | 511 | } |
57a6839d A |
512 | UnicodeString s((UBool)(textLength < 0), text, textLength); |
513 | return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status); | |
b75a7d8f A |
514 | } |
515 | ||
516 | U_CAPI int32_t U_EXPORT2 | |
517 | ucol_getOffset(const UCollationElements *elems) | |
518 | { | |
57a6839d | 519 | return CollationElementIterator::fromUCollationElements(elems)->getOffset(); |
b75a7d8f A |
520 | } |
521 | ||
522 | U_CAPI void U_EXPORT2 | |
523 | ucol_setOffset(UCollationElements *elems, | |
524 | int32_t offset, | |
525 | UErrorCode *status) | |
526 | { | |
46f4442e A |
527 | if (U_FAILURE(*status)) { |
528 | return; | |
529 | } | |
b75a7d8f | 530 | |
57a6839d | 531 | CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status); |
b75a7d8f A |
532 | } |
533 | ||
534 | U_CAPI int32_t U_EXPORT2 | |
535 | ucol_primaryOrder (int32_t order) | |
536 | { | |
57a6839d | 537 | return (order >> 16) & 0xffff; |
b75a7d8f A |
538 | } |
539 | ||
540 | U_CAPI int32_t U_EXPORT2 | |
541 | ucol_secondaryOrder (int32_t order) | |
542 | { | |
57a6839d | 543 | return (order >> 8) & 0xff; |
b75a7d8f A |
544 | } |
545 | ||
546 | U_CAPI int32_t U_EXPORT2 | |
547 | ucol_tertiaryOrder (int32_t order) | |
548 | { | |
57a6839d | 549 | return order & 0xff; |
729e4ab9 A |
550 | } |
551 | ||
b75a7d8f | 552 | #endif /* #if !UCONFIG_NO_COLLATION */ |