]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucoleitr.cpp
ICU-400.38.tar.gz
[apple/icu.git] / icuSources / i18n / ucoleitr.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 2001-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 ******************************************************************************/
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ustring.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/uobject.h"
24 #include "ucol_imp.h"
25 #include "cmemory.h"
26
27 U_NAMESPACE_USE
28
29 #define BUFFER_LENGTH 100
30
31 #define DEFAULT_BUFFER_SIZE 16
32 #define BUFFER_GROW 8
33
34 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
35
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39
40 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
41
42 #define DELETE_ARRAY(array) uprv_free((void *) (array))
43
44 typedef struct collIterate collIterator;
45
46 struct RCEI
47 {
48 uint32_t ce;
49 int32_t low;
50 int32_t high;
51 };
52
53 U_NAMESPACE_BEGIN
54
55 struct RCEBuffer
56 {
57 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
58 RCEI *buffer;
59 int32_t bufferIndex;
60 int32_t bufferSize;
61
62 RCEBuffer();
63 ~RCEBuffer();
64
65 UBool empty() const;
66 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
67 const RCEI *get();
68 };
69
70 RCEBuffer::RCEBuffer()
71 {
72 buffer = defaultBuffer;
73 bufferIndex = 0;
74 bufferSize = DEFAULT_BUFFER_SIZE;
75 }
76
77 RCEBuffer::~RCEBuffer()
78 {
79 if (buffer != defaultBuffer) {
80 DELETE_ARRAY(buffer);
81 }
82 }
83
84 UBool RCEBuffer::empty() const
85 {
86 return bufferIndex <= 0;
87 }
88
89 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
90 {
91 if (bufferIndex >= bufferSize) {
92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93
94 ARRAY_COPY(newBuffer, buffer, bufferSize);
95
96 if (buffer != defaultBuffer) {
97 DELETE_ARRAY(buffer);
98 }
99
100 buffer = newBuffer;
101 bufferSize += BUFFER_GROW;
102 }
103
104 buffer[bufferIndex].ce = ce;
105 buffer[bufferIndex].low = ixLow;
106 buffer[bufferIndex].high = ixHigh;
107
108 bufferIndex += 1;
109 }
110
111 const RCEI *RCEBuffer::get()
112 {
113 if (bufferIndex > 0) {
114 return &buffer[--bufferIndex];
115 }
116
117 return NULL;
118 }
119
120 struct PCEI
121 {
122 uint64_t ce;
123 int32_t low;
124 int32_t high;
125 };
126
127 struct PCEBuffer
128 {
129 PCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
130 PCEI *buffer;
131 int32_t bufferIndex;
132 int32_t bufferSize;
133
134 PCEBuffer();
135 ~PCEBuffer();
136
137 void reset();
138 UBool empty() const;
139 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
140 const PCEI *get();
141 };
142
143 PCEBuffer::PCEBuffer()
144 {
145 buffer = defaultBuffer;
146 bufferIndex = 0;
147 bufferSize = DEFAULT_BUFFER_SIZE;
148 }
149
150 PCEBuffer::~PCEBuffer()
151 {
152 if (buffer != defaultBuffer) {
153 DELETE_ARRAY(buffer);
154 }
155 }
156
157 void PCEBuffer::reset()
158 {
159 bufferIndex = 0;
160 }
161
162 UBool PCEBuffer::empty() const
163 {
164 return bufferIndex <= 0;
165 }
166
167 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
168 {
169 if (bufferIndex >= bufferSize) {
170 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
171
172 ARRAY_COPY(newBuffer, buffer, bufferSize);
173
174 if (buffer != defaultBuffer) {
175 DELETE_ARRAY(buffer);
176 }
177
178 buffer = newBuffer;
179 bufferSize += BUFFER_GROW;
180 }
181
182 buffer[bufferIndex].ce = ce;
183 buffer[bufferIndex].low = ixLow;
184 buffer[bufferIndex].high = ixHigh;
185
186 bufferIndex += 1;
187 }
188
189 const PCEI *PCEBuffer::get()
190 {
191 if (bufferIndex > 0) {
192 return &buffer[--bufferIndex];
193 }
194
195 return NULL;
196 }
197
198 /*
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
202 */
203 struct UCollationPCE : public UObject
204 {
205 PCEBuffer pceBuffer;
206 UCollationStrength strength;
207 UBool toShift;
208 UBool isShifted;
209 uint32_t variableTop;
210
211 UCollationPCE(UCollationElements *elems);
212 ~UCollationPCE();
213
214 void init(const UCollator *coll);
215
216 virtual UClassID getDynamicClassID() const;
217 static UClassID getStaticClassID();
218 };
219
220 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
221
222 UCollationPCE::UCollationPCE(UCollationElements *elems)
223 {
224 init(elems->iteratordata_.coll);
225 }
226
227 void UCollationPCE::init(const UCollator *coll)
228 {
229 UErrorCode status = U_ZERO_ERROR;
230
231 strength = ucol_getStrength(coll);
232 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
233 isShifted = FALSE;
234 variableTop = coll->variableTopValue << 16;
235 }
236
237 UCollationPCE::~UCollationPCE()
238 {
239 // nothing to do
240 }
241
242
243 U_NAMESPACE_END
244
245
246 inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
247 {
248 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
249
250 // This is clean, but somewhat slow...
251 // We could apply the mask to ce and then
252 // just get all three orders...
253 switch(elems->pce->strength) {
254 default:
255 tertiary = ucol_tertiaryOrder(ce);
256 /* note fall-through */
257
258 case UCOL_SECONDARY:
259 secondary = ucol_secondaryOrder(ce);
260 /* note fall-through */
261
262 case UCOL_PRIMARY:
263 primary = ucol_primaryOrder(ce);
264 }
265
266 // Continuation?
267 if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0)
268 || (elems->pce->isShifted && primary == 0)) {
269
270 if (primary == 0) {
271 return UCOL_IGNORABLE;
272 }
273
274 if (elems->pce->strength >= UCOL_QUATERNARY) {
275 quaternary = primary;
276 }
277
278 primary = secondary = tertiary = 0;
279 elems->pce->isShifted = TRUE;
280 } else {
281 if (elems->pce->strength >= UCOL_QUATERNARY) {
282 quaternary = 0xFFFF;
283 }
284
285 elems->pce->isShifted = FALSE;
286 }
287
288
289 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
290 }
291
292 U_CAPI void U_EXPORT2
293 uprv_init_pce(const UCollationElements *elems)
294 {
295 if (elems->pce != NULL) {
296 elems->pce->init(elems->iteratordata_.coll);
297 }
298 }
299
300
301
302 /* public methods ---------------------------------------------------- */
303
304 U_CAPI UCollationElements* U_EXPORT2
305 ucol_openElements(const UCollator *coll,
306 const UChar *text,
307 int32_t textLength,
308 UErrorCode *status)
309 {
310 UCollationElements *result;
311
312 if (U_FAILURE(*status)) {
313 return NULL;
314 }
315
316 result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
317 /* test for NULL */
318 if (result == NULL) {
319 *status = U_MEMORY_ALLOCATION_ERROR;
320 return NULL;
321 }
322
323 result->reset_ = TRUE;
324 result->isWritable = FALSE;
325 result->pce = NULL;
326
327 if (text == NULL) {
328 textLength = 0;
329 }
330 uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
331
332 return result;
333 }
334
335 U_CAPI void U_EXPORT2
336 ucol_closeElements(UCollationElements *elems)
337 {
338 if (elems != NULL) {
339 collIterate *ci = &elems->iteratordata_;
340
341 if (ci != NULL) {
342 if (ci->writableBuffer != ci->stackWritableBuffer) {
343 uprv_free(ci->writableBuffer);
344 }
345
346 if (ci->extendCEs) {
347 uprv_free(ci->extendCEs);
348 }
349
350 if (ci->offsetBuffer) {
351 uprv_free(ci->offsetBuffer);
352 }
353 }
354
355 if (elems->isWritable && elems->iteratordata_.string != NULL)
356 {
357 uprv_free(elems->iteratordata_.string);
358 }
359
360 if (elems->pce != NULL) {
361 delete elems->pce;
362 }
363
364 uprv_free(elems);
365 }
366 }
367
368 U_CAPI void U_EXPORT2
369 ucol_reset(UCollationElements *elems)
370 {
371 collIterate *ci = &(elems->iteratordata_);
372 elems->reset_ = TRUE;
373 ci->pos = ci->string;
374 if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
375 ci->endp = ci->string + u_strlen(ci->string);
376 }
377 ci->CEpos = ci->toReturn = ci->CEs;
378 ci->flags = UCOL_ITER_HASLEN;
379 if (ci->coll->normalizationMode == UCOL_ON) {
380 ci->flags |= UCOL_ITER_NORM;
381 }
382
383 if (ci->stackWritableBuffer != ci->writableBuffer) {
384 uprv_free(ci->writableBuffer);
385 ci->writableBuffer = ci->stackWritableBuffer;
386 ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
387 }
388 ci->fcdPosition = NULL;
389
390 //ci->offsetReturn = ci->offsetStore = NULL;
391 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
392 }
393
394 U_CAPI int32_t U_EXPORT2
395 ucol_next(UCollationElements *elems,
396 UErrorCode *status)
397 {
398 int32_t result;
399 if (U_FAILURE(*status)) {
400 return UCOL_NULLORDER;
401 }
402
403 elems->reset_ = FALSE;
404
405 result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll,
406 &elems->iteratordata_,
407 status);
408
409 if (result == UCOL_NO_MORE_CES) {
410 result = UCOL_NULLORDER;
411 }
412 return result;
413 }
414
415 U_CAPI int64_t U_EXPORT2
416 ucol_nextProcessed(UCollationElements *elems,
417 int32_t *ixLow,
418 int32_t *ixHigh,
419 UErrorCode *status)
420 {
421 const UCollator *coll = elems->iteratordata_.coll;
422 int64_t result = UCOL_IGNORABLE;
423 uint32_t low = 0, high = 0;
424
425 if (U_FAILURE(*status)) {
426 return UCOL_PROCESSED_NULLORDER;
427 }
428
429 if (elems->pce == NULL) {
430 elems->pce = new UCollationPCE(elems);
431 } else {
432 elems->pce->pceBuffer.reset();
433 }
434
435 elems->reset_ = FALSE;
436
437 do {
438 low = ucol_getOffset(elems);
439 uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
440 high = ucol_getOffset(elems);
441
442 if (ce == UCOL_NO_MORE_CES) {
443 result = UCOL_PROCESSED_NULLORDER;
444 break;
445 }
446
447 result = processCE(elems, ce);
448 } while (result == UCOL_IGNORABLE);
449
450 if (ixLow != NULL) {
451 *ixLow = low;
452 }
453
454 if (ixHigh != NULL) {
455 *ixHigh = high;
456 }
457
458 return result;
459 }
460
461 U_CAPI int32_t U_EXPORT2
462 ucol_previous(UCollationElements *elems,
463 UErrorCode *status)
464 {
465 if(U_FAILURE(*status)) {
466 return UCOL_NULLORDER;
467 }
468 else
469 {
470 int32_t result;
471
472 if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) {
473 if (elems->iteratordata_.endp == NULL) {
474 elems->iteratordata_.endp = elems->iteratordata_.string +
475 u_strlen(elems->iteratordata_.string);
476 elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
477 }
478 elems->iteratordata_.pos = elems->iteratordata_.endp;
479 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
480 }
481
482 elems->reset_ = FALSE;
483
484 result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll,
485 &(elems->iteratordata_),
486 status);
487
488 if (result == UCOL_NO_MORE_CES) {
489 result = UCOL_NULLORDER;
490 }
491
492 return result;
493 }
494 }
495
496 U_CAPI int64_t U_EXPORT2
497 ucol_previousProcessed(UCollationElements *elems,
498 int32_t *ixLow,
499 int32_t *ixHigh,
500 UErrorCode *status)
501 {
502 const UCollator *coll = elems->iteratordata_.coll;
503 int64_t result = UCOL_IGNORABLE;
504 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
505 // UCollationStrength strength = ucol_getStrength(coll);
506 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
507 // uint32_t variableTop = coll->variableTopValue;
508 int32_t low = 0, high = 0;
509
510 if (U_FAILURE(*status)) {
511 return UCOL_PROCESSED_NULLORDER;
512 }
513
514 if (elems->reset_ &&
515 (elems->iteratordata_.pos == elems->iteratordata_.string)) {
516 if (elems->iteratordata_.endp == NULL) {
517 elems->iteratordata_.endp = elems->iteratordata_.string +
518 u_strlen(elems->iteratordata_.string);
519 elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
520 }
521
522 elems->iteratordata_.pos = elems->iteratordata_.endp;
523 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
524 }
525
526 if (elems->pce == NULL) {
527 elems->pce = new UCollationPCE(elems);
528 } else {
529 //elems->pce->pceBuffer.reset();
530 }
531
532 elems->reset_ = FALSE;
533
534 while (elems->pce->pceBuffer.empty()) {
535 // buffer raw CEs up to non-ignorable primary
536 RCEBuffer rceb;
537 uint32_t ce;
538
539 // **** do we need to reset rceb, or will it always be empty at this point ****
540 do {
541 high = ucol_getOffset(elems);
542 ce = ucol_getPrevCE(coll, &elems->iteratordata_, status);
543 low = ucol_getOffset(elems);
544
545 if (ce == UCOL_NO_MORE_CES) {
546 if (! rceb.empty()) {
547 break;
548 }
549
550 goto finish;
551 }
552
553 rceb.put(ce, low, high);
554 } while ((ce & UCOL_PRIMARYMASK) == 0);
555
556 // process the raw CEs
557 while (! rceb.empty()) {
558 const RCEI *rcei = rceb.get();
559
560 result = processCE(elems, rcei->ce);
561
562 if (result != UCOL_IGNORABLE) {
563 elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
564 }
565 }
566 }
567
568 finish:
569 if (elems->pce->pceBuffer.empty()) {
570 // **** Is -1 the right value for ixLow, ixHigh? ****
571 if (ixLow != NULL) {
572 *ixLow = -1;
573 }
574
575 if (ixHigh != NULL) {
576 *ixHigh = -1
577 ;
578 }
579 return UCOL_PROCESSED_NULLORDER;
580 }
581
582 const PCEI *pcei = elems->pce->pceBuffer.get();
583
584 if (ixLow != NULL) {
585 *ixLow = pcei->low;
586 }
587
588 if (ixHigh != NULL) {
589 *ixHigh = pcei->high;
590 }
591
592 return pcei->ce;
593 }
594
595 U_CAPI int32_t U_EXPORT2
596 ucol_getMaxExpansion(const UCollationElements *elems,
597 int32_t order)
598 {
599 uint8_t result;
600
601 #if 0
602 UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
603 #else
604 const UCollator *coll = elems->iteratordata_.coll;
605 const uint32_t *start;
606 const uint32_t *limit;
607 const uint32_t *mid;
608 uint32_t strengthMask = 0;
609 uint32_t mOrder = (uint32_t) order;
610
611 switch (coll->strength)
612 {
613 default:
614 strengthMask |= UCOL_TERTIARYORDERMASK;
615 /* fall through */
616
617 case UCOL_SECONDARY:
618 strengthMask |= UCOL_SECONDARYORDERMASK;
619 /* fall through */
620
621 case UCOL_PRIMARY:
622 strengthMask |= UCOL_PRIMARYORDERMASK;
623 }
624
625 mOrder &= strengthMask;
626 start = (coll)->endExpansionCE;
627 limit = (coll)->lastEndExpansionCE;
628
629 while (start < limit - 1) {
630 mid = start + ((limit - start) >> 1);
631 if (mOrder <= (*mid & strengthMask)) {
632 limit = mid;
633 } else {
634 start = mid;
635 }
636 }
637
638 // FIXME: with a masked search, there might be more than one hit,
639 // so we need to look forward and backward from the match to find all
640 // of the hits...
641 if ((*start & strengthMask) == mOrder) {
642 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
643 } else if ((*limit & strengthMask) == mOrder) {
644 result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
645 } else if ((mOrder & 0xFFFF) == 0x00C0) {
646 result = 2;
647 } else {
648 result = 1;
649 }
650 #endif
651
652 return result;
653 }
654
655 U_CAPI void U_EXPORT2
656 ucol_setText( UCollationElements *elems,
657 const UChar *text,
658 int32_t textLength,
659 UErrorCode *status)
660 {
661 if (U_FAILURE(*status)) {
662 return;
663 }
664
665 if (elems->isWritable && elems->iteratordata_.string != NULL)
666 {
667 uprv_free(elems->iteratordata_.string);
668 }
669
670 if (text == NULL) {
671 textLength = 0;
672 }
673
674 elems->isWritable = FALSE;
675
676 /* free offset buffer to avoid memory leak before initializing. */
677 freeOffsetBuffer(&(elems->iteratordata_));
678 uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
679 &elems->iteratordata_);
680
681 elems->reset_ = TRUE;
682 }
683
684 U_CAPI int32_t U_EXPORT2
685 ucol_getOffset(const UCollationElements *elems)
686 {
687 const collIterate *ci = &(elems->iteratordata_);
688
689 if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
690 return ci->offsetRepeatValue;
691 }
692
693 if (ci->offsetReturn != NULL) {
694 return *ci->offsetReturn;
695 }
696
697 // while processing characters in normalization buffer getOffset will
698 // return the next non-normalized character.
699 // should be inline with the old implementation since the old codes uses
700 // nextDecomp in normalizer which also decomposes the string till the
701 // first base character is found.
702 if (ci->flags & UCOL_ITER_INNORMBUF) {
703 if (ci->fcdPosition == NULL) {
704 return 0;
705 }
706 return (int32_t)(ci->fcdPosition - ci->string);
707 }
708 else {
709 return (int32_t)(ci->pos - ci->string);
710 }
711 }
712
713 U_CAPI void U_EXPORT2
714 ucol_setOffset(UCollationElements *elems,
715 int32_t offset,
716 UErrorCode *status)
717 {
718 if (U_FAILURE(*status)) {
719 return;
720 }
721
722 // this methods will clean up any use of the writable buffer and points to
723 // the original string
724 collIterate *ci = &(elems->iteratordata_);
725 ci->pos = ci->string + offset;
726 ci->CEpos = ci->toReturn = ci->CEs;
727 if (ci->flags & UCOL_ITER_INNORMBUF) {
728 ci->flags = ci->origFlags;
729 }
730 if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
731 ci->endp = ci->string + u_strlen(ci->string);
732 ci->flags |= UCOL_ITER_HASLEN;
733 }
734 ci->fcdPosition = NULL;
735 elems->reset_ = FALSE;
736
737 ci->offsetReturn = NULL;
738 ci->offsetStore = ci->offsetBuffer;
739 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
740 }
741
742 U_CAPI int32_t U_EXPORT2
743 ucol_primaryOrder (int32_t order)
744 {
745 order &= UCOL_PRIMARYMASK;
746 return (order >> UCOL_PRIMARYORDERSHIFT);
747 }
748
749 U_CAPI int32_t U_EXPORT2
750 ucol_secondaryOrder (int32_t order)
751 {
752 order &= UCOL_SECONDARYMASK;
753 return (order >> UCOL_SECONDARYORDERSHIFT);
754 }
755
756 U_CAPI int32_t U_EXPORT2
757 ucol_tertiaryOrder (int32_t order)
758 {
759 return (order & UCOL_TERTIARYMASK);
760 }
761
762 #endif /* #if !UCONFIG_NO_COLLATION */