]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucoleitr.cpp
ICU-511.27.tar.gz
[apple/icu.git] / icuSources / i18n / ucoleitr.cpp
1 /*
2 ******************************************************************************
3 * Copyright (C) 2001-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File ucoleitr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 02/15/2001 synwee Modified all methods to process its own function
13 * instead of calling the equivalent c++ api (coleitr.h)
14 ******************************************************************************/
15
16 #include "unicode/utypes.h"
17
18 #if !UCONFIG_NO_COLLATION
19
20 #include "unicode/ucoleitr.h"
21 #include "unicode/ustring.h"
22 #include "unicode/sortkey.h"
23 #include "unicode/uobject.h"
24 #include "ucol_imp.h"
25 #include "cmemory.h"
26
27 U_NAMESPACE_USE
28
29 #define BUFFER_LENGTH 100
30
31 #define DEFAULT_BUFFER_SIZE 16
32 #define BUFFER_GROW 8
33
34 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
35
36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
37
38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39
40 #define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])
41
42 #define DELETE_ARRAY(array) uprv_free((void *) (array))
43
44 typedef struct icu::collIterate collIterator;
45
46 struct RCEI
47 {
48 uint32_t ce;
49 int32_t low;
50 int32_t high;
51 };
52
53 U_NAMESPACE_BEGIN
54
55 struct RCEBuffer
56 {
57 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
58 RCEI *buffer;
59 int32_t bufferIndex;
60 int32_t bufferSize;
61
62 RCEBuffer();
63 ~RCEBuffer();
64
65 UBool empty() const;
66 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
67 const RCEI *get();
68 };
69
70 RCEBuffer::RCEBuffer()
71 {
72 buffer = defaultBuffer;
73 bufferIndex = 0;
74 bufferSize = DEFAULT_BUFFER_SIZE;
75 }
76
77 RCEBuffer::~RCEBuffer()
78 {
79 if (buffer != defaultBuffer) {
80 DELETE_ARRAY(buffer);
81 }
82 }
83
84 UBool RCEBuffer::empty() const
85 {
86 return bufferIndex <= 0;
87 }
88
89 void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
90 {
91 if (bufferIndex >= bufferSize) {
92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
93
94 ARRAY_COPY(newBuffer, buffer, bufferSize);
95
96 if (buffer != defaultBuffer) {
97 DELETE_ARRAY(buffer);
98 }
99
100 buffer = newBuffer;
101 bufferSize += BUFFER_GROW;
102 }
103
104 buffer[bufferIndex].ce = ce;
105 buffer[bufferIndex].low = ixLow;
106 buffer[bufferIndex].high = ixHigh;
107
108 bufferIndex += 1;
109 }
110
111 const RCEI *RCEBuffer::get()
112 {
113 if (bufferIndex > 0) {
114 return &buffer[--bufferIndex];
115 }
116
117 return NULL;
118 }
119
120 struct PCEI
121 {
122 uint64_t ce;
123 int32_t low;
124 int32_t high;
125 };
126
127 struct PCEBuffer
128 {
129 PCEI defaultBuffer[DEFAULT_BUFFER_SIZE];
130 PCEI *buffer;
131 int32_t bufferIndex;
132 int32_t bufferSize;
133
134 PCEBuffer();
135 ~PCEBuffer();
136
137 void reset();
138 UBool empty() const;
139 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
140 const PCEI *get();
141 };
142
143 PCEBuffer::PCEBuffer()
144 {
145 buffer = defaultBuffer;
146 bufferIndex = 0;
147 bufferSize = DEFAULT_BUFFER_SIZE;
148 }
149
150 PCEBuffer::~PCEBuffer()
151 {
152 if (buffer != defaultBuffer) {
153 DELETE_ARRAY(buffer);
154 }
155 }
156
157 void PCEBuffer::reset()
158 {
159 bufferIndex = 0;
160 }
161
162 UBool PCEBuffer::empty() const
163 {
164 return bufferIndex <= 0;
165 }
166
167 void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
168 {
169 if (bufferIndex >= bufferSize) {
170 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
171
172 ARRAY_COPY(newBuffer, buffer, bufferSize);
173
174 if (buffer != defaultBuffer) {
175 DELETE_ARRAY(buffer);
176 }
177
178 buffer = newBuffer;
179 bufferSize += BUFFER_GROW;
180 }
181
182 buffer[bufferIndex].ce = ce;
183 buffer[bufferIndex].low = ixLow;
184 buffer[bufferIndex].high = ixHigh;
185
186 bufferIndex += 1;
187 }
188
189 const PCEI *PCEBuffer::get()
190 {
191 if (bufferIndex > 0) {
192 return &buffer[--bufferIndex];
193 }
194
195 return NULL;
196 }
197
198 /*
199 * This inherits from UObject so that
200 * it can be allocated by new and the
201 * constructor for PCEBuffer is called.
202 */
203 struct UCollationPCE : public UObject
204 {
205 PCEBuffer pceBuffer;
206 UCollationStrength strength;
207 UBool toShift;
208 UBool isShifted;
209 uint32_t variableTop;
210
211 UCollationPCE(UCollationElements *elems);
212 ~UCollationPCE();
213
214 void init(const UCollator *coll);
215
216 virtual UClassID getDynamicClassID() const;
217 static UClassID getStaticClassID();
218 };
219
220 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)
221
222 UCollationPCE::UCollationPCE(UCollationElements *elems)
223 {
224 init(elems->iteratordata_.coll);
225 }
226
227 void UCollationPCE::init(const UCollator *coll)
228 {
229 UErrorCode status = U_ZERO_ERROR;
230
231 strength = ucol_getStrength(coll);
232 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
233 isShifted = FALSE;
234 variableTop = coll->variableTopValue << 16;
235 }
236
237 UCollationPCE::~UCollationPCE()
238 {
239 // nothing to do
240 }
241
242
243 U_NAMESPACE_END
244
245
246 inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
247 {
248 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
249
250 // This is clean, but somewhat slow...
251 // We could apply the mask to ce and then
252 // just get all three orders...
253 switch(elems->pce->strength) {
254 default:
255 tertiary = ucol_tertiaryOrder(ce);
256 /* note fall-through */
257
258 case UCOL_SECONDARY:
259 secondary = ucol_secondaryOrder(ce);
260 /* note fall-through */
261
262 case UCOL_PRIMARY:
263 primary = ucol_primaryOrder(ce);
264 }
265
266 // **** This should probably handle continuations too. ****
267 // **** That means that we need 24 bits for the primary ****
268 // **** instead of the 16 that we're currently using. ****
269 // **** So we can lay out the 64 bits as: 24.12.12.16. ****
270 // **** Another complication with continuations is that ****
271 // **** the *second* CE is marked as a continuation, so ****
272 // **** we always have to peek ahead to know how long ****
273 // **** the primary is... ****
274 if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0)
275 || (elems->pce->isShifted && primary == 0)) {
276
277 if (primary == 0) {
278 return UCOL_IGNORABLE;
279 }
280
281 if (elems->pce->strength >= UCOL_QUATERNARY) {
282 quaternary = primary;
283 }
284
285 primary = secondary = tertiary = 0;
286 elems->pce->isShifted = TRUE;
287 } else {
288 if (elems->pce->strength >= UCOL_QUATERNARY) {
289 quaternary = 0xFFFF;
290 }
291
292 elems->pce->isShifted = FALSE;
293 }
294
295 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
296 }
297
298 U_CAPI void U_EXPORT2
299 uprv_init_pce(const UCollationElements *elems)
300 {
301 if (elems->pce != NULL) {
302 elems->pce->init(elems->iteratordata_.coll);
303 }
304 }
305
306
307
308 /* public methods ---------------------------------------------------- */
309
310 U_CAPI UCollationElements* U_EXPORT2
311 ucol_openElements(const UCollator *coll,
312 const UChar *text,
313 int32_t textLength,
314 UErrorCode *status)
315 {
316 if (U_FAILURE(*status)) {
317 return NULL;
318 }
319
320 UCollationElements *result = new UCollationElements;
321 if (result == NULL) {
322 *status = U_MEMORY_ALLOCATION_ERROR;
323 return NULL;
324 }
325
326 result->reset_ = TRUE;
327 result->isWritable = FALSE;
328 result->pce = NULL;
329
330 if (text == NULL) {
331 textLength = 0;
332 }
333 uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status);
334
335 return result;
336 }
337
338
339 U_CAPI void U_EXPORT2
340 ucol_closeElements(UCollationElements *elems)
341 {
342 if (elems != NULL) {
343 collIterate *ci = &elems->iteratordata_;
344
345 if (ci->extendCEs) {
346 uprv_free(ci->extendCEs);
347 }
348
349 if (ci->offsetBuffer) {
350 uprv_free(ci->offsetBuffer);
351 }
352
353 if (elems->isWritable && elems->iteratordata_.string != NULL)
354 {
355 uprv_free((UChar *)elems->iteratordata_.string);
356 }
357
358 if (elems->pce != NULL) {
359 delete elems->pce;
360 }
361
362 delete elems;
363 }
364 }
365
366 U_CAPI void U_EXPORT2
367 ucol_reset(UCollationElements *elems)
368 {
369 collIterate *ci = &(elems->iteratordata_);
370 elems->reset_ = TRUE;
371 ci->pos = ci->string;
372 if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
373 ci->endp = ci->string + u_strlen(ci->string);
374 }
375 ci->CEpos = ci->toReturn = ci->CEs;
376 ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
377 if (ci->coll->normalizationMode == UCOL_ON) {
378 ci->flags |= UCOL_ITER_NORM;
379 }
380
381 ci->writableBuffer.remove();
382 ci->fcdPosition = NULL;
383
384 //ci->offsetReturn = ci->offsetStore = NULL;
385 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
386 }
387
388 U_CAPI void U_EXPORT2
389 ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
390 {
391 if (U_FAILURE(*status)) {
392 return;
393 }
394
395 if (elems == NULL) {
396 *status = U_ILLEGAL_ARGUMENT_ERROR;
397 return;
398 }
399
400 elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
401 }
402
403 U_CAPI int32_t U_EXPORT2
404 ucol_next(UCollationElements *elems,
405 UErrorCode *status)
406 {
407 int32_t result;
408 if (U_FAILURE(*status)) {
409 return UCOL_NULLORDER;
410 }
411
412 elems->reset_ = FALSE;
413
414 result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll,
415 &elems->iteratordata_,
416 status);
417
418 if (result == UCOL_NO_MORE_CES) {
419 result = UCOL_NULLORDER;
420 }
421 return result;
422 }
423
424 U_CAPI int64_t U_EXPORT2
425 ucol_nextProcessed(UCollationElements *elems,
426 int32_t *ixLow,
427 int32_t *ixHigh,
428 UErrorCode *status)
429 {
430 const UCollator *coll = elems->iteratordata_.coll;
431 int64_t result = UCOL_IGNORABLE;
432 uint32_t low = 0, high = 0;
433
434 if (U_FAILURE(*status)) {
435 return UCOL_PROCESSED_NULLORDER;
436 }
437
438 if (elems->pce == NULL) {
439 elems->pce = new UCollationPCE(elems);
440 } else {
441 elems->pce->pceBuffer.reset();
442 }
443
444 elems->reset_ = FALSE;
445
446 do {
447 low = ucol_getOffset(elems);
448 uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
449 high = ucol_getOffset(elems);
450
451 if (ce == UCOL_NO_MORE_CES) {
452 result = UCOL_PROCESSED_NULLORDER;
453 break;
454 }
455
456 result = processCE(elems, ce);
457 } while (result == UCOL_IGNORABLE);
458
459 if (ixLow != NULL) {
460 *ixLow = low;
461 }
462
463 if (ixHigh != NULL) {
464 *ixHigh = high;
465 }
466
467 return result;
468 }
469
470 U_CAPI int32_t U_EXPORT2
471 ucol_previous(UCollationElements *elems,
472 UErrorCode *status)
473 {
474 if(U_FAILURE(*status)) {
475 return UCOL_NULLORDER;
476 }
477 else
478 {
479 int32_t result;
480
481 if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) {
482 if (elems->iteratordata_.endp == NULL) {
483 elems->iteratordata_.endp = elems->iteratordata_.string +
484 u_strlen(elems->iteratordata_.string);
485 elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
486 }
487 elems->iteratordata_.pos = elems->iteratordata_.endp;
488 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
489 }
490
491 elems->reset_ = FALSE;
492
493 result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll,
494 &(elems->iteratordata_),
495 status);
496
497 if (result == UCOL_NO_MORE_CES) {
498 result = UCOL_NULLORDER;
499 }
500
501 return result;
502 }
503 }
504
505 U_CAPI int64_t U_EXPORT2
506 ucol_previousProcessed(UCollationElements *elems,
507 int32_t *ixLow,
508 int32_t *ixHigh,
509 UErrorCode *status)
510 {
511 const UCollator *coll = elems->iteratordata_.coll;
512 int64_t result = UCOL_IGNORABLE;
513 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
514 // UCollationStrength strength = ucol_getStrength(coll);
515 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
516 // uint32_t variableTop = coll->variableTopValue;
517 int32_t low = 0, high = 0;
518
519 if (U_FAILURE(*status)) {
520 return UCOL_PROCESSED_NULLORDER;
521 }
522
523 if (elems->reset_ &&
524 (elems->iteratordata_.pos == elems->iteratordata_.string)) {
525 if (elems->iteratordata_.endp == NULL) {
526 elems->iteratordata_.endp = elems->iteratordata_.string +
527 u_strlen(elems->iteratordata_.string);
528 elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
529 }
530
531 elems->iteratordata_.pos = elems->iteratordata_.endp;
532 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
533 }
534
535 if (elems->pce == NULL) {
536 elems->pce = new UCollationPCE(elems);
537 } else {
538 //elems->pce->pceBuffer.reset();
539 }
540
541 elems->reset_ = FALSE;
542
543 while (elems->pce->pceBuffer.empty()) {
544 // buffer raw CEs up to non-ignorable primary
545 RCEBuffer rceb;
546 uint32_t ce;
547
548 // **** do we need to reset rceb, or will it always be empty at this point ****
549 do {
550 high = ucol_getOffset(elems);
551 ce = ucol_getPrevCE(coll, &elems->iteratordata_, status);
552 low = ucol_getOffset(elems);
553
554 if (ce == UCOL_NO_MORE_CES) {
555 if (! rceb.empty()) {
556 break;
557 }
558
559 goto finish;
560 }
561
562 rceb.put(ce, low, high);
563 } while ((ce & UCOL_PRIMARYMASK) == 0);
564
565 // process the raw CEs
566 while (! rceb.empty()) {
567 const RCEI *rcei = rceb.get();
568
569 result = processCE(elems, rcei->ce);
570
571 if (result != UCOL_IGNORABLE) {
572 elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
573 }
574 }
575 }
576
577 finish:
578 if (elems->pce->pceBuffer.empty()) {
579 // **** Is -1 the right value for ixLow, ixHigh? ****
580 if (ixLow != NULL) {
581 *ixLow = -1;
582 }
583
584 if (ixHigh != NULL) {
585 *ixHigh = -1
586 ;
587 }
588 return UCOL_PROCESSED_NULLORDER;
589 }
590
591 const PCEI *pcei = elems->pce->pceBuffer.get();
592
593 if (ixLow != NULL) {
594 *ixLow = pcei->low;
595 }
596
597 if (ixHigh != NULL) {
598 *ixHigh = pcei->high;
599 }
600
601 return pcei->ce;
602 }
603
604 U_CAPI int32_t U_EXPORT2
605 ucol_getMaxExpansion(const UCollationElements *elems,
606 int32_t order)
607 {
608 uint8_t result;
609
610 #if 0
611 UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
612 #else
613 const UCollator *coll = elems->iteratordata_.coll;
614 const uint32_t *start;
615 const uint32_t *limit;
616 const uint32_t *mid;
617 uint32_t strengthMask = 0;
618 uint32_t mOrder = (uint32_t) order;
619
620 switch (coll->strength)
621 {
622 default:
623 strengthMask |= UCOL_TERTIARYORDERMASK;
624 /* fall through */
625
626 case UCOL_SECONDARY:
627 strengthMask |= UCOL_SECONDARYORDERMASK;
628 /* fall through */
629
630 case UCOL_PRIMARY:
631 strengthMask |= UCOL_PRIMARYORDERMASK;
632 }
633
634 mOrder &= strengthMask;
635 start = (coll)->endExpansionCE;
636 limit = (coll)->lastEndExpansionCE;
637
638 while (start < limit - 1) {
639 mid = start + ((limit - start) >> 1);
640 if (mOrder <= (*mid & strengthMask)) {
641 limit = mid;
642 } else {
643 start = mid;
644 }
645 }
646
647 // FIXME: with a masked search, there might be more than one hit,
648 // so we need to look forward and backward from the match to find all
649 // of the hits...
650 if ((*start & strengthMask) == mOrder) {
651 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
652 } else if ((*limit & strengthMask) == mOrder) {
653 result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
654 } else if ((mOrder & 0xFFFF) == 0x00C0) {
655 result = 2;
656 } else {
657 result = 1;
658 }
659 #endif
660
661 return result;
662 }
663
664 U_CAPI void U_EXPORT2
665 ucol_setText( UCollationElements *elems,
666 const UChar *text,
667 int32_t textLength,
668 UErrorCode *status)
669 {
670 if (U_FAILURE(*status)) {
671 return;
672 }
673
674 if (elems->isWritable && elems->iteratordata_.string != NULL)
675 {
676 uprv_free((UChar *)elems->iteratordata_.string);
677 }
678
679 if (text == NULL) {
680 textLength = 0;
681 }
682
683 elems->isWritable = FALSE;
684
685 /* free offset buffer to avoid memory leak before initializing. */
686 ucol_freeOffsetBuffer(&(elems->iteratordata_));
687 /* Ensure that previously allocated extendCEs is freed before setting to NULL. */
688 if (elems->iteratordata_.extendCEs != NULL) {
689 uprv_free(elems->iteratordata_.extendCEs);
690 }
691 uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
692 &elems->iteratordata_, status);
693
694 elems->reset_ = TRUE;
695 }
696
697 U_CAPI int32_t U_EXPORT2
698 ucol_getOffset(const UCollationElements *elems)
699 {
700 const collIterate *ci = &(elems->iteratordata_);
701
702 if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
703 return ci->offsetRepeatValue;
704 }
705
706 if (ci->offsetReturn != NULL) {
707 return *ci->offsetReturn;
708 }
709
710 // while processing characters in normalization buffer getOffset will
711 // return the next non-normalized character.
712 // should be inline with the old implementation since the old codes uses
713 // nextDecomp in normalizer which also decomposes the string till the
714 // first base character is found.
715 if (ci->flags & UCOL_ITER_INNORMBUF) {
716 if (ci->fcdPosition == NULL) {
717 return 0;
718 }
719 return (int32_t)(ci->fcdPosition - ci->string);
720 }
721 else {
722 return (int32_t)(ci->pos - ci->string);
723 }
724 }
725
726 U_CAPI void U_EXPORT2
727 ucol_setOffset(UCollationElements *elems,
728 int32_t offset,
729 UErrorCode *status)
730 {
731 if (U_FAILURE(*status)) {
732 return;
733 }
734
735 // this methods will clean up any use of the writable buffer and points to
736 // the original string
737 collIterate *ci = &(elems->iteratordata_);
738 ci->pos = ci->string + offset;
739 ci->CEpos = ci->toReturn = ci->CEs;
740 if (ci->flags & UCOL_ITER_INNORMBUF) {
741 ci->flags = ci->origFlags;
742 }
743 if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
744 ci->endp = ci->string + u_strlen(ci->string);
745 ci->flags |= UCOL_ITER_HASLEN;
746 }
747 ci->fcdPosition = NULL;
748 elems->reset_ = FALSE;
749
750 ci->offsetReturn = NULL;
751 ci->offsetStore = ci->offsetBuffer;
752 ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
753 }
754
755 U_CAPI int32_t U_EXPORT2
756 ucol_primaryOrder (int32_t order)
757 {
758 order &= UCOL_PRIMARYMASK;
759 return (order >> UCOL_PRIMARYORDERSHIFT);
760 }
761
762 U_CAPI int32_t U_EXPORT2
763 ucol_secondaryOrder (int32_t order)
764 {
765 order &= UCOL_SECONDARYMASK;
766 return (order >> UCOL_SECONDARYORDERSHIFT);
767 }
768
769 U_CAPI int32_t U_EXPORT2
770 ucol_tertiaryOrder (int32_t order)
771 {
772 return (order & UCOL_TERTIARYMASK);
773 }
774
775
776 void ucol_freeOffsetBuffer(collIterate *s) {
777 if (s != NULL && s->offsetBuffer != NULL) {
778 uprv_free(s->offsetBuffer);
779 s->offsetBuffer = NULL;
780 s->offsetBufferSize = 0;
781 }
782 }
783
784 #endif /* #if !UCONFIG_NO_COLLATION */