]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / i18n / ucol.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_COLLATION
22
23 #include "unicode/coleitr.h"
24 #include "unicode/unorm.h"
25 #include "unicode/udata.h"
26 #include "unicode/ustring.h"
27
28 #include "ucol_imp.h"
29 #include "bocsu.h"
30
31 #include "unormimp.h"
32 #include "unorm_it.h"
33 #include "umutex.h"
34 #include "cmemory.h"
35 #include "ucln_in.h"
36 #include "cstring.h"
37 #include "utracimp.h"
38 #include "putilimp.h"
39 #include "uassert.h"
40
41 #ifdef UCOL_DEBUG
42 #include <stdio.h>
43 #endif
44
45 U_NAMESPACE_USE
46
47 /* added by synwee for trie manipulation*/
48 #define STAGE_1_SHIFT_ 10
49 #define STAGE_2_SHIFT_ 4
50 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
51 #define STAGE_3_MASK_ 0xF
52 #define LAST_BYTE_MASK_ 0xFF
53 #define SECOND_LAST_BYTE_SHIFT_ 8
54
55 #define ZERO_CC_LIMIT_ 0xC0
56
57 // this is static pointer to the normalizer fcdTrieIndex
58 // it is always the same between calls to u_cleanup
59 // and therefore writing to it is not synchronized.
60 // It is cleaned in ucol_cleanup
61 static const uint16_t *fcdTrieIndex=NULL;
62
63 // These are values from UCA required for
64 // implicit generation and supressing sort key compression
65 // they should regularly be in the UCA, but if one
66 // is running without UCA, it could be a problem
67 static const int32_t maxRegularPrimary = 0xA0;
68 static const int32_t minImplicitPrimary = 0xE0;
69 static const int32_t maxImplicitPrimary = 0xE4;
70
71 U_CDECL_BEGIN
72 static UBool U_CALLCONV
73 ucol_cleanup(void)
74 {
75 fcdTrieIndex = NULL;
76 return TRUE;
77 }
78
79 static int32_t U_CALLCONV
80 _getFoldingOffset(uint32_t data) {
81 return (int32_t)(data&0xFFFFFF);
82 }
83
84 U_CDECL_END
85
86 static
87 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
88 int32_t sourceLen, collIterate *s)
89 {
90 (s)->string = (s)->pos = (UChar *)(sourceString);
91 (s)->origFlags = 0;
92 (s)->flags = 0;
93 if (sourceLen >= 0) {
94 s->flags |= UCOL_ITER_HASLEN;
95 (s)->endp = (UChar *)sourceString+sourceLen;
96 }
97 else {
98 /* change to enable easier checking for end of string for fcdpositon */
99 (s)->endp = NULL;
100 }
101 (s)->extendCEs = NULL;
102 (s)->extendCEsSize = 0;
103 (s)->CEpos = (s)->toReturn = (s)->CEs;
104 (s)->offsetBuffer = NULL;
105 (s)->offsetBufferSize = 0;
106 (s)->offsetReturn = (s)->offsetStore = NULL;
107 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
108 (s)->writableBuffer = (s)->stackWritableBuffer;
109 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
110 (s)->coll = (collator);
111 (s)->fcdPosition = 0;
112 if(collator->normalizationMode == UCOL_ON) {
113 (s)->flags |= UCOL_ITER_NORM;
114 }
115 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
116 (s)->flags |= UCOL_HIRAGANA_Q;
117 }
118 (s)->iterator = NULL;
119 //(s)->iteratorIndex = 0;
120 }
121
122 U_CAPI void U_EXPORT2
123 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
124 int32_t sourceLen, collIterate *s){
125 /* Out-of-line version for use from other files. */
126 IInit_collIterate(collator, sourceString, sourceLen, s);
127 }
128
129
130 /**
131 * Backup the state of the collIterate struct data
132 * @param data collIterate to backup
133 * @param backup storage
134 */
135 static
136 inline void backupState(const collIterate *data, collIterateState *backup)
137 {
138 backup->fcdPosition = data->fcdPosition;
139 backup->flags = data->flags;
140 backup->origFlags = data->origFlags;
141 backup->pos = data->pos;
142 backup->bufferaddress = data->writableBuffer;
143 backup->buffersize = data->writableBufSize;
144 backup->iteratorMove = 0;
145 backup->iteratorIndex = 0;
146 if(data->iterator != NULL) {
147 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
148 backup->iteratorIndex = data->iterator->getState(data->iterator);
149 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
150 if(backup->iteratorIndex == UITER_NO_STATE) {
151 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
152 backup->iteratorMove++;
153 data->iterator->move(data->iterator, -1, UITER_CURRENT);
154 }
155 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
156 }
157 }
158 }
159
160 /**
161 * Loads the state into the collIterate struct data
162 * @param data collIterate to backup
163 * @param backup storage
164 * @param forwards boolean to indicate if forwards iteration is used,
165 * false indicates backwards iteration
166 */
167 static
168 inline void loadState(collIterate *data, const collIterateState *backup,
169 UBool forwards)
170 {
171 UErrorCode status = U_ZERO_ERROR;
172 data->flags = backup->flags;
173 data->origFlags = backup->origFlags;
174 if(data->iterator != NULL) {
175 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
176 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
177 if(backup->iteratorMove != 0) {
178 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
179 }
180 }
181 data->pos = backup->pos;
182
183 if ((data->flags & UCOL_ITER_INNORMBUF) &&
184 data->writableBuffer != backup->bufferaddress) {
185 /*
186 this is when a new buffer has been reallocated and we'll have to
187 calculate the new position.
188 note the new buffer has to contain the contents of the old buffer.
189 */
190 if (forwards) {
191 data->pos = data->writableBuffer +
192 (data->pos - backup->bufferaddress);
193 }
194 else {
195 /* backwards direction */
196 uint32_t temp = backup->buffersize -
197 (data->pos - backup->bufferaddress);
198 data->pos = data->writableBuffer + (data->writableBufSize - temp);
199 }
200 }
201 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
202 /*
203 this is alittle tricky.
204 if we are initially not in the normalization buffer, even if we
205 normalize in the later stage, the data in the buffer will be
206 ignored, since we skip back up to the data string.
207 however if we are already in the normalization buffer, any
208 further normalization will pull data into the normalization
209 buffer and modify the fcdPosition.
210 since we are keeping the data in the buffer for use, the
211 fcdPosition can not be reverted back.
212 arrgghh....
213 */
214 data->fcdPosition = backup->fcdPosition;
215 }
216 }
217
218
219 /*
220 * collIter_eos()
221 * Checks for a collIterate being positioned at the end of
222 * its source string.
223 *
224 */
225 static
226 inline UBool collIter_eos(collIterate *s) {
227 if(s->flags & UCOL_USE_ITERATOR) {
228 return !(s->iterator->hasNext(s->iterator));
229 }
230 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
231 // Null terminated string, but not at null, so not at end.
232 // Whether in main or normalization buffer doesn't matter.
233 return FALSE;
234 }
235
236 // String with length. Can't be in normalization buffer, which is always
237 // null termintated.
238 if (s->flags & UCOL_ITER_HASLEN) {
239 return (s->pos == s->endp);
240 }
241
242 // We are at a null termination, could be either normalization buffer or main string.
243 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
244 // At null at end of main string.
245 return TRUE;
246 }
247
248 // At null at end of normalization buffer. Need to check whether there there are
249 // any characters left in the main buffer.
250 if(s->origFlags & UCOL_USE_ITERATOR) {
251 return !(s->iterator->hasNext(s->iterator));
252 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
253 // Null terminated main string. fcdPosition is the 'return' position into main buf.
254 return (*s->fcdPosition == 0);
255 }
256 else {
257 // Main string with an end pointer.
258 return s->fcdPosition == s->endp;
259 }
260 }
261
262 /*
263 * collIter_bos()
264 * Checks for a collIterate being positioned at the start of
265 * its source string.
266 *
267 */
268 static
269 inline UBool collIter_bos(collIterate *source) {
270 // if we're going backwards, we need to know whether there is more in the
271 // iterator, even if we are in the side buffer
272 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
273 return !source->iterator->hasPrevious(source->iterator);
274 }
275 if (source->pos <= source->string ||
276 ((source->flags & UCOL_ITER_INNORMBUF) &&
277 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
278 return TRUE;
279 }
280 return FALSE;
281 }
282
283 /*static
284 inline UBool collIter_SimpleBos(collIterate *source) {
285 // if we're going backwards, we need to know whether there is more in the
286 // iterator, even if we are in the side buffer
287 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
288 return !source->iterator->hasPrevious(source->iterator);
289 }
290 if (source->pos == source->string) {
291 return TRUE;
292 }
293 return FALSE;
294 }*/
295 //return (data->pos == data->string) ||
296
297
298 /**
299 * Checks and free writable buffer if it is not the original stack buffer
300 * in collIterate. This function does not reassign the writable buffer.
301 * @param data collIterate struct to determine and free the writable buffer
302 */
303 static
304 inline void freeHeapWritableBuffer(collIterate *data)
305 {
306 if (data->writableBuffer != data->stackWritableBuffer) {
307 uprv_free(data->writableBuffer);
308 }
309 }
310
311
312 /****************************************************************************/
313 /* Following are the open/close functions */
314 /* */
315 /****************************************************************************/
316
317 static UCollator*
318 ucol_initFromBinary(const uint8_t *bin, int32_t length,
319 const UCollator *base,
320 UCollator *fillIn,
321 UErrorCode *status)
322 {
323 UCollator *result = fillIn;
324 if(U_FAILURE(*status)) {
325 return NULL;
326 }
327 /*
328 if(base == NULL) {
329 // we don't support null base yet
330 *status = U_ILLEGAL_ARGUMENT_ERROR;
331 return NULL;
332 }
333 */
334 // We need these and we could be running without UCA
335 uprv_uca_initImplicitConstants(status);
336 UCATableHeader *colData = (UCATableHeader *)bin;
337 // do we want version check here? We're trying to figure out whether collators are compatible
338 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
339 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
340 colData->version[0] != UCOL_BUILDER_VERSION)
341 {
342 *status = U_COLLATOR_VERSION_MISMATCH;
343 return NULL;
344 }
345 else {
346 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
347 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
348 if(U_FAILURE(*status)){
349 return NULL;
350 }
351 result->hasRealData = TRUE;
352 }
353 else {
354 if(base) {
355 result = ucol_initCollator(base->image, result, base, status);
356 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
357 if(U_FAILURE(*status)){
358 return NULL;
359 }
360 result->hasRealData = FALSE;
361 }
362 else {
363 *status = U_USELESS_COLLATOR_ERROR;
364 return NULL;
365 }
366 }
367 result->freeImageOnClose = FALSE;
368 }
369 result->actualLocale = NULL;
370 result->validLocale = NULL;
371 result->requestedLocale = NULL;
372 result->rules = NULL;
373 result->rulesLength = 0;
374 result->freeRulesOnClose = FALSE;
375 result->ucaRules = NULL;
376 return result;
377 }
378
379 U_CAPI UCollator* U_EXPORT2
380 ucol_openBinary(const uint8_t *bin, int32_t length,
381 const UCollator *base,
382 UErrorCode *status)
383 {
384 return ucol_initFromBinary(bin, length, base, NULL, status);
385 }
386
387 U_CAPI int32_t U_EXPORT2
388 ucol_cloneBinary(const UCollator *coll,
389 uint8_t *buffer, int32_t capacity,
390 UErrorCode *status)
391 {
392 int32_t length = 0;
393 if(U_FAILURE(*status)) {
394 return length;
395 }
396 if(capacity < 0) {
397 *status = U_ILLEGAL_ARGUMENT_ERROR;
398 return length;
399 }
400 if(coll->hasRealData == TRUE) {
401 length = coll->image->size;
402 if(length <= capacity) {
403 uprv_memcpy(buffer, coll->image, length);
404 } else {
405 *status = U_BUFFER_OVERFLOW_ERROR;
406 }
407 } else {
408 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
409 if(length <= capacity) {
410 /* build the UCATableHeader with minimal entries */
411 /* do not copy the header from the UCA file because its values are wrong! */
412 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
413
414 /* reset everything */
415 uprv_memset(buffer, 0, length);
416
417 /* set the tailoring-specific values */
418 UCATableHeader *myData = (UCATableHeader *)buffer;
419 myData->size = length;
420
421 /* offset for the options, the only part of the data that is present after the header */
422 myData->options = sizeof(UCATableHeader);
423
424 /* need to always set the expansion value for an upper bound of the options */
425 myData->expansion = myData->options + sizeof(UColOptionSet);
426
427 myData->magic = UCOL_HEADER_MAGIC;
428 myData->isBigEndian = U_IS_BIG_ENDIAN;
429 myData->charSetFamily = U_CHARSET_FAMILY;
430
431 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
432 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
433
434 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
435 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
436 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
437 myData->jamoSpecial = coll->image->jamoSpecial;
438
439 /* copy the collator options */
440 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
441 } else {
442 *status = U_BUFFER_OVERFLOW_ERROR;
443 }
444 }
445 return length;
446 }
447
448 U_CAPI UCollator* U_EXPORT2
449 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
450 {
451 UCollator * localCollator;
452 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
453 char *stackBufferChars = (char *)stackBuffer;
454 int32_t imageSize = 0;
455 int32_t rulesSize = 0;
456 int32_t rulesPadding = 0;
457 uint8_t *image;
458 UChar *rules;
459 UBool colAllocated = FALSE;
460 UBool imageAllocated = FALSE;
461
462 if (status == NULL || U_FAILURE(*status)){
463 return 0;
464 }
465 if ((stackBuffer && !pBufferSize) || !coll){
466 *status = U_ILLEGAL_ARGUMENT_ERROR;
467 return 0;
468 }
469 if (coll->rules && coll->freeRulesOnClose) {
470 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
471 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
472 bufferSizeNeeded += rulesSize + rulesPadding;
473 }
474
475 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
476 *pBufferSize = bufferSizeNeeded;
477 return 0;
478 }
479
480 /* Pointers on 64-bit platforms need to be aligned
481 * on a 64-bit boundry in memory.
482 */
483 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
484 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
485 if (*pBufferSize > offsetUp) {
486 *pBufferSize -= offsetUp;
487 stackBufferChars += offsetUp;
488 }
489 else {
490 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
491 *pBufferSize = 1;
492 }
493 }
494 stackBuffer = (void *)stackBufferChars;
495
496 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
497 /* allocate one here...*/
498 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
499 // Null pointer check.
500 if (stackBufferChars == NULL) {
501 *status = U_MEMORY_ALLOCATION_ERROR;
502 return NULL;
503 }
504 colAllocated = TRUE;
505 if (U_SUCCESS(*status)) {
506 *status = U_SAFECLONE_ALLOCATED_WARNING;
507 }
508 }
509 localCollator = (UCollator *)stackBufferChars;
510 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
511 {
512 UErrorCode tempStatus = U_ZERO_ERROR;
513 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
514 }
515 if (coll->freeImageOnClose) {
516 image = (uint8_t *)uprv_malloc(imageSize);
517 // Null pointer check
518 if (image == NULL) {
519 *status = U_MEMORY_ALLOCATION_ERROR;
520 return NULL;
521 }
522 ucol_cloneBinary(coll, image, imageSize, status);
523 imageAllocated = TRUE;
524 }
525 else {
526 image = (uint8_t *)coll->image;
527 }
528 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
529 if (U_FAILURE(*status)) {
530 return NULL;
531 }
532
533 if (coll->rules) {
534 if (coll->freeRulesOnClose) {
535 localCollator->rules = u_strcpy(rules, coll->rules);
536 //bufferEnd += rulesSize;
537 }
538 else {
539 localCollator->rules = coll->rules;
540 }
541 localCollator->freeRulesOnClose = FALSE;
542 localCollator->rulesLength = coll->rulesLength;
543 }
544
545 int32_t i;
546 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
547 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
548 }
549 // zero copies of pointers
550 localCollator->actualLocale = NULL;
551 localCollator->validLocale = NULL;
552 localCollator->requestedLocale = NULL;
553 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
554 localCollator->freeOnClose = colAllocated;
555 localCollator->freeImageOnClose = imageAllocated;
556 return localCollator;
557 }
558
559 U_CAPI void U_EXPORT2
560 ucol_close(UCollator *coll)
561 {
562 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
563 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
564 if(coll != NULL) {
565 // these are always owned by each UCollator struct,
566 // so we always free them
567 if(coll->validLocale != NULL) {
568 uprv_free(coll->validLocale);
569 }
570 if(coll->actualLocale != NULL) {
571 uprv_free(coll->actualLocale);
572 }
573 if(coll->requestedLocale != NULL) {
574 uprv_free(coll->requestedLocale);
575 }
576 if(coll->latinOneCEs != NULL) {
577 uprv_free(coll->latinOneCEs);
578 }
579 if(coll->options != NULL && coll->freeOptionsOnClose) {
580 uprv_free(coll->options);
581 }
582 if(coll->rules != NULL && coll->freeRulesOnClose) {
583 uprv_free((UChar *)coll->rules);
584 }
585 if(coll->image != NULL && coll->freeImageOnClose) {
586 uprv_free((UCATableHeader *)coll->image);
587 }
588
589 /* Here, it would be advisable to close: */
590 /* - UData for UCA (unless we stuff it in the root resb */
591 /* Again, do we need additional housekeeping... HMMM! */
592 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
593 if(coll->freeOnClose){
594 /* for safeClone, if freeOnClose is FALSE,
595 don't free the other instance data */
596 uprv_free(coll);
597 }
598 }
599 UTRACE_EXIT();
600 }
601
602 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
603 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
604 U_CFUNC uint8_t* U_EXPORT2
605 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
606 {
607 uint8_t *result = NULL;
608 if(U_FAILURE(*status)) {
609 return NULL;
610 }
611 if(coll->hasRealData == TRUE) {
612 *length = coll->image->size;
613 result = (uint8_t *)uprv_malloc(*length);
614 /* test for NULL */
615 if (result == NULL) {
616 *status = U_MEMORY_ALLOCATION_ERROR;
617 return NULL;
618 }
619 uprv_memcpy(result, coll->image, *length);
620 } else {
621 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
622 result = (uint8_t *)uprv_malloc(*length);
623 /* test for NULL */
624 if (result == NULL) {
625 *status = U_MEMORY_ALLOCATION_ERROR;
626 return NULL;
627 }
628
629 /* build the UCATableHeader with minimal entries */
630 /* do not copy the header from the UCA file because its values are wrong! */
631 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
632
633 /* reset everything */
634 uprv_memset(result, 0, *length);
635
636 /* set the tailoring-specific values */
637 UCATableHeader *myData = (UCATableHeader *)result;
638 myData->size = *length;
639
640 /* offset for the options, the only part of the data that is present after the header */
641 myData->options = sizeof(UCATableHeader);
642
643 /* need to always set the expansion value for an upper bound of the options */
644 myData->expansion = myData->options + sizeof(UColOptionSet);
645
646 myData->magic = UCOL_HEADER_MAGIC;
647 myData->isBigEndian = U_IS_BIG_ENDIAN;
648 myData->charSetFamily = U_CHARSET_FAMILY;
649
650 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
651 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
652
653 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
654 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
655 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
656 myData->jamoSpecial = coll->image->jamoSpecial;
657
658 /* copy the collator options */
659 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
660 }
661 return result;
662 }
663
664 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
665 if(U_FAILURE(*status)) {
666 return;
667 }
668 result->caseFirst = (UColAttributeValue)opts->caseFirst;
669 result->caseLevel = (UColAttributeValue)opts->caseLevel;
670 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
671 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
672 result->strength = (UColAttributeValue)opts->strength;
673 result->variableTopValue = opts->variableTopValue;
674 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
675 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
676 result->numericCollation = (UColAttributeValue)opts->numericCollation;
677
678 result->caseFirstisDefault = TRUE;
679 result->caseLevelisDefault = TRUE;
680 result->frenchCollationisDefault = TRUE;
681 result->normalizationModeisDefault = TRUE;
682 result->strengthisDefault = TRUE;
683 result->variableTopValueisDefault = TRUE;
684 result->hiraganaQisDefault = TRUE;
685 result->numericCollationisDefault = TRUE;
686
687 ucol_updateInternalState(result, status);
688
689 result->options = opts;
690 }
691
692
693 /**
694 * Approximate determination if a character is at a contraction end.
695 * Guaranteed to be TRUE if a character is at the end of a contraction,
696 * otherwise it is not deterministic.
697 * @param c character to be determined
698 * @param coll collator
699 */
700 static
701 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
702 if (c < coll->minContrEndCP) {
703 return FALSE;
704 }
705
706 int32_t hash = c;
707 uint8_t htbyte;
708 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
709 if (U16_IS_TRAIL(c)) {
710 return TRUE;
711 }
712 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
713 }
714 htbyte = coll->contrEndCP[hash>>3];
715 return (((htbyte >> (hash & 7)) & 1) == 1);
716 }
717
718
719
720 /*
721 * i_getCombiningClass()
722 * A fast, at least partly inline version of u_getCombiningClass()
723 * This is a candidate for further optimization. Used heavily
724 * in contraction processing.
725 */
726 static
727 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
728 uint8_t sCC = 0;
729 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
730 sCC = u_getCombiningClass(c);
731 }
732 return sCC;
733 }
734
735 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
736 UChar c;
737 UCollator *result = fillIn;
738 if(U_FAILURE(*status) || image == NULL) {
739 return NULL;
740 }
741
742 if(result == NULL) {
743 result = (UCollator *)uprv_malloc(sizeof(UCollator));
744 if(result == NULL) {
745 *status = U_MEMORY_ALLOCATION_ERROR;
746 return result;
747 }
748 result->freeOnClose = TRUE;
749 } else {
750 result->freeOnClose = FALSE;
751 }
752
753 // init FCD data
754 if (fcdTrieIndex == NULL) {
755 // The result is constant, until the library is reloaded.
756 fcdTrieIndex = unorm_getFCDTrie(status);
757 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
758 }
759
760 result->image = image;
761 result->mapping.getFoldingOffset = _getFoldingOffset;
762 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
763 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
764 if(U_FAILURE(*status)) {
765 if(result->freeOnClose == TRUE) {
766 uprv_free(result);
767 result = NULL;
768 }
769 return result;
770 }
771
772 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
773 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
774 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
775 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
776 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
777
778 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
779 result->freeOptionsOnClose = FALSE;
780
781 /* set attributes */
782 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
783 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
784 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
785 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
786 result->strength = (UColAttributeValue)result->options->strength;
787 result->variableTopValue = result->options->variableTopValue;
788 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
789 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
790 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
791
792 result->caseFirstisDefault = TRUE;
793 result->caseLevelisDefault = TRUE;
794 result->frenchCollationisDefault = TRUE;
795 result->normalizationModeisDefault = TRUE;
796 result->strengthisDefault = TRUE;
797 result->variableTopValueisDefault = TRUE;
798 result->alternateHandlingisDefault = TRUE;
799 result->hiraganaQisDefault = TRUE;
800 result->numericCollationisDefault = TRUE;
801
802 /*result->scriptOrder = NULL;*/
803
804 result->rules = NULL;
805 result->rulesLength = 0;
806 result->freeRulesOnClose = FALSE;
807
808 /* get the version info from UCATableHeader and populate the Collator struct*/
809 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
810 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
811 result->dataVersion[2] = 0;
812 result->dataVersion[3] = 0;
813
814 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
815 result->minUnsafeCP = 0;
816 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
817 if (ucol_unsafeCP(c, result)) break;
818 }
819 result->minUnsafeCP = c;
820
821 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
822 result->minContrEndCP = 0;
823 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
824 if (ucol_contractionEndCP(c, result)) break;
825 }
826 result->minContrEndCP = c;
827
828 /* max expansion tables */
829 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
830 result->image->endExpansionCE);
831 result->lastEndExpansionCE = result->endExpansionCE +
832 result->image->endExpansionCECount - 1;
833 result->expansionCESize = (uint8_t*)result->image +
834 result->image->expansionCESize;
835
836
837 //result->errorCode = *status;
838
839 result->latinOneCEs = NULL;
840
841 result->latinOneRegenTable = FALSE;
842 result->latinOneFailed = FALSE;
843 result->UCA = UCA;
844
845 ucol_updateInternalState(result, status);
846
847 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
848 result->ucaRules = NULL;
849 result->actualLocale = NULL;
850 result->validLocale = NULL;
851 result->requestedLocale = NULL;
852 result->hasRealData = FALSE; // real data lives in .dat file...
853 result->freeImageOnClose = FALSE;
854
855 return result;
856 }
857
858 /* new Mark's code */
859
860 /**
861 * For generation of Implicit CEs
862 * @author Davis
863 *
864 * Cleaned up so that changes can be made more easily.
865 * Old values:
866 # First Implicit: E26A792D
867 # Last Implicit: E3DC70C0
868 # First CJK: E0030300
869 # Last CJK: E0A9DD00
870 # First CJK_A: E0A9DF00
871 # Last CJK_A: E0DE3100
872 */
873 /* Following is a port of Mark's code for new treatment of implicits.
874 * It is positioned here, since ucol_initUCA need to initialize the
875 * variables below according to the data in the fractional UCA.
876 */
877
878 /**
879 * Function used to:
880 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
881 * b) bump any non-CJK characters by 10FFFF.
882 * The relevant blocks are:
883 * A: 4E00..9FFF; CJK Unified Ideographs
884 * F900..FAFF; CJK Compatibility Ideographs
885 * B: 3400..4DBF; CJK Unified Ideographs Extension A
886 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
887 * As long as
888 * no new B characters are allocated between 4E00 and FAFF, and
889 * no new A characters are outside of this range,
890 * (very high probability) this simple code will work.
891 * The reordered blocks are:
892 * Block1 is CJK
893 * Block2 is CJK_COMPAT_USED
894 * Block3 is CJK_A
895 * (all contiguous)
896 * Any other CJK gets its normal code point
897 * Any non-CJK gets +10FFFF
898 * When we reorder Block1, we make sure that it is at the very start,
899 * so that it will use a 3-byte form.
900 * Warning: the we only pick up the compatibility characters that are
901 * NOT decomposed, so that block is smaller!
902 */
903
904 // CONSTANTS
905 static const UChar32
906 NON_CJK_OFFSET = 0x110000,
907 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
908
909 /**
910 * Precomputed by constructor
911 */
912 static int32_t
913 final3Multiplier = 0,
914 final4Multiplier = 0,
915 final3Count = 0,
916 final4Count = 0,
917 medialCount = 0,
918 min3Primary = 0,
919 min4Primary = 0,
920 max4Primary = 0,
921 minTrail = 0,
922 maxTrail = 0,
923 max3Trail = 0,
924 max4Trail = 0,
925 min4Boundary = 0;
926
927 static const UChar32
928 CJK_BASE = 0x4E00,
929 CJK_LIMIT = 0x9FFF+1,
930 CJK_COMPAT_USED_BASE = 0xFA0E,
931 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
932 CJK_A_BASE = 0x3400,
933 CJK_A_LIMIT = 0x4DBF+1,
934 CJK_B_BASE = 0x20000,
935 CJK_B_LIMIT = 0x2A6DF+1;
936
937 static UChar32 swapCJK(UChar32 i) {
938
939 if (i >= CJK_BASE) {
940 if (i < CJK_LIMIT) return i - CJK_BASE;
941
942 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
943
944 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
945 + (CJK_LIMIT - CJK_BASE);
946 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
947
948 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
949
950 return i + NON_CJK_OFFSET; // non-CJK
951 }
952 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
953
954 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
955 + (CJK_LIMIT - CJK_BASE)
956 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
957 return i + NON_CJK_OFFSET; // non-CJK
958 }
959
960 U_CAPI UChar32 U_EXPORT2
961 uprv_uca_getRawFromCodePoint(UChar32 i) {
962 return swapCJK(i)+1;
963 }
964
965 U_CAPI UChar32 U_EXPORT2
966 uprv_uca_getCodePointFromRaw(UChar32 i) {
967 i--;
968 UChar32 result = 0;
969 if(i >= NON_CJK_OFFSET) {
970 result = i - NON_CJK_OFFSET;
971 } else if(i >= CJK_B_BASE) {
972 result = i;
973 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
974 if(i < CJK_LIMIT - CJK_BASE) {
975 result = i + CJK_BASE;
976 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
977 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
978 } else {
979 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
980 }
981 } else {
982 result = -1;
983 }
984 return result;
985 }
986
987 // GET IMPLICIT PRIMARY WEIGHTS
988 // Return value is left justified primary key
989 U_CAPI uint32_t U_EXPORT2
990 uprv_uca_getImplicitFromRaw(UChar32 cp) {
991 /*
992 if (cp < 0 || cp > UCOL_MAX_INPUT) {
993 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
994 }
995 */
996 int32_t last0 = cp - min4Boundary;
997 if (last0 < 0) {
998 int32_t last1 = cp / final3Count;
999 last0 = cp % final3Count;
1000
1001 int32_t last2 = last1 / medialCount;
1002 last1 %= medialCount;
1003
1004 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1005 last1 = minTrail + last1; // offset
1006 last2 = min3Primary + last2; // offset
1007 /*
1008 if (last2 >= min4Primary) {
1009 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1010 }
1011 */
1012 return (last2 << 24) + (last1 << 16) + (last0 << 8);
1013 } else {
1014 int32_t last1 = last0 / final4Count;
1015 last0 %= final4Count;
1016
1017 int32_t last2 = last1 / medialCount;
1018 last1 %= medialCount;
1019
1020 int32_t last3 = last2 / medialCount;
1021 last2 %= medialCount;
1022
1023 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1024 last1 = minTrail + last1; // offset
1025 last2 = minTrail + last2; // offset
1026 last3 = min4Primary + last3; // offset
1027 /*
1028 if (last3 > max4Primary) {
1029 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1030 }
1031 */
1032 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1033 }
1034 }
1035
1036 static uint32_t U_EXPORT2
1037 uprv_uca_getImplicitPrimary(UChar32 cp) {
1038 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1039
1040 cp = swapCJK(cp);
1041 cp++;
1042 // we now have a range of numbers from 0 to 21FFFF.
1043
1044 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1045
1046 return uprv_uca_getImplicitFromRaw(cp);
1047 }
1048
1049 /**
1050 * Converts implicit CE into raw integer ("code point")
1051 * @param implicit
1052 * @return -1 if illegal format
1053 */
1054 U_CAPI UChar32 U_EXPORT2
1055 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1056 UChar32 result;
1057 UChar32 b3 = implicit & 0xFF;
1058 UChar32 b2 = (implicit >> 8) & 0xFF;
1059 UChar32 b1 = (implicit >> 16) & 0xFF;
1060 UChar32 b0 = (implicit >> 24) & 0xFF;
1061
1062 // simple parameter checks
1063 if (b0 < min3Primary || b0 > max4Primary
1064 || b1 < minTrail || b1 > maxTrail)
1065 return -1;
1066 // normal offsets
1067 b1 -= minTrail;
1068
1069 // take care of the final values, and compose
1070 if (b0 < min4Primary) {
1071 if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1072 return -1;
1073 b2 -= minTrail;
1074 UChar32 remainder = b2 % final3Multiplier;
1075 if (remainder != 0)
1076 return -1;
1077 b0 -= min3Primary;
1078 b2 /= final3Multiplier;
1079 result = ((b0 * medialCount) + b1) * final3Count + b2;
1080 } else {
1081 if (b2 < minTrail || b2 > maxTrail
1082 || b3 < minTrail || b3 > max4Trail)
1083 return -1;
1084 b2 -= minTrail;
1085 b3 -= minTrail;
1086 UChar32 remainder = b3 % final4Multiplier;
1087 if (remainder != 0)
1088 return -1;
1089 b3 /= final4Multiplier;
1090 b0 -= min4Primary;
1091 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1092 }
1093 // final check
1094 if (result < 0 || result > UCOL_MAX_INPUT)
1095 return -1;
1096 return result;
1097 }
1098
1099
1100 static inline int32_t divideAndRoundUp(int a, int b) {
1101 return 1 + (a-1)/b;
1102 }
1103
1104 /* this function is either called from initUCA or from genUCA before
1105 * doing canonical closure for the UCA.
1106 */
1107
1108 /**
1109 * Set up to generate implicits.
1110 * @param minPrimary
1111 * @param maxPrimary
1112 * @param minTrail final byte
1113 * @param maxTrail final byte
1114 * @param gap3 the gap we leave for tailoring for 3-byte forms
1115 * @param gap4 the gap we leave for tailoring for 4-byte forms
1116 */
1117 static void initImplicitConstants(int minPrimary, int maxPrimary,
1118 int minTrailIn, int maxTrailIn,
1119 int gap3, int primaries3count,
1120 UErrorCode *status) {
1121 // some simple parameter checks
1122 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1123 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1124 || (primaries3count < 1))
1125 {
1126 *status = U_ILLEGAL_ARGUMENT_ERROR;
1127 return;
1128 };
1129
1130 minTrail = minTrailIn;
1131 maxTrail = maxTrailIn;
1132
1133 min3Primary = minPrimary;
1134 max4Primary = maxPrimary;
1135 // compute constants for use later.
1136 // number of values we can use in trailing bytes
1137 // leave room for empty values between AND above, e.g. if gap = 2
1138 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1139 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1140 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1141 final3Multiplier = gap3 + 1;
1142 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1143 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1144
1145 // medials can use full range
1146 medialCount = (maxTrail - minTrail + 1);
1147 // find out how many values fit in each form
1148 int32_t threeByteCount = medialCount * final3Count;
1149 // now determine where the 3/4 boundary is.
1150 // we use 3 bytes below the boundary, and 4 above
1151 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1152 int32_t primaries4count = primariesAvailable - primaries3count;
1153
1154
1155 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1156 min4Primary = minPrimary + primaries3count;
1157 min4Boundary = min3ByteCoverage;
1158 // Now expand out the multiplier for the 4 bytes, and redo.
1159
1160 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1161 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1162 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1163 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1164 if (gap4 < 1) {
1165 *status = U_ILLEGAL_ARGUMENT_ERROR;
1166 return;
1167 }
1168 final4Multiplier = gap4 + 1;
1169 final4Count = neededPerFinalByte;
1170 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1171 }
1172
1173 /**
1174 * Supply parameters for generating implicit CEs
1175 */
1176 U_CAPI void U_EXPORT2
1177 uprv_uca_initImplicitConstants(UErrorCode *status) {
1178 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1179 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1180 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1181 }
1182
1183
1184 /* collIterNormalize Incremental Normalization happens here. */
1185 /* pick up the range of chars identifed by FCD, */
1186 /* normalize it into the collIterate's writable buffer, */
1187 /* switch the collIterate's state to use the writable buffer. */
1188 /* */
1189 static
1190 void collIterNormalize(collIterate *collationSource)
1191 {
1192 UErrorCode status = U_ZERO_ERROR;
1193
1194 int32_t normLen;
1195 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1196 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1197
1198 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1199 srcP, (int32_t)(endP - srcP),
1200 FALSE, 0,
1201 &status);
1202 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1203 // reallocate and terminate
1204 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1205 &collationSource->writableBuffer,
1206 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1207 0)
1208 ) {
1209 #ifdef UCOL_DEBUG
1210 fprintf(stderr, "collIterNormalize(), out of memory\n");
1211 #endif
1212 return;
1213 }
1214 status = U_ZERO_ERROR;
1215 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1216 srcP, (int32_t)(endP - srcP),
1217 FALSE, 0,
1218 &status);
1219 }
1220 if (U_FAILURE(status)) {
1221 #ifdef UCOL_DEBUG
1222 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1223 #endif
1224 return;
1225 }
1226
1227 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1228 collationSource->flags |= UCOL_ITER_ALLOCATED;
1229 }
1230 collationSource->pos = collationSource->writableBuffer;
1231 collationSource->origFlags = collationSource->flags;
1232 collationSource->flags |= UCOL_ITER_INNORMBUF;
1233 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1234 }
1235
1236
1237 // This function takes the iterator and extracts normalized stuff up to the next boundary
1238 // It is similar in the end results to the collIterNormalize, but for the cases when we
1239 // use an iterator
1240 /*static
1241 inline void normalizeIterator(collIterate *collationSource) {
1242 UErrorCode status = U_ZERO_ERROR;
1243 UBool wasNormalized = FALSE;
1244 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1245 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1246 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1247 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1248 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1249 // reallocate and terminate
1250 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1251 &collationSource->writableBuffer,
1252 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1253 0)
1254 ) {
1255 #ifdef UCOL_DEBUG
1256 fprintf(stderr, "normalizeIterator(), out of memory\n");
1257 #endif
1258 return;
1259 }
1260 status = U_ZERO_ERROR;
1261 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1262 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1263 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1264 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1265 }
1266 // Terminate the buffer - we already checked that it is big enough
1267 collationSource->writableBuffer[normLen] = 0;
1268 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1269 collationSource->flags |= UCOL_ITER_ALLOCATED;
1270 }
1271 collationSource->pos = collationSource->writableBuffer;
1272 collationSource->origFlags = collationSource->flags;
1273 collationSource->flags |= UCOL_ITER_INNORMBUF;
1274 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1275 }*/
1276
1277
1278 /* Incremental FCD check and normalize */
1279 /* Called from getNextCE when normalization state is suspect. */
1280 /* When entering, the state is known to be this: */
1281 /* o We are working in the main buffer of the collIterate, not the side */
1282 /* writable buffer. When in the side buffer, normalization mode is always off, */
1283 /* so we won't get here. */
1284 /* o The leading combining class from the current character is 0 or */
1285 /* the trailing combining class of the previous char was zero. */
1286 /* True because the previous call to this function will have always exited */
1287 /* that way, and we get called for every char where cc might be non-zero. */
1288 static
1289 inline UBool collIterFCD(collIterate *collationSource) {
1290 UChar c, c2;
1291 const UChar *srcP, *endP;
1292 uint8_t leadingCC;
1293 uint8_t prevTrailingCC = 0;
1294 uint16_t fcd;
1295 UBool needNormalize = FALSE;
1296
1297 srcP = collationSource->pos-1;
1298
1299 if (collationSource->flags & UCOL_ITER_HASLEN) {
1300 endP = collationSource->endp;
1301 } else {
1302 endP = NULL;
1303 }
1304
1305 // Get the trailing combining class of the current character. If it's zero,
1306 // we are OK.
1307 c = *srcP++;
1308 /* trie access */
1309 fcd = unorm_getFCD16(fcdTrieIndex, c);
1310 if (fcd != 0) {
1311 if (U16_IS_LEAD(c)) {
1312 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1313 ++srcP;
1314 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1315 } else {
1316 fcd = 0;
1317 }
1318 }
1319
1320 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1321
1322 if (prevTrailingCC != 0) {
1323 // The current char has a non-zero trailing CC. Scan forward until we find
1324 // a char with a leading cc of zero.
1325 while (endP == NULL || srcP != endP)
1326 {
1327 const UChar *savedSrcP = srcP;
1328
1329 c = *srcP++;
1330 /* trie access */
1331 fcd = unorm_getFCD16(fcdTrieIndex, c);
1332 if (fcd != 0 && U16_IS_LEAD(c)) {
1333 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1334 ++srcP;
1335 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1336 } else {
1337 fcd = 0;
1338 }
1339 }
1340 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1341 if (leadingCC == 0) {
1342 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1343 // back up over it. (Could be surrogate pair!)
1344 break;
1345 }
1346
1347 if (leadingCC < prevTrailingCC) {
1348 needNormalize = TRUE;
1349 }
1350
1351 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1352 }
1353 }
1354 }
1355
1356 collationSource->fcdPosition = (UChar *)srcP;
1357
1358 return needNormalize;
1359 }
1360
1361 /****************************************************************************/
1362 /* Following are the CE retrieval functions */
1363 /* */
1364 /****************************************************************************/
1365
1366 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1367 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1368
1369 /* there should be a macro version of this function in the header file */
1370 /* This is the first function that tries to fetch a collation element */
1371 /* If it's not succesfull or it encounters a more difficult situation */
1372 /* some more sofisticated and slower functions are invoked */
1373 static
1374 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1375 uint32_t order = 0;
1376 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1377 order = *(collationSource->toReturn++); /* if so, return them */
1378 if(collationSource->CEpos == collationSource->toReturn) {
1379 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1380 }
1381 return order;
1382 }
1383
1384 UChar ch = 0;
1385 collationSource->offsetReturn = NULL;
1386
1387 for (;;) /* Loop handles case when incremental normalize switches */
1388 { /* to or from the side buffer / original string, and we */
1389 /* need to start again to get the next character. */
1390
1391 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1392 {
1393 // The source string is null terminated and we're not working from the side buffer,
1394 // and we're not normalizing. This is the fast path.
1395 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1396 ch = *collationSource->pos++;
1397 if (ch != 0) {
1398 break;
1399 }
1400 else {
1401 return UCOL_NO_MORE_CES;
1402 }
1403 }
1404
1405 if (collationSource->flags & UCOL_ITER_HASLEN) {
1406 // Normal path for strings when length is specified.
1407 // (We can't be in side buffer because it is always null terminated.)
1408 if (collationSource->pos >= collationSource->endp) {
1409 // Ran off of the end of the main source string. We're done.
1410 return UCOL_NO_MORE_CES;
1411 }
1412 ch = *collationSource->pos++;
1413 }
1414 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1415 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1416 if(iterCh == U_SENTINEL) {
1417 return UCOL_NO_MORE_CES;
1418 }
1419 ch = (UChar)iterCh;
1420 }
1421 else
1422 {
1423 // Null terminated string.
1424 ch = *collationSource->pos++;
1425 if (ch == 0) {
1426 // Ran off end of buffer.
1427 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1428 // Ran off end of main string. backing up one character.
1429 collationSource->pos--;
1430 return UCOL_NO_MORE_CES;
1431 }
1432 else
1433 {
1434 // Hit null in the normalize side buffer.
1435 // Usually this means the end of the normalized data,
1436 // except for one odd case: a null followed by combining chars,
1437 // which is the case if we are at the start of the buffer.
1438 if (collationSource->pos == collationSource->writableBuffer+1) {
1439 break;
1440 }
1441
1442 // Null marked end of side buffer.
1443 // Revert to the main string and
1444 // loop back to top to try again to get a character.
1445 collationSource->pos = collationSource->fcdPosition;
1446 collationSource->flags = collationSource->origFlags;
1447 continue;
1448 }
1449 }
1450 }
1451
1452 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1453 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1454 * based on whether the previous codepoint was Hiragana or Katakana.
1455 */
1456 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1457 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1458 collationSource->flags |= UCOL_WAS_HIRAGANA;
1459 } else {
1460 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1461 }
1462 }
1463
1464 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1465 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1466 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1467 break;
1468 }
1469
1470 if (collationSource->fcdPosition >= collationSource->pos) {
1471 // An earlier FCD check has already covered the current character.
1472 // We can go ahead and process this char.
1473 break;
1474 }
1475
1476 if (ch < ZERO_CC_LIMIT_ ) {
1477 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1478 break;
1479 }
1480
1481 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1482 // We need to peek at the next character in order to tell if we are FCD
1483 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1484 // We are at the last char of source string.
1485 // It is always OK for FCD check.
1486 break;
1487 }
1488
1489 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1490 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1491 break;
1492 }
1493 }
1494
1495
1496 // Need a more complete FCD check and possible normalization.
1497 if (collIterFCD(collationSource)) {
1498 collIterNormalize(collationSource);
1499 }
1500 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1501 // No normalization was needed. Go ahead and process the char we already had.
1502 break;
1503 }
1504
1505 // Some normalization happened. Next loop iteration will pick up a char
1506 // from the normalization buffer.
1507
1508 } // end for (;;)
1509
1510
1511 if (ch <= 0xFF) {
1512 /* For latin-1 characters we never need to fall back to the UCA table */
1513 /* because all of the UCA data is replicated in the latinOneMapping array */
1514 order = coll->latinOneMapping[ch];
1515 if (order > UCOL_NOT_FOUND) {
1516 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1517 }
1518 }
1519 else
1520 {
1521 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1522 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1523 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1524 }
1525 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1526 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1527 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1528
1529 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1530 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1531 }
1532 }
1533 }
1534 if(order == UCOL_NOT_FOUND) {
1535 order = getImplicit(ch, collationSource);
1536 }
1537 return order; /* return the CE */
1538 }
1539
1540 /* ucol_getNextCE, out-of-line version for use from other files. */
1541 U_CAPI uint32_t U_EXPORT2
1542 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1543 return ucol_IGetNextCE(coll, collationSource, status);
1544 }
1545
1546
1547 /**
1548 * Incremental previous normalization happens here. Pick up the range of chars
1549 * identifed by FCD, normalize it into the collIterate's writable buffer,
1550 * switch the collIterate's state to use the writable buffer.
1551 * @param data collation iterator data
1552 */
1553 static
1554 void collPrevIterNormalize(collIterate *data)
1555 {
1556 UErrorCode status = U_ZERO_ERROR;
1557 UChar *pEnd = data->pos; /* End normalize + 1 */
1558 UChar *pStart;
1559 uint32_t normLen;
1560 UChar *pStartNorm;
1561
1562 /* Start normalize */
1563 if (data->fcdPosition == NULL) {
1564 pStart = data->string;
1565 }
1566 else {
1567 pStart = data->fcdPosition + 1;
1568 }
1569
1570 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1571 data->writableBuffer, 0, &status);
1572
1573 if (data->writableBufSize <= normLen) {
1574 freeHeapWritableBuffer(data);
1575 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1576 sizeof(UChar));
1577 if(data->writableBuffer == NULL) { // something is wrong here, return
1578 data->writableBufSize = 0; // Reset writableBufSize
1579 return;
1580 }
1581 data->flags |= UCOL_ITER_ALLOCATED;
1582 /* to handle the zero termination */
1583 data->writableBufSize = normLen + 1;
1584 }
1585 status = U_ZERO_ERROR;
1586 /*
1587 this puts the null termination infront of the normalized string instead
1588 of the end
1589 */
1590 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1591 *(pStartNorm - 1) = 0;
1592 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1593 normLen, &status);
1594
1595 if (data->offsetBuffer == NULL) {
1596 int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
1597
1598 data->offsetBufferSize = len;
1599 data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
1600 data->offsetStore = data->offsetBuffer;
1601 } else if(data->offsetBufferSize < (int32_t) normLen) {
1602 int32_t storeIX = data->offsetStore - data->offsetBuffer;
1603 int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
1604
1605 if (tob != NULL) {
1606 data->offsetBuffer = tob;
1607 data->offsetStore = &data->offsetBuffer[storeIX];
1608 data->offsetBufferSize = normLen + 1;
1609 }
1610 }
1611
1612 /*
1613 * The usual case at this point is that we've got a base
1614 * character followed by marks that were normalized. If
1615 * fcdPosition is NULL, that means that we backed up to
1616 * the beginning of the string and there's no base character.
1617 *
1618 * Forward processing will usually normalize when it sees
1619 * the first mark, so that mark will get it's natural offset
1620 * and the rest will get the offset of the character following
1621 * the marks. The base character will also get its natural offset.
1622 *
1623 * We write the offset of the base character, if there is one,
1624 * followed by the offset of the first mark and then the offsets
1625 * of the rest of the marks.
1626 */
1627 int32_t firstMarkOffset = 0;
1628 int32_t trailOffset = data->pos - data->string + 1;
1629 int32_t trailCount = normLen - 1;
1630
1631 if (data->fcdPosition != NULL) {
1632 int32_t baseOffset = data->fcdPosition - data->string;
1633 UChar baseChar = *data->fcdPosition;
1634
1635 firstMarkOffset = baseOffset + 1;
1636
1637 /*
1638 * If the base character is the start of a contraction, forward processing
1639 * will normalize the marks while checking for the contraction, which means
1640 * that the offset of the first mark will the same as the other marks.
1641 *
1642 * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1643 */
1644 if (baseChar >= 0x100) {
1645 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1646
1647 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1648 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1649 }
1650
1651 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1652 firstMarkOffset = trailOffset;
1653 }
1654 }
1655
1656 *(data->offsetStore++) = baseOffset;
1657 }
1658
1659 *(data->offsetStore++) = firstMarkOffset;
1660
1661 for (int32_t i = 0; i < trailCount; i += 1) {
1662 *(data->offsetStore++) = trailOffset;
1663 }
1664
1665 data->offsetRepeatValue = trailOffset;
1666
1667 data->offsetReturn = data->offsetStore - 1;
1668 if (data->offsetReturn == data->offsetBuffer) {
1669 data->offsetStore = data->offsetBuffer;
1670 }
1671
1672 data->pos = data->writableBuffer + data->writableBufSize;
1673 data->origFlags = data->flags;
1674 data->flags |= UCOL_ITER_INNORMBUF;
1675 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1676 }
1677
1678
1679 /**
1680 * Incremental FCD check for previous iteration and normalize. Called from
1681 * getPrevCE when normalization state is suspect.
1682 * When entering, the state is known to be this:
1683 * o We are working in the main buffer of the collIterate, not the side
1684 * writable buffer. When in the side buffer, normalization mode is always
1685 * off, so we won't get here.
1686 * o The leading combining class from the current character is 0 or the
1687 * trailing combining class of the previous char was zero.
1688 * True because the previous call to this function will have always exited
1689 * that way, and we get called for every char where cc might be non-zero.
1690 * @param data collation iterate struct
1691 * @return normalization status, TRUE for normalization to be done, FALSE
1692 * otherwise
1693 */
1694 static
1695 inline UBool collPrevIterFCD(collIterate *data)
1696 {
1697 const UChar *src, *start;
1698 UChar c, c2;
1699 uint8_t leadingCC;
1700 uint8_t trailingCC = 0;
1701 uint16_t fcd;
1702 UBool result = FALSE;
1703
1704 start = data->string;
1705 src = data->pos + 1;
1706
1707 /* Get the trailing combining class of the current character. */
1708 c = *--src;
1709 if (!U16_IS_SURROGATE(c)) {
1710 fcd = unorm_getFCD16(fcdTrieIndex, c);
1711 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1712 --src;
1713 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1714 if (fcd != 0) {
1715 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1716 }
1717 } else /* unpaired surrogate */ {
1718 fcd = 0;
1719 }
1720
1721 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1722
1723 if (leadingCC != 0) {
1724 /*
1725 The current char has a non-zero leading combining class.
1726 Scan backward until we find a char with a trailing cc of zero.
1727 */
1728 for (;;)
1729 {
1730 if (start == src) {
1731 data->fcdPosition = NULL;
1732 return result;
1733 }
1734
1735 c = *--src;
1736 if (!U16_IS_SURROGATE(c)) {
1737 fcd = unorm_getFCD16(fcdTrieIndex, c);
1738 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1739 --src;
1740 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1741 if (fcd != 0) {
1742 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1743 }
1744 } else /* unpaired surrogate */ {
1745 fcd = 0;
1746 }
1747
1748 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1749
1750 if (trailingCC == 0) {
1751 break;
1752 }
1753
1754 if (leadingCC < trailingCC) {
1755 result = TRUE;
1756 }
1757
1758 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1759 }
1760 }
1761
1762 data->fcdPosition = (UChar *)src;
1763
1764 return result;
1765 }
1766
1767 /** gets a character from the string at a given offset
1768 * Handles both normal and iterative cases.
1769 * No error checking - caller beware!
1770 */
1771 inline static
1772 UChar peekCharacter(collIterate *source, int32_t offset) {
1773 if(source->pos != NULL) {
1774 return *(source->pos + offset);
1775 } else if(source->iterator != NULL) {
1776 if(offset != 0) {
1777 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1778 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1779 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1780 return toReturn;
1781 } else {
1782 return (UChar)source->iterator->current(source->iterator);
1783 }
1784 } else {
1785 return (UChar)U_SENTINEL;
1786 }
1787 }
1788
1789 /**
1790 * Determines if we are at the start of the data string in the backwards
1791 * collation iterator
1792 * @param data collation iterator
1793 * @return TRUE if we are at the start
1794 */
1795 static
1796 inline UBool isAtStartPrevIterate(collIterate *data) {
1797 if(data->pos == NULL && data->iterator != NULL) {
1798 return !data->iterator->hasPrevious(data->iterator);
1799 }
1800 //return (collIter_bos(data)) ||
1801 return (data->pos == data->string) ||
1802 ((data->flags & UCOL_ITER_INNORMBUF) &&
1803 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1804 }
1805
1806 static
1807 inline void goBackOne(collIterate *data) {
1808 # if 0
1809 // somehow, it looks like we need to keep iterator synced up
1810 // at all times, as above.
1811 if(data->pos) {
1812 data->pos--;
1813 }
1814 if(data->iterator) {
1815 data->iterator->previous(data->iterator);
1816 }
1817 #endif
1818 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1819 data->iterator->previous(data->iterator);
1820 }
1821 if(data->pos) {
1822 data->pos --;
1823 }
1824 }
1825
1826 /**
1827 * Inline function that gets a simple CE.
1828 * So what it does is that it will first check the expansion buffer. If the
1829 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1830 * is different from the string pointer, we return the collation element at the
1831 * return pointer and decrement it.
1832 * For more complicated CEs it resorts to getComplicatedCE.
1833 * @param coll collator data
1834 * @param data collation iterator struct
1835 * @param status error status
1836 */
1837 static
1838 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1839 UErrorCode *status)
1840 {
1841 uint32_t result = (uint32_t)UCOL_NULLORDER;
1842
1843 if (data->offsetReturn != NULL) {
1844 if (data->offsetRepeatCount > 0) {
1845 data->offsetRepeatCount -= 1;
1846 } else {
1847 if (data->offsetReturn == data->offsetBuffer) {
1848 data->offsetReturn = NULL;
1849 data->offsetStore = data->offsetBuffer;
1850 } else {
1851 data->offsetReturn -= 1;
1852 }
1853 }
1854 }
1855
1856 if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1857 (!data->extendCEs && data->toReturn > data->CEs))
1858 {
1859 data->toReturn -= 1;
1860 result = *(data->toReturn);
1861 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1862 data->CEpos = data->toReturn;
1863 }
1864 }
1865 else {
1866 UChar ch = 0;
1867
1868 /*
1869 Loop handles case when incremental normalize switches to or from the
1870 side buffer / original string, and we need to start again to get the
1871 next character.
1872 */
1873 for (;;) {
1874 if (data->flags & UCOL_ITER_HASLEN) {
1875 /*
1876 Normal path for strings when length is specified.
1877 Not in side buffer because it is always null terminated.
1878 */
1879 if (data->pos <= data->string) {
1880 /* End of the main source string */
1881 return UCOL_NO_MORE_CES;
1882 }
1883 data->pos --;
1884 ch = *data->pos;
1885 }
1886 // we are using an iterator to go back. Pray for us!
1887 else if (data->flags & UCOL_USE_ITERATOR) {
1888 UChar32 iterCh = data->iterator->previous(data->iterator);
1889 if(iterCh == U_SENTINEL) {
1890 return UCOL_NO_MORE_CES;
1891 } else {
1892 ch = (UChar)iterCh;
1893 }
1894 }
1895 else {
1896 data->pos --;
1897 ch = *data->pos;
1898 /* we are in the side buffer. */
1899 if (ch == 0) {
1900 /*
1901 At the start of the normalize side buffer.
1902 Go back to string.
1903 Because pointer points to the last accessed character,
1904 hence we have to increment it by one here.
1905 */
1906 data->flags = data->origFlags;
1907 data->offsetRepeatValue = 0;
1908
1909 if (data->fcdPosition == NULL) {
1910 data->pos = data->string;
1911 return UCOL_NO_MORE_CES;
1912 }
1913 else {
1914 data->pos = data->fcdPosition + 1;
1915 }
1916
1917 continue;
1918 }
1919 }
1920
1921 if(data->flags&UCOL_HIRAGANA_Q) {
1922 if(ch>=0x3040 && ch<=0x309f) {
1923 data->flags |= UCOL_WAS_HIRAGANA;
1924 } else {
1925 data->flags &= ~UCOL_WAS_HIRAGANA;
1926 }
1927 }
1928
1929 /*
1930 * got a character to determine if there's fcd and/or normalization
1931 * stuff to do.
1932 * if the current character is not fcd.
1933 * if current character is at the start of the string
1934 * Trailing combining class == 0.
1935 * Note if pos is in the writablebuffer, norm is always 0
1936 */
1937 if (ch < ZERO_CC_LIMIT_ ||
1938 // this should propel us out of the loop in the iterator case
1939 (data->flags & UCOL_ITER_NORM) == 0 ||
1940 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1941 || data->string == data->pos) {
1942 break;
1943 }
1944
1945 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1946 /* if next character is FCD */
1947 if (data->pos == data->string) {
1948 /* First char of string is always OK for FCD check */
1949 break;
1950 }
1951
1952 /* Not first char of string, do the FCD fast test */
1953 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1954 break;
1955 }
1956 }
1957
1958 /* Need a more complete FCD check and possible normalization. */
1959 if (collPrevIterFCD(data)) {
1960 collPrevIterNormalize(data);
1961 }
1962
1963 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1964 /* No normalization. Go ahead and process the char. */
1965 break;
1966 }
1967
1968 /*
1969 Some normalization happened.
1970 Next loop picks up a char from the normalization buffer.
1971 */
1972 }
1973
1974 /* attempt to handle contractions, after removal of the backwards
1975 contraction
1976 */
1977 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1978 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1979 } else {
1980 if (ch <= 0xFF) {
1981 result = coll->latinOneMapping[ch];
1982 }
1983 else {
1984 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1985 }
1986 if (result > UCOL_NOT_FOUND) {
1987 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1988 }
1989 if (result == UCOL_NOT_FOUND) { // Not found in master list
1990 if (!isAtStartPrevIterate(data) &&
1991 ucol_contractionEndCP(ch, data->coll))
1992 {
1993 result = UCOL_CONTRACTION;
1994 } else {
1995 if(coll->UCA) {
1996 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1997 }
1998 }
1999
2000 if (result > UCOL_NOT_FOUND) {
2001 if(coll->UCA) {
2002 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2003 }
2004 }
2005 }
2006 }
2007
2008 if(result == UCOL_NOT_FOUND) {
2009 result = getPrevImplicit(ch, data);
2010 }
2011 }
2012
2013 return result;
2014 }
2015
2016
2017 /* ucol_getPrevCE, out-of-line version for use from other files. */
2018 U_CFUNC uint32_t U_EXPORT2
2019 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2020 UErrorCode *status) {
2021 return ucol_IGetPrevCE(coll, data, status);
2022 }
2023
2024
2025 /* this should be connected to special Jamo handling */
2026 U_CFUNC uint32_t U_EXPORT2
2027 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2028 collIterate colIt;
2029 uint32_t order;
2030 IInit_collIterate(coll, &u, 1, &colIt);
2031 order = ucol_IGetNextCE(coll, &colIt, status);
2032 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2033 return order;
2034 }
2035
2036 /**
2037 * Inserts the argument character into the end of the buffer pushing back the
2038 * null terminator.
2039 * @param data collIterate struct data
2040 * @param pNull pointer to the null termination
2041 * @param ch character to be appended
2042 * @return the position of the new addition
2043 */
2044 static
2045 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
2046 {
2047 uint32_t size = data->writableBufSize;
2048 UChar *newbuffer;
2049 static const uint32_t INCSIZE = 5;
2050
2051 if ((data->writableBuffer + size) > (pNull + 1)) {
2052 *pNull = ch;
2053 *(pNull + 1) = 0;
2054 return pNull;
2055 }
2056
2057 /*
2058 buffer will always be null terminated at the end.
2059 giving extra space since it is likely that more characters will be added.
2060 */
2061 size += INCSIZE;
2062 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2063 if(newbuffer != NULL) { // something wrong, but no status
2064 uprv_memcpy(newbuffer, data->writableBuffer,
2065 data->writableBufSize * sizeof(UChar));
2066
2067 freeHeapWritableBuffer(data);
2068 data->writableBufSize = size;
2069 data->writableBuffer = newbuffer;
2070
2071 newbuffer = newbuffer + data->writableBufSize;
2072 *newbuffer = ch;
2073 *(newbuffer + 1) = 0;
2074 }
2075 return newbuffer;
2076 }
2077
2078 /**
2079 * Inserts the argument string into the end of the buffer pushing back the
2080 * null terminator.
2081 * @param data collIterate struct data
2082 * @param pNull pointer to the null termination
2083 * @param string to be appended
2084 * @param length of the string to be appended
2085 * @return the position of the new addition
2086 */
2087 static
2088 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2089 int32_t length)
2090 {
2091 uint32_t size = pNull - data->writableBuffer;
2092 UChar *newbuffer;
2093
2094 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2095 uprv_memcpy(pNull, str, length * sizeof(UChar));
2096 *(pNull + length) = 0;
2097 return pNull;
2098 }
2099
2100 /*
2101 buffer will always be null terminated at the end.
2102 giving extra space since it is likely that more characters will be added.
2103 */
2104 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2105 if(newbuffer != NULL) {
2106 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2107 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2108
2109 freeHeapWritableBuffer(data);
2110 data->writableBufSize = size + length + 1;
2111 data->writableBuffer = newbuffer;
2112 }
2113
2114 return newbuffer;
2115 }
2116
2117 /**
2118 * Special normalization function for contraction in the forwards iterator.
2119 * This normalization sequence will place the current character at source->pos
2120 * and its following normalized sequence into the buffer.
2121 * The fcd position, pos will be changed.
2122 * pos will now point to positions in the buffer.
2123 * Flags will be changed accordingly.
2124 * @param data collation iterator data
2125 */
2126 static
2127 inline void normalizeNextContraction(collIterate *data)
2128 {
2129 UChar *buffer = data->writableBuffer;
2130 uint32_t buffersize = data->writableBufSize;
2131 uint32_t strsize;
2132 UErrorCode status = U_ZERO_ERROR;
2133 /* because the pointer points to the next character */
2134 UChar *pStart = data->pos - 1;
2135 UChar *pEnd;
2136 uint32_t normLen;
2137 UChar *pStartNorm;
2138
2139 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2140 *data->writableBuffer = *(pStart - 1);
2141 strsize = 1;
2142 }
2143 else {
2144 strsize = u_strlen(data->writableBuffer);
2145 }
2146
2147 pEnd = data->fcdPosition;
2148
2149 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2150 &status);
2151
2152 if (buffersize <= normLen + strsize) {
2153 uint32_t size = strsize + normLen + 1;
2154 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2155 if(temp != NULL) {
2156 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2157 freeHeapWritableBuffer(data);
2158 data->writableBuffer = temp;
2159 data->writableBufSize = size;
2160 data->flags |= UCOL_ITER_ALLOCATED;
2161 } else {
2162 return; // Avoid writing past bound of buffer->writableBuffer.
2163 }
2164 }
2165
2166 status = U_ZERO_ERROR;
2167 pStartNorm = buffer + strsize;
2168 /* null-termination will be added here */
2169 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2170 normLen + 1, &status);
2171
2172 data->pos = data->writableBuffer + strsize;
2173 data->origFlags = data->flags;
2174 data->flags |= UCOL_ITER_INNORMBUF;
2175 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2176 }
2177
2178 /**
2179 * Contraction character management function that returns the next character
2180 * for the forwards iterator.
2181 * Does nothing if the next character is in buffer and not the first character
2182 * in it.
2183 * Else it checks next character in data string to see if it is normalizable.
2184 * If it is not, the character is simply copied into the buffer, else
2185 * the whole normalized substring is copied into the buffer, including the
2186 * current character.
2187 * @param data collation element iterator data
2188 * @return next character
2189 */
2190 static
2191 inline UChar getNextNormalizedChar(collIterate *data)
2192 {
2193 UChar nextch;
2194 UChar ch;
2195 // Here we need to add the iterator code. One problem is the way
2196 // end of string is handled. If we just return next char, it could
2197 // be the sentinel. Most of the cases already check for this, but we
2198 // need to be sure.
2199 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2200 /* if no normalization and not in buffer. */
2201 if(data->flags & UCOL_USE_ITERATOR) {
2202 return (UChar)data->iterator->next(data->iterator);
2203 } else {
2204 return *(data->pos ++);
2205 }
2206 }
2207
2208 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2209 //normalizeIterator(data);
2210 //}
2211
2212 UChar *pEndWritableBuffer = NULL;
2213 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2214 if ((innormbuf && *data->pos != 0) ||
2215 (data->fcdPosition != NULL && !innormbuf &&
2216 data->pos < data->fcdPosition)) {
2217 /*
2218 if next character is in normalized buffer, no further normalization
2219 is required
2220 */
2221 return *(data->pos ++);
2222 }
2223
2224 if (data->flags & UCOL_ITER_HASLEN) {
2225 /* in data string */
2226 if (data->pos + 1 == data->endp) {
2227 return *(data->pos ++);
2228 }
2229 }
2230 else {
2231 if (innormbuf) {
2232 // inside the normalization buffer, but at the end
2233 // (since we encountered zero). This means, in the
2234 // case we're using char iterator, that we need to
2235 // do another round of normalization.
2236 //if(data->origFlags & UCOL_USE_ITERATOR) {
2237 // we need to restore original flags,
2238 // otherwise, we'll lose them
2239 //data->flags = data->origFlags;
2240 //normalizeIterator(data);
2241 //return *(data->pos++);
2242 //} else {
2243 /*
2244 in writable buffer, at this point fcdPosition can not be
2245 pointing to the end of the data string. see contracting tag.
2246 */
2247 if(data->fcdPosition) {
2248 if (*(data->fcdPosition + 1) == 0 ||
2249 data->fcdPosition + 1 == data->endp) {
2250 /* at the end of the string, dump it into the normalizer */
2251 data->pos = insertBufferEnd(data, data->pos,
2252 *(data->fcdPosition)) + 1;
2253 // Check if data->pos received a null pointer
2254 if (data->pos == NULL) {
2255 return (UChar)-1; // Return to indicate error.
2256 }
2257 return *(data->fcdPosition ++);
2258 }
2259 pEndWritableBuffer = data->pos;
2260 data->pos = data->fcdPosition;
2261 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2262 // if we are here, we're using a normalizing iterator.
2263 // we should just continue further.
2264 data->flags = data->origFlags;
2265 data->pos = NULL;
2266 return (UChar)data->iterator->next(data->iterator);
2267 }
2268 //}
2269 }
2270 else {
2271 if (*(data->pos + 1) == 0) {
2272 return *(data->pos ++);
2273 }
2274 }
2275 }
2276
2277 ch = *data->pos ++;
2278 nextch = *data->pos;
2279
2280 /*
2281 * if the current character is not fcd.
2282 * Trailing combining class == 0.
2283 */
2284 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2285 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2286 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2287 /*
2288 Need a more complete FCD check and possible normalization.
2289 normalize substring will be appended to buffer
2290 */
2291 if (collIterFCD(data)) {
2292 normalizeNextContraction(data);
2293 return *(data->pos ++);
2294 }
2295 else if (innormbuf) {
2296 /* fcdposition shifted even when there's no normalization, if we
2297 don't input the rest into this, we'll get the wrong position when
2298 we reach the end of the writableBuffer */
2299 int32_t length = data->fcdPosition - data->pos + 1;
2300 data->pos = insertBufferEnd(data, pEndWritableBuffer,
2301 data->pos - 1, length);
2302 // Check if data->pos received a null pointer
2303 if (data->pos == NULL) {
2304 return (UChar)-1; // Return to indicate error.
2305 }
2306 return *(data->pos ++);
2307 }
2308 }
2309
2310 if (innormbuf) {
2311 /*
2312 no normalization is to be done hence only one character will be
2313 appended to the buffer.
2314 */
2315 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2316 // Check if data->pos received a null pointer
2317 if (data->pos == NULL) {
2318 return (UChar)-1; // Return to indicate error.
2319 }
2320 }
2321
2322 /* points back to the pos in string */
2323 return ch;
2324 }
2325
2326
2327
2328 /**
2329 * Function to copy the buffer into writableBuffer and sets the fcd position to
2330 * the correct position
2331 * @param source data string source
2332 * @param buffer character buffer
2333 * @param tempdb current position in buffer that has been used up
2334 */
2335 static
2336 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2337 UChar *tempdb)
2338 {
2339 /* okay confusing part here. to ensure that the skipped characters are
2340 considered later, we need to place it in the appropriate position in the
2341 normalization buffer and reassign the pos pointer. simple case if pos
2342 reside in string, simply copy to normalization buffer and
2343 fcdposition = pos, pos = start of normalization buffer. if pos in
2344 normalization buffer, we'll insert the copy infront of pos and point pos
2345 to the start of the normalization buffer. why am i doing these copies?
2346 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2347 not require any changes, which be really painful. */
2348 uint32_t length = u_strlen(buffer);;
2349 if (source->flags & UCOL_ITER_INNORMBUF) {
2350 u_strcpy(tempdb, source->pos);
2351 }
2352 else {
2353 source->fcdPosition = source->pos;
2354 source->origFlags = source->flags;
2355 source->flags |= UCOL_ITER_INNORMBUF;
2356 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2357 }
2358
2359 if (length >= source->writableBufSize) {
2360 freeHeapWritableBuffer(source);
2361 source->writableBuffer =
2362 (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2363 if(source->writableBuffer == NULL) {
2364 source->writableBufSize = 0; // Reset size
2365 return;
2366 }
2367 source->writableBufSize = length;
2368 }
2369
2370 u_strcpy(source->writableBuffer, buffer);
2371 source->pos = source->writableBuffer;
2372 }
2373
2374 /**
2375 * Function to get the discontiguos collation element within the source.
2376 * Note this function will set the position to the appropriate places.
2377 * @param coll current collator used
2378 * @param source data string source
2379 * @param constart index to the start character in the contraction table
2380 * @return discontiguos collation element offset
2381 */
2382 static
2383 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2384 const UChar *constart)
2385 {
2386 /* source->pos currently points to the second combining character after
2387 the start character */
2388 UChar *temppos = source->pos;
2389 UChar buffer[4*UCOL_MAX_BUFFER];
2390 UChar *tempdb = buffer;
2391 const UChar *tempconstart = constart;
2392 uint8_t tempflags = source->flags;
2393 UBool multicontraction = FALSE;
2394 UChar *tempbufferpos = 0;
2395 collIterateState discState;
2396
2397 backupState(source, &discState);
2398
2399 //*tempdb = *(source->pos - 1);
2400 *tempdb = peekCharacter(source, -1);
2401 tempdb++;
2402 for (;;) {
2403 UChar *UCharOffset;
2404 UChar schar,
2405 tchar;
2406 uint32_t result;
2407
2408 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2409 || (peekCharacter(source, 0) == 0 &&
2410 //|| (*source->pos == 0 &&
2411 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2412 source->fcdPosition == NULL ||
2413 source->fcdPosition == source->endp ||
2414 *(source->fcdPosition) == 0 ||
2415 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2416 /* end of string in null terminated string or stopped by a
2417 null character, note fcd does not always point to a base
2418 character after the discontiguos change */
2419 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2420 //u_getCombiningClass(*(source->pos)) == 0) {
2421 //constart = (UChar *)coll->image + getContractOffset(CE);
2422 if (multicontraction) {
2423 *tempbufferpos = 0;
2424 source->pos = temppos - 1;
2425 setDiscontiguosAttribute(source, buffer, tempdb);
2426 return *(coll->contractionCEs +
2427 (tempconstart - coll->contractionIndex));
2428 }
2429 constart = tempconstart;
2430 break;
2431 }
2432
2433 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2434 schar = getNextNormalizedChar(source);
2435
2436 while (schar > (tchar = *UCharOffset)) {
2437 UCharOffset++;
2438 }
2439
2440 if (schar != tchar) {
2441 /* not the correct codepoint. we stuff the current codepoint into
2442 the discontiguos buffer and try the next character */
2443 *tempdb = schar;
2444 tempdb ++;
2445 continue;
2446 }
2447 else {
2448 if (u_getCombiningClass(schar) ==
2449 u_getCombiningClass(peekCharacter(source, -2))) {
2450 //u_getCombiningClass(*(source->pos - 2))) {
2451 *tempdb = schar;
2452 tempdb ++;
2453 continue;
2454 }
2455 result = *(coll->contractionCEs +
2456 (UCharOffset - coll->contractionIndex));
2457 }
2458 *tempdb = 0;
2459
2460 if (result == UCOL_NOT_FOUND) {
2461 break;
2462 } else if (isContraction(result)) {
2463 /* this is a multi-contraction*/
2464 tempconstart = (UChar *)coll->image + getContractOffset(result);
2465 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2466 != UCOL_NOT_FOUND) {
2467 multicontraction = TRUE;
2468 temppos = source->pos + 1;
2469 tempbufferpos = buffer + u_strlen(buffer);
2470 }
2471 } else {
2472 setDiscontiguosAttribute(source, buffer, tempdb);
2473 return result;
2474 }
2475 }
2476
2477 /* no problems simply reverting just like that,
2478 if we are in string before getting into this function, points back to
2479 string hence no problem.
2480 if we are in normalization buffer before getting into this function,
2481 since we'll never use another normalization within this function, we
2482 know that fcdposition points to a base character. the normalization buffer
2483 never change, hence this revert works. */
2484 loadState(source, &discState, TRUE);
2485 goBackOne(source);
2486
2487 //source->pos = temppos - 1;
2488 source->flags = tempflags;
2489 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2490 }
2491
2492 static
2493 inline UBool isNonChar(UChar32 cp) {
2494 return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
2495 }
2496
2497 /* now uses Mark's getImplicitPrimary code */
2498 static
2499 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2500 if(isNonChar(cp)) {
2501 return 0;
2502 }
2503 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2504 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2505 collationSource->offsetRepeatCount += 1;
2506 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2507 }
2508
2509 /**
2510 * Inserts the argument character into the front of the buffer replacing the
2511 * front null terminator.
2512 * @param data collation element iterator data
2513 * @param pNull pointer to the null terminator
2514 * @param ch character to be appended
2515 * @return positon of added character
2516 */
2517 static
2518 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2519 {
2520 uint32_t size = data->writableBufSize;
2521 UChar *end;
2522 UChar *newbuffer;
2523 static const uint32_t INCSIZE = 5;
2524
2525 if (pNull > data->writableBuffer + 1) {
2526 *pNull = ch;
2527 *(pNull - 1) = 0;
2528 return pNull;
2529 }
2530
2531 /*
2532 buffer will always be null terminated infront.
2533 giving extra space since it is likely that more characters will be added.
2534 */
2535 size += INCSIZE;
2536 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2537 if(newbuffer == NULL) {
2538 return NULL;
2539 }
2540 end = newbuffer + INCSIZE;
2541 uprv_memcpy(end, data->writableBuffer,
2542 data->writableBufSize * sizeof(UChar));
2543 *end = ch;
2544 *(end - 1) = 0;
2545
2546 freeHeapWritableBuffer(data);
2547
2548 data->writableBufSize = size;
2549 data->writableBuffer = newbuffer;
2550 return end;
2551 }
2552
2553 /**
2554 * Special normalization function for contraction in the previous iterator.
2555 * This normalization sequence will place the current character at source->pos
2556 * and its following normalized sequence into the buffer.
2557 * The fcd position, pos will be changed.
2558 * pos will now point to positions in the buffer.
2559 * Flags will be changed accordingly.
2560 * @param data collation iterator data
2561 */
2562 static
2563 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2564 {
2565 uint32_t nulltermsize;
2566 UErrorCode localstatus = U_ZERO_ERROR;
2567 UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2568 UChar *pStart;
2569 uint32_t normLen;
2570 UChar *pStartNorm;
2571
2572 if (data->flags & UCOL_ITER_HASLEN) {
2573 /*
2574 normalization buffer not used yet, we'll pull down the next
2575 character into the end of the buffer
2576 */
2577 *(data->writableBuffer + (data->writableBufSize - 1)) = *(data->pos + 1);
2578 nulltermsize = data->writableBufSize - 1;
2579 }
2580 else {
2581 nulltermsize = data->writableBufSize;
2582 UChar *temp = data->writableBuffer + (nulltermsize - 1);
2583 while (*(temp --) != 0) {
2584 nulltermsize --;
2585 }
2586 }
2587
2588 /* Start normalize */
2589 if (data->fcdPosition == NULL) {
2590 pStart = data->string;
2591 }
2592 else {
2593 pStart = data->fcdPosition + 1;
2594 }
2595
2596 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, data->writableBuffer, 0,
2597 &localstatus);
2598
2599 if (nulltermsize <= normLen) {
2600 uint32_t size = data->writableBufSize - nulltermsize + normLen + 1;
2601 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2602 if (temp == NULL) {
2603 *status = U_MEMORY_ALLOCATION_ERROR;
2604 return;
2605 }
2606 nulltermsize = normLen + 1;
2607 uprv_memcpy(temp + normLen, data->writableBuffer,
2608 sizeof(UChar) * (data->writableBufSize - nulltermsize));
2609 freeHeapWritableBuffer(data);
2610 data->writableBuffer = temp;
2611 data->writableBufSize = size;
2612 }
2613
2614 /*
2615 this puts the null termination infront of the normalized string instead
2616 of the end
2617 */
2618 pStartNorm = data->writableBuffer + (nulltermsize - normLen);
2619 *(pStartNorm - 1) = 0;
2620 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2621 status);
2622
2623 data->pos = data->writableBuffer + nulltermsize;
2624 data->origFlags = data->flags;
2625 data->flags |= UCOL_ITER_INNORMBUF;
2626 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2627 }
2628
2629 /**
2630 * Contraction character management function that returns the previous character
2631 * for the backwards iterator.
2632 * Does nothing if the previous character is in buffer and not the first
2633 * character in it.
2634 * Else it checks previous character in data string to see if it is
2635 * normalizable.
2636 * If it is not, the character is simply copied into the buffer, else
2637 * the whole normalized substring is copied into the buffer, including the
2638 * current character.
2639 * @param data collation element iterator data
2640 * @return previous character
2641 */
2642 static
2643 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2644 {
2645 UChar prevch;
2646 UChar ch;
2647 UChar *start;
2648 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2649 UChar *pNull = NULL;
2650 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2651 (innormbuf && *(data->pos - 1) != 0)) {
2652 /*
2653 if no normalization.
2654 if previous character is in normalized buffer, no further normalization
2655 is required
2656 */
2657 if(data->flags & UCOL_USE_ITERATOR) {
2658 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2659 return (UChar)data->iterator->next(data->iterator);
2660 } else {
2661 return *(data->pos - 1);
2662 }
2663 }
2664
2665 start = data->pos;
2666 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2667 /* in data string */
2668 if ((start - 1) == data->string) {
2669 return *(start - 1);
2670 }
2671 start --;
2672 ch = *start;
2673 prevch = *(start - 1);
2674 }
2675 else {
2676 /*
2677 in writable buffer, at this point fcdPosition can not be NULL.
2678 see contracting tag.
2679 */
2680 if (data->fcdPosition == data->string) {
2681 /* at the start of the string, just dump it into the normalizer */
2682 insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2683 data->fcdPosition = NULL;
2684 return *(data->pos - 1);
2685 }
2686 pNull = data->pos - 1;
2687 start = data->fcdPosition;
2688 ch = *start;
2689 prevch = *(start - 1);
2690 }
2691 /*
2692 * if the current character is not fcd.
2693 * Trailing combining class == 0.
2694 */
2695 if (data->fcdPosition > start &&
2696 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2697 {
2698 /*
2699 Need a more complete FCD check and possible normalization.
2700 normalize substring will be appended to buffer
2701 */
2702 UChar *backuppos = data->pos;
2703 data->pos = start;
2704 if (collPrevIterFCD(data)) {
2705 normalizePrevContraction(data, status);
2706 return *(data->pos - 1);
2707 }
2708 data->pos = backuppos;
2709 data->fcdPosition ++;
2710 }
2711
2712 if (innormbuf) {
2713 /*
2714 no normalization is to be done hence only one character will be
2715 appended to the buffer.
2716 */
2717 insertBufferFront(data, pNull, ch);
2718 data->fcdPosition --;
2719 }
2720
2721 return ch;
2722 }
2723
2724 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2725 /* It is called by getNextCE */
2726
2727 /* The following should be even */
2728 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2729
2730 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2731 collIterateState entryState;
2732 backupState(source, &entryState);
2733 UChar32 cp = ch;
2734
2735 for (;;) {
2736 // This loop will repeat only in the case of contractions, and only when a contraction
2737 // is found and the first CE resulting from that contraction is itself a special
2738 // (an expansion, for example.) All other special CE types are fully handled the
2739 // first time through, and the loop exits.
2740
2741 const uint32_t *CEOffset = NULL;
2742 switch(getCETag(CE)) {
2743 case NOT_FOUND_TAG:
2744 /* This one is not found, and we'll let somebody else bother about it... no more games */
2745 return CE;
2746 case SPEC_PROC_TAG:
2747 {
2748 // Special processing is getting a CE that is preceded by a certain prefix
2749 // Currently this is only needed for optimizing Japanese length and iteration marks.
2750 // When we encouter a special processing tag, we go backwards and try to see if
2751 // we have a match.
2752 // Contraction tables are used - so the whole process is not unlike contraction.
2753 // prefix data is stored backwards in the table.
2754 const UChar *UCharOffset;
2755 UChar schar, tchar;
2756 collIterateState prefixState;
2757 backupState(source, &prefixState);
2758 loadState(source, &entryState, TRUE);
2759 goBackOne(source); // We want to look at the point where we entered - actually one
2760 // before that...
2761
2762 for(;;) {
2763 // This loop will run once per source string character, for as long as we
2764 // are matching a potential contraction sequence
2765
2766 // First we position ourselves at the begining of contraction sequence
2767 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2768 if (collIter_bos(source)) {
2769 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2770 break;
2771 }
2772 schar = getPrevNormalizedChar(source, status);
2773 goBackOne(source);
2774
2775 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2776 UCharOffset++;
2777 }
2778
2779 if (schar == tchar) {
2780 // Found the source string char in the table.
2781 // Pick up the corresponding CE from the table.
2782 CE = *(coll->contractionCEs +
2783 (UCharOffset - coll->contractionIndex));
2784 }
2785 else
2786 {
2787 // Source string char was not in the table.
2788 // We have not found the prefix.
2789 CE = *(coll->contractionCEs +
2790 (ContractionStart - coll->contractionIndex));
2791 }
2792
2793 if(!isPrefix(CE)) {
2794 // The source string char was in the contraction table, and the corresponding
2795 // CE is not a prefix CE. We found the prefix, break
2796 // out of loop, this CE will end up being returned. This is the normal
2797 // way out of prefix handling when the source actually contained
2798 // the prefix.
2799 break;
2800 }
2801 }
2802 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2803 loadState(source, &prefixState, TRUE);
2804 if(source->origFlags & UCOL_USE_ITERATOR) {
2805 source->flags = source->origFlags;
2806 }
2807 } else { // prefix search was a failure, we have to backup all the way to the start
2808 loadState(source, &entryState, TRUE);
2809 }
2810 break;
2811 }
2812 case CONTRACTION_TAG:
2813 {
2814 /* This should handle contractions */
2815 collIterateState state;
2816 backupState(source, &state);
2817 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2818 const UChar *UCharOffset;
2819 UChar schar, tchar;
2820
2821 for (;;) {
2822 /* This loop will run once per source string character, for as long as we */
2823 /* are matching a potential contraction sequence */
2824
2825 /* First we position ourselves at the begining of contraction sequence */
2826 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2827
2828 if (collIter_eos(source)) {
2829 // Ran off the end of the source string.
2830 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2831 // So we'll pick whatever we have at the point...
2832 if (CE == UCOL_NOT_FOUND) {
2833 // back up the source over all the chars we scanned going into this contraction.
2834 CE = firstCE;
2835 loadState(source, &state, TRUE);
2836 if(source->origFlags & UCOL_USE_ITERATOR) {
2837 source->flags = source->origFlags;
2838 }
2839 }
2840 break;
2841 }
2842
2843 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2844 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2845
2846 schar = getNextNormalizedChar(source);
2847 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2848 UCharOffset++;
2849 }
2850
2851 if (schar == tchar) {
2852 // Found the source string char in the contraction table.
2853 // Pick up the corresponding CE from the table.
2854 CE = *(coll->contractionCEs +
2855 (UCharOffset - coll->contractionIndex));
2856 }
2857 else
2858 {
2859 // Source string char was not in contraction table.
2860 // Unless we have a discontiguous contraction, we have finished
2861 // with this contraction.
2862 // in order to do the proper detection, we
2863 // need to see if we're dealing with a supplementary
2864 /* We test whether the next two char are surrogate pairs.
2865 * This test is done if the iterator is not NULL.
2866 * If there is no surrogate pair, the iterator
2867 * goes back one if needed. */
2868 UChar32 miss = schar;
2869 if (source->iterator) {
2870 UChar32 surrNextChar; /* the next char in the iteration to test */
2871 int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2872 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2873 prevPos = source->iterator->index;
2874 surrNextChar = getNextNormalizedChar(source);
2875 if (U16_IS_TRAIL(surrNextChar)) {
2876 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2877 } else if (prevPos < source->iterator->index){
2878 goBackOne(source);
2879 }
2880 }
2881 } else if (U16_IS_LEAD(schar)) {
2882 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2883 }
2884
2885 uint8_t sCC;
2886 if (miss < 0x300 ||
2887 maxCC == 0 ||
2888 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2889 sCC>maxCC ||
2890 (allSame != 0 && sCC == maxCC) ||
2891 collIter_eos(source))
2892 {
2893 // Contraction can not be discontiguous.
2894 goBackOne(source); // back up the source string by one,
2895 // because the character we just looked at was
2896 // not part of the contraction. */
2897 if(U_IS_SUPPLEMENTARY(miss)) {
2898 goBackOne(source);
2899 }
2900 CE = *(coll->contractionCEs +
2901 (ContractionStart - coll->contractionIndex));
2902 } else {
2903 //
2904 // Contraction is possibly discontiguous.
2905 // Scan more of source string looking for a match
2906 //
2907 UChar tempchar;
2908 /* find the next character if schar is not a base character
2909 and we are not yet at the end of the string */
2910 tempchar = getNextNormalizedChar(source);
2911 // probably need another supplementary thingie here
2912 goBackOne(source);
2913 if (i_getCombiningClass(tempchar, coll) == 0) {
2914 goBackOne(source);
2915 if(U_IS_SUPPLEMENTARY(miss)) {
2916 goBackOne(source);
2917 }
2918 /* Spit out the last char of the string, wasn't tasty enough */
2919 CE = *(coll->contractionCEs +
2920 (ContractionStart - coll->contractionIndex));
2921 } else {
2922 CE = getDiscontiguous(coll, source, ContractionStart);
2923 }
2924 }
2925 } // else after if(schar == tchar)
2926
2927 if(CE == UCOL_NOT_FOUND) {
2928 /* The Source string did not match the contraction that we were checking. */
2929 /* Back up the source position to undo the effects of having partially */
2930 /* scanned through what ultimately proved to not be a contraction. */
2931 loadState(source, &state, TRUE);
2932 CE = firstCE;
2933 break;
2934 }
2935
2936 if(!isContraction(CE)) {
2937 // The source string char was in the contraction table, and the corresponding
2938 // CE is not a contraction CE. We completed the contraction, break
2939 // out of loop, this CE will end up being returned. This is the normal
2940 // way out of contraction handling when the source actually contained
2941 // the contraction.
2942 break;
2943 }
2944
2945
2946 // The source string char was in the contraction table, and the corresponding
2947 // CE is IS a contraction CE. We will continue looping to check the source
2948 // string for the remaining chars in the contraction.
2949 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2950 if(tempCE != UCOL_NOT_FOUND) {
2951 // We have scanned a a section of source string for which there is a
2952 // CE from the contraction table. Remember the CE and scan position, so
2953 // that we can return to this point if further scanning fails to
2954 // match a longer contraction sequence.
2955 firstCE = tempCE;
2956
2957 goBackOne(source);
2958 backupState(source, &state);
2959 getNextNormalizedChar(source);
2960
2961 // Another way to do this is:
2962 //collIterateState tempState;
2963 //backupState(source, &tempState);
2964 //goBackOne(source);
2965 //backupState(source, &state);
2966 //loadState(source, &tempState, TRUE);
2967
2968 // The problem is that for incomplete contractions we have to remember the previous
2969 // position. Before, the only thing I needed to do was state.pos--;
2970 // After iterator introduction and especially after introduction of normalizing
2971 // iterators, it became much more difficult to decrease the saved state.
2972 // I'm not yet sure which of the two methods above is faster.
2973 }
2974 } // for(;;)
2975 break;
2976 } // case CONTRACTION_TAG:
2977 case LONG_PRIMARY_TAG:
2978 {
2979 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2980 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2981 source->offsetRepeatCount += 1;
2982 return CE;
2983 }
2984 case EXPANSION_TAG:
2985 {
2986 /* This should handle expansion. */
2987 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2988 /* I have to decide where continuations are going to be dealt with */
2989 uint32_t size;
2990 uint32_t i; /* general counter */
2991
2992 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2993 size = getExpansionCount(CE);
2994 CE = *CEOffset++;
2995 //source->offsetRepeatCount = -1;
2996
2997 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2998 for(i = 1; i<size; i++) {
2999 *(source->CEpos++) = *CEOffset++;
3000 source->offsetRepeatCount += 1;
3001 }
3002 } else { /* else, we do */
3003 while(*CEOffset != 0) {
3004 *(source->CEpos++) = *CEOffset++;
3005 source->offsetRepeatCount += 1;
3006 }
3007 }
3008
3009 return CE;
3010 }
3011 case DIGIT_TAG:
3012 {
3013 /*
3014 We do a check to see if we want to collate digits as numbers; if so we generate
3015 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3016 */
3017 //uint32_t size;
3018 uint32_t i; /* general counter */
3019
3020 if (source->coll->numericCollation == UCOL_ON){
3021 collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3022 UChar32 char32 = 0;
3023 int32_t digVal = 0;
3024
3025 uint32_t digIndx = 0;
3026 uint32_t endIndex = 0;
3027 uint32_t trailingZeroIndex = 0;
3028
3029 uint8_t collateVal = 0;
3030
3031 UBool nonZeroValReached = FALSE;
3032
3033 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3034 /*
3035 We parse the source string until we hit a char that's NOT a digit.
3036 Use this u_charDigitValue. This might be slow because we have to
3037 handle surrogates...
3038 */
3039 /*
3040 if (U16_IS_LEAD(ch)){
3041 if (!collIter_eos(source)) {
3042 backupState(source, &digitState);
3043 UChar trail = getNextNormalizedChar(source);
3044 if(U16_IS_TRAIL(trail)) {
3045 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3046 } else {
3047 loadState(source, &digitState, TRUE);
3048 char32 = ch;
3049 }
3050 } else {
3051 char32 = ch;
3052 }
3053 } else {
3054 char32 = ch;
3055 }
3056 digVal = u_charDigitValue(char32);
3057 */
3058 digVal = u_charDigitValue(cp); // if we have arrived here, we have
3059 // already processed possible supplementaries that trigered the digit tag -
3060 // all supplementaries are marked in the UCA.
3061 /*
3062 We pad a zero in front of the first element anyways. This takes
3063 care of the (probably) most common case where people are sorting things followed
3064 by a single digit
3065 */
3066 digIndx++;
3067 for(;;){
3068 // Make sure we have enough space. No longer needed;
3069 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3070 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3071 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3072
3073 // Skipping over leading zeroes.
3074 if (digVal != 0) {
3075 nonZeroValReached = TRUE;
3076 }
3077 if (nonZeroValReached) {
3078 /*
3079 We parse the digit string into base 100 numbers (this fits into a byte).
3080 We only add to the buffer in twos, thus if we are parsing an odd character,
3081 that serves as the 'tens' digit while the if we are parsing an even one, that
3082 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3083 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3084 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3085 than all the other bytes.
3086 */
3087
3088 if (digIndx % 2 == 1){
3089 collateVal += (uint8_t)digVal;
3090
3091 // We don't enter the low-order-digit case unless we've already seen
3092 // the high order, or for the first digit, which is always non-zero.
3093 if (collateVal != 0)
3094 trailingZeroIndex = 0;
3095
3096 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3097 collateVal = 0;
3098 }
3099 else{
3100 // We drop the collation value into the buffer so if we need to do
3101 // a "front patch" we don't have to check to see if we're hitting the
3102 // last element.
3103 collateVal = (uint8_t)(digVal * 10);
3104
3105 // Check for trailing zeroes.
3106 if (collateVal == 0)
3107 {
3108 if (!trailingZeroIndex)
3109 trailingZeroIndex = (digIndx/2) + 2;
3110 }
3111 else
3112 trailingZeroIndex = 0;
3113
3114 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3115 }
3116 digIndx++;
3117 }
3118
3119 // Get next character.
3120 if (!collIter_eos(source)){
3121 ch = getNextNormalizedChar(source);
3122 if (U16_IS_LEAD(ch)){
3123 if (!collIter_eos(source)) {
3124 backupState(source, &digitState);
3125 UChar trail = getNextNormalizedChar(source);
3126 if(U16_IS_TRAIL(trail)) {
3127 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3128 } else {
3129 loadState(source, &digitState, TRUE);
3130 char32 = ch;
3131 }
3132 }
3133 } else {
3134 char32 = ch;
3135 }
3136
3137 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3138 // Resetting position to point to the next unprocessed char. We
3139 // overshot it when doing our test/set for numbers.
3140 if (char32 > 0xFFFF) { // For surrogates.
3141 loadState(source, &digitState, TRUE);
3142 //goBackOne(source);
3143 }
3144 goBackOne(source);
3145 break;
3146 }
3147 } else {
3148 break;
3149 }
3150 }
3151
3152 if (nonZeroValReached == FALSE){
3153 digIndx = 2;
3154 numTempBuf[2] = 6;
3155 }
3156
3157 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3158 if (digIndx % 2 != 0){
3159 /*
3160 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3161 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3162 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3163 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3164 */
3165
3166 for(i = 2; i < endIndex; i++){
3167 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3168 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3169 }
3170 --digIndx;
3171 }
3172
3173 // Subtract one off of the last byte.
3174 numTempBuf[endIndex-1] -= 1;
3175
3176 /*
3177 We want to skip over the first two slots in the buffer. The first slot
3178 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3179 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3180 */
3181 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3182 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3183
3184 // Now transfer the collation key to our collIterate struct.
3185 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3186 //size = ((endIndex+1) & ~1)/2;
3187 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3188 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3189 UCOL_BYTE_COMMON; // Tertiary weight.
3190 i = 2; // Reset the index into the buffer.
3191 while(i < endIndex)
3192 {
3193 uint32_t primWeight = numTempBuf[i++] << 8;
3194 if ( i < endIndex)
3195 primWeight |= numTempBuf[i++];
3196 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3197 }
3198
3199 } else {
3200 // no numeric mode, we'll just switch to whatever we stashed and continue
3201 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3202 CE = *CEOffset++;
3203 break;
3204 }
3205 return CE;
3206 }
3207 /* various implicits optimization */
3208 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3209 /* UCA is filled with these. Tailorings are NOT_FOUND */
3210 return getImplicit(cp, source);
3211 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3212 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3213 return getImplicit(cp, source);
3214 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3215 {
3216 static const uint32_t
3217 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3218 //const uint32_t LCount = 19;
3219 static const uint32_t VCount = 21;
3220 static const uint32_t TCount = 28;
3221 //const uint32_t NCount = VCount * TCount; // 588
3222 //const uint32_t SCount = LCount * NCount; // 11172
3223 uint32_t L = ch - SBase;
3224
3225 // divide into pieces
3226
3227 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3228 L /= TCount;
3229 uint32_t V = L % VCount;
3230 L /= VCount;
3231
3232 // offset them
3233
3234 L += LBase;
3235 V += VBase;
3236 T += TBase;
3237
3238 // return the first CE, but first put the rest into the expansion buffer
3239 if (!source->coll->image->jamoSpecial) { // FAST PATH
3240
3241 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3242 if (T != TBase) {
3243 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3244 }
3245
3246 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3247
3248 } else { // Jamo is Special
3249 // Since Hanguls pass the FCD check, it is
3250 // guaranteed that we won't be in
3251 // the normalization buffer if something like this happens
3252 // However, if we are using a uchar iterator and normalization
3253 // is ON, the Hangul that lead us here is going to be in that
3254 // normalization buffer. Here we want to restore the uchar
3255 // iterator state and pull out of the normalization buffer
3256 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3257 source->flags = source->origFlags; // restore the iterator
3258 source->pos = NULL;
3259 }
3260 // Move Jamos into normalization buffer
3261 source->writableBuffer[0] = (UChar)L;
3262 source->writableBuffer[1] = (UChar)V;
3263 if (T != TBase) {
3264 source->writableBuffer[2] = (UChar)T;
3265 source->writableBuffer[3] = 0;
3266 } else {
3267 source->writableBuffer[2] = 0;
3268 }
3269
3270 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3271 // after exhausting the writableBuffer
3272 source->pos = source->writableBuffer;
3273 source->origFlags = source->flags;
3274 source->flags |= UCOL_ITER_INNORMBUF;
3275 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3276
3277 return(UCOL_IGNORABLE);
3278 }
3279 }
3280 case SURROGATE_TAG:
3281 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3282 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3283 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3284 /* we return 0 (completely ignorable - per UCA specification */
3285 {
3286 UChar trail;
3287 collIterateState state;
3288 backupState(source, &state);
3289 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3290 // we chould have stepped one char forward and it might have turned that it
3291 // was not a trail surrogate. In that case, we have to backup.
3292 loadState(source, &state, TRUE);
3293 return 0;
3294 } else {
3295 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3296 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3297 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3298 // We need to backup
3299 loadState(source, &state, TRUE);
3300 return CE;
3301 }
3302 // calculate the supplementary code point value, if surrogate was not tailored
3303 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3304 }
3305 }
3306 break;
3307 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3308 UChar nextChar;
3309 if( source->flags & UCOL_USE_ITERATOR) {
3310 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3311 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3312 source->iterator->next(source->iterator);
3313 return getImplicit(cp, source);
3314 } else {
3315 return 0;
3316 }
3317 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3318 U_IS_TRAIL((nextChar=*source->pos))) {
3319 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3320 source->pos++;
3321 return getImplicit(cp, source);
3322 } else {
3323 return 0; /* completely ignorable */
3324 }
3325 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3326 return 0; /* broken surrogate sequence */
3327 case CHARSET_TAG:
3328 /* not yet implemented */
3329 /* probably after 1.8 */
3330 return UCOL_NOT_FOUND;
3331 default:
3332 *status = U_INTERNAL_PROGRAM_ERROR;
3333 CE=0;
3334 break;
3335 }
3336 if (CE <= UCOL_NOT_FOUND) break;
3337 }
3338 return CE;
3339 }
3340
3341
3342 /* now uses Mark's getImplicitPrimary code */
3343 static
3344 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3345 if(isNonChar(cp)) {
3346 return 0;
3347 }
3348
3349 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3350
3351 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3352 collationSource->toReturn = collationSource->CEpos;
3353
3354 if (collationSource->offsetBuffer == NULL) {
3355 collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3356 collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3357 collationSource->offsetStore = collationSource->offsetBuffer;
3358 }
3359
3360 // **** doesn't work if using iterator ****
3361 if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3362 collationSource->offsetRepeatCount = 1;
3363 } else {
3364 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3365
3366 *(collationSource->offsetStore++) = firstOffset;
3367 *(collationSource->offsetStore++) = firstOffset + 1;
3368
3369 collationSource->offsetReturn = collationSource->offsetStore - 1;
3370 *(collationSource->offsetBuffer) = firstOffset;
3371 if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3372 collationSource->offsetStore = collationSource->offsetBuffer;
3373 }
3374 }
3375
3376 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3377 }
3378
3379 /**
3380 * This function handles the special CEs like contractions, expansions,
3381 * surrogates, Thai.
3382 * It is called by both getPrevCE
3383 */
3384 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3385 collIterate *source,
3386 UErrorCode *status)
3387 {
3388 const uint32_t *CEOffset = NULL;
3389 UChar *UCharOffset = NULL;
3390 UChar schar;
3391 const UChar *constart = NULL;
3392 uint32_t size;
3393 UChar buffer[UCOL_MAX_BUFFER];
3394 uint32_t *endCEBuffer;
3395 UChar *strbuffer;
3396 int32_t noChars = 0;
3397 int32_t CECount = 0;
3398
3399 for(;;)
3400 {
3401 /* the only ces that loops are thai and contractions */
3402 switch (getCETag(CE))
3403 {
3404 case NOT_FOUND_TAG: /* this tag always returns */
3405 return CE;
3406
3407 case SPEC_PROC_TAG:
3408 {
3409 // Special processing is getting a CE that is preceded by a certain prefix
3410 // Currently this is only needed for optimizing Japanese length and iteration marks.
3411 // When we encouter a special processing tag, we go backwards and try to see if
3412 // we have a match.
3413 // Contraction tables are used - so the whole process is not unlike contraction.
3414 // prefix data is stored backwards in the table.
3415 const UChar *UCharOffset;
3416 UChar schar, tchar;
3417 collIterateState prefixState;
3418 backupState(source, &prefixState);
3419 for(;;) {
3420 // This loop will run once per source string character, for as long as we
3421 // are matching a potential contraction sequence
3422
3423 // First we position ourselves at the begining of contraction sequence
3424 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3425
3426 if (collIter_bos(source)) {
3427 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3428 break;
3429 }
3430 schar = getPrevNormalizedChar(source, status);
3431 goBackOne(source);
3432
3433 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3434 UCharOffset++;
3435 }
3436
3437 if (schar == tchar) {
3438 // Found the source string char in the table.
3439 // Pick up the corresponding CE from the table.
3440 CE = *(coll->contractionCEs +
3441 (UCharOffset - coll->contractionIndex));
3442 }
3443 else
3444 {
3445 // if there is a completely ignorable code point in the middle of
3446 // a prefix, we need to act as if it's not there
3447 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3448 // lone surrogates cannot be set to zero as it would break other processing
3449 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3450 // it's easy for BMP code points
3451 if(isZeroCE == 0) {
3452 continue;
3453 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3454 // for supplementary code points, we have to check the next one
3455 // situations where we are going to ignore
3456 // 1. beginning of the string: schar is a lone surrogate
3457 // 2. schar is a lone surrogate
3458 // 3. schar is a trail surrogate in a valid surrogate sequence
3459 // that is explicitly set to zero.
3460 if (!collIter_bos(source)) {
3461 UChar lead;
3462 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3463 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3464 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3465 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3466 if(finalCE == 0) {
3467 // this is a real, assigned completely ignorable code point
3468 goBackOne(source);
3469 continue;
3470 }
3471 }
3472 } else {
3473 // lone surrogate, completely ignorable
3474 continue;
3475 }
3476 } else {
3477 // lone surrogate at the beggining, completely ignorable
3478 continue;
3479 }
3480 }
3481 // Source string char was not in the table.
3482 // We have not found the prefix.
3483 CE = *(coll->contractionCEs +
3484 (ContractionStart - coll->contractionIndex));
3485 }
3486
3487 if(!isPrefix(CE)) {
3488 // The source string char was in the contraction table, and the corresponding
3489 // CE is not a prefix CE. We found the prefix, break
3490 // out of loop, this CE will end up being returned. This is the normal
3491 // way out of prefix handling when the source actually contained
3492 // the prefix.
3493 break;
3494 }
3495 }
3496 loadState(source, &prefixState, TRUE);
3497 break;
3498 }
3499
3500 case CONTRACTION_TAG:
3501 /* to ensure that the backwards and forwards iteration matches, we
3502 take the current region of most possible match and pass it through
3503 the forward iteration. this will ensure that the obstinate problem of
3504 overlapping contractions will not occur.
3505 */
3506 schar = peekCharacter(source, 0);
3507 constart = (UChar *)coll->image + getContractOffset(CE);
3508 if (isAtStartPrevIterate(source)
3509 /* commented away contraction end checks after adding the checks
3510 in getPrevCE */) {
3511 /* start of string or this is not the end of any contraction */
3512 CE = *(coll->contractionCEs +
3513 (constart - coll->contractionIndex));
3514 break;
3515 }
3516 strbuffer = buffer;
3517 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3518 *(UCharOffset --) = 0;
3519 noChars = 0;
3520 // have to swap thai characters
3521 while (ucol_unsafeCP(schar, coll)) {
3522 *(UCharOffset) = schar;
3523 noChars++;
3524 UCharOffset --;
3525 schar = getPrevNormalizedChar(source, status);
3526 goBackOne(source);
3527 // TODO: when we exhaust the contraction buffer,
3528 // it needs to get reallocated. The problem is
3529 // that the size depends on the string which is
3530 // not iterated over. However, since we're travelling
3531 // backwards, we already had to set the iterator at
3532 // the end - so we might as well know where we are?
3533 if (UCharOffset + 1 == buffer) {
3534 /* we have exhausted the buffer */
3535 int32_t newsize = 0;
3536 if(source->pos) { // actually dealing with a position
3537 newsize = source->pos - source->string + 1;
3538 } else { // iterator
3539 newsize = 4 * UCOL_MAX_BUFFER;
3540 }
3541 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3542 (newsize + UCOL_MAX_BUFFER));
3543 /* test for NULL */
3544 if (strbuffer == NULL) {
3545 *status = U_MEMORY_ALLOCATION_ERROR;
3546 return UCOL_NO_MORE_CES;
3547 }
3548 UCharOffset = strbuffer + newsize;
3549 uprv_memcpy(UCharOffset, buffer,
3550 UCOL_MAX_BUFFER * sizeof(UChar));
3551 UCharOffset --;
3552 }
3553 if ((source->pos && (source->pos == source->string ||
3554 ((source->flags & UCOL_ITER_INNORMBUF) &&
3555 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3556 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3557 break;
3558 }
3559 }
3560 /* adds the initial base character to the string */
3561 *(UCharOffset) = schar;
3562 noChars++;
3563
3564 int32_t offsetBias;
3565
3566 #if 0
3567 if (source->offsetReturn != NULL) {
3568 source->offsetStore = source->offsetReturn - noChars;
3569 }
3570
3571 // **** doesn't work if using iterator ****
3572 if (source->flags & UCOL_ITER_INNORMBUF) {
3573 if (source->fcdPosition == NULL) {
3574 offsetBias = 0;
3575 } else {
3576 offsetBias = (int32_t)(source->fcdPosition - source->string);
3577 }
3578 } else {
3579 offsetBias = (int32_t)(source->pos - source->string);
3580 }
3581
3582 #else
3583 // **** doesn't work if using iterator ****
3584 if (source->flags & UCOL_ITER_INNORMBUF) {
3585 #if 1
3586 offsetBias = -1;
3587 #else
3588 if (source->fcdPosition == NULL) {
3589 offsetBias = 0;
3590 } else {
3591 offsetBias = (int32_t)(source->fcdPosition - source->string);
3592 }
3593 #endif
3594 } else {
3595 offsetBias = (int32_t)(source->pos - source->string);
3596 }
3597 #endif
3598
3599 /* a new collIterate is used to simplify things, since using the current
3600 collIterate will mean that the forward and backwards iteration will
3601 share and change the same buffers. we don't want to get into that. */
3602 collIterate temp;
3603 int32_t rawOffset;
3604
3605 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3606 IInit_collIterate(coll, UCharOffset, noChars, &temp);
3607 temp.flags &= ~UCOL_ITER_NORM;
3608
3609 rawOffset = temp.pos - temp.string; // should always be zero?
3610 CE = ucol_IGetNextCE(coll, &temp, status);
3611
3612 if (source->extendCEs) {
3613 endCEBuffer = source->extendCEs + source->extendCEsSize;
3614 CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t);
3615 } else {
3616 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3617 CECount = (source->CEpos - source->CEs)/sizeof(uint32_t);
3618 }
3619
3620 if (source->offsetBuffer == NULL) {
3621 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3622 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3623 source->offsetStore = source->offsetBuffer;
3624 }
3625
3626 while (CE != UCOL_NO_MORE_CES) {
3627 *(source->CEpos ++) = CE;
3628
3629 if (offsetBias >= 0) {
3630 *(source->offsetStore ++) = rawOffset + offsetBias;
3631 }
3632
3633 CECount++;
3634 if (source->CEpos == endCEBuffer) {
3635 /* ran out of CE space, reallocate to new buffer.
3636 If reallocation fails, reset pointers and bail out,
3637 there's no guarantee of the right character position after
3638 this bail*/
3639 if (source->extendCEs == NULL) {
3640 source->extendCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t) *
3641 (source->extendCEsSize =UCOL_EXPAND_CE_BUFFER_SIZE + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3642 if (source->extendCEs == NULL) {
3643 // Handle error later.
3644 CECount = -1;
3645 } else {
3646 source->extendCEs = (uint32_t *)uprv_memcpy(source->extendCEs, source->CEs, UCOL_EXPAND_CE_BUFFER_SIZE * sizeof(uint32_t));
3647 }
3648 } else {
3649 uint32_t *tempBufCE = (uint32_t *)uprv_realloc(source->extendCEs,
3650 sizeof(uint32_t) * (source->extendCEsSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3651 if (tempBufCE == NULL) {
3652 // Handle error later.
3653 CECount = -1;
3654 }
3655 else {
3656 source->extendCEs = tempBufCE;
3657 }
3658 }
3659
3660 if (CECount == -1) {
3661 *status = U_MEMORY_ALLOCATION_ERROR;
3662 source->extendCEsSize = 0;
3663 source->CEpos = source->CEs;
3664 freeHeapWritableBuffer(&temp);
3665
3666 if (strbuffer != buffer) {
3667 uprv_free(strbuffer);
3668 }
3669
3670 return (uint32_t)UCOL_NULLORDER;
3671 }
3672
3673 source->CEpos = source->extendCEs + CECount;
3674 endCEBuffer = source->extendCEs + source->extendCEsSize;
3675 }
3676
3677 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
3678 int32_t storeIX = source->offsetStore - source->offsetBuffer;
3679 int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
3680 sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3681
3682 if (tob != NULL) {
3683 source->offsetBuffer = tob;
3684 source->offsetStore = &source->offsetBuffer[storeIX];
3685 source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
3686 } else {
3687 // memory error...
3688 *status = U_MEMORY_ALLOCATION_ERROR;
3689 source->CEpos = source->CEs;
3690 freeHeapWritableBuffer(&temp);
3691
3692 if (strbuffer != buffer) {
3693 uprv_free(strbuffer);
3694 }
3695
3696 return (uint32_t) UCOL_NULLORDER;
3697 }
3698 }
3699
3700 rawOffset = temp.pos - temp.string;
3701 CE = ucol_IGetNextCE(coll, &temp, status);
3702 }
3703
3704 if (source->offsetRepeatValue != 0) {
3705 if (CECount > noChars) {
3706 source->offsetRepeatCount += temp.offsetRepeatCount;
3707 } else {
3708 // **** does this really skip the right offsets? ****
3709 source->offsetReturn -= (noChars - CECount);
3710 }
3711 }
3712
3713 freeHeapWritableBuffer(&temp);
3714
3715 if (strbuffer != buffer) {
3716 uprv_free(strbuffer);
3717 }
3718
3719 if (offsetBias >= 0) {
3720 source->offsetReturn = source->offsetStore - 1;
3721 if (source->offsetReturn == source->offsetBuffer) {
3722 source->offsetStore = source->offsetBuffer;
3723 }
3724 }
3725
3726 source->toReturn = source->CEpos - 1;
3727 if (source->toReturn == source->CEs) {
3728 source->CEpos = source->CEs;
3729 }
3730
3731 return *(source->toReturn);
3732
3733 case LONG_PRIMARY_TAG:
3734 {
3735 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3736 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3737 source->toReturn = source->CEpos - 1;
3738
3739 if (source->offsetBuffer == NULL) {
3740 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3741 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3742 source->offsetStore = source->offsetBuffer;
3743 }
3744
3745 if (source->flags & UCOL_ITER_INNORMBUF) {
3746 source->offsetRepeatCount = 1;
3747 } else {
3748 int32_t firstOffset = (int32_t)(source->pos - source->string);
3749
3750 *(source->offsetStore++) = firstOffset;
3751 *(source->offsetStore++) = firstOffset + 1;
3752
3753 source->offsetReturn = source->offsetStore - 1;
3754 *(source->offsetBuffer) = firstOffset;
3755 if (source->offsetReturn == source->offsetBuffer) {
3756 source->offsetStore = source->offsetBuffer;
3757 }
3758 }
3759
3760
3761 return *(source->toReturn);
3762 }
3763
3764 case EXPANSION_TAG: /* this tag always returns */
3765 {
3766 /*
3767 This should handle expansion.
3768 NOTE: we can encounter both continuations and expansions in an expansion!
3769 I have to decide where continuations are going to be dealt with
3770 */
3771 int32_t firstOffset = (int32_t)(source->pos - source->string);
3772
3773 // **** doesn't work if using iterator ****
3774 if (source->offsetReturn != NULL) {
3775 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3776 source->offsetStore = source->offsetBuffer;
3777 }else {
3778 firstOffset = -1;
3779 }
3780 }
3781
3782 if (source->offsetBuffer == NULL) {
3783 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3784 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3785 source->offsetStore = source->offsetBuffer;
3786 }
3787
3788 /* find the offset to expansion table */
3789 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3790 size = getExpansionCount(CE);
3791 if (size != 0) {
3792 /*
3793 if there are less than 16 elements in expansion, we don't terminate
3794 */
3795 uint32_t count;
3796
3797 for (count = 0; count < size; count++) {
3798 *(source->CEpos ++) = *CEOffset++;
3799
3800 if (firstOffset >= 0) {
3801 *(source->offsetStore ++) = firstOffset + 1;
3802 }
3803 }
3804 } else {
3805 /* else, we do */
3806 while (*CEOffset != 0) {
3807 *(source->CEpos ++) = *CEOffset ++;
3808
3809 if (firstOffset >= 0) {
3810 *(source->offsetStore ++) = firstOffset + 1;
3811 }
3812 }
3813 }
3814
3815 if (firstOffset >= 0) {
3816 source->offsetReturn = source->offsetStore - 1;
3817 *(source->offsetBuffer) = firstOffset;
3818 if (source->offsetReturn == source->offsetBuffer) {
3819 source->offsetStore = source->offsetBuffer;
3820 }
3821 } else {
3822 source->offsetRepeatCount += size - 1;
3823 }
3824
3825 source->toReturn = source->CEpos - 1;
3826 // in case of one element expansion, we
3827 // want to immediately return CEpos
3828 if(source->toReturn == source->CEs) {
3829 source->CEpos = source->CEs;
3830 }
3831
3832 return *(source->toReturn);
3833 }
3834
3835 case DIGIT_TAG:
3836 {
3837 /*
3838 We do a check to see if we want to collate digits as numbers; if so we generate
3839 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3840 */
3841 //uint32_t size;
3842 uint32_t i; /* general counter */
3843
3844 if (source->coll->numericCollation == UCOL_ON){
3845 uint32_t digIndx = 0;
3846 uint32_t endIndex = 0;
3847 uint32_t leadingZeroIndex = 0;
3848 uint32_t trailingZeroCount = 0;
3849
3850 uint8_t collateVal = 0;
3851
3852 UBool nonZeroValReached = FALSE;
3853
3854 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3855 /*
3856 We parse the source string until we hit a char that's NOT a digit.
3857 Use this u_charDigitValue. This might be slow because we have to
3858 handle surrogates...
3859 */
3860 /*
3861 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3862 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3863 element we process when going backward. To determine how long that chunk might be, we may need to make
3864 two passes through the loop that collects digits - one to see how long the string is (and how much is
3865 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3866 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3867 element chunk after resetting the state to the initialState at the right side of the digit string.
3868 */
3869 uint32_t ceLimit = 0;
3870 UChar initial_ch = ch;
3871 collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3872 backupState(source, &initialState);
3873
3874 for(;;) {
3875 collIterateState state = {0,0,0,0,0,0,0,0,0};
3876 UChar32 char32 = 0;
3877 int32_t digVal = 0;
3878
3879 if (U16_IS_TRAIL (ch)) {
3880 if (!collIter_bos(source)){
3881 UChar lead = getPrevNormalizedChar(source, status);
3882 if(U16_IS_LEAD(lead)) {
3883 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3884 goBackOne(source);
3885 } else {
3886 char32 = ch;
3887 }
3888 } else {
3889 char32 = ch;
3890 }
3891 } else {
3892 char32 = ch;
3893 }
3894 digVal = u_charDigitValue(char32);
3895
3896 for(;;) {
3897 // Make sure we have enough space. No longer needed;
3898 // at this point the largest value of digIndx when we need to save data in numTempBuf
3899 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3900 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3901
3902 // Skip over trailing zeroes, and keep a count of them.
3903 if (digVal != 0)
3904 nonZeroValReached = TRUE;
3905
3906 if (nonZeroValReached) {
3907 /*
3908 We parse the digit string into base 100 numbers (this fits into a byte).
3909 We only add to the buffer in twos, thus if we are parsing an odd character,
3910 that serves as the 'tens' digit while the if we are parsing an even one, that
3911 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3912 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3913 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3914 than all the other bytes.
3915
3916 Since we're doing in this reverse we want to put the first digit encountered into the
3917 ones place and the second digit encountered into the tens place.
3918 */
3919
3920 if ((digIndx + trailingZeroCount) % 2 == 1) {
3921 // High-order digit case (tens place)
3922 collateVal += (uint8_t)(digVal * 10);
3923
3924 // We cannot set leadingZeroIndex unless it has been set for the
3925 // low-order digit. Therefore, all we can do for the high-order
3926 // digit is turn it off, never on.
3927 // The only time we will have a high digit without a low is for
3928 // the very first non-zero digit, so no zero check is necessary.
3929 if (collateVal != 0)
3930 leadingZeroIndex = 0;
3931
3932 // The first pass through, digIndx may exceed the limit, but in that case
3933 // we no longer care about numTempBuf contents since they will be discarded
3934 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3935 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3936 }
3937 collateVal = 0;
3938 } else {
3939 // Low-order digit case (ones place)
3940 collateVal = (uint8_t)digVal;
3941
3942 // Check for leading zeroes.
3943 if (collateVal == 0) {
3944 if (!leadingZeroIndex)
3945 leadingZeroIndex = (digIndx/2) + 2;
3946 } else
3947 leadingZeroIndex = 0;
3948
3949 // No need to write to buffer; the case of a last odd digit
3950 // is handled below.
3951 }
3952 ++digIndx;
3953 } else
3954 ++trailingZeroCount;
3955
3956 if (!collIter_bos(source)) {
3957 ch = getPrevNormalizedChar(source, status);
3958 //goBackOne(source);
3959 if (U16_IS_TRAIL(ch)) {
3960 backupState(source, &state);
3961 if (!collIter_bos(source)) {
3962 goBackOne(source);
3963 UChar lead = getPrevNormalizedChar(source, status);
3964
3965 if(U16_IS_LEAD(lead)) {
3966 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3967 } else {
3968 loadState(source, &state, FALSE);
3969 char32 = ch;
3970 }
3971 }
3972 } else
3973 char32 = ch;
3974
3975 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3976 if (char32 > 0xFFFF) {// For surrogates.
3977 loadState(source, &state, FALSE);
3978 }
3979 // Don't need to "reverse" the goBackOne call,
3980 // as this points to the next position to process..
3981 //if (char32 > 0xFFFF) // For surrogates.
3982 //getNextNormalizedChar(source);
3983 break;
3984 }
3985
3986 goBackOne(source);
3987 }else
3988 break;
3989 }
3990
3991 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3992 // our collation element is not too big, go ahead and finish with it
3993 break;
3994 }
3995 // our digit string is too long for a collation element;
3996 // set the limit for it, reset the state and begin again
3997 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3998 if ( ceLimit == 0 ) {
3999 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
4000 }
4001 ch = initial_ch;
4002 loadState(source, &initialState, FALSE);
4003 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
4004 collateVal = 0;
4005 nonZeroValReached = FALSE;
4006 }
4007
4008 if (! nonZeroValReached) {
4009 digIndx = 2;
4010 trailingZeroCount = 0;
4011 numTempBuf[2] = 6;
4012 }
4013
4014 if ((digIndx + trailingZeroCount) % 2 != 0) {
4015 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
4016 digIndx += 1; // The implicit leading zero
4017 }
4018 if (trailingZeroCount % 2 != 0) {
4019 // We had to consume one trailing zero for the low digit
4020 // of the least significant byte
4021 digIndx += 1; // The trailing zero not in the exponent
4022 trailingZeroCount -= 1;
4023 }
4024
4025 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
4026
4027 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4028 numTempBuf[2] -= 1;
4029
4030 /*
4031 We want to skip over the first two slots in the buffer. The first slot
4032 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4033 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4034 The exponent must be adjusted by the number of leading zeroes, and the number of
4035 trailing zeroes.
4036 */
4037 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
4038 uint32_t exponent = (digIndx+trailingZeroCount)/2;
4039 if (leadingZeroIndex)
4040 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
4041 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
4042
4043 // Now transfer the collation key to our collIterate struct.
4044 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4045 //size = ((endIndex+1) & ~1)/2;
4046 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
4047 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
4048 UCOL_BYTE_COMMON; // Tertiary weight.
4049 i = endIndex - 1; // Reset the index into the buffer.
4050 while(i >= 2) {
4051 uint32_t primWeight = numTempBuf[i--] << 8;
4052 if ( i >= 2)
4053 primWeight |= numTempBuf[i--];
4054 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
4055 }
4056
4057 source->toReturn = source->CEpos -1;
4058 return *(source->toReturn);
4059 } else {
4060 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4061 CE = *(CEOffset++);
4062 break;
4063 }
4064 }
4065
4066 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
4067 {
4068 static const uint32_t
4069 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4070 //const uint32_t LCount = 19;
4071 static const uint32_t VCount = 21;
4072 static const uint32_t TCount = 28;
4073 //const uint32_t NCount = VCount * TCount; /* 588 */
4074 //const uint32_t SCount = LCount * NCount; /* 11172 */
4075
4076 uint32_t L = ch - SBase;
4077 /*
4078 divide into pieces.
4079 we do it in this order since some compilers can do % and / in one
4080 operation
4081 */
4082 uint32_t T = L % TCount;
4083 L /= TCount;
4084 uint32_t V = L % VCount;
4085 L /= VCount;
4086
4087 /* offset them */
4088 L += LBase;
4089 V += VBase;
4090 T += TBase;
4091
4092 if (source->offsetBuffer == NULL) {
4093 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
4094 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
4095 source->offsetStore = source->offsetBuffer;
4096 }
4097
4098 int32_t firstOffset = (int32_t)(source->pos - source->string);
4099
4100 *(source->offsetStore++) = firstOffset;
4101
4102 /*
4103 * return the first CE, but first put the rest into the expansion buffer
4104 */
4105 if (!source->coll->image->jamoSpecial) {
4106 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4107 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4108 *(source->offsetStore++) = firstOffset + 1;
4109
4110 if (T != TBase) {
4111 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4112 *(source->offsetStore++) = firstOffset + 1;
4113 }
4114
4115 source->toReturn = source->CEpos - 1;
4116
4117 source->offsetReturn = source->offsetStore - 1;
4118 if (source->offsetReturn == source->offsetBuffer) {
4119 source->offsetStore = source->offsetBuffer;
4120 }
4121
4122 return *(source->toReturn);
4123 } else {
4124 // Since Hanguls pass the FCD check, it is
4125 // guaranteed that we won't be in
4126 // the normalization buffer if something like this happens
4127 // Move Jamos into normalization buffer
4128 /*
4129 Move the Jamos into the
4130 normalization buffer
4131 */
4132 UChar *tempbuffer = source->writableBuffer +
4133 (source->writableBufSize - 1);
4134 *(tempbuffer) = 0;
4135 if (T != TBase) {
4136 *(tempbuffer - 1) = (UChar)T;
4137 *(tempbuffer - 2) = (UChar)V;
4138 *(tempbuffer - 3) = (UChar)L;
4139 *(tempbuffer - 4) = 0;
4140 } else {
4141 *(tempbuffer - 1) = (UChar)V;
4142 *(tempbuffer - 2) = (UChar)L;
4143 *(tempbuffer - 3) = 0;
4144 }
4145
4146 /*
4147 Indicate where to continue in main input string after exhausting
4148 the writableBuffer
4149 */
4150 if (source->pos == source->string) {
4151 source->fcdPosition = NULL;
4152 } else {
4153 source->fcdPosition = source->pos-1;
4154 }
4155
4156 source->pos = tempbuffer;
4157 source->origFlags = source->flags;
4158 source->flags |= UCOL_ITER_INNORMBUF;
4159 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4160
4161 return(UCOL_IGNORABLE);
4162 }
4163 }
4164
4165 case IMPLICIT_TAG: /* everything that is not defined otherwise */
4166 #if 0
4167 if (source->offsetBuffer == NULL) {
4168 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
4169 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
4170 source->offsetStore = source->offsetBuffer;
4171 }
4172
4173 // **** doesn't work if using iterator ****
4174 if (source->flags & UCOL_ITER_INNORMBUF) {
4175 source->offsetRepeatCount = 1;
4176 } else {
4177 int32_t firstOffset = (int32_t)(source->pos - source->string);
4178
4179 *(source->offsetStore++) = firstOffset;
4180 *(source->offsetStore++) = firstOffset + 1;
4181
4182 source->offsetReturn = source->offsetStore - 1;
4183 if (source->offsetReturn == source->offsetBuffer) {
4184 source->offsetStore = source->offsetBuffer;
4185 }
4186 }
4187 #endif
4188
4189 return getPrevImplicit(ch, source);
4190
4191 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4192 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4193 return getPrevImplicit(ch, source);
4194
4195 case SURROGATE_TAG: /* This is a surrogate pair */
4196 /* essentialy an engaged lead surrogate. */
4197 /* if you have encountered it here, it means that a */
4198 /* broken sequence was encountered and this is an error */
4199 return 0;
4200
4201 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
4202 return 0; /* broken surrogate sequence */
4203
4204 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4205 {
4206 UChar32 cp = 0;
4207 UChar prevChar;
4208 UChar *prev;
4209 if (isAtStartPrevIterate(source)) {
4210 /* we are at the start of the string, wrong place to be at */
4211 return 0;
4212 }
4213 if (source->pos != source->writableBuffer) {
4214 prev = source->pos - 1;
4215 } else {
4216 prev = source->fcdPosition;
4217 }
4218 prevChar = *prev;
4219
4220 /* Handles Han and Supplementary characters here.*/
4221 if (U16_IS_LEAD(prevChar)) {
4222 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4223 source->pos = prev;
4224 } else {
4225 return 0; /* completely ignorable */
4226 }
4227
4228 return getPrevImplicit(cp, source);
4229 }
4230
4231 /* UCA is filled with these. Tailorings are NOT_FOUND */
4232 /* not yet implemented */
4233 case CHARSET_TAG: /* this tag always returns */
4234 /* probably after 1.8 */
4235 return UCOL_NOT_FOUND;
4236
4237 default: /* this tag always returns */
4238 *status = U_INTERNAL_PROGRAM_ERROR;
4239 CE=0;
4240 break;
4241 }
4242
4243 if (CE <= UCOL_NOT_FOUND) {
4244 break;
4245 }
4246 }
4247
4248 return CE;
4249 }
4250
4251 /* This should really be a macro */
4252 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4253 /* anyway */
4254 static
4255 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4256 #ifdef UCOL_DEBUG
4257 fprintf(stderr, ".");
4258 #endif
4259 uint8_t *newStart = NULL;
4260 uint32_t offset = *secondaries-secStart;
4261
4262 if(secStart==second) {
4263 newStart=(uint8_t*)uprv_malloc(newSize);
4264 if(newStart==NULL) {
4265 *status = U_MEMORY_ALLOCATION_ERROR;
4266 return NULL;
4267 }
4268 uprv_memcpy(newStart, secStart, *secondaries-secStart);
4269 } else {
4270 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4271 if(newStart==NULL) {
4272 *status = U_MEMORY_ALLOCATION_ERROR;
4273 /* Since we're reallocating, return original reference so we don't loose it. */
4274 return secStart;
4275 }
4276 }
4277 *secondaries=newStart+offset;
4278 *secSize=newSize;
4279 return newStart;
4280 }
4281
4282
4283 /* This should really be a macro */
4284 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4285 /* secondaries in French */
4286 /*
4287 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4288 uint8_t temp;
4289 while(start<end) {
4290 temp = *start;
4291 *start++ = *end;
4292 *end-- = temp;
4293 }
4294 }
4295 */
4296
4297 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4298 TYPE tempA; \
4299 while((start)<(end)) { \
4300 tempA = *(start); \
4301 *(start)++ = *(end); \
4302 *(end)-- = tempA; \
4303 } \
4304 }
4305
4306 /****************************************************************************/
4307 /* Following are the sortkey generation functions */
4308 /* */
4309 /****************************************************************************/
4310
4311 /**
4312 * Merge two sort keys.
4313 * This is useful, for example, to combine sort keys from first and last names
4314 * to sort such pairs.
4315 * Merged sort keys consider on each collation level the first part first entirely,
4316 * then the second one.
4317 * It is possible to merge multiple sort keys by consecutively merging
4318 * another one with the intermediate result.
4319 *
4320 * The length of the merge result is the sum of the lengths of the input sort keys
4321 * minus 1.
4322 *
4323 * @param src1 the first sort key
4324 * @param src1Length the length of the first sort key, including the zero byte at the end;
4325 * can be -1 if the function is to find the length
4326 * @param src2 the second sort key
4327 * @param src2Length the length of the second sort key, including the zero byte at the end;
4328 * can be -1 if the function is to find the length
4329 * @param dest the buffer where the merged sort key is written,
4330 * can be NULL if destCapacity==0
4331 * @param destCapacity the number of bytes in the dest buffer
4332 * @return the length of the merged sort key, src1Length+src2Length-1;
4333 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4334 * in which cases the contents of dest is undefined
4335 *
4336 * @draft
4337 */
4338 U_CAPI int32_t U_EXPORT2
4339 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4340 const uint8_t *src2, int32_t src2Length,
4341 uint8_t *dest, int32_t destCapacity) {
4342 int32_t destLength;
4343 uint8_t b;
4344
4345 /* check arguments */
4346 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4347 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4348 destCapacity<0 || (destCapacity>0 && dest==NULL)
4349 ) {
4350 /* error, attempt to write a zero byte and return 0 */
4351 if(dest!=NULL && destCapacity>0) {
4352 *dest=0;
4353 }
4354 return 0;
4355 }
4356
4357 /* check lengths and capacity */
4358 if(src1Length<0) {
4359 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4360 }
4361 if(src2Length<0) {
4362 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4363 }
4364
4365 destLength=src1Length+src2Length-1;
4366 if(destLength>destCapacity) {
4367 /* the merged sort key does not fit into the destination */
4368 return destLength;
4369 }
4370
4371 /* merge the sort keys with the same number of levels */
4372 while(*src1!=0 && *src2!=0) { /* while both have another level */
4373 /* copy level from src1 not including 00 or 01 */
4374 while((b=*src1)>=2) {
4375 ++src1;
4376 *dest++=b;
4377 }
4378
4379 /* add a 02 merge separator */
4380 *dest++=2;
4381
4382 /* copy level from src2 not including 00 or 01 */
4383 while((b=*src2)>=2) {
4384 ++src2;
4385 *dest++=b;
4386 }
4387
4388 /* if both sort keys have another level, then add a 01 level separator and continue */
4389 if(*src1==1 && *src2==1) {
4390 ++src1;
4391 ++src2;
4392 *dest++=1;
4393 }
4394 }
4395
4396 /*
4397 * here, at least one sort key is finished now, but the other one
4398 * might have some contents left from containing more levels;
4399 * that contents is just appended to the result
4400 */
4401 if(*src1!=0) {
4402 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4403 src2=src1;
4404 }
4405 /* append src2, "the other, unfinished sort key" */
4406 uprv_strcpy((char *)dest, (const char *)src2);
4407
4408 /* trust that neither sort key contained illegally embedded zero bytes */
4409 return destLength;
4410 }
4411
4412 /* sortkey API */
4413 U_CAPI int32_t U_EXPORT2
4414 ucol_getSortKey(const UCollator *coll,
4415 const UChar *source,
4416 int32_t sourceLength,
4417 uint8_t *result,
4418 int32_t resultLength)
4419 {
4420 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4421 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4422 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4423 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4424 }
4425
4426 UErrorCode status = U_ZERO_ERROR;
4427 int32_t keySize = 0;
4428
4429 if(source != NULL) {
4430 // source == NULL is actually an error situation, but we would need to
4431 // have an error code to return it. Until we introduce a new
4432 // API, it stays like this
4433
4434 /* this uses the function pointer that is set in updateinternalstate */
4435 /* currently, there are two funcs: */
4436 /*ucol_calcSortKey(...);*/
4437 /*ucol_calcSortKeySimpleTertiary(...);*/
4438
4439 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4440 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4441 // That's not good. Something unusual happened.
4442 // We don't know how much we initialized before we failed.
4443 // NULL terminate for safety.
4444 // We have no way say that we have generated a partial sort key.
4445 //result[0] = 0;
4446 //keySize = 0;
4447 //}
4448 }
4449 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4450 UTRACE_EXIT_STATUS(status);
4451 return keySize;
4452 }
4453
4454 /* this function is called by the C++ API for sortkey generation */
4455 U_CFUNC int32_t
4456 ucol_getSortKeyWithAllocation(const UCollator *coll,
4457 const UChar *source, int32_t sourceLength,
4458 uint8_t **pResult,
4459 UErrorCode *pErrorCode) {
4460 *pResult = 0;
4461 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4462 }
4463
4464 #define UCOL_FSEC_BUF_SIZE 256
4465
4466 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4467 /* or if we run out of space while making a sortkey and want to return ASAP */
4468 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4469 UErrorCode status = U_ZERO_ERROR;
4470 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4471 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4472 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4473 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4474 UBool compareIdent = (strength == UCOL_IDENTICAL);
4475 UBool doCase = (coll->caseLevel == UCOL_ON);
4476 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4477 //UBool qShifted = shifted && (compareQuad == 0);
4478 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4479 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4480 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4481 uint8_t *fSecs = fSecsBuff;
4482 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4483 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4484
4485 uint32_t variableTopValue = coll->variableTopValue;
4486 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4487 if(doHiragana) {
4488 UCOL_COMMON_BOT4++;
4489 /* allocate one more space for hiragana */
4490 }
4491 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4492
4493 uint32_t order = UCOL_NO_MORE_CES;
4494 uint8_t primary1 = 0;
4495 uint8_t primary2 = 0;
4496 uint8_t secondary = 0;
4497 uint8_t tertiary = 0;
4498 int32_t caseShift = 0;
4499 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4500
4501 uint8_t caseSwitch = coll->caseSwitch;
4502 uint8_t tertiaryMask = coll->tertiaryMask;
4503 uint8_t tertiaryCommon = coll->tertiaryCommon;
4504
4505 UBool wasShifted = FALSE;
4506 UBool notIsContinuation = FALSE;
4507 uint8_t leadPrimary = 0;
4508
4509
4510 for(;;) {
4511 order = ucol_IGetNextCE(coll, s, &status);
4512 if(order == UCOL_NO_MORE_CES) {
4513 break;
4514 }
4515
4516 if(order == 0) {
4517 continue;
4518 }
4519
4520 notIsContinuation = !isContinuation(order);
4521
4522
4523 if(notIsContinuation) {
4524 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4525 } else {
4526 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4527 }
4528 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4529 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4530 primary1 = (uint8_t)(order >> 8);
4531
4532
4533 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4534 || (!notIsContinuation && wasShifted))
4535 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4536 /* and other ignorables should be removed if following a shifted code point */
4537 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4538 /* we should just completely ignore it */
4539 continue;
4540 }
4541 if(compareQuad == 0) {
4542 if(c4 > 0) {
4543 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4544 c4 = 0;
4545 }
4546 currentSize++;
4547 if(primary2 != 0) {
4548 currentSize++;
4549 }
4550 }
4551 wasShifted = TRUE;
4552 } else {
4553 wasShifted = FALSE;
4554 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4555 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4556 /* calculate sortkey size */
4557 if(primary1 != UCOL_IGNORABLE) {
4558 if(notIsContinuation) {
4559 if(leadPrimary == primary1) {
4560 currentSize++;
4561 } else {
4562 if(leadPrimary != 0) {
4563 currentSize++;
4564 }
4565 if(primary2 == UCOL_IGNORABLE) {
4566 /* one byter, not compressed */
4567 currentSize++;
4568 leadPrimary = 0;
4569 }
4570 else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4571 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4572 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4573 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
4574 {
4575 /* not compressible */
4576 leadPrimary = 0;
4577 currentSize+=2;
4578 }
4579 else { /* compress */
4580 leadPrimary = primary1;
4581 currentSize+=2;
4582 }
4583 }
4584 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4585 currentSize++;
4586 if(primary2 != UCOL_IGNORABLE) {
4587 currentSize++;
4588 }
4589 }
4590 }
4591
4592 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4593 if(!isFrenchSec){
4594 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4595 c2++;
4596 } else {
4597 if(c2 > 0) {
4598 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4599 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4600 } else {
4601 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4602 }
4603 c2 = 0;
4604 }
4605 currentSize++;
4606 }
4607 } else {
4608 fSecs[fSecsLen++] = secondary;
4609 if(fSecsLen == fSecsMaxLen) {
4610 uint8_t *fSecsTemp;
4611 if(fSecs == fSecsBuff) {
4612 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4613 } else {
4614 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4615 }
4616 if(fSecsTemp == NULL) {
4617 status = U_MEMORY_ALLOCATION_ERROR;
4618 return 0;
4619 }
4620 fSecs = fSecsTemp;
4621 fSecsMaxLen *= 2;
4622 }
4623 if(notIsContinuation) {
4624 if (frenchStartPtr != NULL) {
4625 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4626 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4627 frenchStartPtr = NULL;
4628 }
4629 } else {
4630 if (frenchStartPtr == NULL) {
4631 frenchStartPtr = fSecs+fSecsLen-2;
4632 }
4633 frenchEndPtr = fSecs+fSecsLen-1;
4634 }
4635 }
4636 }
4637
4638 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4639 // do the case level if we need to do it. We don't want to calculate
4640 // case level for primary ignorables if we have only primary strength and case level
4641 // otherwise we would break well formedness of CEs
4642 if (caseShift == 0) {
4643 currentSize++;
4644 caseShift = UCOL_CASE_SHIFT_START;
4645 }
4646 if((tertiary&0x3F) > 0 && notIsContinuation) {
4647 caseShift--;
4648 if((tertiary &0xC0) != 0) {
4649 if (caseShift == 0) {
4650 currentSize++;
4651 caseShift = UCOL_CASE_SHIFT_START;
4652 }
4653 caseShift--;
4654 }
4655 }
4656 } else {
4657 if(notIsContinuation) {
4658 tertiary ^= caseSwitch;
4659 }
4660 }
4661
4662 tertiary &= tertiaryMask;
4663 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4664 if (tertiary == tertiaryCommon && notIsContinuation) {
4665 c3++;
4666 } else {
4667 if(c3 > 0) {
4668 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4669 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4670 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4671 } else {
4672 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4673 }
4674 c3 = 0;
4675 }
4676 currentSize++;
4677 }
4678 }
4679
4680 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4681 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4682 if(c4>0) { // Close this part
4683 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4684 c4 = 0;
4685 }
4686 currentSize++; // Add the Hiragana
4687 } else { // This wasn't Hiragana, so we can continue adding stuff
4688 c4++;
4689 }
4690 }
4691 }
4692 }
4693
4694 if(!isFrenchSec){
4695 if(c2 > 0) {
4696 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4697 }
4698 } else {
4699 uint32_t i = 0;
4700 if(frenchStartPtr != NULL) {
4701 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4702 }
4703 for(i = 0; i<fSecsLen; i++) {
4704 secondary = *(fSecs+fSecsLen-i-1);
4705 /* This is compression code. */
4706 if (secondary == UCOL_COMMON2) {
4707 ++c2;
4708 } else {
4709 if(c2 > 0) {
4710 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4711 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4712 } else {
4713 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4714 }
4715 c2 = 0;
4716 }
4717 currentSize++;
4718 }
4719 }
4720 if(c2 > 0) {
4721 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4722 }
4723 if(fSecs != fSecsBuff) {
4724 uprv_free(fSecs);
4725 }
4726 }
4727
4728 if(c3 > 0) {
4729 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4730 }
4731
4732 if(c4 > 0 && compareQuad == 0) {
4733 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4734 }
4735
4736 if(compareIdent) {
4737 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4738 }
4739 return currentSize;
4740 }
4741
4742 static
4743 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4744 if (caseShift == 0) {
4745 *(*cases)++ = UCOL_CASE_BYTE_START;
4746 caseShift = UCOL_CASE_SHIFT_START;
4747 }
4748 }
4749
4750 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4751 // know how many values we wanted to add, even if we didn't add them all
4752 static
4753 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4754 size++;
4755 if(primaries < limit) {
4756 *(primaries)++ = value;
4757 }
4758 }
4759
4760 // Packs the secondary buffer when processing French locale. Adds the terminator.
4761 static
4762 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4763 uint8_t secondary;
4764 int32_t count2 = 0;
4765 uint32_t i = 0, size = 0;
4766 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4767 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4768 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4769 if(frenchStartPtr != NULL) {
4770 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4771 }
4772 for(i = 0; i<*secsize; i++) {
4773 secondary = *(secondaries-i-1);
4774 /* This is compression code. */
4775 if (secondary == UCOL_COMMON2) {
4776 ++count2;
4777 } else {
4778 if (count2 > 0) {
4779 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4780 while (count2 > UCOL_TOP_COUNT2) {
4781 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4782 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4783 }
4784 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4785 } else {
4786 while (count2 > UCOL_BOT_COUNT2) {
4787 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4788 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4789 }
4790 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4791 }
4792 count2 = 0;
4793 }
4794 addWithIncrement(primaries, primEnd, size, secondary);
4795 }
4796 }
4797 if (count2 > 0) {
4798 while (count2 > UCOL_BOT_COUNT2) {
4799 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4800 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4801 }
4802 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4803 }
4804 *secsize = size;
4805 return primaries;
4806 }
4807
4808 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4809
4810 /* This is the sortkey work horse function */
4811 U_CFUNC int32_t U_CALLCONV
4812 ucol_calcSortKey(const UCollator *coll,
4813 const UChar *source,
4814 int32_t sourceLength,
4815 uint8_t **result,
4816 uint32_t resultLength,
4817 UBool allocateSKBuffer,
4818 UErrorCode *status)
4819 {
4820 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4821
4822 uint32_t i = 0; /* general purpose counter */
4823
4824 /* Stack allocated buffers for buffers we use */
4825 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4826
4827 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4828
4829 if(U_FAILURE(*status)) {
4830 return 0;
4831 }
4832
4833 if(primaries == NULL && allocateSKBuffer == TRUE) {
4834 primaries = *result = prim;
4835 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4836 }
4837
4838 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4839 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4840
4841 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4842
4843 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4844 UChar *normSource = normBuffer;
4845 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4846
4847 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4848
4849 UColAttributeValue strength = coll->strength;
4850
4851 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4852 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4853 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4854 UBool compareIdent = (strength == UCOL_IDENTICAL);
4855 UBool doCase = (coll->caseLevel == UCOL_ON);
4856 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4857 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4858 //UBool qShifted = shifted && (compareQuad == 0);
4859 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4860 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4861
4862 uint32_t variableTopValue = coll->variableTopValue;
4863 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4864 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4865 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4866 uint8_t UCOL_HIRAGANA_QUAD = 0;
4867 if(doHiragana) {
4868 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4869 /* allocate one more space for hiragana, value for hiragana */
4870 }
4871 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4872
4873 /* support for special features like caselevel and funky secondaries */
4874 uint8_t *frenchStartPtr = NULL;
4875 uint8_t *frenchEndPtr = NULL;
4876 uint32_t caseShift = 0;
4877
4878 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4879
4880 /* If we need to normalize, we'll do it all at once at the beginning! */
4881 UNormalizationMode normMode;
4882 if(compareIdent) {
4883 normMode = UNORM_NFD;
4884 } else if(coll->normalizationMode != UCOL_OFF) {
4885 normMode = UNORM_FCD;
4886 } else {
4887 normMode = UNORM_NONE;
4888 }
4889
4890 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4891 len = unorm_internalNormalize(normSource, normSourceLen,
4892 source, len,
4893 normMode, FALSE,
4894 status);
4895 if(*status == U_BUFFER_OVERFLOW_ERROR) {
4896 normSourceLen = len;
4897 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4898 if(normSource == NULL) {
4899 *status = U_MEMORY_ALLOCATION_ERROR;
4900 return 0;
4901 }
4902 *status = U_ZERO_ERROR;
4903 len = unorm_internalNormalize(normSource, normSourceLen,
4904 source, len,
4905 normMode, FALSE,
4906 status);
4907 }
4908
4909 if(U_FAILURE(*status)) {
4910 return 0;
4911 }
4912 source = normSource;
4913 }
4914
4915 collIterate s;
4916 IInit_collIterate(coll, (UChar *)source, len, &s);
4917 if(source == normSource) {
4918 s.flags &= ~UCOL_ITER_NORM;
4919 }
4920
4921 if(resultLength == 0 || primaries == NULL) {
4922 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4923 if(normSource != normBuffer) {
4924 uprv_free(normSource);
4925 }
4926 return keyLen;
4927 }
4928 uint8_t *primarySafeEnd = primaries + resultLength - 1;
4929 if(strength > UCOL_PRIMARY) {
4930 primarySafeEnd--;
4931 }
4932
4933 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4934
4935 uint8_t *primStart = primaries;
4936 uint8_t *secStart = secondaries;
4937 uint8_t *terStart = tertiaries;
4938 uint8_t *caseStart = cases;
4939 uint8_t *quadStart = quads;
4940
4941 uint32_t order = 0;
4942
4943 uint8_t primary1 = 0;
4944 uint8_t primary2 = 0;
4945 uint8_t secondary = 0;
4946 uint8_t tertiary = 0;
4947 uint8_t caseSwitch = coll->caseSwitch;
4948 uint8_t tertiaryMask = coll->tertiaryMask;
4949 int8_t tertiaryAddition = coll->tertiaryAddition;
4950 uint8_t tertiaryTop = coll->tertiaryTop;
4951 uint8_t tertiaryBottom = coll->tertiaryBottom;
4952 uint8_t tertiaryCommon = coll->tertiaryCommon;
4953 uint8_t caseBits = 0;
4954
4955 UBool finished = FALSE;
4956 UBool wasShifted = FALSE;
4957 UBool notIsContinuation = FALSE;
4958
4959 uint32_t prevBuffSize = 0;
4960
4961 uint32_t count2 = 0, count3 = 0, count4 = 0;
4962 uint8_t leadPrimary = 0;
4963
4964 for(;;) {
4965 for(i=prevBuffSize; i<minBufferSize; ++i) {
4966
4967 order = ucol_IGetNextCE(coll, &s, status);
4968 if(order == UCOL_NO_MORE_CES) {
4969 finished = TRUE;
4970 break;
4971 }
4972
4973 if(order == 0) {
4974 continue;
4975 }
4976
4977 notIsContinuation = !isContinuation(order);
4978
4979 if(notIsContinuation) {
4980 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4981 } else {
4982 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4983 }
4984
4985 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4986 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4987 primary1 = (uint8_t)(order >> 8);
4988
4989 /*if(notIsContinuation && scriptOrder != NULL) {
4990 primary1 = scriptOrder[primary1];
4991 }*/
4992
4993 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4994 || (!notIsContinuation && wasShifted))
4995 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4996 {
4997 /* and other ignorables should be removed if following a shifted code point */
4998 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4999 /* we should just completely ignore it */
5000 continue;
5001 }
5002 if(compareQuad == 0) {
5003 if(count4 > 0) {
5004 while (count4 > UCOL_BOT_COUNT4) {
5005 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5006 count4 -= UCOL_BOT_COUNT4;
5007 }
5008 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5009 count4 = 0;
5010 }
5011 /* We are dealing with a variable and we're treating them as shifted */
5012 /* This is a shifted ignorable */
5013 if(primary1 != 0) { /* we need to check this since we could be in continuation */
5014 *quads++ = primary1;
5015 }
5016 if(primary2 != 0) {
5017 *quads++ = primary2;
5018 }
5019 }
5020 wasShifted = TRUE;
5021 } else {
5022 wasShifted = FALSE;
5023 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5024 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5025 /* regular and simple sortkey calc */
5026 if(primary1 != UCOL_IGNORABLE) {
5027 if(notIsContinuation) {
5028 if(leadPrimary == primary1) {
5029 *primaries++ = primary2;
5030 } else {
5031 if(leadPrimary != 0) {
5032 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5033 }
5034 if(primary2 == UCOL_IGNORABLE) {
5035 /* one byter, not compressed */
5036 *primaries++ = primary1;
5037 leadPrimary = 0;
5038 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5039 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5040 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5041 /* not compressible */
5042 leadPrimary = 0;
5043 *primaries++ = primary1;
5044 if(primaries <= primarySafeEnd) {
5045 *primaries++ = primary2;
5046 }
5047 } else { /* compress */
5048 *primaries++ = leadPrimary = primary1;
5049 if(primaries <= primarySafeEnd) {
5050 *primaries++ = primary2;
5051 }
5052 }
5053 }
5054 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5055 *primaries++ = primary1;
5056 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
5057 *primaries++ = primary2; /* second part */
5058 }
5059 }
5060 }
5061
5062 if(secondary > compareSec) {
5063 if(!isFrenchSec) {
5064 /* This is compression code. */
5065 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5066 ++count2;
5067 } else {
5068 if (count2 > 0) {
5069 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5070 while (count2 > UCOL_TOP_COUNT2) {
5071 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5072 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5073 }
5074 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5075 } else {
5076 while (count2 > UCOL_BOT_COUNT2) {
5077 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5078 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5079 }
5080 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5081 }
5082 count2 = 0;
5083 }
5084 *secondaries++ = secondary;
5085 }
5086 } else {
5087 *secondaries++ = secondary;
5088 /* Do the special handling for French secondaries */
5089 /* We need to get continuation elements and do intermediate restore */
5090 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5091 if(notIsContinuation) {
5092 if (frenchStartPtr != NULL) {
5093 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5094 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5095 frenchStartPtr = NULL;
5096 }
5097 } else {
5098 if (frenchStartPtr == NULL) {
5099 frenchStartPtr = secondaries - 2;
5100 }
5101 frenchEndPtr = secondaries-1;
5102 }
5103 }
5104 }
5105
5106 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
5107 // do the case level if we need to do it. We don't want to calculate
5108 // case level for primary ignorables if we have only primary strength and case level
5109 // otherwise we would break well formedness of CEs
5110 doCaseShift(&cases, caseShift);
5111 if(notIsContinuation) {
5112 caseBits = (uint8_t)(tertiary & 0xC0);
5113
5114 if(tertiary != 0) {
5115 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5116 if((caseBits & 0xC0) == 0) {
5117 *(cases-1) |= 1 << (--caseShift);
5118 } else {
5119 *(cases-1) |= 0 << (--caseShift);
5120 /* second bit */
5121 doCaseShift(&cases, caseShift);
5122 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
5123 }
5124 } else {
5125 if((caseBits & 0xC0) == 0) {
5126 *(cases-1) |= 0 << (--caseShift);
5127 } else {
5128 *(cases-1) |= 1 << (--caseShift);
5129 /* second bit */
5130 doCaseShift(&cases, caseShift);
5131 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
5132 }
5133 }
5134 }
5135
5136 }
5137 } else {
5138 if(notIsContinuation) {
5139 tertiary ^= caseSwitch;
5140 }
5141 }
5142
5143 tertiary &= tertiaryMask;
5144 if(tertiary > compareTer) {
5145 /* This is compression code. */
5146 /* sequence size check is included in the if clause */
5147 if (tertiary == tertiaryCommon && notIsContinuation) {
5148 ++count3;
5149 } else {
5150 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5151 tertiary += tertiaryAddition;
5152 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5153 tertiary -= tertiaryAddition;
5154 }
5155 if (count3 > 0) {
5156 if ((tertiary > tertiaryCommon)) {
5157 while (count3 > coll->tertiaryTopCount) {
5158 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5159 count3 -= (uint32_t)coll->tertiaryTopCount;
5160 }
5161 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5162 } else {
5163 while (count3 > coll->tertiaryBottomCount) {
5164 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5165 count3 -= (uint32_t)coll->tertiaryBottomCount;
5166 }
5167 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5168 }
5169 count3 = 0;
5170 }
5171 *tertiaries++ = tertiary;
5172 }
5173 }
5174
5175 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
5176 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5177 if(count4>0) { // Close this part
5178 while (count4 > UCOL_BOT_COUNT4) {
5179 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5180 count4 -= UCOL_BOT_COUNT4;
5181 }
5182 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5183 count4 = 0;
5184 }
5185 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5186 } else { // This wasn't Hiragana, so we can continue adding stuff
5187 count4++;
5188 }
5189 }
5190 }
5191
5192 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5193 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5194 IInit_collIterate(coll, (UChar *)source, len, &s);
5195 if(source == normSource) {
5196 s.flags &= ~UCOL_ITER_NORM;
5197 }
5198 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5199 *status = U_BUFFER_OVERFLOW_ERROR;
5200 finished = TRUE;
5201 break;
5202 } else { /* It's much nicer if we can actually reallocate */
5203 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
5204 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5205 if(U_SUCCESS(*status)) {
5206 *result = primStart;
5207 primarySafeEnd = primStart + resultLength - 1;
5208 if(strength > UCOL_PRIMARY) {
5209 primarySafeEnd--;
5210 }
5211 } else {
5212 /* We ran out of memory!? We can't recover. */
5213 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5214 finished = TRUE;
5215 break;
5216 }
5217 }
5218 }
5219 }
5220 if(finished) {
5221 break;
5222 } else {
5223 prevBuffSize = minBufferSize;
5224
5225 uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5226 if (frenchStartPtr != NULL) {
5227 frenchStartOffset = frenchStartPtr - secStart;
5228 frenchEndOffset = frenchEndPtr - secStart;
5229 }
5230 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5231 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5232 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5233 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5234 if(U_FAILURE(*status)) {
5235 /* We ran out of memory!? We can't recover. */
5236 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5237 break;
5238 }
5239 if (frenchStartPtr != NULL) {
5240 frenchStartPtr = secStart + frenchStartOffset;
5241 frenchEndPtr = secStart + frenchEndOffset;
5242 }
5243 minBufferSize *= 2;
5244 }
5245 }
5246
5247 /* Here, we are generally done with processing */
5248 /* bailing out would not be too productive */
5249
5250 if(U_SUCCESS(*status)) {
5251 sortKeySize += (primaries - primStart);
5252 /* we have done all the CE's, now let's put them together to form a key */
5253 if(compareSec == 0) {
5254 if (count2 > 0) {
5255 while (count2 > UCOL_BOT_COUNT2) {
5256 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5257 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5258 }
5259 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5260 }
5261 uint32_t secsize = secondaries-secStart;
5262 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5263 sortKeySize += secsize;
5264 if(sortKeySize <= resultLength) {
5265 *(primaries++) = UCOL_LEVELTERMINATOR;
5266 uprv_memcpy(primaries, secStart, secsize);
5267 primaries += secsize;
5268 } else {
5269 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5270 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5271 if(U_SUCCESS(*status)) {
5272 *result = primStart;
5273 *(primaries++) = UCOL_LEVELTERMINATOR;
5274 uprv_memcpy(primaries, secStart, secsize);
5275 primaries += secsize;
5276 }
5277 else {
5278 /* We ran out of memory!? We can't recover. */
5279 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5280 goto cleanup;
5281 }
5282 } else {
5283 *status = U_BUFFER_OVERFLOW_ERROR;
5284 }
5285 }
5286 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5287 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5288 sortKeySize += secsize;
5289 if(sortKeySize <= resultLength) { // if we managed to pack fine
5290 primaries = newPrim; // update the primary pointer
5291 } else { // overflow, need to reallocate and redo
5292 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5293 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5294 if(U_SUCCESS(*status)) {
5295 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5296 }
5297 else {
5298 /* We ran out of memory!? We can't recover. */
5299 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5300 goto cleanup;
5301 }
5302 } else {
5303 *status = U_BUFFER_OVERFLOW_ERROR;
5304 }
5305 }
5306 }
5307 }
5308
5309 if(doCase) {
5310 uint32_t casesize = cases - caseStart;
5311 sortKeySize += casesize;
5312 if(sortKeySize <= resultLength) {
5313 *(primaries++) = UCOL_LEVELTERMINATOR;
5314 uprv_memcpy(primaries, caseStart, casesize);
5315 primaries += casesize;
5316 } else {
5317 if(allocateSKBuffer == TRUE) {
5318 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5319 if(U_SUCCESS(*status)) {
5320 *result = primStart;
5321 *(primaries++) = UCOL_LEVELTERMINATOR;
5322 uprv_memcpy(primaries, caseStart, casesize);
5323 }
5324 else {
5325 /* We ran out of memory!? We can't recover. */
5326 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5327 goto cleanup;
5328 }
5329 } else {
5330 *status = U_BUFFER_OVERFLOW_ERROR;
5331 }
5332 }
5333 }
5334
5335 if(compareTer == 0) {
5336 if (count3 > 0) {
5337 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5338 while (count3 >= coll->tertiaryTopCount) {
5339 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5340 count3 -= (uint32_t)coll->tertiaryTopCount;
5341 }
5342 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5343 } else {
5344 while (count3 > coll->tertiaryBottomCount) {
5345 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5346 count3 -= (uint32_t)coll->tertiaryBottomCount;
5347 }
5348 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5349 }
5350 }
5351 uint32_t tersize = tertiaries - terStart;
5352 sortKeySize += tersize;
5353 if(sortKeySize <= resultLength) {
5354 *(primaries++) = UCOL_LEVELTERMINATOR;
5355 uprv_memcpy(primaries, terStart, tersize);
5356 primaries += tersize;
5357 } else {
5358 if(allocateSKBuffer == TRUE) {
5359 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5360 if(U_SUCCESS(*status)) {
5361 *result = primStart;
5362 *(primaries++) = UCOL_LEVELTERMINATOR;
5363 uprv_memcpy(primaries, terStart, tersize);
5364 }
5365 else {
5366 /* We ran out of memory!? We can't recover. */
5367 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5368 goto cleanup;
5369 }
5370 } else {
5371 *status = U_BUFFER_OVERFLOW_ERROR;
5372 }
5373 }
5374
5375 if(compareQuad == 0/*qShifted == TRUE*/) {
5376 if(count4 > 0) {
5377 while (count4 > UCOL_BOT_COUNT4) {
5378 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5379 count4 -= UCOL_BOT_COUNT4;
5380 }
5381 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5382 }
5383 uint32_t quadsize = quads - quadStart;
5384 sortKeySize += quadsize;
5385 if(sortKeySize <= resultLength) {
5386 *(primaries++) = UCOL_LEVELTERMINATOR;
5387 uprv_memcpy(primaries, quadStart, quadsize);
5388 primaries += quadsize;
5389 } else {
5390 if(allocateSKBuffer == TRUE) {
5391 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5392 if(U_SUCCESS(*status)) {
5393 *result = primStart;
5394 *(primaries++) = UCOL_LEVELTERMINATOR;
5395 uprv_memcpy(primaries, quadStart, quadsize);
5396 }
5397 else {
5398 /* We ran out of memory!? We can't recover. */
5399 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5400 goto cleanup;
5401 }
5402 } else {
5403 *status = U_BUFFER_OVERFLOW_ERROR;
5404 }
5405 }
5406 }
5407
5408 if(compareIdent) {
5409 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5410 if(sortKeySize <= resultLength) {
5411 *(primaries++) = UCOL_LEVELTERMINATOR;
5412 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5413 } else {
5414 if(allocateSKBuffer == TRUE) {
5415 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5416 if(U_SUCCESS(*status)) {
5417 *result = primStart;
5418 *(primaries++) = UCOL_LEVELTERMINATOR;
5419 u_writeIdenticalLevelRun(s.string, len, primaries);
5420 }
5421 else {
5422 /* We ran out of memory!? We can't recover. */
5423 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5424 goto cleanup;
5425 }
5426 } else {
5427 *status = U_BUFFER_OVERFLOW_ERROR;
5428 }
5429 }
5430 }
5431 }
5432 *(primaries++) = '\0';
5433 }
5434
5435 if(allocateSKBuffer == TRUE) {
5436 *result = (uint8_t*)uprv_malloc(sortKeySize);
5437 /* test for NULL */
5438 if (*result == NULL) {
5439 *status = U_MEMORY_ALLOCATION_ERROR;
5440 goto cleanup;
5441 }
5442 uprv_memcpy(*result, primStart, sortKeySize);
5443 if(primStart != prim) {
5444 uprv_free(primStart);
5445 }
5446 }
5447
5448 cleanup:
5449 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5450 /* NULL terminate for safety */
5451 **result = 0;
5452 }
5453 if(terStart != tert) {
5454 uprv_free(terStart);
5455 uprv_free(secStart);
5456 uprv_free(caseStart);
5457 uprv_free(quadStart);
5458 }
5459
5460 /* To avoid memory leak, free the offset buffer if necessary. */
5461 freeOffsetBuffer(&s);
5462
5463 if(normSource != normBuffer) {
5464 uprv_free(normSource);
5465 }
5466
5467 return sortKeySize;
5468 }
5469
5470
5471 U_CFUNC int32_t U_CALLCONV
5472 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5473 const UChar *source,
5474 int32_t sourceLength,
5475 uint8_t **result,
5476 uint32_t resultLength,
5477 UBool allocateSKBuffer,
5478 UErrorCode *status)
5479 {
5480 U_ALIGN_CODE(16);
5481
5482 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5483 uint32_t i = 0; /* general purpose counter */
5484
5485 /* Stack allocated buffers for buffers we use */
5486 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5487
5488 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5489
5490 if(U_FAILURE(*status)) {
5491 return 0;
5492 }
5493
5494 if(primaries == NULL && allocateSKBuffer == TRUE) {
5495 primaries = *result = prim;
5496 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5497 }
5498
5499 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5500
5501 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5502
5503 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5504 UChar *normSource = normBuffer;
5505 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5506
5507 int32_t len = sourceLength;
5508
5509 /* If we need to normalize, we'll do it all at once at the beginning! */
5510 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5511 len = unorm_internalNormalize(normSource, normSourceLen,
5512 source, len,
5513 UNORM_FCD, FALSE,
5514 status);
5515 if(*status == U_BUFFER_OVERFLOW_ERROR) {
5516 normSourceLen = len;
5517 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5518 if(normSource == NULL) {
5519 *status = U_MEMORY_ALLOCATION_ERROR;
5520 return 0;
5521 }
5522 *status = U_ZERO_ERROR;
5523 len = unorm_internalNormalize(normSource, normSourceLen,
5524 source, len,
5525 UNORM_FCD, FALSE,
5526 status);
5527 if(U_FAILURE(*status)) {
5528 /* Should never happen. */
5529 uprv_free(normSource);
5530 normSource = normBuffer;
5531 }
5532 }
5533
5534 if(U_FAILURE(*status)) {
5535 return 0;
5536 }
5537 source = normSource;
5538 }
5539
5540 collIterate s;
5541 IInit_collIterate(coll, (UChar *)source, len, &s);
5542 if(source == normSource) {
5543 s.flags &= ~UCOL_ITER_NORM;
5544 }
5545
5546 if(resultLength == 0 || primaries == NULL) {
5547 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5548 if(normSource != normBuffer) {
5549 uprv_free(normSource);
5550 }
5551 return t;
5552 }
5553
5554 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5555
5556 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5557
5558 uint8_t *primStart = primaries;
5559 uint8_t *secStart = secondaries;
5560 uint8_t *terStart = tertiaries;
5561
5562 uint32_t order = 0;
5563
5564 uint8_t primary1 = 0;
5565 uint8_t primary2 = 0;
5566 uint8_t secondary = 0;
5567 uint8_t tertiary = 0;
5568 uint8_t caseSwitch = coll->caseSwitch;
5569 uint8_t tertiaryMask = coll->tertiaryMask;
5570 int8_t tertiaryAddition = coll->tertiaryAddition;
5571 uint8_t tertiaryTop = coll->tertiaryTop;
5572 uint8_t tertiaryBottom = coll->tertiaryBottom;
5573 uint8_t tertiaryCommon = coll->tertiaryCommon;
5574
5575 uint32_t prevBuffSize = 0;
5576
5577 UBool finished = FALSE;
5578 UBool notIsContinuation = FALSE;
5579
5580 uint32_t count2 = 0, count3 = 0;
5581 uint8_t leadPrimary = 0;
5582
5583 for(;;) {
5584 for(i=prevBuffSize; i<minBufferSize; ++i) {
5585
5586 order = ucol_IGetNextCE(coll, &s, status);
5587
5588 if(order == 0) {
5589 continue;
5590 }
5591
5592 if(order == UCOL_NO_MORE_CES) {
5593 finished = TRUE;
5594 break;
5595 }
5596
5597 notIsContinuation = !isContinuation(order);
5598
5599 if(notIsContinuation) {
5600 tertiary = (uint8_t)((order & tertiaryMask));
5601 } else {
5602 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5603 }
5604 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5605 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5606 primary1 = (uint8_t)(order >> 8);
5607
5608 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5609 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5610 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5611 /* regular and simple sortkey calc */
5612 if(primary1 != UCOL_IGNORABLE) {
5613 if(notIsContinuation) {
5614 if(leadPrimary == primary1) {
5615 *primaries++ = primary2;
5616 } else {
5617 if(leadPrimary != 0) {
5618 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5619 }
5620 if(primary2 == UCOL_IGNORABLE) {
5621 /* one byter, not compressed */
5622 *primaries++ = primary1;
5623 leadPrimary = 0;
5624 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5625 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5626 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5627 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5628 /* not compressible */
5629 leadPrimary = 0;
5630 *primaries++ = primary1;
5631 *primaries++ = primary2;
5632 } else { /* compress */
5633 *primaries++ = leadPrimary = primary1;
5634 *primaries++ = primary2;
5635 }
5636 }
5637 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5638 *primaries++ = primary1;
5639 if(primary2 != UCOL_IGNORABLE) {
5640 *primaries++ = primary2; /* second part */
5641 }
5642 }
5643 }
5644
5645 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5646 /* This is compression code. */
5647 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5648 ++count2;
5649 } else {
5650 if (count2 > 0) {
5651 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5652 while (count2 > UCOL_TOP_COUNT2) {
5653 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5654 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5655 }
5656 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5657 } else {
5658 while (count2 > UCOL_BOT_COUNT2) {
5659 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5660 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5661 }
5662 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5663 }
5664 count2 = 0;
5665 }
5666 *secondaries++ = secondary;
5667 }
5668 }
5669
5670 if(notIsContinuation) {
5671 tertiary ^= caseSwitch;
5672 }
5673
5674 if(tertiary > 0) {
5675 /* This is compression code. */
5676 /* sequence size check is included in the if clause */
5677 if (tertiary == tertiaryCommon && notIsContinuation) {
5678 ++count3;
5679 } else {
5680 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5681 tertiary += tertiaryAddition;
5682 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5683 tertiary -= tertiaryAddition;
5684 }
5685 if (count3 > 0) {
5686 if ((tertiary > tertiaryCommon)) {
5687 while (count3 > coll->tertiaryTopCount) {
5688 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5689 count3 -= (uint32_t)coll->tertiaryTopCount;
5690 }
5691 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5692 } else {
5693 while (count3 > coll->tertiaryBottomCount) {
5694 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5695 count3 -= (uint32_t)coll->tertiaryBottomCount;
5696 }
5697 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5698 }
5699 count3 = 0;
5700 }
5701 *tertiaries++ = tertiary;
5702 }
5703 }
5704
5705 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5706 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5707 IInit_collIterate(coll, (UChar *)source, len, &s);
5708 if(source == normSource) {
5709 s.flags &= ~UCOL_ITER_NORM;
5710 }
5711 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5712 *status = U_BUFFER_OVERFLOW_ERROR;
5713 finished = TRUE;
5714 break;
5715 } else { /* It's much nicer if we can actually reallocate */
5716 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5717 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5718 if(U_SUCCESS(*status)) {
5719 *result = primStart;
5720 primarySafeEnd = primStart + resultLength - 2;
5721 } else {
5722 /* We ran out of memory!? We can't recover. */
5723 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5724 finished = TRUE;
5725 break;
5726 }
5727 }
5728 }
5729 }
5730 if(finished) {
5731 break;
5732 } else {
5733 prevBuffSize = minBufferSize;
5734 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5735 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5736 minBufferSize *= 2;
5737 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5738 /* We ran out of memory!? We can't recover. */
5739 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5740 break;
5741 }
5742 }
5743 }
5744
5745 if(U_SUCCESS(*status)) {
5746 sortKeySize += (primaries - primStart);
5747 /* we have done all the CE's, now let's put them together to form a key */
5748 if (count2 > 0) {
5749 while (count2 > UCOL_BOT_COUNT2) {
5750 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5751 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5752 }
5753 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5754 }
5755 uint32_t secsize = secondaries-secStart;
5756 sortKeySize += secsize;
5757 if(sortKeySize <= resultLength) {
5758 *(primaries++) = UCOL_LEVELTERMINATOR;
5759 uprv_memcpy(primaries, secStart, secsize);
5760 primaries += secsize;
5761 } else {
5762 if(allocateSKBuffer == TRUE) {
5763 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5764 if(U_SUCCESS(*status)) {
5765 *(primaries++) = UCOL_LEVELTERMINATOR;
5766 *result = primStart;
5767 uprv_memcpy(primaries, secStart, secsize);
5768 }
5769 else {
5770 /* We ran out of memory!? We can't recover. */
5771 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5772 goto cleanup;
5773 }
5774 } else {
5775 *status = U_BUFFER_OVERFLOW_ERROR;
5776 }
5777 }
5778
5779 if (count3 > 0) {
5780 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5781 while (count3 >= coll->tertiaryTopCount) {
5782 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5783 count3 -= (uint32_t)coll->tertiaryTopCount;
5784 }
5785 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5786 } else {
5787 while (count3 > coll->tertiaryBottomCount) {
5788 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5789 count3 -= (uint32_t)coll->tertiaryBottomCount;
5790 }
5791 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5792 }
5793 }
5794 uint32_t tersize = tertiaries - terStart;
5795 sortKeySize += tersize;
5796 if(sortKeySize <= resultLength) {
5797 *(primaries++) = UCOL_LEVELTERMINATOR;
5798 uprv_memcpy(primaries, terStart, tersize);
5799 primaries += tersize;
5800 } else {
5801 if(allocateSKBuffer == TRUE) {
5802 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5803 if(U_SUCCESS(*status)) {
5804 *result = primStart;
5805 *(primaries++) = UCOL_LEVELTERMINATOR;
5806 uprv_memcpy(primaries, terStart, tersize);
5807 }
5808 else {
5809 /* We ran out of memory!? We can't recover. */
5810 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5811 goto cleanup;
5812 }
5813 } else {
5814 *status = U_MEMORY_ALLOCATION_ERROR;
5815 }
5816 }
5817
5818 *(primaries++) = '\0';
5819 }
5820
5821 if(allocateSKBuffer == TRUE) {
5822 *result = (uint8_t*)uprv_malloc(sortKeySize);
5823 /* test for NULL */
5824 if (*result == NULL) {
5825 *status = U_MEMORY_ALLOCATION_ERROR;
5826 goto cleanup;
5827 }
5828 uprv_memcpy(*result, primStart, sortKeySize);
5829 if(primStart != prim) {
5830 uprv_free(primStart);
5831 }
5832 }
5833
5834 cleanup:
5835 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5836 /* NULL terminate for safety */
5837 **result = 0;
5838 }
5839 if(terStart != tert) {
5840 uprv_free(terStart);
5841 uprv_free(secStart);
5842 }
5843
5844 /* To avoid memory leak, free the offset buffer if necessary. */
5845 freeOffsetBuffer(&s);
5846
5847 if(normSource != normBuffer) {
5848 uprv_free(normSource);
5849 }
5850
5851 return sortKeySize;
5852 }
5853
5854 static inline
5855 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5856 UBool notIsContinuation = !isContinuation(CE);
5857 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5858 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5859 || (!notIsContinuation && *wasShifted))
5860 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5861 {
5862 // The stuff below should probably be in the sortkey code... maybe not...
5863 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5864 /* we should just completely ignore it */
5865 *wasShifted = TRUE;
5866 //continue;
5867 }
5868 //*wasShifted = TRUE;
5869 return TRUE;
5870 } else {
5871 *wasShifted = FALSE;
5872 return FALSE;
5873 }
5874 }
5875 static inline
5876 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5877 if(level < maxLevel) {
5878 dest[i++] = UCOL_LEVELTERMINATOR;
5879 } else {
5880 dest[i++] = 0;
5881 }
5882 }
5883
5884 /** enumeration of level identifiers for partial sort key generation */
5885 enum {
5886 UCOL_PSK_PRIMARY = 0,
5887 UCOL_PSK_SECONDARY = 1,
5888 UCOL_PSK_CASE = 2,
5889 UCOL_PSK_TERTIARY = 3,
5890 UCOL_PSK_QUATERNARY = 4,
5891 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5892 UCOL_PSK_IDENTICAL = 6,
5893 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5894 UCOL_PSK_LIMIT
5895 };
5896
5897 /** collation state enum. *_SHIFT value is how much to shift right
5898 * to get the state piece to the right. *_MASK value should be
5899 * ANDed with the shifted state. This data is stored in state[1]
5900 * field.
5901 */
5902 enum {
5903 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5904 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5905 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5906 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5907 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5908 * This field is also used to denote that the French secondary level is finished
5909 */
5910 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5911 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5912 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5913 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5914 /** When we do French we need to reverse secondary values. However, continuations
5915 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5916 */
5917 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5918 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5919 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5920 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5921 };
5922
5923 // macro calculating the number of expansion CEs available
5924 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5925
5926
5927 /** main sortkey part procedure. On the first call,
5928 * you should pass in a collator, an iterator, empty state
5929 * state[0] == state[1] == 0, a buffer to hold results
5930 * number of bytes you need and an error code pointer.
5931 * Make sure your buffer is big enough to hold the wanted
5932 * number of sortkey bytes. I don't check.
5933 * The only meaningful status you can get back is
5934 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5935 * have been dealt a raw deal and that you probably won't
5936 * be able to use partial sortkey generation for this
5937 * particular combination of string and collator. This
5938 * is highly unlikely, but you should still check the error code.
5939 * Any other status means that you're not in a sane situation
5940 * anymore. After the first call, preserve state values and
5941 * use them on subsequent calls to obtain more bytes of a sortkey.
5942 * Use until the number of bytes written is smaller than the requested
5943 * number of bytes. Generated sortkey is not compatible with the
5944 * one generated by ucol_getSortKey, as we don't do any compression.
5945 * However, levels are still terminated by a 1 (one) and the sortkey
5946 * is terminated by a 0 (zero). Identical level is the same as in the
5947 * regular sortkey - internal bocu-1 implementation is used.
5948 * For curious, although you cannot do much about this, here is
5949 * the structure of state words.
5950 * state[0] - iterator state. Depends on the iterator implementation,
5951 * but allows the iterator to continue where it stopped in
5952 * the last iteration.
5953 * state[1] - collation processing state. Here is the distribution
5954 * of the bits:
5955 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956 * quaternary, quin (we don't use this one), identical and
5957 * null (producing only zeroes - first one to terminate the
5958 * sortkey and subsequent to fill the buffer).
5959 * 3 - byte count. Number of bytes written on the primary level.
5960 * 4 - was shifted. Whether the previous iteration finished in the
5961 * shifted state.
5962 * 5, 6 - French continuation bytes written. See the comment in the enum
5963 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5964 * the identical level.
5965 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5966 * since thes last successful update of the iterator state.
5967 */
5968 U_CAPI int32_t U_EXPORT2
5969 ucol_nextSortKeyPart(const UCollator *coll,
5970 UCharIterator *iter,
5971 uint32_t state[2],
5972 uint8_t *dest, int32_t count,
5973 UErrorCode *status)
5974 {
5975 /* error checking */
5976 if(status==NULL || U_FAILURE(*status)) {
5977 return 0;
5978 }
5979 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5980 if( coll==NULL || iter==NULL ||
5981 state==NULL ||
5982 count<0 || (count>0 && dest==NULL)
5983 ) {
5984 *status=U_ILLEGAL_ARGUMENT_ERROR;
5985 UTRACE_EXIT_STATUS(status);
5986 return 0;
5987 }
5988
5989 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5990 coll, iter, state[0], state[1], dest, count);
5991
5992 if(count==0) {
5993 /* nothing to do */
5994 UTRACE_EXIT_VALUE(0);
5995 return 0;
5996 }
5997 /** Setting up situation according to the state we got from the previous iteration */
5998 // The state of the iterator from the previous invocation
5999 uint32_t iterState = state[0];
6000 // Has the last iteration ended in the shifted state
6001 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
6002 // What is the current level of the sortkey?
6003 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
6004 // Have we written only one byte from a two byte primary in the previous iteration?
6005 // Also on secondary level - have we finished with the French secondary?
6006 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
6007 // number of bytes in the continuation buffer for French
6008 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
6009 // Number of bytes already written from a bocsu sequence. Since
6010 // the longes bocsu sequence is 4 long, this can be up to 3.
6011 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
6012 // Number of elements that need to be consumed in this iteration because
6013 // the iterator returned UITER_NO_STATE at the end of the last iteration,
6014 // so we had to save the last valid state.
6015 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
6016
6017 /** values that depend on the collator attributes */
6018 // strength of the collator.
6019 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
6020 // maximal level of the partial sortkey. Need to take whether case level is done
6021 int32_t maxLevel = 0;
6022 if(strength < UCOL_TERTIARY) {
6023 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6024 maxLevel = UCOL_PSK_CASE;
6025 } else {
6026 maxLevel = strength;
6027 }
6028 } else {
6029 if(strength == UCOL_TERTIARY) {
6030 maxLevel = UCOL_PSK_TERTIARY;
6031 } else if(strength == UCOL_QUATERNARY) {
6032 maxLevel = UCOL_PSK_QUATERNARY;
6033 } else { // identical
6034 maxLevel = UCOL_IDENTICAL;
6035 }
6036 }
6037 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
6038 uint8_t UCOL_HIRAGANA_QUAD =
6039 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
6040 // Boundary value that decides whether a CE is shifted or not
6041 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
6042 // Are we doing French collation?
6043 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
6044
6045 /** initializing the collation state */
6046 UBool notIsContinuation = FALSE;
6047 uint32_t CE = UCOL_NO_MORE_CES;
6048
6049 collIterate s;
6050 IInit_collIterate(coll, NULL, -1, &s);
6051 s.iterator = iter;
6052 s.flags |= UCOL_USE_ITERATOR;
6053 // This variable tells us whether we have produced some other levels in this iteration
6054 // before we moved to the identical level. In that case, we need to switch the
6055 // type of the iterator.
6056 UBool doingIdenticalFromStart = FALSE;
6057 // Normalizing iterator
6058 // The division for the array length may truncate the array size to
6059 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6060 // for all platforms anyway.
6061 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6062 UNormIterator *normIter = NULL;
6063 // If the normalization is turned on for the collator and we are below identical level
6064 // we will use a FCD normalizing iterator
6065 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
6066 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6067 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
6068 s.flags &= ~UCOL_ITER_NORM;
6069 if(U_FAILURE(*status)) {
6070 UTRACE_EXIT_STATUS(*status);
6071 return 0;
6072 }
6073 } else if(level == UCOL_PSK_IDENTICAL) {
6074 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6075 // will be updating the state - and this cannot be done on an ordinary iterator.
6076 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6077 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6078 s.flags &= ~UCOL_ITER_NORM;
6079 if(U_FAILURE(*status)) {
6080 UTRACE_EXIT_STATUS(*status);
6081 return 0;
6082 }
6083 doingIdenticalFromStart = TRUE;
6084 }
6085
6086 // This is the tentative new state of the iterator. The problem
6087 // is that the iterator might return an undefined state, in
6088 // which case we should save the last valid state and increase
6089 // the iterator skip value.
6090 uint32_t newState = 0;
6091
6092 // First, we set the iterator to the last valid position
6093 // from the last iteration. This was saved in state[0].
6094 if(iterState == 0) {
6095 /* initial state */
6096 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
6097 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6098 } else {
6099 s.iterator->move(s.iterator, 0, UITER_START);
6100 }
6101 } else {
6102 /* reset to previous state */
6103 s.iterator->setState(s.iterator, iterState, status);
6104 if(U_FAILURE(*status)) {
6105 UTRACE_EXIT_STATUS(*status);
6106 return 0;
6107 }
6108 }
6109
6110
6111
6112 // This variable tells us whether we can attempt to update the state
6113 // of iterator. Situations where we don't want to update iterator state
6114 // are the existence of expansion CEs that are not yet processed, and
6115 // finishing the case level without enough space in the buffer to insert
6116 // a level terminator.
6117 UBool canUpdateState = TRUE;
6118
6119 // Consume all the CEs that were consumed at the end of the previous
6120 // iteration without updating the iterator state. On identical level,
6121 // consume the code points.
6122 int32_t counter = cces;
6123 if(level < UCOL_PSK_IDENTICAL) {
6124 while(counter-->0) {
6125 // If we're doing French and we are on the secondary level,
6126 // we go backwards.
6127 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6128 CE = ucol_IGetPrevCE(coll, &s, status);
6129 } else {
6130 CE = ucol_IGetNextCE(coll, &s, status);
6131 }
6132 if(CE==UCOL_NO_MORE_CES) {
6133 /* should not happen */
6134 *status=U_INTERNAL_PROGRAM_ERROR;
6135 UTRACE_EXIT_STATUS(*status);
6136 return 0;
6137 }
6138 if(uprv_numAvailableExpCEs(s)) {
6139 canUpdateState = FALSE;
6140 }
6141 }
6142 } else {
6143 while(counter-->0) {
6144 uiter_next32(s.iterator);
6145 }
6146 }
6147
6148 // French secondary needs to know whether the iterator state of zero came from previous level OR
6149 // from a new invocation...
6150 UBool wasDoingPrimary = FALSE;
6151 // destination buffer byte counter. When this guy
6152 // gets to count, we're done with the iteration
6153 int32_t i = 0;
6154 // used to count the zero bytes written after we
6155 // have finished with the sort key
6156 int32_t j = 0;
6157
6158
6159 // Hm.... I think we're ready to plunge in. Basic story is as following:
6160 // we have a fall through case based on level. This is used for initial
6161 // positioning on iteration start. Every level processor contains a
6162 // for(;;) which will be broken when we exhaust all the CEs. Other
6163 // way to exit is a goto saveState, which happens when we have filled
6164 // out our buffer.
6165 switch(level) {
6166 case UCOL_PSK_PRIMARY:
6167 wasDoingPrimary = TRUE;
6168 for(;;) {
6169 if(i==count) {
6170 goto saveState;
6171 }
6172 // We should save the state only if we
6173 // are sure that we are done with the
6174 // previous iterator state
6175 if(canUpdateState && byteCountOrFrenchDone == 0) {
6176 newState = s.iterator->getState(s.iterator);
6177 if(newState != UITER_NO_STATE) {
6178 iterState = newState;
6179 cces = 0;
6180 }
6181 }
6182 CE = ucol_IGetNextCE(coll, &s, status);
6183 cces++;
6184 if(CE==UCOL_NO_MORE_CES) {
6185 // Add the level separator
6186 terminatePSKLevel(level, maxLevel, i, dest);
6187 byteCountOrFrenchDone=0;
6188 // Restart the iteration an move to the
6189 // second level
6190 s.iterator->move(s.iterator, 0, UITER_START);
6191 cces = 0;
6192 level = UCOL_PSK_SECONDARY;
6193 break;
6194 }
6195 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6196 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6197 if(CE != 0) {
6198 if(byteCountOrFrenchDone == 0) {
6199 // get the second byte of primary
6200 dest[i++]=(uint8_t)(CE >> 8);
6201 } else {
6202 byteCountOrFrenchDone = 0;
6203 }
6204 if((CE &=0xff)!=0) {
6205 if(i==count) {
6206 /* overflow */
6207 byteCountOrFrenchDone = 1;
6208 cces--;
6209 goto saveState;
6210 }
6211 dest[i++]=(uint8_t)CE;
6212 }
6213 }
6214 }
6215 if(uprv_numAvailableExpCEs(s)) {
6216 canUpdateState = FALSE;
6217 } else {
6218 canUpdateState = TRUE;
6219 }
6220 }
6221 /* fall through to next level */
6222 case UCOL_PSK_SECONDARY:
6223 if(strength >= UCOL_SECONDARY) {
6224 if(!doingFrench) {
6225 for(;;) {
6226 if(i == count) {
6227 goto saveState;
6228 }
6229 // We should save the state only if we
6230 // are sure that we are done with the
6231 // previous iterator state
6232 if(canUpdateState) {
6233 newState = s.iterator->getState(s.iterator);
6234 if(newState != UITER_NO_STATE) {
6235 iterState = newState;
6236 cces = 0;
6237 }
6238 }
6239 CE = ucol_IGetNextCE(coll, &s, status);
6240 cces++;
6241 if(CE==UCOL_NO_MORE_CES) {
6242 // Add the level separator
6243 terminatePSKLevel(level, maxLevel, i, dest);
6244 byteCountOrFrenchDone = 0;
6245 // Restart the iteration an move to the
6246 // second level
6247 s.iterator->move(s.iterator, 0, UITER_START);
6248 cces = 0;
6249 level = UCOL_PSK_CASE;
6250 break;
6251 }
6252 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6253 CE >>= 8; /* get secondary */
6254 if(CE != 0) {
6255 dest[i++]=(uint8_t)CE;
6256 }
6257 }
6258 if(uprv_numAvailableExpCEs(s)) {
6259 canUpdateState = FALSE;
6260 } else {
6261 canUpdateState = TRUE;
6262 }
6263 }
6264 } else { // French secondary processing
6265 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6266 int32_t frenchIndex = 0;
6267 // Here we are going backwards.
6268 // If the iterator is at the beggining, it should be
6269 // moved to end.
6270 if(wasDoingPrimary) {
6271 s.iterator->move(s.iterator, 0, UITER_LIMIT);
6272 cces = 0;
6273 }
6274 for(;;) {
6275 if(i == count) {
6276 goto saveState;
6277 }
6278 if(canUpdateState) {
6279 newState = s.iterator->getState(s.iterator);
6280 if(newState != UITER_NO_STATE) {
6281 iterState = newState;
6282 cces = 0;
6283 }
6284 }
6285 CE = ucol_IGetPrevCE(coll, &s, status);
6286 cces++;
6287 if(CE==UCOL_NO_MORE_CES) {
6288 // Add the level separator
6289 terminatePSKLevel(level, maxLevel, i, dest);
6290 byteCountOrFrenchDone = 0;
6291 // Restart the iteration an move to the next level
6292 s.iterator->move(s.iterator, 0, UITER_START);
6293 level = UCOL_PSK_CASE;
6294 break;
6295 }
6296 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6297 // reverse when we get a first non-continuation CE.
6298 CE >>= 8;
6299 frenchBuff[frenchIndex++] = (uint8_t)CE;
6300 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6301 CE >>= 8; /* get secondary */
6302 if(!frenchIndex) {
6303 if(CE != 0) {
6304 dest[i++]=(uint8_t)CE;
6305 }
6306 } else {
6307 frenchBuff[frenchIndex++] = (uint8_t)CE;
6308 frenchIndex -= usedFrench;
6309 usedFrench = 0;
6310 while(i < count && frenchIndex) {
6311 dest[i++] = frenchBuff[--frenchIndex];
6312 usedFrench++;
6313 }
6314 }
6315 }
6316 if(uprv_numAvailableExpCEs(s)) {
6317 canUpdateState = FALSE;
6318 } else {
6319 canUpdateState = TRUE;
6320 }
6321 }
6322 }
6323 } else {
6324 level = UCOL_PSK_CASE;
6325 }
6326 /* fall through to next level */
6327 case UCOL_PSK_CASE:
6328 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6329 uint32_t caseShift = UCOL_CASE_SHIFT_START;
6330 uint8_t caseByte = UCOL_CASE_BYTE_START;
6331 uint8_t caseBits = 0;
6332
6333 for(;;) {
6334 if(i == count) {
6335 goto saveState;
6336 }
6337 // We should save the state only if we
6338 // are sure that we are done with the
6339 // previous iterator state
6340 if(canUpdateState) {
6341 newState = s.iterator->getState(s.iterator);
6342 if(newState != UITER_NO_STATE) {
6343 iterState = newState;
6344 cces = 0;
6345 }
6346 }
6347 CE = ucol_IGetNextCE(coll, &s, status);
6348 cces++;
6349 if(CE==UCOL_NO_MORE_CES) {
6350 // On the case level we might have an unfinished
6351 // case byte. Add one if it's started.
6352 if(caseShift != UCOL_CASE_SHIFT_START) {
6353 dest[i++] = caseByte;
6354 }
6355 cces = 0;
6356 // We have finished processing CEs on this level.
6357 // However, we don't know if we have enough space
6358 // to add a case level terminator.
6359 if(i < count) {
6360 // Add the level separator
6361 terminatePSKLevel(level, maxLevel, i, dest);
6362 // Restart the iteration and move to the
6363 // next level
6364 s.iterator->move(s.iterator, 0, UITER_START);
6365 level = UCOL_PSK_TERTIARY;
6366 } else {
6367 canUpdateState = FALSE;
6368 }
6369 break;
6370 }
6371
6372 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6373 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
6374 // do the case level if we need to do it. We don't want to calculate
6375 // case level for primary ignorables if we have only primary strength and case level
6376 // otherwise we would break well formedness of CEs
6377 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6378 caseBits = (uint8_t)(CE & 0xC0);
6379 // this copies the case level logic from the
6380 // sort key generation code
6381 if(CE != 0) {
6382 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6383 if((caseBits & 0xC0) == 0) {
6384 caseByte |= 1 << (--caseShift);
6385 } else {
6386 caseByte |= 0 << (--caseShift);
6387 /* second bit */
6388 if(caseShift == 0) {
6389 dest[i++] = caseByte;
6390 caseShift = UCOL_CASE_SHIFT_START;
6391 caseByte = UCOL_CASE_BYTE_START;
6392 }
6393 caseByte |= ((caseBits>>6)&1) << (--caseShift);
6394 }
6395 } else {
6396 if((caseBits & 0xC0) == 0) {
6397 caseByte |= 0 << (--caseShift);
6398 } else {
6399 caseByte |= 1 << (--caseShift);
6400 /* second bit */
6401 if(caseShift == 0) {
6402 dest[i++] = caseByte;
6403 caseShift = UCOL_CASE_SHIFT_START;
6404 caseByte = UCOL_CASE_BYTE_START;
6405 }
6406 caseByte |= ((caseBits>>7)&1) << (--caseShift);
6407 }
6408 }
6409 }
6410
6411 }
6412 }
6413 // Not sure this is correct for the case level - revisit
6414 if(uprv_numAvailableExpCEs(s)) {
6415 canUpdateState = FALSE;
6416 } else {
6417 canUpdateState = TRUE;
6418 }
6419 }
6420 } else {
6421 level = UCOL_PSK_TERTIARY;
6422 }
6423 /* fall through to next level */
6424 case UCOL_PSK_TERTIARY:
6425 if(strength >= UCOL_TERTIARY) {
6426 for(;;) {
6427 if(i == count) {
6428 goto saveState;
6429 }
6430 // We should save the state only if we
6431 // are sure that we are done with the
6432 // previous iterator state
6433 if(canUpdateState) {
6434 newState = s.iterator->getState(s.iterator);
6435 if(newState != UITER_NO_STATE) {
6436 iterState = newState;
6437 cces = 0;
6438 }
6439 }
6440 CE = ucol_IGetNextCE(coll, &s, status);
6441 cces++;
6442 if(CE==UCOL_NO_MORE_CES) {
6443 // Add the level separator
6444 terminatePSKLevel(level, maxLevel, i, dest);
6445 byteCountOrFrenchDone = 0;
6446 // Restart the iteration an move to the
6447 // second level
6448 s.iterator->move(s.iterator, 0, UITER_START);
6449 cces = 0;
6450 level = UCOL_PSK_QUATERNARY;
6451 break;
6452 }
6453 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6454 notIsContinuation = !isContinuation(CE);
6455
6456 if(notIsContinuation) {
6457 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6458 CE ^= coll->caseSwitch;
6459 CE &= coll->tertiaryMask;
6460 } else {
6461 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6462 }
6463
6464 if(CE != 0) {
6465 dest[i++]=(uint8_t)CE;
6466 }
6467 }
6468 if(uprv_numAvailableExpCEs(s)) {
6469 canUpdateState = FALSE;
6470 } else {
6471 canUpdateState = TRUE;
6472 }
6473 }
6474 } else {
6475 // if we're not doing tertiary
6476 // skip to the end
6477 level = UCOL_PSK_NULL;
6478 }
6479 /* fall through to next level */
6480 case UCOL_PSK_QUATERNARY:
6481 if(strength >= UCOL_QUATERNARY) {
6482 for(;;) {
6483 if(i == count) {
6484 goto saveState;
6485 }
6486 // We should save the state only if we
6487 // are sure that we are done with the
6488 // previous iterator state
6489 if(canUpdateState) {
6490 newState = s.iterator->getState(s.iterator);
6491 if(newState != UITER_NO_STATE) {
6492 iterState = newState;
6493 cces = 0;
6494 }
6495 }
6496 CE = ucol_IGetNextCE(coll, &s, status);
6497 cces++;
6498 if(CE==UCOL_NO_MORE_CES) {
6499 // Add the level separator
6500 terminatePSKLevel(level, maxLevel, i, dest);
6501 //dest[i++] = UCOL_LEVELTERMINATOR;
6502 byteCountOrFrenchDone = 0;
6503 // Restart the iteration an move to the
6504 // second level
6505 s.iterator->move(s.iterator, 0, UITER_START);
6506 cces = 0;
6507 level = UCOL_PSK_QUIN;
6508 break;
6509 }
6510 if(CE==0)
6511 continue;
6512 if(isShiftedCE(CE, LVT, &wasShifted)) {
6513 CE >>= 16; /* get primary */
6514 if(CE != 0) {
6515 if(byteCountOrFrenchDone == 0) {
6516 dest[i++]=(uint8_t)(CE >> 8);
6517 } else {
6518 byteCountOrFrenchDone = 0;
6519 }
6520 if((CE &=0xff)!=0) {
6521 if(i==count) {
6522 /* overflow */
6523 byteCountOrFrenchDone = 1;
6524 goto saveState;
6525 }
6526 dest[i++]=(uint8_t)CE;
6527 }
6528 }
6529 } else {
6530 notIsContinuation = !isContinuation(CE);
6531 if(notIsContinuation) {
6532 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6533 dest[i++] = UCOL_HIRAGANA_QUAD;
6534 } else {
6535 dest[i++] = 0xFF;
6536 }
6537 }
6538 }
6539 if(uprv_numAvailableExpCEs(s)) {
6540 canUpdateState = FALSE;
6541 } else {
6542 canUpdateState = TRUE;
6543 }
6544 }
6545 } else {
6546 // if we're not doing quaternary
6547 // skip to the end
6548 level = UCOL_PSK_NULL;
6549 }
6550 /* fall through to next level */
6551 case UCOL_PSK_QUIN:
6552 level = UCOL_PSK_IDENTICAL;
6553 /* fall through to next level */
6554 case UCOL_PSK_IDENTICAL:
6555 if(strength >= UCOL_IDENTICAL) {
6556 UChar32 first, second;
6557 int32_t bocsuBytesWritten = 0;
6558 // We always need to do identical on
6559 // the NFD form of the string.
6560 if(normIter == NULL) {
6561 // we arrived from the level below and
6562 // normalization was not turned on.
6563 // therefore, we need to make a fresh NFD iterator
6564 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6565 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6566 } else if(!doingIdenticalFromStart) {
6567 // there is an iterator, but we did some other levels.
6568 // therefore, we have a FCD iterator - need to make
6569 // a NFD one.
6570 // normIter being at the beginning does not guarantee
6571 // that the underlying iterator is at the beginning
6572 iter->move(iter, 0, UITER_START);
6573 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6574 }
6575 // At this point we have a NFD iterator that is positioned
6576 // in the right place
6577 if(U_FAILURE(*status)) {
6578 UTRACE_EXIT_STATUS(*status);
6579 return 0;
6580 }
6581 first = uiter_previous32(s.iterator);
6582 // maybe we're at the start of the string
6583 if(first == U_SENTINEL) {
6584 first = 0;
6585 } else {
6586 uiter_next32(s.iterator);
6587 }
6588
6589 j = 0;
6590 for(;;) {
6591 if(i == count) {
6592 if(j+1 < bocsuBytesWritten) {
6593 bocsuBytesUsed = j+1;
6594 }
6595 goto saveState;
6596 }
6597
6598 // On identical level, we will always save
6599 // the state if we reach this point, since
6600 // we don't depend on getNextCE for content
6601 // all the content is in our buffer and we
6602 // already either stored the full buffer OR
6603 // otherwise we won't arrive here.
6604 newState = s.iterator->getState(s.iterator);
6605 if(newState != UITER_NO_STATE) {
6606 iterState = newState;
6607 cces = 0;
6608 }
6609
6610 uint8_t buff[4];
6611 second = uiter_next32(s.iterator);
6612 cces++;
6613
6614 // end condition for identical level
6615 if(second == U_SENTINEL) {
6616 terminatePSKLevel(level, maxLevel, i, dest);
6617 level = UCOL_PSK_NULL;
6618 break;
6619 }
6620 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6621 first = second;
6622
6623 j = 0;
6624 if(bocsuBytesUsed != 0) {
6625 while(bocsuBytesUsed-->0) {
6626 j++;
6627 }
6628 }
6629
6630 while(i < count && j < bocsuBytesWritten) {
6631 dest[i++] = buff[j++];
6632 }
6633 }
6634
6635 } else {
6636 level = UCOL_PSK_NULL;
6637 }
6638 /* fall through to next level */
6639 case UCOL_PSK_NULL:
6640 j = i;
6641 while(j<count) {
6642 dest[j++]=0;
6643 }
6644 break;
6645 default:
6646 *status = U_INTERNAL_PROGRAM_ERROR;
6647 UTRACE_EXIT_STATUS(*status);
6648 return 0;
6649 }
6650
6651 saveState:
6652 // Now we need to return stuff. First we want to see whether we have
6653 // done everything for the current state of iterator.
6654 if(byteCountOrFrenchDone
6655 || canUpdateState == FALSE
6656 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6657 {
6658 // Any of above mean that the previous transaction
6659 // wasn't finished and that we should store the
6660 // previous iterator state.
6661 state[0] = iterState;
6662 } else {
6663 // The transaction is complete. We will continue in the next iteration.
6664 state[0] = s.iterator->getState(s.iterator);
6665 cces = 0;
6666 }
6667 // Store the number of bocsu bytes written.
6668 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6669 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6670 }
6671 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6672
6673 // Next we put in the level of comparison
6674 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6675
6676 // If we are doing French, we need to store whether we have just finished the French level
6677 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6678 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6679 } else {
6680 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6681 }
6682
6683 // Was the latest CE shifted
6684 if(wasShifted) {
6685 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6686 }
6687 // Check for cces overflow
6688 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6689 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6690 }
6691 // Store cces
6692 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6693
6694 // Check for French overflow
6695 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6696 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6697 }
6698 // Store number of bytes written in the French secondary continuation sequence
6699 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6700
6701
6702 // If we have used normalizing iterator, get rid of it
6703 if(normIter != NULL) {
6704 unorm_closeIter(normIter);
6705 }
6706
6707 /* To avoid memory leak, free the offset buffer if necessary. */
6708 freeOffsetBuffer(&s);
6709
6710 // Return number of meaningful sortkey bytes.
6711 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6712 dest,i, state[0], state[1]);
6713 UTRACE_EXIT_VALUE(i);
6714 return i;
6715 }
6716
6717 /**
6718 * Produce a bound for a given sortkey and a number of levels.
6719 */
6720 U_CAPI int32_t U_EXPORT2
6721 ucol_getBound(const uint8_t *source,
6722 int32_t sourceLength,
6723 UColBoundMode boundType,
6724 uint32_t noOfLevels,
6725 uint8_t *result,
6726 int32_t resultLength,
6727 UErrorCode *status)
6728 {
6729 // consistency checks
6730 if(status == NULL || U_FAILURE(*status)) {
6731 return 0;
6732 }
6733 if(source == NULL) {
6734 *status = U_ILLEGAL_ARGUMENT_ERROR;
6735 return 0;
6736 }
6737
6738 int32_t sourceIndex = 0;
6739 // Scan the string until we skip enough of the key OR reach the end of the key
6740 do {
6741 sourceIndex++;
6742 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6743 noOfLevels--;
6744 }
6745 } while (noOfLevels > 0
6746 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6747
6748 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6749 && noOfLevels > 0) {
6750 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6751 }
6752
6753
6754 // READ ME: this code assumes that the values for boundType
6755 // enum will not changes. They are set so that the enum value
6756 // corresponds to the number of extra bytes each bound type
6757 // needs.
6758 if(result != NULL && resultLength >= sourceIndex+boundType) {
6759 uprv_memcpy(result, source, sourceIndex);
6760 switch(boundType) {
6761 // Lower bound just gets terminated. No extra bytes
6762 case UCOL_BOUND_LOWER: // = 0
6763 break;
6764 // Upper bound needs one extra byte
6765 case UCOL_BOUND_UPPER: // = 1
6766 result[sourceIndex++] = 2;
6767 break;
6768 // Upper long bound needs two extra bytes
6769 case UCOL_BOUND_UPPER_LONG: // = 2
6770 result[sourceIndex++] = 0xFF;
6771 result[sourceIndex++] = 0xFF;
6772 break;
6773 default:
6774 *status = U_ILLEGAL_ARGUMENT_ERROR;
6775 return 0;
6776 }
6777 result[sourceIndex++] = 0;
6778
6779 return sourceIndex;
6780 } else {
6781 return sourceIndex+boundType+1;
6782 }
6783 }
6784
6785 /****************************************************************************/
6786 /* Following are the functions that deal with the properties of a collator */
6787 /* there are new APIs and some compatibility APIs */
6788 /****************************************************************************/
6789
6790 static inline void
6791 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6792 int32_t *primShift, int32_t *secShift, int32_t *terShift)
6793 {
6794 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6795 UBool reverseSecondary = FALSE;
6796 if(!isContinuation(CE)) {
6797 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6798 tertiary ^= coll->caseSwitch;
6799 reverseSecondary = TRUE;
6800 } else {
6801 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6802 tertiary &= UCOL_REMOVE_CASE;
6803 reverseSecondary = FALSE;
6804 }
6805
6806 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6807 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6808 primary1 = (uint8_t)(CE >> 8);
6809
6810 if(primary1 != 0) {
6811 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6812 *primShift -= 8;
6813 }
6814 if(primary2 != 0) {
6815 if(*primShift < 0) {
6816 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6817 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6818 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6819 return;
6820 }
6821 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6822 *primShift -= 8;
6823 }
6824 if(secondary != 0) {
6825 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6826 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6827 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6828 } else { // normal case
6829 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6830 }
6831 *secShift -= 8;
6832 }
6833 if(tertiary != 0) {
6834 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6835 *terShift -= 8;
6836 }
6837 }
6838
6839 static inline UBool
6840 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6841 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6842 if(newTable == NULL) {
6843 *status = U_MEMORY_ALLOCATION_ERROR;
6844 coll->latinOneFailed = TRUE;
6845 return FALSE;
6846 }
6847 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6848 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6849 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6850 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6851 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6852 coll->latinOneTableLen = size;
6853 uprv_free(coll->latinOneCEs);
6854 coll->latinOneCEs = newTable;
6855 return TRUE;
6856 }
6857
6858 static UBool
6859 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6860 UBool result = TRUE;
6861 if(coll->latinOneCEs == NULL) {
6862 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6863 if(coll->latinOneCEs == NULL) {
6864 *status = U_MEMORY_ALLOCATION_ERROR;
6865 return FALSE;
6866 }
6867 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6868 }
6869 UChar ch = 0;
6870 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6871 // Check for null pointer
6872 if (U_FAILURE(*status)) {
6873 return FALSE;
6874 }
6875 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6876
6877 int32_t primShift = 24, secShift = 24, terShift = 24;
6878 uint32_t CE = 0;
6879 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6880
6881 // TODO: make safe if you get more than you wanted...
6882 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6883 primShift = 24; secShift = 24; terShift = 24;
6884 if(ch < 0x100) {
6885 CE = coll->latinOneMapping[ch];
6886 } else {
6887 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6888 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6889 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6890 }
6891 }
6892 if(CE < UCOL_NOT_FOUND) {
6893 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6894 } else {
6895 switch (getCETag(CE)) {
6896 case EXPANSION_TAG:
6897 case DIGIT_TAG:
6898 ucol_setText(it, &ch, 1, status);
6899 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6900 if(primShift < 0 || secShift < 0 || terShift < 0) {
6901 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6902 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6903 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6904 break;
6905 }
6906 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6907 }
6908 break;
6909 case CONTRACTION_TAG:
6910 // here is the trick
6911 // F2 is contraction. We do something very similar to contractions
6912 // but have two indices, one in the real contraction table and the
6913 // other to where we stuffed things. This hopes that we don't have
6914 // many contractions (this should work for latin-1 tables).
6915 {
6916 if((CE & 0x00FFF000) != 0) {
6917 *status = U_UNSUPPORTED_ERROR;
6918 goto cleanup_after_failure;
6919 }
6920
6921 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6922
6923 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6924
6925 coll->latinOneCEs[ch] = CE;
6926 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6927 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6928
6929 // We're going to jump into contraction table, pick the elements
6930 // and use them
6931 do {
6932 CE = *(coll->contractionCEs +
6933 (UCharOffset - coll->contractionIndex));
6934 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6935 uint32_t size;
6936 uint32_t i; /* general counter */
6937 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6938 size = getExpansionCount(CE);
6939 //CE = *CEOffset++;
6940 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6941 for(i = 0; i<size; i++) {
6942 if(primShift < 0 || secShift < 0 || terShift < 0) {
6943 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6944 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6945 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6946 break;
6947 }
6948 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6949 }
6950 } else { /* else, we do */
6951 while(*CEOffset != 0) {
6952 if(primShift < 0 || secShift < 0 || terShift < 0) {
6953 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6954 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6955 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6956 break;
6957 }
6958 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6959 }
6960 }
6961 contractionOffset++;
6962 } else if(CE < UCOL_NOT_FOUND) {
6963 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6964 } else {
6965 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6966 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6967 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6968 contractionOffset++;
6969 }
6970 UCharOffset++;
6971 primShift = 24; secShift = 24; terShift = 24;
6972 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6973 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6974 goto cleanup_after_failure;
6975 }
6976 }
6977 } while(*UCharOffset != 0xFFFF);
6978 }
6979 break;;
6980 case SPEC_PROC_TAG:
6981 {
6982 // 0xB7 is a precontext character defined in UCA5.1, a special
6983 // handle is implemeted in order to save LatinOne table for
6984 // most locales.
6985 if (ch==0xb7) {
6986 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6987 }
6988 else {
6989 goto cleanup_after_failure;
6990 }
6991 }
6992 break;
6993 default:
6994 goto cleanup_after_failure;
6995 }
6996 }
6997 }
6998 // compact table
6999 if(contractionOffset < coll->latinOneTableLen) {
7000 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
7001 goto cleanup_after_failure;
7002 }
7003 }
7004 ucol_closeElements(it);
7005 return result;
7006
7007 cleanup_after_failure:
7008 // status should already be set before arriving here.
7009 coll->latinOneFailed = TRUE;
7010 ucol_closeElements(it);
7011 return FALSE;
7012 }
7013
7014 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
7015 if(U_SUCCESS(*status)) {
7016 if(coll->caseFirst == UCOL_UPPER_FIRST) {
7017 coll->caseSwitch = UCOL_CASE_SWITCH;
7018 } else {
7019 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
7020 }
7021
7022 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
7023 coll->tertiaryMask = UCOL_REMOVE_CASE;
7024 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7025 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
7026 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
7027 coll->tertiaryBottom = UCOL_COMMON_BOT3;
7028 } else {
7029 coll->tertiaryMask = UCOL_KEEP_CASE;
7030 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
7031 if(coll->caseFirst == UCOL_UPPER_FIRST) {
7032 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
7033 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
7034 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
7035 } else {
7036 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7037 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
7038 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
7039 }
7040 }
7041
7042 /* Set the compression values */
7043 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
7044 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
7045 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
7046
7047 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
7048 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
7049 {
7050 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
7051 } else {
7052 coll->sortKeyGen = ucol_calcSortKey;
7053 }
7054 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
7055 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
7056 {
7057 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
7058 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
7059 //fprintf(stderr, "F");
7060 coll->latinOneUse = TRUE;
7061 } else {
7062 coll->latinOneUse = FALSE;
7063 }
7064 if(*status == U_UNSUPPORTED_ERROR) {
7065 *status = U_ZERO_ERROR;
7066 }
7067 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7068 coll->latinOneUse = TRUE;
7069 }
7070 } else {
7071 coll->latinOneUse = FALSE;
7072 }
7073 }
7074 }
7075
7076 U_CAPI uint32_t U_EXPORT2
7077 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
7078 if(U_FAILURE(*status) || coll == NULL) {
7079 return 0;
7080 }
7081 if(len == -1) {
7082 len = u_strlen(varTop);
7083 }
7084 if(len == 0) {
7085 *status = U_ILLEGAL_ARGUMENT_ERROR;
7086 return 0;
7087 }
7088
7089 collIterate s;
7090 IInit_collIterate(coll, varTop, len, &s);
7091
7092 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
7093
7094 /* here we check if we have consumed all characters */
7095 /* you can put in either one character or a contraction */
7096 /* you shouldn't put more... */
7097 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
7098 *status = U_CE_NOT_FOUND_ERROR;
7099 return 0;
7100 }
7101
7102 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
7103
7104 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
7105 *status = U_PRIMARY_TOO_LONG_ERROR;
7106 return 0;
7107 }
7108 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
7109 coll->variableTopValueisDefault = FALSE;
7110 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
7111 }
7112
7113 /* To avoid memory leak, free the offset buffer if necessary. */
7114 freeOffsetBuffer(&s);
7115
7116 return CE & UCOL_PRIMARYMASK;
7117 }
7118
7119 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
7120 if(U_FAILURE(*status) || coll == NULL) {
7121 return 0;
7122 }
7123 return coll->variableTopValue<<16;
7124 }
7125
7126 U_CAPI void U_EXPORT2
7127 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
7128 if(U_FAILURE(*status) || coll == NULL) {
7129 return;
7130 }
7131
7132 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
7133 coll->variableTopValueisDefault = FALSE;
7134 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
7135 }
7136 }
7137 /* Attribute setter API */
7138 U_CAPI void U_EXPORT2
7139 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
7140 if(U_FAILURE(*status) || coll == NULL) {
7141 return;
7142 }
7143 UColAttributeValue oldFrench = coll->frenchCollation;
7144 UColAttributeValue oldCaseFirst = coll->caseFirst;
7145 switch(attr) {
7146 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
7147 if(value == UCOL_ON) {
7148 coll->numericCollation = UCOL_ON;
7149 coll->numericCollationisDefault = FALSE;
7150 } else if (value == UCOL_OFF) {
7151 coll->numericCollation = UCOL_OFF;
7152 coll->numericCollationisDefault = FALSE;
7153 } else if (value == UCOL_DEFAULT) {
7154 coll->numericCollationisDefault = TRUE;
7155 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
7156 } else {
7157 *status = U_ILLEGAL_ARGUMENT_ERROR;
7158 }
7159 break;
7160 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
7161 if(value == UCOL_ON) {
7162 coll->hiraganaQ = UCOL_ON;
7163 coll->hiraganaQisDefault = FALSE;
7164 } else if (value == UCOL_OFF) {
7165 coll->hiraganaQ = UCOL_OFF;
7166 coll->hiraganaQisDefault = FALSE;
7167 } else if (value == UCOL_DEFAULT) {
7168 coll->hiraganaQisDefault = TRUE;
7169 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
7170 } else {
7171 *status = U_ILLEGAL_ARGUMENT_ERROR;
7172 }
7173 break;
7174 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7175 if(value == UCOL_ON) {
7176 coll->frenchCollation = UCOL_ON;
7177 coll->frenchCollationisDefault = FALSE;
7178 } else if (value == UCOL_OFF) {
7179 coll->frenchCollation = UCOL_OFF;
7180 coll->frenchCollationisDefault = FALSE;
7181 } else if (value == UCOL_DEFAULT) {
7182 coll->frenchCollationisDefault = TRUE;
7183 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7184 } else {
7185 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7186 }
7187 break;
7188 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7189 if(value == UCOL_SHIFTED) {
7190 coll->alternateHandling = UCOL_SHIFTED;
7191 coll->alternateHandlingisDefault = FALSE;
7192 } else if (value == UCOL_NON_IGNORABLE) {
7193 coll->alternateHandling = UCOL_NON_IGNORABLE;
7194 coll->alternateHandlingisDefault = FALSE;
7195 } else if (value == UCOL_DEFAULT) {
7196 coll->alternateHandlingisDefault = TRUE;
7197 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7198 } else {
7199 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7200 }
7201 break;
7202 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7203 if(value == UCOL_LOWER_FIRST) {
7204 coll->caseFirst = UCOL_LOWER_FIRST;
7205 coll->caseFirstisDefault = FALSE;
7206 } else if (value == UCOL_UPPER_FIRST) {
7207 coll->caseFirst = UCOL_UPPER_FIRST;
7208 coll->caseFirstisDefault = FALSE;
7209 } else if (value == UCOL_OFF) {
7210 coll->caseFirst = UCOL_OFF;
7211 coll->caseFirstisDefault = FALSE;
7212 } else if (value == UCOL_DEFAULT) {
7213 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7214 coll->caseFirstisDefault = TRUE;
7215 } else {
7216 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7217 }
7218 break;
7219 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7220 if(value == UCOL_ON) {
7221 coll->caseLevel = UCOL_ON;
7222 coll->caseLevelisDefault = FALSE;
7223 } else if (value == UCOL_OFF) {
7224 coll->caseLevel = UCOL_OFF;
7225 coll->caseLevelisDefault = FALSE;
7226 } else if (value == UCOL_DEFAULT) {
7227 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7228 coll->caseLevelisDefault = TRUE;
7229 } else {
7230 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7231 }
7232 break;
7233 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7234 if(value == UCOL_ON) {
7235 coll->normalizationMode = UCOL_ON;
7236 coll->normalizationModeisDefault = FALSE;
7237 } else if (value == UCOL_OFF) {
7238 coll->normalizationMode = UCOL_OFF;
7239 coll->normalizationModeisDefault = FALSE;
7240 } else if (value == UCOL_DEFAULT) {
7241 coll->normalizationModeisDefault = TRUE;
7242 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7243 } else {
7244 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7245 }
7246 break;
7247 case UCOL_STRENGTH: /* attribute for strength */
7248 if (value == UCOL_DEFAULT) {
7249 coll->strengthisDefault = TRUE;
7250 coll->strength = (UColAttributeValue)coll->options->strength;
7251 } else if (value <= UCOL_IDENTICAL) {
7252 coll->strengthisDefault = FALSE;
7253 coll->strength = value;
7254 } else {
7255 *status = U_ILLEGAL_ARGUMENT_ERROR ;
7256 }
7257 break;
7258 case UCOL_ATTRIBUTE_COUNT:
7259 default:
7260 *status = U_ILLEGAL_ARGUMENT_ERROR;
7261 break;
7262 }
7263 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7264 coll->latinOneRegenTable = TRUE;
7265 } else {
7266 coll->latinOneRegenTable = FALSE;
7267 }
7268 ucol_updateInternalState(coll, status);
7269 }
7270
7271 U_CAPI UColAttributeValue U_EXPORT2
7272 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7273 if(U_FAILURE(*status) || coll == NULL) {
7274 return UCOL_DEFAULT;
7275 }
7276 switch(attr) {
7277 case UCOL_NUMERIC_COLLATION:
7278 return coll->numericCollation;
7279 case UCOL_HIRAGANA_QUATERNARY_MODE:
7280 return coll->hiraganaQ;
7281 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7282 return coll->frenchCollation;
7283 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7284 return coll->alternateHandling;
7285 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7286 return coll->caseFirst;
7287 case UCOL_CASE_LEVEL: /* do we have an extra case level */
7288 return coll->caseLevel;
7289 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7290 return coll->normalizationMode;
7291 case UCOL_STRENGTH: /* attribute for strength */
7292 return coll->strength;
7293 case UCOL_ATTRIBUTE_COUNT:
7294 default:
7295 *status = U_ILLEGAL_ARGUMENT_ERROR;
7296 break;
7297 }
7298 return UCOL_DEFAULT;
7299 }
7300
7301 U_CAPI void U_EXPORT2
7302 ucol_setStrength( UCollator *coll,
7303 UCollationStrength strength)
7304 {
7305 UErrorCode status = U_ZERO_ERROR;
7306 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7307 }
7308
7309 U_CAPI UCollationStrength U_EXPORT2
7310 ucol_getStrength(const UCollator *coll)
7311 {
7312 UErrorCode status = U_ZERO_ERROR;
7313 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7314 }
7315
7316 /****************************************************************************/
7317 /* Following are misc functions */
7318 /* there are new APIs and some compatibility APIs */
7319 /****************************************************************************/
7320
7321 U_CAPI void U_EXPORT2
7322 ucol_getVersion(const UCollator* coll,
7323 UVersionInfo versionInfo)
7324 {
7325 /* RunTime version */
7326 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7327 /* Builder version*/
7328 uint8_t bdVersion = coll->image->version[0];
7329
7330 /* Charset Version. Need to get the version from cnv files
7331 * makeconv should populate cnv files with version and
7332 * an api has to be provided in ucnv.h to obtain this version
7333 */
7334 uint8_t csVersion = 0;
7335
7336 /* combine the version info */
7337 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7338
7339 /* Tailoring rules */
7340 versionInfo[0] = (uint8_t)(cmbVersion>>8);
7341 versionInfo[1] = (uint8_t)cmbVersion;
7342 versionInfo[2] = coll->image->version[1];
7343 if(coll->UCA) {
7344 versionInfo[3] = coll->UCA->image->UCAVersion[0];
7345 } else {
7346 versionInfo[3] = 0;
7347 }
7348 }
7349
7350
7351 /* This internal API checks whether a character is tailored or not */
7352 U_CAPI UBool U_EXPORT2
7353 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7354 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7355 return FALSE;
7356 }
7357
7358 uint32_t CE = UCOL_NOT_FOUND;
7359 const UChar *ContractionStart = NULL;
7360 if(u < 0x100) { /* latin-1 */
7361 CE = coll->latinOneMapping[u];
7362 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7363 return FALSE;
7364 }
7365 } else { /* regular */
7366 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7367 }
7368
7369 if(isContraction(CE)) {
7370 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7371 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7372 }
7373
7374 return (UBool)(CE != UCOL_NOT_FOUND);
7375 }
7376
7377
7378 /****************************************************************************/
7379 /* Following are the string compare functions */
7380 /* */
7381 /****************************************************************************/
7382
7383
7384 /* ucol_checkIdent internal function. Does byte level string compare. */
7385 /* Used by strcoll if strength == identical and strings */
7386 /* are otherwise equal. Moved out-of-line because this */
7387 /* is a rare case. */
7388 /* */
7389 /* Comparison must be done on NFD normalized strings. */
7390 /* FCD is not good enough. */
7391 /* */
7392 /* TODO: make an incremental NFD Comparison function, which could */
7393 /* be of general use */
7394
7395 static
7396 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7397 {
7398
7399 // TODO: When we have an UChar iterator, we need to access the whole string. One
7400 // useful modification would be a UChar iterator extract API, since reset next next...
7401 // is not optimal.
7402 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7403
7404 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7405 // of same type, but that doesn't really mean that it will stay that way.
7406
7407 // The division for the array length may truncate the array size to
7408 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7409 // for all platforms anyway.
7410 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7411 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7412 //UChar sStackBuf[256], tStackBuf[256];
7413 //int32_t sBufSize = 256, tBufSize = 256;
7414 int32_t comparison;
7415 int32_t sLen = 0;
7416 UChar *sBuf = NULL;
7417 int32_t tLen = 0;
7418 UChar *tBuf = NULL;
7419 UBool freeSBuf = FALSE, freeTBuf = FALSE;
7420
7421 if (sColl->flags & UCOL_USE_ITERATOR) {
7422 UNormIterator *sNIt = NULL, *tNIt = NULL;
7423 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7424 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7425 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7426 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7427 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7428 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7429 comparison = u_strCompareIter(sIt, tIt, TRUE);
7430 unorm_closeIter(sNIt);
7431 unorm_closeIter(tNIt);
7432 } else {
7433 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7434 sBuf = sColl->string;
7435 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7436 tBuf = tColl->string;
7437
7438 if (normalize) {
7439 *status = U_ZERO_ERROR;
7440 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7441 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7442 sBuf, sLen,
7443 FALSE, 0,
7444 status);
7445 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7446 if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7447 &sColl->writableBuffer,
7448 (int32_t *)&sColl->writableBufSize, sLen,
7449 0)
7450 )
7451 {
7452 *status = U_MEMORY_ALLOCATION_ERROR;
7453 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7454 }
7455 *status = U_ZERO_ERROR;
7456 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7457 sBuf, sLen,
7458 FALSE, 0,
7459 status);
7460 }
7461 if(freeSBuf) {
7462 uprv_free(sBuf);
7463 freeSBuf = FALSE;
7464 }
7465 sBuf = sColl->writableBuffer;
7466 if (sBuf != sColl->stackWritableBuffer) {
7467 sColl->flags |= UCOL_ITER_ALLOCATED;
7468 }
7469 }
7470
7471 *status = U_ZERO_ERROR;
7472 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7473 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7474 tBuf, tLen,
7475 FALSE, 0,
7476 status);
7477 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7478 if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7479 &tColl->writableBuffer,
7480 (int32_t *)&tColl->writableBufSize, tLen,
7481 0)
7482 )
7483 {
7484 *status = U_MEMORY_ALLOCATION_ERROR;
7485 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7486 }
7487 *status = U_ZERO_ERROR;
7488 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7489 tBuf, tLen,
7490 FALSE, 0,
7491 status);
7492 }
7493 if(freeTBuf) {
7494 uprv_free(tBuf);
7495 freeTBuf = FALSE;
7496 }
7497 tBuf = tColl->writableBuffer;
7498 if (tBuf != tColl->stackWritableBuffer) {
7499 tColl->flags |= UCOL_ITER_ALLOCATED;
7500 }
7501 }
7502 }
7503
7504 if (sLen == -1 && tLen == -1) {
7505 comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7506 } else {
7507 if (sLen == -1) {
7508 sLen = u_strlen(sBuf);
7509 }
7510 if (tLen == -1) {
7511 tLen = u_strlen(tBuf);
7512 }
7513 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7514 if (comparison == 0) {
7515 comparison = sLen - tLen;
7516 }
7517 }
7518 }
7519
7520 if (comparison < 0) {
7521 return UCOL_LESS;
7522 } else if (comparison == 0) {
7523 return UCOL_EQUAL;
7524 } else /* comparison > 0 */ {
7525 return UCOL_GREATER;
7526 }
7527 }
7528
7529 /* CEBuf - A struct and some inline functions to handle the saving */
7530 /* of CEs in a buffer within ucol_strcoll */
7531
7532 #define UCOL_CEBUF_SIZE 512
7533 typedef struct ucol_CEBuf {
7534 uint32_t *buf;
7535 uint32_t *endp;
7536 uint32_t *pos;
7537 uint32_t localArray[UCOL_CEBUF_SIZE];
7538 } ucol_CEBuf;
7539
7540
7541 static
7542 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7543 (b)->buf = (b)->pos = (b)->localArray;
7544 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7545 }
7546
7547 static
7548 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7549 uint32_t oldSize;
7550 uint32_t newSize;
7551 uint32_t *newBuf;
7552
7553 ci->flags |= UCOL_ITER_ALLOCATED;
7554 oldSize = b->pos - b->buf;
7555 newSize = oldSize * 2;
7556 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7557 if(newBuf == NULL) {
7558 *status = U_MEMORY_ALLOCATION_ERROR;
7559 }
7560 else {
7561 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7562 if (b->buf != b->localArray) {
7563 uprv_free(b->buf);
7564 }
7565 b->buf = newBuf;
7566 b->endp = b->buf + newSize;
7567 b->pos = b->buf + oldSize;
7568 }
7569 }
7570
7571 static
7572 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7573 if (b->pos == b->endp) {
7574 ucol_CEBuf_Expand(b, ci, status);
7575 }
7576 if (U_SUCCESS(*status)) {
7577 *(b)->pos++ = ce;
7578 }
7579 }
7580
7581 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7582 /* It is used when compare gets in trouble and needs to bail out */
7583 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7584 collIterate *tColl,
7585 UErrorCode *status)
7586 {
7587 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7588 uint8_t *sourceKeyP = sourceKey;
7589 uint8_t *targetKeyP = targetKey;
7590 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7591 const UCollator *coll = sColl->coll;
7592 UChar *source = NULL;
7593 UChar *target = NULL;
7594 int32_t result = UCOL_EQUAL;
7595 UChar sStackBuf[256], tStackBuf[256];
7596 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7597 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7598
7599 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7600 if(sColl->flags & UCOL_USE_ITERATOR) {
7601 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7602 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7603 source = sStackBuf;
7604 UChar *sBufp = source;
7605 target = tStackBuf;
7606 UChar *tBufp = target;
7607 while(sColl->iterator->hasNext(sColl->iterator)) {
7608 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7609 }
7610 while(tColl->iterator->hasNext(tColl->iterator)) {
7611 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7612 }
7613 sourceLength = sBufp - source;
7614 targetLength = tBufp - target;
7615 } else { // no iterators
7616 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7617 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7618 source = sColl->string;
7619 target = tColl->string;
7620 }
7621
7622
7623
7624 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7625 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7626 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7627 if(sourceKeyP == NULL) {
7628 *status = U_MEMORY_ALLOCATION_ERROR;
7629 goto cleanup_and_do_compare;
7630 }
7631 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7632 }
7633
7634 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7635 if(targetKeyLen > UCOL_MAX_BUFFER) {
7636 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7637 if(targetKeyP == NULL) {
7638 *status = U_MEMORY_ALLOCATION_ERROR;
7639 goto cleanup_and_do_compare;
7640 }
7641 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7642 }
7643
7644 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7645
7646 cleanup_and_do_compare:
7647 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7648 uprv_free(sourceKeyP);
7649 }
7650
7651 if(targetKeyP != NULL && targetKeyP != targetKey) {
7652 uprv_free(targetKeyP);
7653 }
7654
7655 if(result<0) {
7656 return UCOL_LESS;
7657 } else if(result>0) {
7658 return UCOL_GREATER;
7659 } else {
7660 return UCOL_EQUAL;
7661 }
7662 }
7663
7664
7665 static inline UCollationResult
7666 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7667 // const UCollator *coll,
7668 // const UChar *source,
7669 // int32_t sourceLength,
7670 // const UChar *target,
7671 // int32_t targetLength,
7672 UErrorCode *status)
7673 {
7674 U_ALIGN_CODE(16);
7675
7676 const UCollator *coll = sColl->coll;
7677
7678
7679 // setting up the collator parameters
7680 UColAttributeValue strength = coll->strength;
7681 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7682
7683 UBool checkSecTer = initialCheckSecTer;
7684 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7685 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7686 UBool checkIdent = (strength == UCOL_IDENTICAL);
7687 UBool checkCase = (coll->caseLevel == UCOL_ON);
7688 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7689 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7690 UBool qShifted = shifted && checkQuad;
7691 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7692
7693 if(doHiragana && shifted) {
7694 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7695 }
7696 uint8_t caseSwitch = coll->caseSwitch;
7697 uint8_t tertiaryMask = coll->tertiaryMask;
7698
7699 // This is the lowest primary value that will not be ignored if shifted
7700 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7701
7702 UCollationResult result = UCOL_EQUAL;
7703 UCollationResult hirResult = UCOL_EQUAL;
7704
7705 // Preparing the CE buffers. They will be filled during the primary phase
7706 ucol_CEBuf sCEs;
7707 ucol_CEBuf tCEs;
7708 UCOL_INIT_CEBUF(&sCEs);
7709 UCOL_INIT_CEBUF(&tCEs);
7710
7711 uint32_t secS = 0, secT = 0;
7712 uint32_t sOrder=0, tOrder=0;
7713
7714 // Non shifted primary processing is quite simple
7715 if(!shifted) {
7716 for(;;) {
7717
7718 // We fetch CEs until we hit a non ignorable primary or end.
7719 do {
7720 // We get the next CE
7721 sOrder = ucol_IGetNextCE(coll, sColl, status);
7722 // Stuff it in the buffer
7723 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7724 // And keep just the primary part.
7725 sOrder &= UCOL_PRIMARYMASK;
7726 } while(sOrder == 0);
7727
7728 // see the comments on the above block
7729 do {
7730 tOrder = ucol_IGetNextCE(coll, tColl, status);
7731 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7732 tOrder &= UCOL_PRIMARYMASK;
7733 } while(tOrder == 0);
7734
7735 // if both primaries are the same
7736 if(sOrder == tOrder) {
7737 // and there are no more CEs, we advance to the next level
7738 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7739 break;
7740 }
7741 if(doHiragana && hirResult == UCOL_EQUAL) {
7742 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7743 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7744 ? UCOL_LESS:UCOL_GREATER;
7745 }
7746 }
7747 } else {
7748 // if two primaries are different, we are done
7749 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7750 goto commonReturn;
7751 }
7752 } // no primary difference... do the rest from the buffers
7753 } else { // shifted - do a slightly more complicated processing :)
7754 for(;;) {
7755 UBool sInShifted = FALSE;
7756 UBool tInShifted = FALSE;
7757 // This version of code can be refactored. However, it seems easier to understand this way.
7758 // Source loop. Sam as the target loop.
7759 for(;;) {
7760 sOrder = ucol_IGetNextCE(coll, sColl, status);
7761 if(sOrder == UCOL_NO_MORE_CES) {
7762 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7763 break;
7764 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7765 /* UCA amendment - ignore ignorables that follow shifted code points */
7766 continue;
7767 } else if(isContinuation(sOrder)) {
7768 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7769 if(sInShifted) {
7770 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7771 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7772 continue;
7773 } else {
7774 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7775 break;
7776 }
7777 } else { /* Just lower level values */
7778 if(sInShifted) {
7779 continue;
7780 } else {
7781 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7782 continue;
7783 }
7784 }
7785 } else { /* regular */
7786 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7787 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7788 break;
7789 } else {
7790 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7791 sInShifted = TRUE;
7792 sOrder &= UCOL_PRIMARYMASK;
7793 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7794 continue;
7795 } else {
7796 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7797 sInShifted = FALSE;
7798 continue;
7799 }
7800 }
7801 }
7802 }
7803 sOrder &= UCOL_PRIMARYMASK;
7804 sInShifted = FALSE;
7805
7806 for(;;) {
7807 tOrder = ucol_IGetNextCE(coll, tColl, status);
7808 if(tOrder == UCOL_NO_MORE_CES) {
7809 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7810 break;
7811 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7812 /* UCA amendment - ignore ignorables that follow shifted code points */
7813 continue;
7814 } else if(isContinuation(tOrder)) {
7815 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7816 if(tInShifted) {
7817 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7818 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7819 continue;
7820 } else {
7821 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7822 break;
7823 }
7824 } else { /* Just lower level values */
7825 if(tInShifted) {
7826 continue;
7827 } else {
7828 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7829 continue;
7830 }
7831 }
7832 } else { /* regular */
7833 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7834 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7835 break;
7836 } else {
7837 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7838 tInShifted = TRUE;
7839 tOrder &= UCOL_PRIMARYMASK;
7840 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7841 continue;
7842 } else {
7843 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7844 tInShifted = FALSE;
7845 continue;
7846 }
7847 }
7848 }
7849 }
7850 tOrder &= UCOL_PRIMARYMASK;
7851 tInShifted = FALSE;
7852
7853 if(sOrder == tOrder) {
7854 /*
7855 if(doHiragana && hirResult == UCOL_EQUAL) {
7856 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7857 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7858 ? UCOL_LESS:UCOL_GREATER;
7859 }
7860 }
7861 */
7862 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7863 break;
7864 } else {
7865 sOrder = 0;
7866 tOrder = 0;
7867 continue;
7868 }
7869 } else {
7870 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7871 goto commonReturn;
7872 }
7873 } /* no primary difference... do the rest from the buffers */
7874 }
7875
7876 /* now, we're gonna reexamine collected CEs */
7877 uint32_t *sCE;
7878 uint32_t *tCE;
7879
7880 /* This is the secondary level of comparison */
7881 if(checkSecTer) {
7882 if(!isFrenchSec) { /* normal */
7883 sCE = sCEs.buf;
7884 tCE = tCEs.buf;
7885 for(;;) {
7886 while (secS == 0) {
7887 secS = *(sCE++) & UCOL_SECONDARYMASK;
7888 }
7889
7890 while(secT == 0) {
7891 secT = *(tCE++) & UCOL_SECONDARYMASK;
7892 }
7893
7894 if(secS == secT) {
7895 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7896 break;
7897 } else {
7898 secS = 0; secT = 0;
7899 continue;
7900 }
7901 } else {
7902 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7903 goto commonReturn;
7904 }
7905 }
7906 } else { /* do the French */
7907 uint32_t *sCESave = NULL;
7908 uint32_t *tCESave = NULL;
7909 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7910 tCE = tCEs.pos-2;
7911 for(;;) {
7912 while (secS == 0 && sCE >= sCEs.buf) {
7913 if(sCESave == 0) {
7914 secS = *(sCE--);
7915 if(isContinuation(secS)) {
7916 while(isContinuation(secS = *(sCE--)))
7917 ;
7918 /* after this, secS has the start of continuation, and sCEs points before that */
7919 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7920 sCE+=2; /* need to point to the first continuation CP */
7921 /* However, now you can just continue doing stuff */
7922 }
7923 } else {
7924 secS = *(sCE++);
7925 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7926 sCE = sCESave; /* reset the pointer to before continuation */
7927 sCESave = 0;
7928 continue;
7929 }
7930 }
7931 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7932 }
7933
7934 while(secT == 0 && tCE >= tCEs.buf) {
7935 if(tCESave == 0) {
7936 secT = *(tCE--);
7937 if(isContinuation(secT)) {
7938 while(isContinuation(secT = *(tCE--)))
7939 ;
7940 /* after this, secS has the start of continuation, and sCEs points before that */
7941 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7942 tCE+=2; /* need to point to the first continuation CP */
7943 /* However, now you can just continue doing stuff */
7944 }
7945 } else {
7946 secT = *(tCE++);
7947 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7948 tCE = tCESave; /* reset the pointer to before continuation */
7949 tCESave = 0;
7950 continue;
7951 }
7952 }
7953 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7954 }
7955
7956 if(secS == secT) {
7957 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7958 break;
7959 } else {
7960 secS = 0; secT = 0;
7961 continue;
7962 }
7963 } else {
7964 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7965 goto commonReturn;
7966 }
7967 }
7968 }
7969 }
7970
7971 /* doing the case bit */
7972 if(checkCase) {
7973 sCE = sCEs.buf;
7974 tCE = tCEs.buf;
7975 for(;;) {
7976 while((secS & UCOL_REMOVE_CASE) == 0) {
7977 if(!isContinuation(*sCE++)) {
7978 secS =*(sCE-1);
7979 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7980 // primary ignorables should not be considered on the case level when the strength is primary
7981 // otherwise, the CEs stop being well-formed
7982 secS &= UCOL_TERT_CASE_MASK;
7983 secS ^= caseSwitch;
7984 } else {
7985 secS = 0;
7986 }
7987 } else {
7988 secS = 0;
7989 }
7990 }
7991
7992 while((secT & UCOL_REMOVE_CASE) == 0) {
7993 if(!isContinuation(*tCE++)) {
7994 secT = *(tCE-1);
7995 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7996 // primary ignorables should not be considered on the case level when the strength is primary
7997 // otherwise, the CEs stop being well-formed
7998 secT &= UCOL_TERT_CASE_MASK;
7999 secT ^= caseSwitch;
8000 } else {
8001 secT = 0;
8002 }
8003 } else {
8004 secT = 0;
8005 }
8006 }
8007
8008 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
8009 result = UCOL_LESS;
8010 goto commonReturn;
8011 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
8012 result = UCOL_GREATER;
8013 goto commonReturn;
8014 }
8015
8016 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
8017 break;
8018 } else {
8019 secS = 0;
8020 secT = 0;
8021 }
8022 }
8023 }
8024
8025 /* Tertiary level */
8026 if(checkTertiary) {
8027 secS = 0;
8028 secT = 0;
8029 sCE = sCEs.buf;
8030 tCE = tCEs.buf;
8031 for(;;) {
8032 while((secS & UCOL_REMOVE_CASE) == 0) {
8033 secS = *(sCE++) & tertiaryMask;
8034 if(!isContinuation(secS)) {
8035 secS ^= caseSwitch;
8036 } else {
8037 secS &= UCOL_REMOVE_CASE;
8038 }
8039 }
8040
8041 while((secT & UCOL_REMOVE_CASE) == 0) {
8042 secT = *(tCE++) & tertiaryMask;
8043 if(!isContinuation(secT)) {
8044 secT ^= caseSwitch;
8045 } else {
8046 secT &= UCOL_REMOVE_CASE;
8047 }
8048 }
8049
8050 if(secS == secT) {
8051 if((secS & UCOL_REMOVE_CASE) == 1) {
8052 break;
8053 } else {
8054 secS = 0; secT = 0;
8055 continue;
8056 }
8057 } else {
8058 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8059 goto commonReturn;
8060 }
8061 }
8062 }
8063
8064
8065 if(qShifted /*checkQuad*/) {
8066 UBool sInShifted = TRUE;
8067 UBool tInShifted = TRUE;
8068 secS = 0;
8069 secT = 0;
8070 sCE = sCEs.buf;
8071 tCE = tCEs.buf;
8072 for(;;) {
8073 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
8074 secS = *(sCE++);
8075 if(isContinuation(secS)) {
8076 if(!sInShifted) {
8077 continue;
8078 }
8079 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
8080 secS = UCOL_PRIMARYMASK;
8081 sInShifted = FALSE;
8082 } else {
8083 sInShifted = TRUE;
8084 }
8085 }
8086 secS &= UCOL_PRIMARYMASK;
8087
8088
8089 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
8090 secT = *(tCE++);
8091 if(isContinuation(secT)) {
8092 if(!tInShifted) {
8093 continue;
8094 }
8095 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
8096 secT = UCOL_PRIMARYMASK;
8097 tInShifted = FALSE;
8098 } else {
8099 tInShifted = TRUE;
8100 }
8101 }
8102 secT &= UCOL_PRIMARYMASK;
8103
8104 if(secS == secT) {
8105 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
8106 break;
8107 } else {
8108 secS = 0; secT = 0;
8109 continue;
8110 }
8111 } else {
8112 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8113 goto commonReturn;
8114 }
8115 }
8116 } else if(doHiragana && hirResult != UCOL_EQUAL) {
8117 // If we're fine on quaternaries, we might be different
8118 // on Hiragana. This, however, might fail us in shifted.
8119 result = hirResult;
8120 goto commonReturn;
8121 }
8122
8123 /* For IDENTICAL comparisons, we use a bitwise character comparison */
8124 /* as a tiebreaker if all else is equal. */
8125 /* Getting here should be quite rare - strings are not identical - */
8126 /* that is checked first, but compared == through all other checks. */
8127 if(checkIdent)
8128 {
8129 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8130 result = ucol_checkIdent(sColl, tColl, TRUE, status);
8131 }
8132
8133 commonReturn:
8134 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
8135 freeHeapWritableBuffer(sColl);
8136 freeHeapWritableBuffer(tColl);
8137
8138 if (sCEs.buf != sCEs.localArray ) {
8139 uprv_free(sCEs.buf);
8140 }
8141 if (tCEs.buf != tCEs.localArray ) {
8142 uprv_free(tCEs.buf);
8143 }
8144 }
8145
8146 return result;
8147 }
8148
8149
8150 static inline uint32_t
8151 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
8152 uint32_t CE, const UChar *s, int32_t *index, int32_t len)
8153 {
8154 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8155 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8156 int32_t offset = 1;
8157 UChar schar = 0, tchar = 0;
8158
8159 for(;;) {
8160 if(len == -1) {
8161 if(s[*index] == 0) { // end of string
8162 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8163 } else {
8164 schar = s[*index];
8165 }
8166 } else {
8167 if(*index == len) {
8168 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8169 } else {
8170 schar = s[*index];
8171 }
8172 }
8173
8174 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8175 offset++;
8176 }
8177
8178 if (schar == tchar) {
8179 (*index)++;
8180 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8181 }
8182 else
8183 {
8184 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8185 return UCOL_BAIL_OUT_CE;
8186 }
8187 // skip completely ignorables
8188 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8189 if(isZeroCE == 0) { // we have to ignore completely ignorables
8190 (*index)++;
8191 continue;
8192 }
8193
8194 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8195 }
8196 }
8197 }
8198
8199
8200 /**
8201 * This is a fast strcoll, geared towards text in Latin-1.
8202 * It supports contractions of size two, French secondaries
8203 * and case switching. You can use it with strengths primary
8204 * to tertiary. It does not support shifted and case level.
8205 * It relies on the table build by setupLatin1Table. If it
8206 * doesn't understand something, it will go to the regular
8207 * strcoll.
8208 */
8209 static inline UCollationResult
8210 ucol_strcollUseLatin1( const UCollator *coll,
8211 const UChar *source,
8212 int32_t sLen,
8213 const UChar *target,
8214 int32_t tLen,
8215 UErrorCode *status)
8216 {
8217 U_ALIGN_CODE(16);
8218 int32_t strength = coll->strength;
8219
8220 int32_t sIndex = 0, tIndex = 0;
8221 UChar sChar = 0, tChar = 0;
8222 uint32_t sOrder=0, tOrder=0;
8223
8224 UBool endOfSource = FALSE;
8225
8226 uint32_t *elements = coll->latinOneCEs;
8227
8228 UBool haveContractions = FALSE; // if we have contractions in our string
8229 // we cannot do French secondary
8230
8231 // Do the primary level
8232 for(;;) {
8233 while(sOrder==0) { // this loop skips primary ignorables
8234 // sOrder=getNextlatinOneCE(source);
8235 if(sLen==-1) { // handling zero terminated strings
8236 sChar=source[sIndex++];
8237 if(sChar==0) {
8238 endOfSource = TRUE;
8239 break;
8240 }
8241 } else { // handling strings with known length
8242 if(sIndex==sLen) {
8243 endOfSource = TRUE;
8244 break;
8245 }
8246 sChar=source[sIndex++];
8247 }
8248 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8249 //fprintf(stderr, "R");
8250 goto returnRegular;
8251 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8252 }
8253 sOrder = elements[sChar];
8254 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8255 // specials can basically be either contractions or bail-out signs. If we get anything
8256 // else, we'll bail out anywasy
8257 if(getCETag(sOrder) == CONTRACTION_TAG) {
8258 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8259 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8260 // However, if there are contractions in the table, but we always use just one char,
8261 // we might be able to do French. This should be checked out.
8262 }
8263 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8264 //fprintf(stderr, "S");
8265 goto returnRegular;
8266 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8267 }
8268 }
8269 }
8270
8271 while(tOrder==0) { // this loop skips primary ignorables
8272 // tOrder=getNextlatinOneCE(target);
8273 if(tLen==-1) { // handling zero terminated strings
8274 tChar=target[tIndex++];
8275 if(tChar==0) {
8276 if(endOfSource) { // this is different than source loop,
8277 // as we already know that source loop is done here,
8278 // so we can either finish the primary loop if both
8279 // strings are done or anounce the result if only
8280 // target is done. Same below.
8281 goto endOfPrimLoop;
8282 } else {
8283 return UCOL_GREATER;
8284 }
8285 }
8286 } else { // handling strings with known length
8287 if(tIndex==tLen) {
8288 if(endOfSource) {
8289 goto endOfPrimLoop;
8290 } else {
8291 return UCOL_GREATER;
8292 }
8293 }
8294 tChar=target[tIndex++];
8295 }
8296 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8297 //fprintf(stderr, "R");
8298 goto returnRegular;
8299 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8300 }
8301 tOrder = elements[tChar];
8302 if(tOrder >= UCOL_NOT_FOUND) {
8303 // Handling specials, see the comments for source
8304 if(getCETag(tOrder) == CONTRACTION_TAG) {
8305 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8306 haveContractions = TRUE;
8307 }
8308 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8309 //fprintf(stderr, "S");
8310 goto returnRegular;
8311 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8312 }
8313 }
8314 }
8315 if(endOfSource) { // source is finished, but target is not, say the result.
8316 return UCOL_LESS;
8317 }
8318
8319 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8320 sOrder = 0; tOrder = 0;
8321 continue;
8322 } else {
8323 // compare current top bytes
8324 if(((sOrder^tOrder)&0xFF000000)!=0) {
8325 // top bytes differ, return difference
8326 if(sOrder < tOrder) {
8327 return UCOL_LESS;
8328 } else if(sOrder > tOrder) {
8329 return UCOL_GREATER;
8330 }
8331 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8332 // since we must return enum value
8333 }
8334
8335 // top bytes match, continue with following bytes
8336 sOrder<<=8;
8337 tOrder<<=8;
8338 }
8339 }
8340
8341 endOfPrimLoop:
8342 // after primary loop, we definitely know the sizes of strings,
8343 // so we set it and use simpler loop for secondaries and tertiaries
8344 sLen = sIndex; tLen = tIndex;
8345 if(strength >= UCOL_SECONDARY) {
8346 // adjust the table beggining
8347 elements += coll->latinOneTableLen;
8348 endOfSource = FALSE;
8349
8350 if(coll->frenchCollation == UCOL_OFF) { // non French
8351 // This loop is a simplified copy of primary loop
8352 // at this point we know that whole strings are latin-1, so we don't
8353 // check for that. We also know that we only have contractions as
8354 // specials.
8355 sIndex = 0; tIndex = 0;
8356 for(;;) {
8357 while(sOrder==0) {
8358 if(sIndex==sLen) {
8359 endOfSource = TRUE;
8360 break;
8361 }
8362 sChar=source[sIndex++];
8363 sOrder = elements[sChar];
8364 if(sOrder > UCOL_NOT_FOUND) {
8365 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8366 }
8367 }
8368
8369 while(tOrder==0) {
8370 if(tIndex==tLen) {
8371 if(endOfSource) {
8372 goto endOfSecLoop;
8373 } else {
8374 return UCOL_GREATER;
8375 }
8376 }
8377 tChar=target[tIndex++];
8378 tOrder = elements[tChar];
8379 if(tOrder > UCOL_NOT_FOUND) {
8380 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8381 }
8382 }
8383 if(endOfSource) {
8384 return UCOL_LESS;
8385 }
8386
8387 if(sOrder == tOrder) {
8388 sOrder = 0; tOrder = 0;
8389 continue;
8390 } else {
8391 // see primary loop for comments on this
8392 if(((sOrder^tOrder)&0xFF000000)!=0) {
8393 if(sOrder < tOrder) {
8394 return UCOL_LESS;
8395 } else if(sOrder > tOrder) {
8396 return UCOL_GREATER;
8397 }
8398 }
8399 sOrder<<=8;
8400 tOrder<<=8;
8401 }
8402 }
8403 } else { // French
8404 if(haveContractions) { // if we have contractions, we have to bail out
8405 // since we don't really know how to handle them here
8406 goto returnRegular;
8407 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8408 }
8409 // For French, we go backwards
8410 sIndex = sLen; tIndex = tLen;
8411 for(;;) {
8412 while(sOrder==0) {
8413 if(sIndex==0) {
8414 endOfSource = TRUE;
8415 break;
8416 }
8417 sChar=source[--sIndex];
8418 sOrder = elements[sChar];
8419 // don't even look for contractions
8420 }
8421
8422 while(tOrder==0) {
8423 if(tIndex==0) {
8424 if(endOfSource) {
8425 goto endOfSecLoop;
8426 } else {
8427 return UCOL_GREATER;
8428 }
8429 }
8430 tChar=target[--tIndex];
8431 tOrder = elements[tChar];
8432 // don't even look for contractions
8433 }
8434 if(endOfSource) {
8435 return UCOL_LESS;
8436 }
8437
8438 if(sOrder == tOrder) {
8439 sOrder = 0; tOrder = 0;
8440 continue;
8441 } else {
8442 // see the primary loop for comments
8443 if(((sOrder^tOrder)&0xFF000000)!=0) {
8444 if(sOrder < tOrder) {
8445 return UCOL_LESS;
8446 } else if(sOrder > tOrder) {
8447 return UCOL_GREATER;
8448 }
8449 }
8450 sOrder<<=8;
8451 tOrder<<=8;
8452 }
8453 }
8454 }
8455 }
8456
8457 endOfSecLoop:
8458 if(strength >= UCOL_TERTIARY) {
8459 // tertiary loop is the same as secondary (except no French)
8460 elements += coll->latinOneTableLen;
8461 sIndex = 0; tIndex = 0;
8462 endOfSource = FALSE;
8463 for(;;) {
8464 while(sOrder==0) {
8465 if(sIndex==sLen) {
8466 endOfSource = TRUE;
8467 break;
8468 }
8469 sChar=source[sIndex++];
8470 sOrder = elements[sChar];
8471 if(sOrder > UCOL_NOT_FOUND) {
8472 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8473 }
8474 }
8475 while(tOrder==0) {
8476 if(tIndex==tLen) {
8477 if(endOfSource) {
8478 return UCOL_EQUAL; // if both strings are at the end, they are equal
8479 } else {
8480 return UCOL_GREATER;
8481 }
8482 }
8483 tChar=target[tIndex++];
8484 tOrder = elements[tChar];
8485 if(tOrder > UCOL_NOT_FOUND) {
8486 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8487 }
8488 }
8489 if(endOfSource) {
8490 return UCOL_LESS;
8491 }
8492 if(sOrder == tOrder) {
8493 sOrder = 0; tOrder = 0;
8494 continue;
8495 } else {
8496 if(((sOrder^tOrder)&0xff000000)!=0) {
8497 if(sOrder < tOrder) {
8498 return UCOL_LESS;
8499 } else if(sOrder > tOrder) {
8500 return UCOL_GREATER;
8501 }
8502 }
8503 sOrder<<=8;
8504 tOrder<<=8;
8505 }
8506 }
8507 }
8508 return UCOL_EQUAL;
8509
8510 returnRegular:
8511 // Preparing the context objects for iterating over strings
8512 collIterate sColl, tColl;
8513
8514 IInit_collIterate(coll, source, sLen, &sColl);
8515 IInit_collIterate(coll, target, tLen, &tColl);
8516 return ucol_strcollRegular(&sColl, &tColl, status);
8517 }
8518
8519
8520 U_CAPI UCollationResult U_EXPORT2
8521 ucol_strcollIter( const UCollator *coll,
8522 UCharIterator *sIter,
8523 UCharIterator *tIter,
8524 UErrorCode *status)
8525 {
8526 if(!status || U_FAILURE(*status)) {
8527 return UCOL_EQUAL;
8528 }
8529
8530 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8531 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8532
8533 if (sIter == tIter) {
8534 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8535 return UCOL_EQUAL;
8536 }
8537 if(sIter == NULL || tIter == NULL || coll == NULL) {
8538 *status = U_ILLEGAL_ARGUMENT_ERROR;
8539 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8540 return UCOL_EQUAL;
8541 }
8542
8543 UCollationResult result = UCOL_EQUAL;
8544
8545 // Preparing the context objects for iterating over strings
8546 collIterate sColl, tColl;
8547 // The division for the array length may truncate the array size to
8548 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8549 // for all platforms anyway.
8550 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8551 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8552 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8553
8554 IInit_collIterate(coll, NULL, -1, &sColl);
8555 sColl.iterator = sIter;
8556 sColl.flags |= UCOL_USE_ITERATOR;
8557 IInit_collIterate(coll, NULL, -1, &tColl);
8558 tColl.flags |= UCOL_USE_ITERATOR;
8559 tColl.iterator = tIter;
8560
8561 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8562 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8563 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8564 sColl.flags &= ~UCOL_ITER_NORM;
8565
8566 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8567 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8568 tColl.flags &= ~UCOL_ITER_NORM;
8569 }
8570
8571 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8572
8573 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8574 (tChar = tColl.iterator->next(tColl.iterator))) {
8575 if(sChar == U_SENTINEL) {
8576 result = UCOL_EQUAL;
8577 goto end_compare;
8578 }
8579 }
8580
8581 if(sChar == U_SENTINEL) {
8582 tChar = tColl.iterator->previous(tColl.iterator);
8583 }
8584
8585 if(tChar == U_SENTINEL) {
8586 sChar = sColl.iterator->previous(sColl.iterator);
8587 }
8588
8589 sChar = sColl.iterator->previous(sColl.iterator);
8590 tChar = tColl.iterator->previous(tColl.iterator);
8591
8592 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8593 {
8594 // We are stopped in the middle of a contraction.
8595 // Scan backwards through the == part of the string looking for the start of the contraction.
8596 // It doesn't matter which string we scan, since they are the same in this region.
8597 do
8598 {
8599 sChar = sColl.iterator->previous(sColl.iterator);
8600 tChar = tColl.iterator->previous(tColl.iterator);
8601 }
8602 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8603 }
8604
8605
8606 if(U_SUCCESS(*status)) {
8607 result = ucol_strcollRegular(&sColl, &tColl, status);
8608 }
8609
8610 end_compare:
8611 if(sNormIter || tNormIter) {
8612 unorm_closeIter(sNormIter);
8613 unorm_closeIter(tNormIter);
8614 }
8615
8616 UTRACE_EXIT_VALUE_STATUS(result, *status)
8617 return result;
8618 }
8619
8620
8621 /* */
8622 /* ucol_strcoll Main public API string comparison function */
8623 /* */
8624 U_CAPI UCollationResult U_EXPORT2
8625 ucol_strcoll( const UCollator *coll,
8626 const UChar *source,
8627 int32_t sourceLength,
8628 const UChar *target,
8629 int32_t targetLength)
8630 {
8631 U_ALIGN_CODE(16);
8632
8633 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8634 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8635 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8636 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8637 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8638 }
8639
8640 if(source == NULL || target == NULL) {
8641 // do not crash, but return. Should have
8642 // status argument to return error.
8643 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8644 return UCOL_EQUAL;
8645 }
8646
8647 /* Quick check if source and target are same strings. */
8648 /* They should either both be NULL terminated or the explicit length should be set on both. */
8649 if (source==target && sourceLength==targetLength) {
8650 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8651 return UCOL_EQUAL;
8652 }
8653
8654 /* Scan the strings. Find: */
8655 /* The length of any leading portion that is equal */
8656 /* Whether they are exactly equal. (in which case we just return) */
8657 const UChar *pSrc = source;
8658 const UChar *pTarg = target;
8659 int32_t equalLength;
8660
8661 if (sourceLength == -1 && targetLength == -1) {
8662 // Both strings are null terminated.
8663 // Scan through any leading equal portion.
8664 while (*pSrc == *pTarg && *pSrc != 0) {
8665 pSrc++;
8666 pTarg++;
8667 }
8668 if (*pSrc == 0 && *pTarg == 0) {
8669 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8670 return UCOL_EQUAL;
8671 }
8672 equalLength = pSrc - source;
8673 }
8674 else
8675 {
8676 // One or both strings has an explicit length.
8677 const UChar *pSrcEnd = source + sourceLength;
8678 const UChar *pTargEnd = target + targetLength;
8679
8680 // Scan while the strings are bitwise ==, or until one is exhausted.
8681 for (;;) {
8682 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8683 break;
8684 }
8685 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8686 break;
8687 }
8688 if (*pSrc != *pTarg) {
8689 break;
8690 }
8691 pSrc++;
8692 pTarg++;
8693 }
8694 equalLength = pSrc - source;
8695
8696 // If we made it all the way through both strings, we are done. They are ==
8697 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8698 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */
8699 {
8700 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8701 return UCOL_EQUAL;
8702 }
8703 }
8704 if (equalLength > 0) {
8705 /* There is an identical portion at the beginning of the two strings. */
8706 /* If the identical portion ends within a contraction or a comibining */
8707 /* character sequence, back up to the start of that sequence. */
8708
8709 // These values should already be set by the code above.
8710 //pSrc = source + equalLength; /* point to the first differing chars */
8711 //pTarg = target + equalLength;
8712 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8713 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8714 {
8715 // We are stopped in the middle of a contraction.
8716 // Scan backwards through the == part of the string looking for the start of the contraction.
8717 // It doesn't matter which string we scan, since they are the same in this region.
8718 do
8719 {
8720 equalLength--;
8721 pSrc--;
8722 }
8723 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8724 }
8725
8726 source += equalLength;
8727 target += equalLength;
8728 if (sourceLength > 0) {
8729 sourceLength -= equalLength;
8730 }
8731 if (targetLength > 0) {
8732 targetLength -= equalLength;
8733 }
8734 }
8735
8736 UErrorCode status = U_ZERO_ERROR;
8737 UCollationResult returnVal;
8738 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8739 collIterate sColl, tColl;
8740 // Preparing the context objects for iterating over strings
8741 IInit_collIterate(coll, source, sourceLength, &sColl);
8742 IInit_collIterate(coll, target, targetLength, &tColl);
8743 returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8744 } else {
8745 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8746 }
8747 UTRACE_EXIT_VALUE(returnVal);
8748 return returnVal;
8749 }
8750
8751 /* convenience function for comparing strings */
8752 U_CAPI UBool U_EXPORT2
8753 ucol_greater( const UCollator *coll,
8754 const UChar *source,
8755 int32_t sourceLength,
8756 const UChar *target,
8757 int32_t targetLength)
8758 {
8759 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8760 == UCOL_GREATER);
8761 }
8762
8763 /* convenience function for comparing strings */
8764 U_CAPI UBool U_EXPORT2
8765 ucol_greaterOrEqual( const UCollator *coll,
8766 const UChar *source,
8767 int32_t sourceLength,
8768 const UChar *target,
8769 int32_t targetLength)
8770 {
8771 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8772 != UCOL_LESS);
8773 }
8774
8775 /* convenience function for comparing strings */
8776 U_CAPI UBool U_EXPORT2
8777 ucol_equal( const UCollator *coll,
8778 const UChar *source,
8779 int32_t sourceLength,
8780 const UChar *target,
8781 int32_t targetLength)
8782 {
8783 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8784 == UCOL_EQUAL);
8785 }
8786
8787 U_CAPI void U_EXPORT2
8788 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8789 if(coll && coll->UCA) {
8790 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8791 }
8792 }
8793
8794 #endif /* #if !UCONFIG_NO_COLLATION */