]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol.cpp
ICU-8.11.tar.gz
[apple/icu.git] / icuSources / i18n / ucol.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: ucol.cpp
7 * encoding: US-ASCII
8 * tab size: 8 (not used)
9 * indentation:4
10 *
11 * Modification history
12 * Date Name Comments
13 * 1996-1999 various members of ICU team maintained C API for collation framework
14 * 02/16/2001 synwee Added internal method getPrevSpecialCE
15 * 03/01/2001 synwee Added maxexpansion functionality.
16 * 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant
17 */
18
19 #include "unicode/utypes.h"
20 #include "uassert.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/coleitr.h"
25 #include "unicode/unorm.h"
26 #include "unicode/udata.h"
27 #include "unicode/ustring.h"
28
29 #include "ucol_imp.h"
30 #include "ucol_elm.h"
31 #include "bocsu.h"
32
33 #include "unormimp.h"
34 #include "unorm_it.h"
35 #include "umutex.h"
36 #include "cmemory.h"
37 #include "ucln_in.h"
38 #include "cstring.h"
39 #include "utracimp.h"
40 #include "putilimp.h"
41
42 #ifdef UCOL_DEBUG
43 #include <stdio.h>
44 #endif
45
46 U_NAMESPACE_USE
47
48 /* added by synwee for trie manipulation*/
49 #define STAGE_1_SHIFT_ 10
50 #define STAGE_2_SHIFT_ 4
51 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
52 #define STAGE_3_MASK_ 0xF
53 #define LAST_BYTE_MASK_ 0xFF
54 #define SECOND_LAST_BYTE_SHIFT_ 8
55
56 #define ZERO_CC_LIMIT_ 0xC0
57
58 // static UCA. There is only one. Collators don't use it.
59 // It is referenced only in ucol_initUCA and ucol_cleanup
60 static UCollator* _staticUCA = NULL;
61 // static pointer to udata memory. Inited in ucol_initUCA
62 // used for cleanup in ucol_cleanup
63 static UDataMemory* UCA_DATA_MEM = NULL;
64
65 // this is static pointer to the normalizer fcdTrieIndex
66 // it is always the same between calls to u_cleanup
67 // and therefore writing to it is not synchronized.
68 // It is cleaned in ucol_cleanup
69 static const uint16_t *fcdTrieIndex=NULL;
70
71 // These are values from UCA required for
72 // implicit generation and supressing sort key compression
73 // they should regularly be in the UCA, but if one
74 // is running without UCA, it could be a problem
75 static int32_t maxRegularPrimary = 0xA0;
76 static int32_t minImplicitPrimary = 0xE0;
77 static int32_t maxImplicitPrimary = 0xE4;
78
79 U_CDECL_BEGIN
80 static UBool U_CALLCONV
81 isAcceptableUCA(void * /*context*/,
82 const char * /*type*/, const char * /*name*/,
83 const UDataInfo *pInfo){
84 /* context, type & name are intentionally not used */
85 if( pInfo->size>=20 &&
86 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
87 pInfo->charsetFamily==U_CHARSET_FAMILY &&
88 pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 && /* dataFormat="UCol" */
89 pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
90 pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
91 pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
92 pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
93 pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
94 //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
95 //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
96 //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
97 ) {
98 UVersionInfo UCDVersion;
99 u_getUnicodeVersion(UCDVersion);
100 if(pInfo->dataVersion[0]==UCDVersion[0] &&
101 pInfo->dataVersion[1]==UCDVersion[1]) { // &&
102 //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
103 //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
104 return TRUE;
105 } else {
106 return FALSE;
107 }
108 } else {
109 return FALSE;
110 }
111 }
112
113
114 static int32_t U_CALLCONV
115 _getFoldingOffset(uint32_t data) {
116 return (int32_t)(data&0xFFFFFF);
117 }
118
119 U_CDECL_END
120
121 static
122 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
123 int32_t sourceLen, collIterate *s) {
124 (s)->string = (s)->pos = (UChar *)(sourceString);
125 (s)->origFlags = 0;
126 (s)->flags = 0;
127 if (sourceLen >= 0) {
128 s->flags |= UCOL_ITER_HASLEN;
129 (s)->endp = (UChar *)sourceString+sourceLen;
130 }
131 else {
132 /* change to enable easier checking for end of string for fcdpositon */
133 (s)->endp = NULL;
134 }
135 (s)->CEpos = (s)->toReturn = (s)->CEs;
136 (s)->writableBuffer = (s)->stackWritableBuffer;
137 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
138 (s)->coll = (collator);
139 (s)->fcdPosition = 0;
140 if(collator->normalizationMode == UCOL_ON) {
141 (s)->flags |= UCOL_ITER_NORM;
142 }
143 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
144 (s)->flags |= UCOL_HIRAGANA_Q;
145 }
146 (s)->iterator = NULL;
147 //(s)->iteratorIndex = 0;
148 }
149
150 U_CAPI void U_EXPORT2
151 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
152 int32_t sourceLen, collIterate *s){
153 /* Out-of-line version for use from other files. */
154 IInit_collIterate(collator, sourceString, sourceLen, s);
155 }
156
157
158 /**
159 * Backup the state of the collIterate struct data
160 * @param data collIterate to backup
161 * @param backup storage
162 */
163 static
164 inline void backupState(const collIterate *data, collIterateState *backup)
165 {
166 backup->fcdPosition = data->fcdPosition;
167 backup->flags = data->flags;
168 backup->origFlags = data->origFlags;
169 backup->pos = data->pos;
170 backup->bufferaddress = data->writableBuffer;
171 backup->buffersize = data->writableBufSize;
172 backup->iteratorMove = 0;
173 backup->iteratorIndex = 0;
174 if(data->iterator != NULL) {
175 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
176 backup->iteratorIndex = data->iterator->getState(data->iterator);
177 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
178 if(backup->iteratorIndex == UITER_NO_STATE) {
179 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
180 backup->iteratorMove++;
181 data->iterator->move(data->iterator, -1, UITER_CURRENT);
182 }
183 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
184 }
185 }
186 }
187
188 /**
189 * Loads the state into the collIterate struct data
190 * @param data collIterate to backup
191 * @param backup storage
192 * @param forwards boolean to indicate if forwards iteration is used,
193 * false indicates backwards iteration
194 */
195 static
196 inline void loadState(collIterate *data, const collIterateState *backup,
197 UBool forwards)
198 {
199 UErrorCode status = U_ZERO_ERROR;
200 data->flags = backup->flags;
201 data->origFlags = backup->origFlags;
202 if(data->iterator != NULL) {
203 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
204 data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
205 if(backup->iteratorMove != 0) {
206 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
207 }
208 }
209 data->pos = backup->pos;
210 if ((data->flags & UCOL_ITER_INNORMBUF) &&
211 data->writableBuffer != backup->bufferaddress) {
212 /*
213 this is when a new buffer has been reallocated and we'll have to
214 calculate the new position.
215 note the new buffer has to contain the contents of the old buffer.
216 */
217 if (forwards) {
218 data->pos = data->writableBuffer +
219 (data->pos - backup->bufferaddress);
220 }
221 else {
222 /* backwards direction */
223 uint32_t temp = backup->buffersize -
224 (data->pos - backup->bufferaddress);
225 data->pos = data->writableBuffer + (data->writableBufSize - temp);
226 }
227 }
228 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
229 /*
230 this is alittle tricky.
231 if we are initially not in the normalization buffer, even if we
232 normalize in the later stage, the data in the buffer will be
233 ignored, since we skip back up to the data string.
234 however if we are already in the normalization buffer, any
235 further normalization will pull data into the normalization
236 buffer and modify the fcdPosition.
237 since we are keeping the data in the buffer for use, the
238 fcdPosition can not be reverted back.
239 arrgghh....
240 */
241 data->fcdPosition = backup->fcdPosition;
242 }
243 }
244
245
246 /*
247 * collIter_eos()
248 * Checks for a collIterate being positioned at the end of
249 * its source string.
250 *
251 */
252 static
253 inline UBool collIter_eos(collIterate *s) {
254 if(s->flags & UCOL_USE_ITERATOR) {
255 return !(s->iterator->hasNext(s->iterator));
256 }
257 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
258 // Null terminated string, but not at null, so not at end.
259 // Whether in main or normalization buffer doesn't matter.
260 return FALSE;
261 }
262
263 // String with length. Can't be in normalization buffer, which is always
264 // null termintated.
265 if (s->flags & UCOL_ITER_HASLEN) {
266 return (s->pos == s->endp);
267 }
268
269 // We are at a null termination, could be either normalization buffer or main string.
270 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
271 // At null at end of main string.
272 return TRUE;
273 }
274
275 // At null at end of normalization buffer. Need to check whether there there are
276 // any characters left in the main buffer.
277 if(s->origFlags & UCOL_USE_ITERATOR) {
278 return !(s->iterator->hasNext(s->iterator));
279 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
280 // Null terminated main string. fcdPosition is the 'return' position into main buf.
281 return (*s->fcdPosition == 0);
282 }
283 else {
284 // Main string with an end pointer.
285 return s->fcdPosition == s->endp;
286 }
287 }
288
289 /*
290 * collIter_bos()
291 * Checks for a collIterate being positioned at the start of
292 * its source string.
293 *
294 */
295 static
296 inline UBool collIter_bos(collIterate *source) {
297 // if we're going backwards, we need to know whether there is more in the
298 // iterator, even if we are in the side buffer
299 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
300 return !source->iterator->hasPrevious(source->iterator);
301 }
302 if (source->pos <= source->string ||
303 ((source->flags & UCOL_ITER_INNORMBUF) &&
304 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
305 return TRUE;
306 }
307 return FALSE;
308 }
309
310 static
311 inline UBool collIter_SimpleBos(collIterate *source) {
312 // if we're going backwards, we need to know whether there is more in the
313 // iterator, even if we are in the side buffer
314 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
315 return !source->iterator->hasPrevious(source->iterator);
316 }
317 if (source->pos == source->string) {
318 return TRUE;
319 }
320 return FALSE;
321 }
322 //return (data->pos == data->string) ||
323
324
325 /**
326 * Checks and free writable buffer if it is not the original stack buffer
327 * in collIterate. This function does not reassign the writable buffer.
328 * @param data collIterate struct to determine and free the writable buffer
329 */
330 static
331 inline void freeHeapWritableBuffer(collIterate *data)
332 {
333 if (data->writableBuffer != data->stackWritableBuffer) {
334 uprv_free(data->writableBuffer);
335 }
336 }
337
338
339 /****************************************************************************/
340 /* Following are the open/close functions */
341 /* */
342 /****************************************************************************/
343
344 static UCollator*
345 ucol_initFromBinary(const uint8_t *bin, int32_t length,
346 const UCollator *base,
347 UCollator *fillIn,
348 UErrorCode *status)
349 {
350 UCollator *result = fillIn;
351 if(U_FAILURE(*status)) {
352 return NULL;
353 }
354 /*
355 if(base == NULL) {
356 // we don't support null base yet
357 *status = U_ILLEGAL_ARGUMENT_ERROR;
358 return NULL;
359 }
360 */
361 // We need these and we could be running without UCA
362 uprv_uca_initImplicitConstants(0, 0, status);
363 UCATableHeader *colData = (UCATableHeader *)bin;
364 // do we want version check here? We're trying to figure out whether collators are compatible
365 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
366 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
367 colData->version[0] != UCOL_BUILDER_VERSION)
368 {
369 *status = U_COLLATOR_VERSION_MISMATCH;
370 return NULL;
371 }
372 else {
373 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
374 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
375 if(U_FAILURE(*status)){
376 return NULL;
377 }
378 result->hasRealData = TRUE;
379 }
380 else {
381 if(base) {
382 result = ucol_initCollator(base->image, result, base, status);
383 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
384 if(U_FAILURE(*status)){
385 return NULL;
386 }
387 result->hasRealData = FALSE;
388 }
389 else {
390 *status = U_USELESS_COLLATOR_ERROR;
391 return NULL;
392 }
393 }
394 result->freeImageOnClose = FALSE;
395 }
396 result->validLocale = NULL;
397 result->requestedLocale = NULL;
398 result->rules = NULL;
399 result->rulesLength = 0;
400 result->freeRulesOnClose = FALSE;
401 result->rb = NULL;
402 result->elements = NULL;
403 return result;
404 }
405
406 U_CAPI UCollator* U_EXPORT2
407 ucol_openBinary(const uint8_t *bin, int32_t length,
408 const UCollator *base,
409 UErrorCode *status)
410 {
411 return ucol_initFromBinary(bin, length, base, NULL, status);
412 }
413
414 U_CAPI UCollator* U_EXPORT2
415 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
416 {
417 UCollator * localCollator;
418 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
419 char *stackBufferChars = (char *)stackBuffer;
420 int32_t imageSize = 0;
421 int32_t rulesSize = 0;
422 int32_t rulesPadding = 0;
423 uint8_t *image;
424 UChar *rules;
425 UBool colAllocated = FALSE;
426 UBool imageAllocated = FALSE;
427
428 if (status == NULL || U_FAILURE(*status)){
429 return 0;
430 }
431 if ((stackBuffer && !pBufferSize) || !coll){
432 *status = U_ILLEGAL_ARGUMENT_ERROR;
433 return 0;
434 }
435 if (coll->rules && coll->freeRulesOnClose) {
436 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
437 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
438 bufferSizeNeeded += rulesSize + rulesPadding;
439 }
440
441 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
442 *pBufferSize = bufferSizeNeeded;
443 return 0;
444 }
445
446 /* Pointers on 64-bit platforms need to be aligned
447 * on a 64-bit boundry in memory.
448 */
449 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
450 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
451 if (*pBufferSize > offsetUp) {
452 *pBufferSize -= offsetUp;
453 stackBufferChars += offsetUp;
454 }
455 else {
456 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
457 *pBufferSize = 1;
458 }
459 }
460 stackBuffer = (void *)stackBufferChars;
461
462 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
463 /* allocate one here...*/
464 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
465 colAllocated = TRUE;
466 if (U_SUCCESS(*status)) {
467 *status = U_SAFECLONE_ALLOCATED_WARNING;
468 }
469 }
470 localCollator = (UCollator *)stackBufferChars;
471 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
472 {
473 UErrorCode tempStatus = U_ZERO_ERROR;
474 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
475 }
476 if (coll->freeImageOnClose) {
477 image = (uint8_t *)uprv_malloc(imageSize);
478 ucol_cloneBinary(coll, image, imageSize, status);
479 imageAllocated = TRUE;
480 }
481 else {
482 image = (uint8_t *)coll->image;
483 }
484 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
485 if (U_FAILURE(*status)) {
486 return NULL;
487 }
488
489 if (coll->rules) {
490 if (coll->freeRulesOnClose) {
491 localCollator->rules = u_strcpy(rules, coll->rules);
492 //bufferEnd += rulesSize;
493 }
494 else {
495 localCollator->rules = coll->rules;
496 }
497 localCollator->freeRulesOnClose = FALSE;
498 localCollator->rulesLength = coll->rulesLength;
499 }
500
501 int32_t i;
502 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
503 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
504 }
505 localCollator->requestedLocale = NULL; // zero copies of pointers
506 localCollator->validLocale = NULL;
507 localCollator->rb = NULL;
508 localCollator->elements = NULL;
509 localCollator->freeOnClose = colAllocated;
510 localCollator->freeImageOnClose = imageAllocated;
511 return localCollator;
512 }
513
514 U_CAPI void U_EXPORT2
515 ucol_close(UCollator *coll)
516 {
517 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
518 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
519 if(coll != NULL) {
520 // these are always owned by each UCollator struct,
521 // so we always free them
522 if(coll->validLocale != NULL) {
523 uprv_free(coll->validLocale);
524 }
525 if(coll->requestedLocale != NULL) {
526 uprv_free(coll->requestedLocale);
527 }
528 if(coll->resCleaner != NULL) {
529 coll->resCleaner(coll);
530 }
531 if(coll->latinOneCEs != NULL) {
532 uprv_free(coll->latinOneCEs);
533 }
534 if(coll->options != NULL && coll->freeOptionsOnClose) {
535 uprv_free(coll->options);
536 }
537 if(coll->rules != NULL && coll->freeRulesOnClose) {
538 uprv_free((UChar *)coll->rules);
539 }
540 if(coll->image != NULL && coll->freeImageOnClose) {
541 uprv_free((UCATableHeader *)coll->image);
542 }
543
544 /* Here, it would be advisable to close: */
545 /* - UData for UCA (unless we stuff it in the root resb */
546 /* Again, do we need additional housekeeping... HMMM! */
547 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
548 if(coll->freeOnClose){
549 /* for safeClone, if freeOnClose is FALSE,
550 don't free the other instance data */
551 uprv_free(coll);
552 }
553 }
554 UTRACE_EXIT();
555 }
556
557 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
558 /* you should be able to get the binary chunk to write out... Doesn't look very full now */
559 U_CAPI uint8_t* U_EXPORT2
560 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
561 {
562 uint8_t *result = NULL;
563 if(U_FAILURE(*status)) {
564 return NULL;
565 }
566 if(coll->hasRealData == TRUE) {
567 *length = coll->image->size;
568 result = (uint8_t *)uprv_malloc(*length);
569 /* test for NULL */
570 if (result == NULL) {
571 *status = U_MEMORY_ALLOCATION_ERROR;
572 return NULL;
573 }
574 uprv_memcpy(result, coll->image, *length);
575 } else {
576 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
577 result = (uint8_t *)uprv_malloc(*length);
578 /* test for NULL */
579 if (result == NULL) {
580 *status = U_MEMORY_ALLOCATION_ERROR;
581 return NULL;
582 }
583
584 /* build the UCATableHeader with minimal entries */
585 /* do not copy the header from the UCA file because its values are wrong! */
586 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
587
588 /* reset everything */
589 uprv_memset(result, 0, *length);
590
591 /* set the tailoring-specific values */
592 UCATableHeader *myData = (UCATableHeader *)result;
593 myData->size = *length;
594
595 /* offset for the options, the only part of the data that is present after the header */
596 myData->options = sizeof(UCATableHeader);
597
598 /* need to always set the expansion value for an upper bound of the options */
599 myData->expansion = myData->options + sizeof(UColOptionSet);
600
601 myData->magic = UCOL_HEADER_MAGIC;
602 myData->isBigEndian = U_IS_BIG_ENDIAN;
603 myData->charSetFamily = U_CHARSET_FAMILY;
604
605 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
606 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
607
608 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
609 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
610 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
611 myData->jamoSpecial = coll->image->jamoSpecial;
612
613 /* copy the collator options */
614 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
615 }
616 return result;
617 }
618
619 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
620 if(U_FAILURE(*status)) {
621 return;
622 }
623 result->caseFirst = (UColAttributeValue)opts->caseFirst;
624 result->caseLevel = (UColAttributeValue)opts->caseLevel;
625 result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
626 result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
627 result->strength = (UColAttributeValue)opts->strength;
628 result->variableTopValue = opts->variableTopValue;
629 result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
630 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
631 result->numericCollation = (UColAttributeValue)opts->numericCollation;
632
633 result->caseFirstisDefault = TRUE;
634 result->caseLevelisDefault = TRUE;
635 result->frenchCollationisDefault = TRUE;
636 result->normalizationModeisDefault = TRUE;
637 result->strengthisDefault = TRUE;
638 result->variableTopValueisDefault = TRUE;
639 result->hiraganaQisDefault = TRUE;
640 result->numericCollationisDefault = TRUE;
641
642 ucol_updateInternalState(result, status);
643
644 result->options = opts;
645 }
646
647
648 /**
649 * Approximate determination if a character is at a contraction end.
650 * Guaranteed to be TRUE if a character is at the end of a contraction,
651 * otherwise it is not deterministic.
652 * @param c character to be determined
653 * @param coll collator
654 */
655 static
656 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
657 if (U16_IS_TRAIL(c)) {
658 return TRUE;
659 }
660
661 if (c < coll->minContrEndCP) {
662 return FALSE;
663 }
664
665 int32_t hash = c;
666 uint8_t htbyte;
667 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
668 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
669 }
670 htbyte = coll->contrEndCP[hash>>3];
671 return (((htbyte >> (hash & 7)) & 1) == 1);
672 }
673
674
675
676 /*
677 * i_getCombiningClass()
678 * A fast, at least partly inline version of u_getCombiningClass()
679 * This is a candidate for further optimization. Used heavily
680 * in contraction processing.
681 */
682 static
683 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
684 uint8_t sCC = 0;
685 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
686 sCC = u_getCombiningClass(c);
687 }
688 return sCC;
689 }
690
691 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
692 UChar c;
693 UCollator *result = fillIn;
694 if(U_FAILURE(*status) || image == NULL) {
695 return NULL;
696 }
697
698 if(result == NULL) {
699 result = (UCollator *)uprv_malloc(sizeof(UCollator));
700 if(result == NULL) {
701 *status = U_MEMORY_ALLOCATION_ERROR;
702 return result;
703 }
704 result->freeOnClose = TRUE;
705 } else {
706 result->freeOnClose = FALSE;
707 }
708
709 result->image = image;
710 result->mapping.getFoldingOffset = _getFoldingOffset;
711 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
712 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
713 if(U_FAILURE(*status)) {
714 if(result->freeOnClose == TRUE) {
715 uprv_free(result);
716 result = NULL;
717 }
718 return result;
719 }
720
721 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
722 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
723 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
724 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
725 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
726
727 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
728 result->freeOptionsOnClose = FALSE;
729
730 /* set attributes */
731 result->caseFirst = (UColAttributeValue)result->options->caseFirst;
732 result->caseLevel = (UColAttributeValue)result->options->caseLevel;
733 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
734 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
735 result->strength = (UColAttributeValue)result->options->strength;
736 result->variableTopValue = result->options->variableTopValue;
737 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
738 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
739 result->numericCollation = (UColAttributeValue)result->options->numericCollation;
740
741 result->caseFirstisDefault = TRUE;
742 result->caseLevelisDefault = TRUE;
743 result->frenchCollationisDefault = TRUE;
744 result->normalizationModeisDefault = TRUE;
745 result->strengthisDefault = TRUE;
746 result->variableTopValueisDefault = TRUE;
747 result->alternateHandlingisDefault = TRUE;
748 result->hiraganaQisDefault = TRUE;
749 result->numericCollationisDefault = TRUE;
750
751 /*result->scriptOrder = NULL;*/
752
753 result->rules = NULL;
754 result->rulesLength = 0;
755
756 /* get the version info from UCATableHeader and populate the Collator struct*/
757 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
758 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
759 result->dataVersion[2] = 0;
760 result->dataVersion[3] = 0;
761
762 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
763 result->minUnsafeCP = 0;
764 for (c=0; c<0x300; c++) { // Find the smallest unsafe char.
765 if (ucol_unsafeCP(c, result)) break;
766 }
767 result->minUnsafeCP = c;
768
769 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
770 result->minContrEndCP = 0;
771 for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
772 if (ucol_contractionEndCP(c, result)) break;
773 }
774 result->minContrEndCP = c;
775
776 /* max expansion tables */
777 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
778 result->image->endExpansionCE);
779 result->lastEndExpansionCE = result->endExpansionCE +
780 result->image->endExpansionCECount - 1;
781 result->expansionCESize = (uint8_t*)result->image +
782 result->image->expansionCESize;
783
784
785 //result->errorCode = *status;
786
787 result->latinOneCEs = NULL;
788
789 result->latinOneRegenTable = FALSE;
790 result->latinOneFailed = FALSE;
791 result->UCA = UCA;
792 result->resCleaner = NULL;
793
794 ucol_updateInternalState(result, status);
795
796
797 return result;
798 }
799
800 /* new Mark's code */
801
802 /**
803 * For generation of Implicit CEs
804 * @author Davis
805 *
806 * Cleaned up so that changes can be made more easily.
807 * Old values:
808 # First Implicit: E26A792D
809 # Last Implicit: E3DC70C0
810 # First CJK: E0030300
811 # Last CJK: E0A9DD00
812 # First CJK_A: E0A9DF00
813 # Last CJK_A: E0DE3100
814 */
815 /* Following is a port of Mark's code for new treatment of implicits.
816 * It is positioned here, since ucol_initUCA need to initialize the
817 * variables below according to the data in the fractional UCA.
818 */
819
820 /**
821 * Function used to:
822 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
823 * b) bump any non-CJK characters by 10FFFF.
824 * The relevant blocks are:
825 * A: 4E00..9FFF; CJK Unified Ideographs
826 * F900..FAFF; CJK Compatibility Ideographs
827 * B: 3400..4DBF; CJK Unified Ideographs Extension A
828 * 20000..XX; CJK Unified Ideographs Extension B (and others later on)
829 * As long as
830 * no new B characters are allocated between 4E00 and FAFF, and
831 * no new A characters are outside of this range,
832 * (very high probability) this simple code will work.
833 * The reordered blocks are:
834 * Block1 is CJK
835 * Block2 is CJK_COMPAT_USED
836 * Block3 is CJK_A
837 * (all contiguous)
838 * Any other CJK gets its normal code point
839 * Any non-CJK gets +10FFFF
840 * When we reorder Block1, we make sure that it is at the very start,
841 * so that it will use a 3-byte form.
842 * Warning: the we only pick up the compatibility characters that are
843 * NOT decomposed, so that block is smaller!
844 */
845
846 // CONSTANTS
847 static const UChar32
848 NON_CJK_OFFSET = 0x110000,
849 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
850
851 /**
852 * Precomputed by constructor
853 */
854 static int32_t
855 final3Multiplier = 0,
856 final4Multiplier = 0,
857 final3Count = 0,
858 final4Count = 0,
859 medialCount = 0,
860 min3Primary = 0,
861 min4Primary = 0,
862 max4Primary = 0,
863 minTrail = 0,
864 maxTrail = 0,
865 max3Trail = 0,
866 max4Trail = 0,
867 min4Boundary = 0;
868
869 static const UChar32
870 CJK_BASE = 0x4E00,
871 CJK_LIMIT = 0x9FFF+1,
872 CJK_COMPAT_USED_BASE = 0xFA0E,
873 CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
874 CJK_A_BASE = 0x3400,
875 CJK_A_LIMIT = 0x4DBF+1,
876 CJK_B_BASE = 0x20000,
877 CJK_B_LIMIT = 0x2A6DF+1;
878
879 static UChar32 swapCJK(UChar32 i) {
880
881 if (i >= CJK_BASE) {
882 if (i < CJK_LIMIT) return i - CJK_BASE;
883
884 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
885
886 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
887 + (CJK_LIMIT - CJK_BASE);
888 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
889
890 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
891
892 return i + NON_CJK_OFFSET; // non-CJK
893 }
894 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
895
896 if (i < CJK_A_LIMIT) return i - CJK_A_BASE
897 + (CJK_LIMIT - CJK_BASE)
898 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
899 return i + NON_CJK_OFFSET; // non-CJK
900 }
901
902 U_CAPI UChar32 U_EXPORT2
903 uprv_uca_getRawFromCodePoint(UChar32 i) {
904 return swapCJK(i)+1;
905 }
906
907 U_CAPI UChar32 U_EXPORT2
908 uprv_uca_getCodePointFromRaw(UChar32 i) {
909 i--;
910 UChar32 result = 0;
911 if(i >= NON_CJK_OFFSET) {
912 result = i - NON_CJK_OFFSET;
913 } else if(i >= CJK_B_BASE) {
914 result = i;
915 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
916 if(i < CJK_LIMIT - CJK_BASE) {
917 result = i + CJK_BASE;
918 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
919 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
920 } else {
921 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
922 }
923 } else {
924 result = -1;
925 }
926 return result;
927 }
928
929 // GET IMPLICIT PRIMARY WEIGHTS
930 // Return value is left justified primary key
931 U_CAPI uint32_t U_EXPORT2
932 uprv_uca_getImplicitFromRaw(UChar32 cp) {
933 /*
934 if (cp < 0 || cp > UCOL_MAX_INPUT) {
935 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
936 }
937 */
938 int32_t last0 = cp - min4Boundary;
939 if (last0 < 0) {
940 int32_t last1 = cp / final3Count;
941 last0 = cp % final3Count;
942
943 int32_t last2 = last1 / medialCount;
944 last1 %= medialCount;
945
946 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
947 last1 = minTrail + last1; // offset
948 last2 = min3Primary + last2; // offset
949 /*
950 if (last2 >= min4Primary) {
951 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
952 }
953 */
954 return (last2 << 24) + (last1 << 16) + (last0 << 8);
955 } else {
956 int32_t last1 = last0 / final4Count;
957 last0 %= final4Count;
958
959 int32_t last2 = last1 / medialCount;
960 last1 %= medialCount;
961
962 int32_t last3 = last2 / medialCount;
963 last2 %= medialCount;
964
965 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
966 last1 = minTrail + last1; // offset
967 last2 = minTrail + last2; // offset
968 last3 = min4Primary + last3; // offset
969 /*
970 if (last3 > max4Primary) {
971 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
972 }
973 */
974 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
975 }
976 }
977
978 U_CAPI uint32_t U_EXPORT2
979 uprv_uca_getImplicitPrimary(UChar32 cp) {
980 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
981
982 cp = swapCJK(cp);
983 cp++;
984 // we now have a range of numbers from 0 to 21FFFF.
985
986 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
987
988 return uprv_uca_getImplicitFromRaw(cp);
989 }
990
991 /**
992 * Converts implicit CE into raw integer ("code point")
993 * @param implicit
994 * @return -1 if illegal format
995 */
996 U_CAPI UChar32 U_EXPORT2
997 uprv_uca_getRawFromImplicit(uint32_t implicit) {
998 UChar32 result;
999 UChar32 b3 = implicit & 0xFF;
1000 implicit >>= 8;
1001 UChar32 b2 = implicit & 0xFF;
1002 implicit >>= 8;
1003 UChar32 b1 = implicit & 0xFF;
1004 implicit >>= 8;
1005 UChar32 b0 = implicit & 0xFF;
1006
1007 // simple parameter checks
1008 if (b0 < min3Primary || b0 > max4Primary
1009 || b1 < minTrail || b1 > maxTrail) return -1;
1010 // normal offsets
1011 b1 -= minTrail;
1012
1013 // take care of the final values, and compose
1014 if (b0 < min4Primary) {
1015 if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1016 b2 -= minTrail;
1017 UChar32 remainder = b2 % final3Multiplier;
1018 if (remainder != 0) return -1;
1019 b0 -= min3Primary;
1020 b2 /= final3Multiplier;
1021 result = ((b0 * medialCount) + b1) * final3Count + b2;
1022 } else {
1023 if (b2 < minTrail || b2 > maxTrail
1024 || b3 < minTrail || b3 > max4Trail) return -1;
1025 b2 -= minTrail;
1026 b3 -= minTrail;
1027 UChar32 remainder = b3 % final4Multiplier;
1028 if (remainder != 0) return -1;
1029 b3 /= final4Multiplier;
1030 b0 -= min4Primary;
1031 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1032 }
1033 // final check
1034 if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1035 return result;
1036 }
1037
1038
1039 static inline int32_t divideAndRoundUp(int a, int b) {
1040 return 1 + (a-1)/b;
1041 }
1042
1043 /* this function is either called from initUCA or from genUCA before
1044 * doing canonical closure for the UCA.
1045 */
1046
1047 /**
1048 * Set up to generate implicits.
1049 * @param minPrimary
1050 * @param maxPrimary
1051 * @param minTrail final byte
1052 * @param maxTrail final byte
1053 * @param gap3 the gap we leave for tailoring for 3-byte forms
1054 * @param gap4 the gap we leave for tailoring for 4-byte forms
1055 */
1056 static void initImplicitConstants(int minPrimary, int maxPrimary,
1057 int minTrailIn, int maxTrailIn,
1058 int gap3, int primaries3count,
1059 UErrorCode *status) {
1060 // some simple parameter checks
1061 if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1062 *status = U_ILLEGAL_ARGUMENT_ERROR;
1063 return;
1064 };
1065 if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1066 *status = U_ILLEGAL_ARGUMENT_ERROR;
1067 return;
1068 };
1069 if (primaries3count < 1) {
1070 *status = U_ILLEGAL_ARGUMENT_ERROR;
1071 return;
1072 };
1073
1074 minTrail = minTrailIn;
1075 maxTrail = maxTrailIn;
1076
1077 min3Primary = minPrimary;
1078 max4Primary = maxPrimary;
1079 // compute constants for use later.
1080 // number of values we can use in trailing bytes
1081 // leave room for empty values between AND above, e.g. if gap = 2
1082 // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1083 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1084 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1085 final3Multiplier = gap3 + 1;
1086 final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1087 max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1088
1089 // medials can use full range
1090 medialCount = (maxTrail - minTrail + 1);
1091 // find out how many values fit in each form
1092 int32_t threeByteCount = medialCount * final3Count;
1093 // now determine where the 3/4 boundary is.
1094 // we use 3 bytes below the boundary, and 4 above
1095 int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1096 int32_t primaries4count = primariesAvailable - primaries3count;
1097
1098
1099 int32_t min3ByteCoverage = primaries3count * threeByteCount;
1100 min4Primary = minPrimary + primaries3count;
1101 min4Boundary = min3ByteCoverage;
1102 // Now expand out the multiplier for the 4 bytes, and redo.
1103
1104 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1105 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1106 //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1107 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1108 //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1109 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1110 //if (DEBUG) System.out.println("expandedGap: " + gap4);
1111 if (gap4 < 1) {
1112 *status = U_ILLEGAL_ARGUMENT_ERROR;
1113 return;
1114 }
1115 final4Multiplier = gap4 + 1;
1116 final4Count = neededPerFinalByte;
1117 max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1118 /*
1119 if (DEBUG) {
1120 System.out.println("final4Count: " + final4Count);
1121 for (int counter = 0; counter <= final4Count; ++counter) {
1122 int value = minTrail + (1 + counter)*final4Multiplier;
1123 System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1124 }
1125 }
1126 */
1127 }
1128
1129 /**
1130 * Supply parameters for generating implicit CEs
1131 */
1132 U_CAPI void U_EXPORT2
1133 uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode *status) {
1134 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1135 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1136 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1137 }
1138
1139 U_CDECL_BEGIN
1140 static UBool U_CALLCONV
1141 ucol_cleanup(void)
1142 {
1143 if (UCA_DATA_MEM) {
1144 udata_close(UCA_DATA_MEM);
1145 UCA_DATA_MEM = NULL;
1146 }
1147 if (_staticUCA) {
1148 ucol_close(_staticUCA);
1149 _staticUCA = NULL;
1150 }
1151 fcdTrieIndex = NULL;
1152 return TRUE;
1153 }
1154 U_CDECL_END
1155
1156 /* do not close UCA returned by ucol_initUCA! */
1157 UCollator *
1158 ucol_initUCA(UErrorCode *status) {
1159 if(U_FAILURE(*status)) {
1160 return NULL;
1161 }
1162 umtx_lock(NULL);
1163 UBool f = (_staticUCA == NULL);
1164 umtx_unlock(NULL);
1165
1166 if(f) {
1167 UCollator *newUCA = NULL;
1168 UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1169
1170 if(U_FAILURE(*status)) {
1171 if (result) {
1172 udata_close(result);
1173 }
1174 uprv_free(newUCA);
1175 }
1176
1177 // init FCD data
1178 if (fcdTrieIndex == NULL) {
1179 fcdTrieIndex = unorm_getFCDTrie(status);
1180 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1181 }
1182
1183 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1184 newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
1185 if(U_SUCCESS(*status)){
1186 newUCA->rb = NULL;
1187 newUCA->elements = NULL;
1188 newUCA->validLocale = NULL;
1189 newUCA->requestedLocale = NULL;
1190 newUCA->hasRealData = FALSE; // real data lives in .dat file...
1191 newUCA->freeImageOnClose = FALSE;
1192 umtx_lock(NULL);
1193 if(_staticUCA == NULL) {
1194 _staticUCA = newUCA;
1195 UCA_DATA_MEM = result;
1196 result = NULL;
1197 newUCA = NULL;
1198 }
1199 umtx_unlock(NULL);
1200
1201 if(newUCA != NULL) {
1202 udata_close(result);
1203 uprv_free(newUCA);
1204 }
1205 else {
1206 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1207 }
1208 // Initalize variables for implicit generation
1209 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1210 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1211 //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset;
1212 }else{
1213 udata_close(result);
1214 uprv_free(newUCA);
1215 _staticUCA= NULL;
1216 }
1217 }
1218 }
1219 return _staticUCA;
1220 }
1221
1222
1223 /* collIterNormalize Incremental Normalization happens here. */
1224 /* pick up the range of chars identifed by FCD, */
1225 /* normalize it into the collIterate's writable buffer, */
1226 /* switch the collIterate's state to use the writable buffer. */
1227 /* */
1228 static
1229 void collIterNormalize(collIterate *collationSource)
1230 {
1231 UErrorCode status = U_ZERO_ERROR;
1232
1233 int32_t normLen;
1234 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */
1235 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */
1236
1237 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1238 srcP, (int32_t)(endP - srcP),
1239 FALSE, 0,
1240 &status);
1241 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1242 // reallocate and terminate
1243 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1244 &collationSource->writableBuffer,
1245 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1246 0)
1247 ) {
1248 #ifdef UCOL_DEBUG
1249 fprintf(stderr, "collIterNormalize(), out of memory\n");
1250 #endif
1251 return;
1252 }
1253 status = U_ZERO_ERROR;
1254 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1255 srcP, (int32_t)(endP - srcP),
1256 FALSE, 0,
1257 &status);
1258 }
1259 if (U_FAILURE(status)) {
1260 #ifdef UCOL_DEBUG
1261 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1262 #endif
1263 return;
1264 }
1265
1266 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1267 collationSource->flags |= UCOL_ITER_ALLOCATED;
1268 }
1269 collationSource->pos = collationSource->writableBuffer;
1270 collationSource->origFlags = collationSource->flags;
1271 collationSource->flags |= UCOL_ITER_INNORMBUF;
1272 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1273 }
1274
1275
1276 // This function takes the iterator and extracts normalized stuff up to the next boundary
1277 // It is similar in the end results to the collIterNormalize, but for the cases when we
1278 // use an iterator
1279 static
1280 inline void normalizeIterator(collIterate *collationSource) {
1281 UErrorCode status = U_ZERO_ERROR;
1282 UBool wasNormalized = FALSE;
1283 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1284 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1285 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1286 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1287 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1288 // reallocate and terminate
1289 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1290 &collationSource->writableBuffer,
1291 (int32_t *)&collationSource->writableBufSize, normLen + 1,
1292 0)
1293 ) {
1294 #ifdef UCOL_DEBUG
1295 fprintf(stderr, "normalizeIterator(), out of memory\n");
1296 #endif
1297 return;
1298 }
1299 status = U_ZERO_ERROR;
1300 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1301 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1302 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1303 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1304 }
1305 // Terminate the buffer - we already checked that it is big enough
1306 collationSource->writableBuffer[normLen] = 0;
1307 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1308 collationSource->flags |= UCOL_ITER_ALLOCATED;
1309 }
1310 collationSource->pos = collationSource->writableBuffer;
1311 collationSource->origFlags = collationSource->flags;
1312 collationSource->flags |= UCOL_ITER_INNORMBUF;
1313 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1314 }
1315
1316
1317 /* Incremental FCD check and normalize */
1318 /* Called from getNextCE when normalization state is suspect. */
1319 /* When entering, the state is known to be this: */
1320 /* o We are working in the main buffer of the collIterate, not the side */
1321 /* writable buffer. When in the side buffer, normalization mode is always off, */
1322 /* so we won't get here. */
1323 /* o The leading combining class from the current character is 0 or */
1324 /* the trailing combining class of the previous char was zero. */
1325 /* True because the previous call to this function will have always exited */
1326 /* that way, and we get called for every char where cc might be non-zero. */
1327 static
1328 inline UBool collIterFCD(collIterate *collationSource) {
1329 UChar c, c2;
1330 const UChar *srcP, *endP;
1331 uint8_t leadingCC;
1332 uint8_t prevTrailingCC = 0;
1333 uint16_t fcd;
1334 UBool needNormalize = FALSE;
1335
1336 srcP = collationSource->pos-1;
1337
1338 if (collationSource->flags & UCOL_ITER_HASLEN) {
1339 endP = collationSource->endp;
1340 } else {
1341 endP = NULL;
1342 }
1343
1344 // Get the trailing combining class of the current character. If it's zero,
1345 // we are OK.
1346 c = *srcP++;
1347 /* trie access */
1348 fcd = unorm_getFCD16(fcdTrieIndex, c);
1349 if (fcd != 0) {
1350 if (U16_IS_LEAD(c)) {
1351 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1352 ++srcP;
1353 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1354 } else {
1355 fcd = 0;
1356 }
1357 }
1358
1359 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1360
1361 if (prevTrailingCC != 0) {
1362 // The current char has a non-zero trailing CC. Scan forward until we find
1363 // a char with a leading cc of zero.
1364 while (endP == NULL || srcP != endP)
1365 {
1366 const UChar *savedSrcP = srcP;
1367
1368 c = *srcP++;
1369 /* trie access */
1370 fcd = unorm_getFCD16(fcdTrieIndex, c);
1371 if (fcd != 0 && U16_IS_LEAD(c)) {
1372 if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1373 ++srcP;
1374 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1375 } else {
1376 fcd = 0;
1377 }
1378 }
1379 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1380 if (leadingCC == 0) {
1381 srcP = savedSrcP; // Hit char that is not part of combining sequence.
1382 // back up over it. (Could be surrogate pair!)
1383 break;
1384 }
1385
1386 if (leadingCC < prevTrailingCC) {
1387 needNormalize = TRUE;
1388 }
1389
1390 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1391 }
1392 }
1393 }
1394
1395 collationSource->fcdPosition = (UChar *)srcP;
1396
1397 return needNormalize;
1398 }
1399
1400 /****************************************************************************/
1401 /* Following are the CE retrieval functions */
1402 /* */
1403 /****************************************************************************/
1404
1405 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1406 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1407
1408 /* there should be a macro version of this function in the header file */
1409 /* This is the first function that tries to fetch a collation element */
1410 /* If it's not succesfull or it encounters a more difficult situation */
1411 /* some more sofisticated and slower functions are invoked */
1412 static
1413 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1414 uint32_t order = 0;
1415 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
1416 order = *(collationSource->toReturn++); /* if so, return them */
1417 if(collationSource->CEpos == collationSource->toReturn) {
1418 collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1419 }
1420 return order;
1421 }
1422
1423 UChar ch = 0;
1424
1425 for (;;) /* Loop handles case when incremental normalize switches */
1426 { /* to or from the side buffer / original string, and we */
1427 /* need to start again to get the next character. */
1428
1429 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1430 {
1431 // The source string is null terminated and we're not working from the side buffer,
1432 // and we're not normalizing. This is the fast path.
1433 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1434 ch = *collationSource->pos++;
1435 if (ch != 0) {
1436 break;
1437 }
1438 else {
1439 return UCOL_NO_MORE_CES;
1440 }
1441 }
1442
1443 if (collationSource->flags & UCOL_ITER_HASLEN) {
1444 // Normal path for strings when length is specified.
1445 // (We can't be in side buffer because it is always null terminated.)
1446 if (collationSource->pos >= collationSource->endp) {
1447 // Ran off of the end of the main source string. We're done.
1448 return UCOL_NO_MORE_CES;
1449 }
1450 ch = *collationSource->pos++;
1451 }
1452 else if(collationSource->flags & UCOL_USE_ITERATOR) {
1453 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1454 if(iterCh == U_SENTINEL) {
1455 return UCOL_NO_MORE_CES;
1456 }
1457 ch = (UChar)iterCh;
1458 }
1459 else
1460 {
1461 // Null terminated string.
1462 ch = *collationSource->pos++;
1463 if (ch == 0) {
1464 // Ran off end of buffer.
1465 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1466 // Ran off end of main string. backing up one character.
1467 collationSource->pos--;
1468 return UCOL_NO_MORE_CES;
1469 }
1470 else
1471 {
1472 // Hit null in the normalize side buffer.
1473 // Usually this means the end of the normalized data,
1474 // except for one odd case: a null followed by combining chars,
1475 // which is the case if we are at the start of the buffer.
1476 if (collationSource->pos == collationSource->writableBuffer+1) {
1477 break;
1478 }
1479
1480 // Null marked end of side buffer.
1481 // Revert to the main string and
1482 // loop back to top to try again to get a character.
1483 collationSource->pos = collationSource->fcdPosition;
1484 collationSource->flags = collationSource->origFlags;
1485 continue;
1486 }
1487 }
1488 }
1489
1490 if(collationSource->flags&UCOL_HIRAGANA_Q) {
1491 if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1492 collationSource->flags |= UCOL_WAS_HIRAGANA;
1493 } else {
1494 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1495 }
1496 }
1497
1498 // We've got a character. See if there's any fcd and/or normalization stuff to do.
1499 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1500 if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1501 break;
1502 }
1503
1504 if (collationSource->fcdPosition >= collationSource->pos) {
1505 // An earlier FCD check has already covered the current character.
1506 // We can go ahead and process this char.
1507 break;
1508 }
1509
1510 if (ch < ZERO_CC_LIMIT_ ) {
1511 // Fast fcd safe path. Trailing combining class == 0. This char is OK.
1512 break;
1513 }
1514
1515 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1516 // We need to peek at the next character in order to tell if we are FCD
1517 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1518 // We are at the last char of source string.
1519 // It is always OK for FCD check.
1520 break;
1521 }
1522
1523 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test
1524 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1525 break;
1526 }
1527 }
1528
1529
1530 // Need a more complete FCD check and possible normalization.
1531 if (collIterFCD(collationSource)) {
1532 collIterNormalize(collationSource);
1533 }
1534 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1535 // No normalization was needed. Go ahead and process the char we already had.
1536 break;
1537 }
1538
1539 // Some normalization happened. Next loop iteration will pick up a char
1540 // from the normalization buffer.
1541
1542 } // end for (;;)
1543
1544
1545 if (ch <= 0xFF) {
1546 /* For latin-1 characters we never need to fall back to the UCA table */
1547 /* because all of the UCA data is replicated in the latinOneMapping array */
1548 order = coll->latinOneMapping[ch];
1549 if (order > UCOL_NOT_FOUND) {
1550 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1551 }
1552 }
1553 else
1554 {
1555 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1556 if(order > UCOL_NOT_FOUND) { /* if a CE is special */
1557 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */
1558 }
1559 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */
1560 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1561 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1562
1563 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1564 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1565 }
1566 }
1567 }
1568 if(order == UCOL_NOT_FOUND) {
1569 order = getImplicit(ch, collationSource);
1570 }
1571 return order; /* return the CE */
1572 }
1573
1574 /* ucol_getNextCE, out-of-line version for use from other files. */
1575 U_CAPI uint32_t U_EXPORT2
1576 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1577 return ucol_IGetNextCE(coll, collationSource, status);
1578 }
1579
1580
1581 /**
1582 * Incremental previous normalization happens here. Pick up the range of chars
1583 * identifed by FCD, normalize it into the collIterate's writable buffer,
1584 * switch the collIterate's state to use the writable buffer.
1585 * @param data collation iterator data
1586 */
1587 static
1588 void collPrevIterNormalize(collIterate *data)
1589 {
1590 UErrorCode status = U_ZERO_ERROR;
1591 UChar *pEnd = data->pos; /* End normalize + 1 */
1592 UChar *pStart;
1593 uint32_t normLen;
1594 UChar *pStartNorm;
1595
1596 /* Start normalize */
1597 if (data->fcdPosition == NULL) {
1598 pStart = data->string;
1599 }
1600 else {
1601 pStart = data->fcdPosition + 1;
1602 }
1603
1604 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1605 data->writableBuffer, 0, &status);
1606
1607 if (data->writableBufSize <= normLen) {
1608 freeHeapWritableBuffer(data);
1609 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1610 sizeof(UChar));
1611 if(data->writableBuffer == NULL) { // something is wrong here, return
1612 return;
1613 }
1614 data->flags |= UCOL_ITER_ALLOCATED;
1615 /* to handle the zero termination */
1616 data->writableBufSize = normLen + 1;
1617 }
1618 status = U_ZERO_ERROR;
1619 /*
1620 this puts the null termination infront of the normalized string instead
1621 of the end
1622 */
1623 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1624 *(pStartNorm - 1) = 0;
1625 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1626 normLen, &status);
1627
1628 data->pos = data->writableBuffer + data->writableBufSize;
1629 data->origFlags = data->flags;
1630 data->flags |= UCOL_ITER_INNORMBUF;
1631 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1632 }
1633
1634
1635 /**
1636 * Incremental FCD check for previous iteration and normalize. Called from
1637 * getPrevCE when normalization state is suspect.
1638 * When entering, the state is known to be this:
1639 * o We are working in the main buffer of the collIterate, not the side
1640 * writable buffer. When in the side buffer, normalization mode is always
1641 * off, so we won't get here.
1642 * o The leading combining class from the current character is 0 or the
1643 * trailing combining class of the previous char was zero.
1644 * True because the previous call to this function will have always exited
1645 * that way, and we get called for every char where cc might be non-zero.
1646 * @param data collation iterate struct
1647 * @return normalization status, TRUE for normalization to be done, FALSE
1648 * otherwise
1649 */
1650 static
1651 inline UBool collPrevIterFCD(collIterate *data)
1652 {
1653 const UChar *src, *start;
1654 UChar c, c2;
1655 uint8_t leadingCC;
1656 uint8_t trailingCC = 0;
1657 uint16_t fcd;
1658 UBool result = FALSE;
1659
1660 start = data->string;
1661 src = data->pos + 1;
1662
1663 /* Get the trailing combining class of the current character. */
1664 c = *--src;
1665 if (!U16_IS_SURROGATE(c)) {
1666 fcd = unorm_getFCD16(fcdTrieIndex, c);
1667 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1668 --src;
1669 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1670 if (fcd != 0) {
1671 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1672 }
1673 } else /* unpaired surrogate */ {
1674 fcd = 0;
1675 }
1676
1677 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1678
1679 if (leadingCC != 0) {
1680 /*
1681 The current char has a non-zero leading combining class.
1682 Scan backward until we find a char with a trailing cc of zero.
1683 */
1684 for (;;)
1685 {
1686 if (start == src) {
1687 data->fcdPosition = NULL;
1688 return result;
1689 }
1690
1691 c = *--src;
1692 if (!U16_IS_SURROGATE(c)) {
1693 fcd = unorm_getFCD16(fcdTrieIndex, c);
1694 } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1695 --src;
1696 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1697 if (fcd != 0) {
1698 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1699 }
1700 } else /* unpaired surrogate */ {
1701 fcd = 0;
1702 }
1703
1704 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1705
1706 if (trailingCC == 0) {
1707 break;
1708 }
1709
1710 if (leadingCC < trailingCC) {
1711 result = TRUE;
1712 }
1713
1714 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1715 }
1716 }
1717
1718 data->fcdPosition = (UChar *)src;
1719
1720 return result;
1721 }
1722
1723 /** gets a character from the string at a given offset
1724 * Handles both normal and iterative cases.
1725 * No error checking - caller beware!
1726 */
1727 inline static
1728 UChar peekCharacter(collIterate *source, int32_t offset) {
1729 if(source->pos != NULL) {
1730 return *(source->pos + offset);
1731 } else if(source->iterator != NULL) {
1732 if(offset != 0) {
1733 source->iterator->move(source->iterator, offset, UITER_CURRENT);
1734 UChar toReturn = (UChar)source->iterator->next(source->iterator);
1735 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1736 return toReturn;
1737 } else {
1738 return (UChar)source->iterator->current(source->iterator);
1739 }
1740 } else {
1741 return (UChar)U_SENTINEL;
1742 }
1743 }
1744
1745 /**
1746 * Determines if we are at the start of the data string in the backwards
1747 * collation iterator
1748 * @param data collation iterator
1749 * @return TRUE if we are at the start
1750 */
1751 static
1752 inline UBool isAtStartPrevIterate(collIterate *data) {
1753 if(data->pos == NULL && data->iterator != NULL) {
1754 return !data->iterator->hasPrevious(data->iterator);
1755 }
1756 //return (collIter_bos(data)) ||
1757 return (data->pos == data->string) ||
1758 ((data->flags & UCOL_ITER_INNORMBUF) &&
1759 *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1760 }
1761
1762 static
1763 inline void goBackOne(collIterate *data) {
1764 # if 0
1765 // somehow, it looks like we need to keep iterator synced up
1766 // at all times, as above.
1767 if(data->pos) {
1768 data->pos--;
1769 }
1770 if(data->iterator) {
1771 data->iterator->previous(data->iterator);
1772 }
1773 #endif
1774 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1775 data->iterator->previous(data->iterator);
1776 }
1777 if(data->pos) {
1778 data->pos --;
1779 }
1780 }
1781
1782 /**
1783 * Inline function that gets a simple CE.
1784 * So what it does is that it will first check the expansion buffer. If the
1785 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1786 * is different from the string pointer, we return the collation element at the
1787 * return pointer and decrement it.
1788 * For more complicated CEs it resorts to getComplicatedCE.
1789 * @param coll collator data
1790 * @param data collation iterator struct
1791 * @param status error status
1792 */
1793 static
1794 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1795 UErrorCode *status)
1796 {
1797 uint32_t result = (uint32_t)UCOL_NULLORDER;
1798 if (data->toReturn > data->CEs) {
1799 data->toReturn --;
1800 result = *(data->toReturn);
1801 if (data->CEs == data->toReturn) {
1802 data->CEpos = data->toReturn;
1803 }
1804 }
1805 else {
1806 UChar ch = 0;
1807 /*
1808 Loop handles case when incremental normalize switches to or from the
1809 side buffer / original string, and we need to start again to get the
1810 next character.
1811 */
1812 for (;;) {
1813 if (data->flags & UCOL_ITER_HASLEN) {
1814 /*
1815 Normal path for strings when length is specified.
1816 Not in side buffer because it is always null terminated.
1817 */
1818 if (data->pos <= data->string) {
1819 /* End of the main source string */
1820 return UCOL_NO_MORE_CES;
1821 }
1822 data->pos --;
1823 ch = *data->pos;
1824 }
1825 // we are using an iterator to go back. Pray for us!
1826 else if (data->flags & UCOL_USE_ITERATOR) {
1827 UChar32 iterCh = data->iterator->previous(data->iterator);
1828 if(iterCh == U_SENTINEL) {
1829 return UCOL_NO_MORE_CES;
1830 } else {
1831 ch = (UChar)iterCh;
1832 }
1833 }
1834 else {
1835 data->pos --;
1836 ch = *data->pos;
1837 /* we are in the side buffer. */
1838 if (ch == 0) {
1839 /*
1840 At the start of the normalize side buffer.
1841 Go back to string.
1842 Because pointer points to the last accessed character,
1843 hence we have to increment it by one here.
1844 */
1845 if (data->fcdPosition == NULL) {
1846 data->pos = data->string;
1847 return UCOL_NO_MORE_CES;
1848 }
1849 else {
1850 data->pos = data->fcdPosition + 1;
1851 }
1852 data->flags = data->origFlags;
1853 continue;
1854 }
1855 }
1856
1857 if(data->flags&UCOL_HIRAGANA_Q) {
1858 if(ch>=0x3040 && ch<=0x309f) {
1859 data->flags |= UCOL_WAS_HIRAGANA;
1860 } else {
1861 data->flags &= ~UCOL_WAS_HIRAGANA;
1862 }
1863 }
1864
1865 /*
1866 * got a character to determine if there's fcd and/or normalization
1867 * stuff to do.
1868 * if the current character is not fcd.
1869 * if current character is at the start of the string
1870 * Trailing combining class == 0.
1871 * Note if pos is in the writablebuffer, norm is always 0
1872 */
1873 if (ch < ZERO_CC_LIMIT_ ||
1874 // this should propel us out of the loop in the iterator case
1875 (data->flags & UCOL_ITER_NORM) == 0 ||
1876 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1877 || data->string == data->pos) {
1878 break;
1879 }
1880
1881 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1882 /* if next character is FCD */
1883 if (data->pos == data->string) {
1884 /* First char of string is always OK for FCD check */
1885 break;
1886 }
1887
1888 /* Not first char of string, do the FCD fast test */
1889 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1890 break;
1891 }
1892 }
1893
1894 /* Need a more complete FCD check and possible normalization. */
1895 if (collPrevIterFCD(data)) {
1896 collPrevIterNormalize(data);
1897 }
1898
1899 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1900 /* No normalization. Go ahead and process the char. */
1901 break;
1902 }
1903
1904 /*
1905 Some normalization happened.
1906 Next loop picks up a char from the normalization buffer.
1907 */
1908 }
1909
1910 /* attempt to handle contractions, after removal of the backwards
1911 contraction
1912 */
1913 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1914 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1915 } else {
1916 if (ch <= 0xFF) {
1917 result = coll->latinOneMapping[ch];
1918 }
1919 else {
1920 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1921 }
1922 if (result > UCOL_NOT_FOUND) {
1923 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1924 }
1925 if (result == UCOL_NOT_FOUND) { // Not found in master list
1926 if (!isAtStartPrevIterate(data) &&
1927 ucol_contractionEndCP(ch, data->coll)) {
1928 result = UCOL_CONTRACTION;
1929 } else {
1930 if(coll->UCA) {
1931 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1932 }
1933 }
1934
1935 if (result > UCOL_NOT_FOUND) {
1936 if(coll->UCA) {
1937 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
1938 }
1939 }
1940 }
1941 }
1942 if(result == UCOL_NOT_FOUND) {
1943 result = getPrevImplicit(ch, data);
1944 }
1945 }
1946 return result;
1947 }
1948
1949
1950 /* ucol_getPrevCE, out-of-line version for use from other files. */
1951 U_CAPI uint32_t U_EXPORT2
1952 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1953 UErrorCode *status) {
1954 return ucol_IGetPrevCE(coll, data, status);
1955 }
1956
1957
1958 /* this should be connected to special Jamo handling */
1959 U_CAPI uint32_t U_EXPORT2
1960 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1961 collIterate colIt;
1962 uint32_t order;
1963 IInit_collIterate(coll, &u, 1, &colIt);
1964 order = ucol_IGetNextCE(coll, &colIt, status);
1965 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1966 return order;
1967 }
1968
1969 /**
1970 * Inserts the argument character into the end of the buffer pushing back the
1971 * null terminator.
1972 * @param data collIterate struct data
1973 * @param pNull pointer to the null termination
1974 * @param ch character to be appended
1975 * @return the position of the new addition
1976 */
1977 static
1978 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1979 {
1980 uint32_t size = data->writableBufSize;
1981 UChar *newbuffer;
1982 const uint32_t incsize = 5;
1983
1984 if ((data->writableBuffer + size) > (pNull + 1)) {
1985 *pNull = ch;
1986 *(pNull + 1) = 0;
1987 return pNull;
1988 }
1989
1990 /*
1991 buffer will always be null terminated at the end.
1992 giving extra space since it is likely that more characters will be added.
1993 */
1994 size += incsize;
1995 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1996 if(newbuffer != NULL) { // something wrong, but no status
1997 uprv_memcpy(newbuffer, data->writableBuffer,
1998 data->writableBufSize * sizeof(UChar));
1999
2000 freeHeapWritableBuffer(data);
2001 data->writableBufSize = size;
2002 data->writableBuffer = newbuffer;
2003
2004 newbuffer = newbuffer + data->writableBufSize;
2005 *newbuffer = ch;
2006 *(newbuffer + 1) = 0;
2007 }
2008 return newbuffer;
2009 }
2010
2011 /**
2012 * Inserts the argument string into the end of the buffer pushing back the
2013 * null terminator.
2014 * @param data collIterate struct data
2015 * @param pNull pointer to the null termination
2016 * @param string to be appended
2017 * @param length of the string to be appended
2018 * @return the position of the new addition
2019 */
2020 static
2021 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2022 int32_t length)
2023 {
2024 uint32_t size = pNull - data->writableBuffer;
2025 UChar *newbuffer;
2026
2027 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2028 uprv_memcpy(pNull, str, length * sizeof(UChar));
2029 *(pNull + length) = 0;
2030 return pNull;
2031 }
2032
2033 /*
2034 buffer will always be null terminated at the end.
2035 giving extra space since it is likely that more characters will be added.
2036 */
2037 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2038 if(newbuffer != NULL) {
2039 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2040 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2041
2042 freeHeapWritableBuffer(data);
2043 data->writableBufSize = size + length + 1;
2044 data->writableBuffer = newbuffer;
2045 }
2046
2047 return newbuffer;
2048 }
2049
2050 /**
2051 * Special normalization function for contraction in the forwards iterator.
2052 * This normalization sequence will place the current character at source->pos
2053 * and its following normalized sequence into the buffer.
2054 * The fcd position, pos will be changed.
2055 * pos will now point to positions in the buffer.
2056 * Flags will be changed accordingly.
2057 * @param data collation iterator data
2058 */
2059 static
2060 inline void normalizeNextContraction(collIterate *data)
2061 {
2062 UChar *buffer = data->writableBuffer;
2063 uint32_t buffersize = data->writableBufSize;
2064 uint32_t strsize;
2065 UErrorCode status = U_ZERO_ERROR;
2066 /* because the pointer points to the next character */
2067 UChar *pStart = data->pos - 1;
2068 UChar *pEnd;
2069 uint32_t normLen;
2070 UChar *pStartNorm;
2071
2072 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2073 *data->writableBuffer = *(pStart - 1);
2074 strsize = 1;
2075 }
2076 else {
2077 strsize = u_strlen(data->writableBuffer);
2078 }
2079
2080 pEnd = data->fcdPosition;
2081
2082 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2083 &status);
2084
2085 if (buffersize <= normLen + strsize) {
2086 uint32_t size = strsize + normLen + 1;
2087 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2088 if(temp != NULL) {
2089 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2090 freeHeapWritableBuffer(data);
2091 data->writableBuffer = temp;
2092 data->writableBufSize = size;
2093 data->flags |= UCOL_ITER_ALLOCATED;
2094 }
2095 }
2096
2097 status = U_ZERO_ERROR;
2098 pStartNorm = buffer + strsize;
2099 /* null-termination will be added here */
2100 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2101 normLen + 1, &status);
2102
2103 data->pos = data->writableBuffer + strsize;
2104 data->origFlags = data->flags;
2105 data->flags |= UCOL_ITER_INNORMBUF;
2106 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2107 }
2108
2109 /**
2110 * Contraction character management function that returns the next character
2111 * for the forwards iterator.
2112 * Does nothing if the next character is in buffer and not the first character
2113 * in it.
2114 * Else it checks next character in data string to see if it is normalizable.
2115 * If it is not, the character is simply copied into the buffer, else
2116 * the whole normalized substring is copied into the buffer, including the
2117 * current character.
2118 * @param data collation element iterator data
2119 * @return next character
2120 */
2121 static
2122 inline UChar getNextNormalizedChar(collIterate *data)
2123 {
2124 UChar nextch;
2125 UChar ch;
2126 // Here we need to add the iterator code. One problem is the way
2127 // end of string is handled. If we just return next char, it could
2128 // be the sentinel. Most of the cases already check for this, but we
2129 // need to be sure.
2130 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2131 /* if no normalization and not in buffer. */
2132 if(data->flags & UCOL_USE_ITERATOR) {
2133 return (UChar)data->iterator->next(data->iterator);
2134 } else {
2135 return *(data->pos ++);
2136 }
2137 }
2138
2139 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2140 //normalizeIterator(data);
2141 //}
2142
2143 UChar *pEndWritableBuffer = NULL;
2144 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2145 if ((innormbuf && *data->pos != 0) ||
2146 (data->fcdPosition != NULL && !innormbuf &&
2147 data->pos < data->fcdPosition)) {
2148 /*
2149 if next character is in normalized buffer, no further normalization
2150 is required
2151 */
2152 return *(data->pos ++);
2153 }
2154
2155 if (data->flags & UCOL_ITER_HASLEN) {
2156 /* in data string */
2157 if (data->pos + 1 == data->endp) {
2158 return *(data->pos ++);
2159 }
2160 }
2161 else {
2162 if (innormbuf) {
2163 // inside the normalization buffer, but at the end
2164 // (since we encountered zero). This means, in the
2165 // case we're using char iterator, that we need to
2166 // do another round of normalization.
2167 //if(data->origFlags & UCOL_USE_ITERATOR) {
2168 // we need to restore original flags,
2169 // otherwise, we'll lose them
2170 //data->flags = data->origFlags;
2171 //normalizeIterator(data);
2172 //return *(data->pos++);
2173 //} else {
2174 /*
2175 in writable buffer, at this point fcdPosition can not be
2176 pointing to the end of the data string. see contracting tag.
2177 */
2178 if(data->fcdPosition) {
2179 if (*(data->fcdPosition + 1) == 0 ||
2180 data->fcdPosition + 1 == data->endp) {
2181 /* at the end of the string, dump it into the normalizer */
2182 data->pos = insertBufferEnd(data, data->pos,
2183 *(data->fcdPosition)) + 1;
2184 return *(data->fcdPosition ++);
2185 }
2186 pEndWritableBuffer = data->pos;
2187 data->pos = data->fcdPosition;
2188 } else if(data->origFlags & UCOL_USE_ITERATOR) {
2189 // if we are here, we're using a normalizing iterator.
2190 // we should just continue further.
2191 data->flags = data->origFlags;
2192 data->pos = NULL;
2193 return (UChar)data->iterator->next(data->iterator);
2194 }
2195 //}
2196 }
2197 else {
2198 if (*(data->pos + 1) == 0) {
2199 return *(data->pos ++);
2200 }
2201 }
2202 }
2203
2204 ch = *data->pos ++;
2205 nextch = *data->pos;
2206
2207 /*
2208 * if the current character is not fcd.
2209 * Trailing combining class == 0.
2210 */
2211 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2212 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2213 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2214 /*
2215 Need a more complete FCD check and possible normalization.
2216 normalize substring will be appended to buffer
2217 */
2218 if (collIterFCD(data)) {
2219 normalizeNextContraction(data);
2220 return *(data->pos ++);
2221 }
2222 else if (innormbuf) {
2223 /* fcdposition shifted even when there's no normalization, if we
2224 don't input the rest into this, we'll get the wrong position when
2225 we reach the end of the writableBuffer */
2226 int32_t length = data->fcdPosition - data->pos + 1;
2227 data->pos = insertBufferEnd(data, pEndWritableBuffer,
2228 data->pos - 1, length);
2229 return *(data->pos ++);
2230 }
2231 }
2232
2233 if (innormbuf) {
2234 /*
2235 no normalization is to be done hence only one character will be
2236 appended to the buffer.
2237 */
2238 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2239 }
2240
2241 /* points back to the pos in string */
2242 return ch;
2243 }
2244
2245
2246
2247 /**
2248 * Function to copy the buffer into writableBuffer and sets the fcd position to
2249 * the correct position
2250 * @param source data string source
2251 * @param buffer character buffer
2252 * @param tempdb current position in buffer that has been used up
2253 */
2254 static
2255 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2256 UChar *tempdb)
2257 {
2258 /* okay confusing part here. to ensure that the skipped characters are
2259 considered later, we need to place it in the appropriate position in the
2260 normalization buffer and reassign the pos pointer. simple case if pos
2261 reside in string, simply copy to normalization buffer and
2262 fcdposition = pos, pos = start of normalization buffer. if pos in
2263 normalization buffer, we'll insert the copy infront of pos and point pos
2264 to the start of the normalization buffer. why am i doing these copies?
2265 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2266 not require any changes, which be really painful. */
2267 uint32_t length = u_strlen(buffer);;
2268 if (source->flags & UCOL_ITER_INNORMBUF) {
2269 u_strcpy(tempdb, source->pos);
2270 }
2271 else {
2272 source->fcdPosition = source->pos;
2273 source->origFlags = source->flags;
2274 source->flags |= UCOL_ITER_INNORMBUF;
2275 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2276 }
2277
2278 if (length >= source->writableBufSize) {
2279 freeHeapWritableBuffer(source);
2280 source->writableBuffer =
2281 (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2282 if(source->writableBuffer == NULL) {
2283 return;
2284 }
2285 source->writableBufSize = length;
2286 }
2287
2288 u_strcpy(source->writableBuffer, buffer);
2289 source->pos = source->writableBuffer;
2290 }
2291
2292 /**
2293 * Function to get the discontiguos collation element within the source.
2294 * Note this function will set the position to the appropriate places.
2295 * @param coll current collator used
2296 * @param source data string source
2297 * @param constart index to the start character in the contraction table
2298 * @return discontiguos collation element offset
2299 */
2300 static
2301 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2302 const UChar *constart)
2303 {
2304 /* source->pos currently points to the second combining character after
2305 the start character */
2306 UChar *temppos = source->pos;
2307 UChar buffer[4*UCOL_MAX_BUFFER];
2308 UChar *tempdb = buffer;
2309 const UChar *tempconstart = constart;
2310 uint8_t tempflags = source->flags;
2311 UBool multicontraction = FALSE;
2312 UChar *tempbufferpos = 0;
2313 collIterateState discState;
2314
2315 backupState(source, &discState);
2316
2317 //*tempdb = *(source->pos - 1);
2318 *tempdb = peekCharacter(source, -1);
2319 tempdb ++;
2320 while (TRUE) {
2321 UChar *UCharOffset;
2322 UChar schar,
2323 tchar;
2324 uint32_t result;
2325
2326 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2327 || (peekCharacter(source, 0) == 0 &&
2328 //|| (*source->pos == 0 &&
2329 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2330 source->fcdPosition == NULL ||
2331 source->fcdPosition == source->endp ||
2332 *(source->fcdPosition) == 0 ||
2333 u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2334 /* end of string in null terminated string or stopped by a
2335 null character, note fcd does not always point to a base
2336 character after the discontiguos change */
2337 u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2338 //u_getCombiningClass(*(source->pos)) == 0) {
2339 //constart = (UChar *)coll->image + getContractOffset(CE);
2340 if (multicontraction) {
2341 *tempbufferpos = 0;
2342 source->pos = temppos - 1;
2343 setDiscontiguosAttribute(source, buffer, tempdb);
2344 return *(coll->contractionCEs +
2345 (tempconstart - coll->contractionIndex));
2346 }
2347 constart = tempconstart;
2348 break;
2349 }
2350
2351 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2352 schar = getNextNormalizedChar(source);
2353
2354 while (schar > (tchar = *UCharOffset)) {
2355 UCharOffset++;
2356 }
2357
2358 if (schar != tchar) {
2359 /* not the correct codepoint. we stuff the current codepoint into
2360 the discontiguos buffer and try the next character */
2361 *tempdb = schar;
2362 tempdb ++;
2363 continue;
2364 }
2365 else {
2366 if (u_getCombiningClass(schar) ==
2367 u_getCombiningClass(peekCharacter(source, -2))) {
2368 //u_getCombiningClass(*(source->pos - 2))) {
2369 *tempdb = schar;
2370 tempdb ++;
2371 continue;
2372 }
2373 result = *(coll->contractionCEs +
2374 (UCharOffset - coll->contractionIndex));
2375 }
2376 *tempdb = 0;
2377
2378 if (result == UCOL_NOT_FOUND) {
2379 break;
2380 } else if (isContraction(result)) {
2381 /* this is a multi-contraction*/
2382 tempconstart = (UChar *)coll->image + getContractOffset(result);
2383 if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2384 != UCOL_NOT_FOUND) {
2385 multicontraction = TRUE;
2386 temppos = source->pos + 1;
2387 tempbufferpos = buffer + u_strlen(buffer);
2388 }
2389 } else {
2390 setDiscontiguosAttribute(source, buffer, tempdb);
2391 return result;
2392 }
2393 }
2394
2395 /* no problems simply reverting just like that,
2396 if we are in string before getting into this function, points back to
2397 string hence no problem.
2398 if we are in normalization buffer before getting into this function,
2399 since we'll never use another normalization within this function, we
2400 know that fcdposition points to a base character. the normalization buffer
2401 never change, hence this revert works. */
2402 loadState(source, &discState, TRUE);
2403 goBackOne(source);
2404
2405 //source->pos = temppos - 1;
2406 source->flags = tempflags;
2407 return *(coll->contractionCEs + (constart - coll->contractionIndex));
2408 }
2409
2410 static
2411 inline UBool isNonChar(UChar32 cp) {
2412 if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2413 return TRUE;
2414 }
2415 return FALSE;
2416 }
2417
2418 /* now uses Mark's getImplicitPrimary code */
2419 static
2420 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2421 if(isNonChar(cp)) {
2422 return 0;
2423 }
2424 uint32_t r = uprv_uca_getImplicitPrimary(cp);
2425 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2426 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2427 }
2428
2429 /**
2430 * Inserts the argument character into the front of the buffer replacing the
2431 * front null terminator.
2432 * @param data collation element iterator data
2433 * @param pNull pointer to the null terminator
2434 * @param ch character to be appended
2435 * @return positon of added character
2436 */
2437 static
2438 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2439 {
2440 uint32_t size = data->writableBufSize;
2441 UChar *end;
2442 UChar *newbuffer;
2443 const uint32_t incsize = 5;
2444
2445 if (pNull > data->writableBuffer + 1) {
2446 *pNull = ch;
2447 *(pNull - 1) = 0;
2448 return pNull;
2449 }
2450
2451 /*
2452 buffer will always be null terminated infront.
2453 giving extra space since it is likely that more characters will be added.
2454 */
2455 size += incsize;
2456 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2457 if(newbuffer == NULL) {
2458 return NULL;
2459 }
2460 end = newbuffer + incsize;
2461 uprv_memcpy(end, data->writableBuffer,
2462 data->writableBufSize * sizeof(UChar));
2463 *end = ch;
2464 *(end - 1) = 0;
2465
2466 freeHeapWritableBuffer(data);
2467
2468 data->writableBufSize = size;
2469 data->writableBuffer = newbuffer;
2470 return end;
2471 }
2472
2473 /**
2474 * Special normalization function for contraction in the previous iterator.
2475 * This normalization sequence will place the current character at source->pos
2476 * and its following normalized sequence into the buffer.
2477 * The fcd position, pos will be changed.
2478 * pos will now point to positions in the buffer.
2479 * Flags will be changed accordingly.
2480 * @param data collation iterator data
2481 */
2482 static
2483 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2484 {
2485 UChar *buffer = data->writableBuffer;
2486 uint32_t buffersize = data->writableBufSize;
2487 uint32_t nulltermsize;
2488 UErrorCode localstatus = U_ZERO_ERROR;
2489 UChar *pEnd = data->pos + 1; /* End normalize + 1 */
2490 UChar *pStart;
2491 uint32_t normLen;
2492 UChar *pStartNorm;
2493
2494 if (data->flags & UCOL_ITER_HASLEN) {
2495 /*
2496 normalization buffer not used yet, we'll pull down the next
2497 character into the end of the buffer
2498 */
2499 *(buffer + (buffersize - 1)) = *(data->pos + 1);
2500 nulltermsize = buffersize - 1;
2501 }
2502 else {
2503 nulltermsize = buffersize;
2504 UChar *temp = buffer + (nulltermsize - 1);
2505 while (*(temp --) != 0) {
2506 nulltermsize --;
2507 }
2508 }
2509
2510 /* Start normalize */
2511 if (data->fcdPosition == NULL) {
2512 pStart = data->string;
2513 }
2514 else {
2515 pStart = data->fcdPosition + 1;
2516 }
2517
2518 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2519 &localstatus);
2520
2521 if (nulltermsize <= normLen) {
2522 uint32_t size = buffersize - nulltermsize + normLen + 1;
2523 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2524 if (temp == NULL) {
2525 *status = U_MEMORY_ALLOCATION_ERROR;
2526 return;
2527 }
2528 nulltermsize = normLen + 1;
2529 uprv_memcpy(temp + normLen, buffer,
2530 sizeof(UChar) * (buffersize - nulltermsize));
2531 freeHeapWritableBuffer(data);
2532 data->writableBuffer = temp;
2533 data->writableBufSize = size;
2534 }
2535
2536 /*
2537 this puts the null termination infront of the normalized string instead
2538 of the end
2539 */
2540 pStartNorm = buffer + (nulltermsize - normLen);
2541 *(pStartNorm - 1) = 0;
2542 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2543 status);
2544
2545 data->pos = data->writableBuffer + nulltermsize;
2546 data->origFlags = data->flags;
2547 data->flags |= UCOL_ITER_INNORMBUF;
2548 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2549 }
2550
2551 /**
2552 * Contraction character management function that returns the previous character
2553 * for the backwards iterator.
2554 * Does nothing if the previous character is in buffer and not the first
2555 * character in it.
2556 * Else it checks previous character in data string to see if it is
2557 * normalizable.
2558 * If it is not, the character is simply copied into the buffer, else
2559 * the whole normalized substring is copied into the buffer, including the
2560 * current character.
2561 * @param data collation element iterator data
2562 * @return previous character
2563 */
2564 static
2565 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2566 {
2567 UChar prevch;
2568 UChar ch;
2569 UChar *start;
2570 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2571 UChar *pNull = NULL;
2572 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2573 (innormbuf && *(data->pos - 1) != 0)) {
2574 /*
2575 if no normalization.
2576 if previous character is in normalized buffer, no further normalization
2577 is required
2578 */
2579 if(data->flags & UCOL_USE_ITERATOR) {
2580 data->iterator->move(data->iterator, -1, UITER_CURRENT);
2581 return (UChar)data->iterator->next(data->iterator);
2582 } else {
2583 return *(data->pos - 1);
2584 }
2585 }
2586
2587 start = data->pos;
2588 if (data->flags & UCOL_ITER_HASLEN) {
2589 /* in data string */
2590 if ((start - 1) == data->string) {
2591 return *(start - 1);
2592 }
2593 start --;
2594 ch = *start;
2595 prevch = *(start - 1);
2596 }
2597 else {
2598 /*
2599 in writable buffer, at this point fcdPosition can not be NULL.
2600 see contracting tag.
2601 */
2602 if (data->fcdPosition == data->string) {
2603 /* at the start of the string, just dump it into the normalizer */
2604 insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2605 data->fcdPosition = NULL;
2606 return *(data->pos - 1);
2607 }
2608 pNull = data->pos - 1;
2609 start = data->fcdPosition;
2610 ch = *start;
2611 prevch = *(start - 1);
2612 }
2613 /*
2614 * if the current character is not fcd.
2615 * Trailing combining class == 0.
2616 */
2617 if (data->fcdPosition > start &&
2618 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2619 {
2620 /*
2621 Need a more complete FCD check and possible normalization.
2622 normalize substring will be appended to buffer
2623 */
2624 UChar *backuppos = data->pos;
2625 data->pos = start;
2626 if (collPrevIterFCD(data)) {
2627 normalizePrevContraction(data, status);
2628 return *(data->pos - 1);
2629 }
2630 data->pos = backuppos;
2631 data->fcdPosition ++;
2632 }
2633
2634 if (innormbuf) {
2635 /*
2636 no normalization is to be done hence only one character will be
2637 appended to the buffer.
2638 */
2639 insertBufferFront(data, pNull, ch);
2640 data->fcdPosition --;
2641 }
2642
2643 return ch;
2644 }
2645
2646 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2647 /* It is called by getNextCE */
2648
2649 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2650 collIterateState entryState;
2651 backupState(source, &entryState);
2652 UChar32 cp = ch;
2653
2654 for (;;) {
2655 // This loop will repeat only in the case of contractions, and only when a contraction
2656 // is found and the first CE resulting from that contraction is itself a special
2657 // (an expansion, for example.) All other special CE types are fully handled the
2658 // first time through, and the loop exits.
2659
2660 const uint32_t *CEOffset = NULL;
2661 switch(getCETag(CE)) {
2662 case NOT_FOUND_TAG:
2663 /* This one is not found, and we'll let somebody else bother about it... no more games */
2664 return CE;
2665 case SURROGATE_TAG:
2666 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2667 /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2668 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2669 /* we return 0 (completely ignorable - per UCA specification */
2670 {
2671 UChar trail;
2672 collIterateState state;
2673 backupState(source, &state);
2674 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2675 // we chould have stepped one char forward and it might have turned that it
2676 // was not a trail surrogate. In that case, we have to backup.
2677 loadState(source, &state, TRUE);
2678 return 0;
2679 } else {
2680 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2681 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
2682 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2683 // We need to backup
2684 loadState(source, &state, TRUE);
2685 return CE;
2686 }
2687 // calculate the supplementary code point value, if surrogate was not tailored
2688 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2689 }
2690 }
2691 break;
2692 case SPEC_PROC_TAG:
2693 {
2694 // Special processing is getting a CE that is preceded by a certain prefix
2695 // Currently this is only needed for optimizing Japanese length and iteration marks.
2696 // When we encouter a special processing tag, we go backwards and try to see if
2697 // we have a match.
2698 // Contraction tables are used - so the whole process is not unlike contraction.
2699 // prefix data is stored backwards in the table.
2700 const UChar *UCharOffset;
2701 UChar schar, tchar;
2702 collIterateState prefixState;
2703 backupState(source, &prefixState);
2704 loadState(source, &entryState, TRUE);
2705 goBackOne(source); // We want to look at the point where we entered - actually one
2706 // before that...
2707
2708 for(;;) {
2709 // This loop will run once per source string character, for as long as we
2710 // are matching a potential contraction sequence
2711
2712 // First we position ourselves at the begining of contraction sequence
2713 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2714 if (collIter_bos(source)) {
2715 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2716 break;
2717 }
2718 schar = getPrevNormalizedChar(source, status);
2719 goBackOne(source);
2720
2721 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2722 UCharOffset++;
2723 }
2724
2725 if (schar == tchar) {
2726 // Found the source string char in the table.
2727 // Pick up the corresponding CE from the table.
2728 CE = *(coll->contractionCEs +
2729 (UCharOffset - coll->contractionIndex));
2730 }
2731 else
2732 {
2733 // Source string char was not in the table.
2734 // We have not found the prefix.
2735 CE = *(coll->contractionCEs +
2736 (ContractionStart - coll->contractionIndex));
2737 }
2738
2739 if(!isPrefix(CE)) {
2740 // The source string char was in the contraction table, and the corresponding
2741 // CE is not a prefix CE. We found the prefix, break
2742 // out of loop, this CE will end up being returned. This is the normal
2743 // way out of prefix handling when the source actually contained
2744 // the prefix.
2745 break;
2746 }
2747 }
2748 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2749 loadState(source, &prefixState, TRUE);
2750 if(source->origFlags & UCOL_USE_ITERATOR) {
2751 source->flags = source->origFlags;
2752 }
2753 } else { // prefix search was a failure, we have to backup all the way to the start
2754 loadState(source, &entryState, TRUE);
2755 }
2756 break;
2757 }
2758 case CONTRACTION_TAG:
2759 {
2760 /* This should handle contractions */
2761 collIterateState state;
2762 backupState(source, &state);
2763 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2764 const UChar *UCharOffset;
2765 UChar schar, tchar;
2766
2767 for (;;) {
2768 /* This loop will run once per source string character, for as long as we */
2769 /* are matching a potential contraction sequence */
2770
2771 /* First we position ourselves at the begining of contraction sequence */
2772 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2773
2774 if (collIter_eos(source)) {
2775 // Ran off the end of the source string.
2776 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2777 // So we'll pick whatever we have at the point...
2778 if (CE == UCOL_NOT_FOUND) {
2779 // back up the source over all the chars we scanned going into this contraction.
2780 CE = firstCE;
2781 loadState(source, &state, TRUE);
2782 if(source->origFlags & UCOL_USE_ITERATOR) {
2783 source->flags = source->origFlags;
2784 }
2785 }
2786 break;
2787 }
2788
2789 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2790 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2791
2792 schar = getNextNormalizedChar(source);
2793 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2794 UCharOffset++;
2795 }
2796
2797 if (schar == tchar) {
2798 // Found the source string char in the contraction table.
2799 // Pick up the corresponding CE from the table.
2800 CE = *(coll->contractionCEs +
2801 (UCharOffset - coll->contractionIndex));
2802 }
2803 else
2804 {
2805 // Source string char was not in contraction table.
2806 // Unless we have a discontiguous contraction, we have finished
2807 // with this contraction.
2808 UChar32 miss = schar;
2809 if(U16_IS_LEAD(schar)) { // in order to do the proper detection, we
2810 // need to see if we're dealing with a supplementary
2811 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2812 }
2813
2814 uint8_t sCC;
2815 if (miss < 0x300 ||
2816 maxCC == 0 ||
2817 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2818 sCC>maxCC ||
2819 (allSame != 0 && sCC == maxCC) ||
2820 collIter_eos(source)) {
2821 // Contraction can not be discontiguous.
2822 goBackOne(source); // back up the source string by one,
2823 // because the character we just looked at was
2824 // not part of the contraction. */
2825 if(U_IS_SUPPLEMENTARY(miss)) {
2826 goBackOne(source);
2827 }
2828 CE = *(coll->contractionCEs +
2829 (ContractionStart - coll->contractionIndex));
2830 } else {
2831 //
2832 // Contraction is possibly discontiguous.
2833 // Scan more of source string looking for a match
2834 //
2835 UChar tempchar;
2836 /* find the next character if schar is not a base character
2837 and we are not yet at the end of the string */
2838 tempchar = getNextNormalizedChar(source);
2839 // probably need another supplementary thingie here
2840 goBackOne(source);
2841 if (i_getCombiningClass(tempchar, coll) == 0) {
2842 goBackOne(source);
2843 if(U_IS_SUPPLEMENTARY(miss)) {
2844 goBackOne(source);
2845 }
2846 /* Spit out the last char of the string, wasn't tasty enough */
2847 CE = *(coll->contractionCEs +
2848 (ContractionStart - coll->contractionIndex));
2849 } else {
2850 CE = getDiscontiguous(coll, source, ContractionStart);
2851 }
2852 }
2853 } // else after if(schar == tchar)
2854
2855 if(CE == UCOL_NOT_FOUND) {
2856 /* The Source string did not match the contraction that we were checking. */
2857 /* Back up the source position to undo the effects of having partially */
2858 /* scanned through what ultimately proved to not be a contraction. */
2859 loadState(source, &state, TRUE);
2860 CE = firstCE;
2861 break;
2862 }
2863
2864 if(!isContraction(CE)) {
2865 // The source string char was in the contraction table, and the corresponding
2866 // CE is not a contraction CE. We completed the contraction, break
2867 // out of loop, this CE will end up being returned. This is the normal
2868 // way out of contraction handling when the source actually contained
2869 // the contraction.
2870 break;
2871 }
2872
2873
2874 // The source string char was in the contraction table, and the corresponding
2875 // CE is IS a contraction CE. We will continue looping to check the source
2876 // string for the remaining chars in the contraction.
2877 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2878 if(tempCE != UCOL_NOT_FOUND) {
2879 // We have scanned a a section of source string for which there is a
2880 // CE from the contraction table. Remember the CE and scan position, so
2881 // that we can return to this point if further scanning fails to
2882 // match a longer contraction sequence.
2883 firstCE = tempCE;
2884
2885 goBackOne(source);
2886 backupState(source, &state);
2887 getNextNormalizedChar(source);
2888
2889 // Another way to do this is:
2890 //collIterateState tempState;
2891 //backupState(source, &tempState);
2892 //goBackOne(source);
2893 //backupState(source, &state);
2894 //loadState(source, &tempState, TRUE);
2895
2896 // The problem is that for incomplete contractions we have to remember the previous
2897 // position. Before, the only thing I needed to do was state.pos--;
2898 // After iterator introduction and especially after introduction of normalizing
2899 // iterators, it became much more difficult to decrease the saved state.
2900 // I'm not yet sure which of the two methods above is faster.
2901 }
2902 } // for(;;)
2903 break;
2904 } // case CONTRACTION_TAG:
2905 case LONG_PRIMARY_TAG:
2906 {
2907 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2908 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2909 return CE;
2910 }
2911 case EXPANSION_TAG:
2912 {
2913 /* This should handle expansion. */
2914 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2915 /* I have to decide where continuations are going to be dealt with */
2916 uint32_t size;
2917 uint32_t i; /* general counter */
2918 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2919 size = getExpansionCount(CE);
2920 CE = *CEOffset++;
2921 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2922 for(i = 1; i<size; i++) {
2923 *(source->CEpos++) = *CEOffset++;
2924 }
2925 } else { /* else, we do */
2926 while(*CEOffset != 0) {
2927 *(source->CEpos++) = *CEOffset++;
2928 }
2929 }
2930 return CE;
2931 }
2932 case DIGIT_TAG:
2933 {
2934 /*
2935 We do a check to see if we want to collate digits as numbers; if so we generate
2936 a custom collation key. Otherwise we pull out the value stored in the expansion table.
2937 */
2938 uint32_t size;
2939 uint32_t i; /* general counter */
2940
2941 if (source->coll->numericCollation == UCOL_ON){
2942 collIterateState digitState = {0,0,0,0,0,0,0,0};
2943 UChar32 char32 = 0;
2944
2945 uint32_t digIndx = 0;
2946 uint32_t endIndex = 0;
2947 uint32_t trailingZeroIndex = 0;
2948
2949 uint32_t primWeight = 0;
2950
2951 int32_t digVal = 0;
2952 uint8_t collateVal = 0;
2953
2954 UBool nonZeroValReached = FALSE;
2955
2956 uint8_t *numTempBuf;
2957 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2958 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2959
2960 numTempBuf = stackNumTempBuf;
2961 /*
2962 We parse the source string until we hit a char that's NOT a digit.
2963 Use this u_charDigitValue. This might be slow because we have to
2964 handle surrogates...
2965 */
2966 /*
2967 if (U16_IS_LEAD(ch)){
2968 if (!collIter_eos(source)) {
2969 backupState(source, &digitState);
2970 UChar trail = getNextNormalizedChar(source);
2971 if(U16_IS_TRAIL(trail)) {
2972 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2973 } else {
2974 loadState(source, &digitState, TRUE);
2975 char32 = ch;
2976 }
2977 } else {
2978 char32 = ch;
2979 }
2980 } else {
2981 char32 = ch;
2982 }
2983 digVal = u_charDigitValue(char32);
2984 */
2985 digVal = u_charDigitValue(cp); // if we have arrived here, we have
2986 // already processed possible supplementaries that trigered the digit tag -
2987 // all supplementaries are marked in the UCA.
2988 /*
2989 We pad a zero in front of the first element anyways. This takes
2990 care of the (probably) most common case where people are sorting things followed
2991 by a single digit
2992 */
2993 digIndx++;
2994 for(;;){
2995 // Make sure we have enough space.
2996 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2997 {
2998 numTempBufSize *= 2;
2999 if (numTempBuf == stackNumTempBuf){
3000 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3001 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3002 } else {
3003 uprv_realloc(numTempBuf, numTempBufSize);
3004 }
3005 }
3006
3007 // Skipping over leading zeroes.
3008 if (digVal != 0) {
3009 nonZeroValReached = TRUE;
3010 }
3011 if (nonZeroValReached) {
3012 /*
3013 We parse the digit string into base 100 numbers (this fits into a byte).
3014 We only add to the buffer in twos, thus if we are parsing an odd character,
3015 that serves as the 'tens' digit while the if we are parsing an even one, that
3016 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3017 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3018 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3019 than all the other bytes.
3020 */
3021
3022 if (digIndx % 2 == 1){
3023 collateVal += (uint8_t)digVal;
3024
3025 // We don't enter the low-order-digit case unless we've already seen
3026 // the high order, or for the first digit, which is always non-zero.
3027 if (collateVal != 0)
3028 trailingZeroIndex = 0;
3029
3030 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3031 collateVal = 0;
3032 }
3033 else{
3034 // We drop the collation value into the buffer so if we need to do
3035 // a "front patch" we don't have to check to see if we're hitting the
3036 // last element.
3037 collateVal = (uint8_t)(digVal * 10);
3038
3039 // Check for trailing zeroes.
3040 if (collateVal == 0)
3041 {
3042 if (!trailingZeroIndex)
3043 trailingZeroIndex = (digIndx/2) + 2;
3044 }
3045 else
3046 trailingZeroIndex = 0;
3047
3048 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3049 }
3050 digIndx++;
3051 }
3052
3053 // Get next character.
3054 if (!collIter_eos(source)){
3055 ch = getNextNormalizedChar(source);
3056 if (U16_IS_LEAD(ch)){
3057 if (!collIter_eos(source)) {
3058 backupState(source, &digitState);
3059 UChar trail = getNextNormalizedChar(source);
3060 if(U16_IS_TRAIL(trail)) {
3061 char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3062 } else {
3063 loadState(source, &digitState, TRUE);
3064 char32 = ch;
3065 }
3066 }
3067 } else {
3068 char32 = ch;
3069 }
3070
3071 if ((digVal = u_charDigitValue(char32)) == -1){
3072 // Resetting position to point to the next unprocessed char. We
3073 // overshot it when doing our test/set for numbers.
3074 if (char32 > 0xFFFF) { // For surrogates.
3075 loadState(source, &digitState, TRUE);
3076 //goBackOne(source);
3077 }
3078 goBackOne(source);
3079 break;
3080 }
3081 } else {
3082 break;
3083 }
3084 }
3085
3086 if (nonZeroValReached == FALSE){
3087 digIndx = 2;
3088 numTempBuf[2] = 6;
3089 }
3090
3091 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3092 if (digIndx % 2 != 0){
3093 /*
3094 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3095 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3096 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3097 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3098 */
3099
3100 for(i = 2; i < endIndex; i++){
3101 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3102 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3103 }
3104 --digIndx;
3105 }
3106
3107 // Subtract one off of the last byte.
3108 numTempBuf[endIndex-1] -= 1;
3109
3110 /*
3111 We want to skip over the first two slots in the buffer. The first slot
3112 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3113 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3114 */
3115 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3116 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3117
3118 // Now transfer the collation key to our collIterate struct.
3119 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3120 size = ((endIndex+1) & ~1)/2;
3121 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3122 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3123 UCOL_BYTE_COMMON; // Tertiary weight.
3124 i = 2; // Reset the index into the buffer.
3125 while(i < endIndex)
3126 {
3127 primWeight = numTempBuf[i++] << 8;
3128 if ( i < endIndex)
3129 primWeight |= numTempBuf[i++];
3130 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3131 }
3132
3133 if (numTempBuf != stackNumTempBuf)
3134 uprv_free(numTempBuf);
3135 } else {
3136 // no numeric mode, we'll just switch to whatever we stashed and continue
3137 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3138 CE = *CEOffset++;
3139 break;
3140 }
3141 return CE;
3142 }
3143 /* various implicits optimization */
3144 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3145 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3146 //return getImplicit(cp, source, 0x04000000);
3147 return getImplicit(cp, source);
3148 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3149 /* UCA is filled with these. Tailorings are NOT_FOUND */
3150 //return getImplicit(cp, source, 0);
3151 return getImplicit(cp, source);
3152 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3153 return 0; /* broken surrogate sequence */
3154 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3155 UChar nextChar;
3156 if( source->flags & UCOL_USE_ITERATOR) {
3157 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3158 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3159 source->iterator->next(source->iterator);
3160 return getImplicit(cp, source);
3161 } else {
3162 return 0;
3163 }
3164 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3165 U_IS_TRAIL((nextChar=*source->pos))) {
3166 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3167 source->pos++;
3168 return getImplicit(cp, source);
3169 } else {
3170 return 0; /* completely ignorable */
3171 }
3172 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3173 {
3174 const uint32_t
3175 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3176 //const uint32_t LCount = 19;
3177 const uint32_t VCount = 21;
3178 const uint32_t TCount = 28;
3179 //const uint32_t NCount = VCount * TCount; // 588
3180 //const uint32_t SCount = LCount * NCount; // 11172
3181 uint32_t L = ch - SBase;
3182
3183 // divide into pieces
3184
3185 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3186 L /= TCount;
3187 uint32_t V = L % VCount;
3188 L /= VCount;
3189
3190 // offset them
3191
3192 L += LBase;
3193 V += VBase;
3194 T += TBase;
3195
3196 // return the first CE, but first put the rest into the expansion buffer
3197 if (!source->coll->image->jamoSpecial) { // FAST PATH
3198
3199 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3200 if (T != TBase) {
3201 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3202 }
3203
3204 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3205
3206 } else { // Jamo is Special
3207 // Since Hanguls pass the FCD check, it is
3208 // guaranteed that we won't be in
3209 // the normalization buffer if something like this happens
3210 // However, if we are using a uchar iterator and normalization
3211 // is ON, the Hangul that lead us here is going to be in that
3212 // normalization buffer. Here we want to restore the uchar
3213 // iterator state and pull out of the normalization buffer
3214 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3215 source->flags = source->origFlags; // restore the iterator
3216 source->pos = NULL;
3217 }
3218 // Move Jamos into normalization buffer
3219 source->writableBuffer[0] = (UChar)L;
3220 source->writableBuffer[1] = (UChar)V;
3221 if (T != TBase) {
3222 source->writableBuffer[2] = (UChar)T;
3223 source->writableBuffer[3] = 0;
3224 } else {
3225 source->writableBuffer[2] = 0;
3226 }
3227
3228 source->fcdPosition = source->pos; // Indicate where to continue in main input string
3229 // after exhausting the writableBuffer
3230 source->pos = source->writableBuffer;
3231 source->origFlags = source->flags;
3232 source->flags |= UCOL_ITER_INNORMBUF;
3233 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3234
3235 return(UCOL_IGNORABLE);
3236 }
3237 }
3238 case CHARSET_TAG:
3239 /* not yet implemented */
3240 /* probably after 1.8 */
3241 return UCOL_NOT_FOUND;
3242 default:
3243 *status = U_INTERNAL_PROGRAM_ERROR;
3244 CE=0;
3245 break;
3246 }
3247 if (CE <= UCOL_NOT_FOUND) break;
3248 }
3249 return CE;
3250 }
3251
3252
3253 /* now uses Mark's getImplicitPrimary code */
3254 static
3255 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3256 if(isNonChar(cp)) {
3257 return 0;
3258 }
3259
3260 uint32_t r = uprv_uca_getImplicitPrimary(cp);
3261
3262 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3263 collationSource->toReturn = collationSource->CEpos;
3264 return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3265 }
3266
3267 /**
3268 * This function handles the special CEs like contractions, expansions,
3269 * surrogates, Thai.
3270 * It is called by both getPrevCE
3271 */
3272 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3273 collIterate *source,
3274 UErrorCode *status)
3275 {
3276 const uint32_t *CEOffset = NULL;
3277 UChar *UCharOffset = NULL;
3278 UChar schar;
3279 const UChar *constart = NULL;
3280 uint32_t size;
3281 UChar buffer[UCOL_MAX_BUFFER];
3282 uint32_t *endCEBuffer;
3283 UChar *strbuffer;
3284 int32_t noChars = 0;
3285
3286 for(;;)
3287 {
3288 /* the only ces that loops are thai and contractions */
3289 switch (getCETag(CE))
3290 {
3291 case NOT_FOUND_TAG: /* this tag always returns */
3292 return CE;
3293 case SURROGATE_TAG: /* This is a surrogate pair */
3294 /* essentialy an engaged lead surrogate. */
3295 /* if you have encountered it here, it means that a */
3296 /* broken sequence was encountered and this is an error */
3297 return 0;
3298 case SPEC_PROC_TAG:
3299 {
3300 // Special processing is getting a CE that is preceded by a certain prefix
3301 // Currently this is only needed for optimizing Japanese length and iteration marks.
3302 // When we encouter a special processing tag, we go backwards and try to see if
3303 // we have a match.
3304 // Contraction tables are used - so the whole process is not unlike contraction.
3305 // prefix data is stored backwards in the table.
3306 const UChar *UCharOffset;
3307 UChar schar, tchar;
3308 collIterateState prefixState;
3309 backupState(source, &prefixState);
3310 for(;;) {
3311 // This loop will run once per source string character, for as long as we
3312 // are matching a potential contraction sequence
3313
3314 // First we position ourselves at the begining of contraction sequence
3315 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3316
3317 if (collIter_bos(source)) {
3318 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3319 break;
3320 }
3321 schar = getPrevNormalizedChar(source, status);
3322 goBackOne(source);
3323
3324 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3325 UCharOffset++;
3326 }
3327
3328 if (schar == tchar) {
3329 // Found the source string char in the table.
3330 // Pick up the corresponding CE from the table.
3331 CE = *(coll->contractionCEs +
3332 (UCharOffset - coll->contractionIndex));
3333 }
3334 else
3335 {
3336 // if there is a completely ignorable code point in the middle of
3337 // a prefix, we need to act as if it's not there
3338 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3339 // lone surrogates cannot be set to zero as it would break other processing
3340 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3341 // it's easy for BMP code points
3342 if(isZeroCE == 0) {
3343 continue;
3344 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3345 // for supplementary code points, we have to check the next one
3346 // situations where we are going to ignore
3347 // 1. beginning of the string: schar is a lone surrogate
3348 // 2. schar is a lone surrogate
3349 // 3. schar is a trail surrogate in a valid surrogate sequence
3350 // that is explicitly set to zero.
3351 if (!collIter_bos(source)) {
3352 UChar lead;
3353 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3354 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3355 if(getCETag(isZeroCE) == SURROGATE_TAG) {
3356 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3357 if(finalCE == 0) {
3358 // this is a real, assigned completely ignorable code point
3359 goBackOne(source);
3360 continue;
3361 }
3362 }
3363 } else {
3364 // lone surrogate, completely ignorable
3365 continue;
3366 }
3367 } else {
3368 // lone surrogate at the beggining, completely ignorable
3369 continue;
3370 }
3371 }
3372 // Source string char was not in the table.
3373 // We have not found the prefix.
3374 CE = *(coll->contractionCEs +
3375 (ContractionStart - coll->contractionIndex));
3376 }
3377
3378 if(!isPrefix(CE)) {
3379 // The source string char was in the contraction table, and the corresponding
3380 // CE is not a prefix CE. We found the prefix, break
3381 // out of loop, this CE will end up being returned. This is the normal
3382 // way out of prefix handling when the source actually contained
3383 // the prefix.
3384 break;
3385 }
3386 }
3387 loadState(source, &prefixState, TRUE);
3388 break;
3389 }
3390
3391 case CONTRACTION_TAG:
3392 /* to ensure that the backwards and forwards iteration matches, we
3393 take the current region of most possible match and pass it through
3394 the forward iteration. this will ensure that the obstinate problem of
3395 overlapping contractions will not occur.
3396 */
3397 schar = peekCharacter(source, 0);
3398 constart = (UChar *)coll->image + getContractOffset(CE);
3399 if (isAtStartPrevIterate(source)
3400 /* commented away contraction end checks after adding the checks
3401 in getPrevCE */) {
3402 /* start of string or this is not the end of any contraction */
3403 CE = *(coll->contractionCEs +
3404 (constart - coll->contractionIndex));
3405 break;
3406 }
3407 strbuffer = buffer;
3408 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3409 *(UCharOffset --) = 0;
3410 noChars = 0;
3411 // have to swap thai characters
3412 while (ucol_unsafeCP(schar, coll)) {
3413 *(UCharOffset) = schar;
3414 noChars++;
3415 UCharOffset --;
3416 schar = getPrevNormalizedChar(source, status);
3417 goBackOne(source);
3418 // TODO: when we exhaust the contraction buffer,
3419 // it needs to get reallocated. The problem is
3420 // that the size depends on the string which is
3421 // not iterated over. However, since we're travelling
3422 // backwards, we already had to set the iterator at
3423 // the end - so we might as well know where we are?
3424 if (UCharOffset + 1 == buffer) {
3425 /* we have exhausted the buffer */
3426 int32_t newsize = 0;
3427 if(source->pos) { // actually dealing with a position
3428 newsize = source->pos - source->string + 1;
3429 } else { // iterator
3430 newsize = 4 * UCOL_MAX_BUFFER;
3431 }
3432 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3433 (newsize + UCOL_MAX_BUFFER));
3434 /* test for NULL */
3435 if (strbuffer == NULL) {
3436 *status = U_MEMORY_ALLOCATION_ERROR;
3437 return UCOL_NO_MORE_CES;
3438 }
3439 UCharOffset = strbuffer + newsize;
3440 uprv_memcpy(UCharOffset, buffer,
3441 UCOL_MAX_BUFFER * sizeof(UChar));
3442 UCharOffset --;
3443 }
3444 if ((source->pos && (source->pos == source->string ||
3445 ((source->flags & UCOL_ITER_INNORMBUF) &&
3446 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3447 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3448 break;
3449 }
3450 }
3451 /* adds the initial base character to the string */
3452 *(UCharOffset) = schar;
3453 noChars++;
3454
3455 /* a new collIterate is used to simplify things, since using the current
3456 collIterate will mean that the forward and backwards iteration will
3457 share and change the same buffers. we don't want to get into that. */
3458 collIterate temp;
3459 //IInit_collIterate(coll, UCharOffset, -1, &temp);
3460 IInit_collIterate(coll, UCharOffset, noChars, &temp);
3461 temp.flags &= ~UCOL_ITER_NORM;
3462
3463 CE = ucol_IGetNextCE(coll, &temp, status);
3464 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3465 while (CE != UCOL_NO_MORE_CES) {
3466 *(source->CEpos ++) = CE;
3467 if (source->CEpos == endCEBuffer) {
3468 /* ran out of CE space, bail.
3469 there's no guarantee of the right character position after
3470 this bail*/
3471 *status = U_BUFFER_OVERFLOW_ERROR;
3472 source->CEpos = source->CEs;
3473 freeHeapWritableBuffer(&temp);
3474 if (strbuffer != buffer) {
3475 uprv_free(strbuffer);
3476 }
3477 return (uint32_t)UCOL_NULLORDER;
3478 }
3479 CE = ucol_IGetNextCE(coll, &temp, status);
3480 }
3481 freeHeapWritableBuffer(&temp);
3482 if (strbuffer != buffer) {
3483 uprv_free(strbuffer);
3484 }
3485 source->toReturn = source->CEpos - 1;
3486 if (source->toReturn == source->CEs) {
3487 source->CEpos = source->CEs;
3488 }
3489 return *(source->toReturn);
3490 case LONG_PRIMARY_TAG:
3491 {
3492 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3493 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3494 source->toReturn = source->CEpos - 1;
3495 return *(source->toReturn);
3496 }
3497 case EXPANSION_TAG: /* this tag always returns */
3498 /*
3499 This should handle expansion.
3500 NOTE: we can encounter both continuations and expansions in an expansion!
3501 I have to decide where continuations are going to be dealt with
3502 */
3503 /* find the offset to expansion table */
3504 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3505 size = getExpansionCount(CE);
3506 if (size != 0) {
3507 /*
3508 if there are less than 16 elements in expansion, we don't terminate
3509 */
3510 uint32_t count;
3511 for (count = 0; count < size; count++) {
3512 *(source->CEpos ++) = *CEOffset++;
3513 }
3514 }
3515 else {
3516 /* else, we do */
3517 while (*CEOffset != 0) {
3518 *(source->CEpos ++) = *CEOffset ++;
3519 }
3520 }
3521 source->toReturn = source->CEpos - 1;
3522 // in case of one element expansion, we
3523 // want to immediately return CEpos
3524 if(source->toReturn == source->CEs) {
3525 source->CEpos = source->CEs;
3526 }
3527 return *(source->toReturn);
3528 case DIGIT_TAG:
3529 {
3530 /*
3531 We do a check to see if we want to collate digits as numbers; if so we generate
3532 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3533 */
3534 //uint32_t size;
3535 uint32_t i; /* general counter */
3536
3537 if (source->coll->numericCollation == UCOL_ON){
3538 collIterateState state = {0,0,0,0,0,0,0,0};
3539 UChar32 char32 = 0;
3540
3541 uint32_t digIndx = 0;
3542 uint32_t endIndex = 0;
3543 uint32_t leadingZeroIndex = 0;
3544 uint32_t trailingZeroCount = 0;
3545
3546 uint32_t primWeight = 0;
3547
3548 int32_t digVal = 0;
3549 uint8_t collateVal = 0;
3550
3551 UBool nonZeroValReached = FALSE;
3552
3553 uint8_t *numTempBuf;
3554 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3555 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3556
3557 numTempBuf = stackNumTempBuf;
3558 /*
3559 We parse the source string until we hit a char that's NOT a digit.
3560 Use this u_charDigitValue. This might be slow because we have to
3561 handle surrogates...
3562 */
3563
3564 if (U16_IS_TRAIL (ch)){
3565 if (!collIter_bos(source)){
3566 UChar lead = getPrevNormalizedChar(source, status);
3567 if(U16_IS_LEAD(lead)) {
3568 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3569 goBackOne(source);
3570 } else {
3571 char32 = ch;
3572 }
3573 } else {
3574 char32 = ch;
3575 }
3576 } else {
3577 char32 = ch;
3578 }
3579 digVal = u_charDigitValue(char32);
3580
3581 for(;;){
3582 // Make sure we have enough space.
3583 if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3584 {
3585 numTempBufSize *= 2;
3586 if (numTempBuf == stackNumTempBuf){
3587 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3588 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3589 }else
3590 uprv_realloc(numTempBuf, numTempBufSize);
3591 }
3592
3593 // Skip over trailing zeroes, and keep a count of them.
3594 if (digVal != 0)
3595 nonZeroValReached = TRUE;
3596 if (nonZeroValReached){
3597 /*
3598 We parse the digit string into base 100 numbers (this fits into a byte).
3599 We only add to the buffer in twos, thus if we are parsing an odd character,
3600 that serves as the 'tens' digit while the if we are parsing an even one, that
3601 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3602 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3603 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3604 than all the other bytes.
3605
3606 Since we're doing in this reverse we want to put the first digit encountered into the
3607 ones place and the second digit encountered into the tens place.
3608 */
3609
3610 if ((digIndx + trailingZeroCount) % 2 == 1){
3611 // High-order digit case (tens place)
3612 collateVal += (uint8_t)(digVal * 10);
3613
3614 // We cannot set leadingZeroIndex unless it has been set for the
3615 // low-order digit. Therefore, all we can do for the high-order
3616 // digit is turn it off, never on.
3617 // The only time we will have a high digit without a low is for
3618 // the very first non-zero digit, so no zero check is necessary.
3619 if (collateVal != 0)
3620 leadingZeroIndex = 0;
3621
3622 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3623 collateVal = 0;
3624 }
3625 else{
3626 // Low-order digit case (ones place)
3627 collateVal = (uint8_t)digVal;
3628
3629 // Check for leading zeroes.
3630 if (collateVal == 0)
3631 {
3632 if (!leadingZeroIndex)
3633 leadingZeroIndex = (digIndx/2) + 2;
3634 }
3635 else
3636 leadingZeroIndex = 0;
3637
3638 // No need to write to buffer; the case of a last odd digit
3639 // is handled below.
3640 }
3641 ++digIndx;
3642 }
3643 else
3644 ++trailingZeroCount;
3645
3646 if (!collIter_bos(source)){
3647 ch = getPrevNormalizedChar(source, status);
3648 //goBackOne(source);
3649 if (U16_IS_TRAIL(ch)){
3650 backupState(source, &state);
3651 if (!collIter_bos(source))
3652 {
3653 goBackOne(source);
3654 UChar lead = getPrevNormalizedChar(source, status);
3655 if(U16_IS_LEAD(lead)) {
3656 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3657 } else {
3658 loadState(source, &state, FALSE);
3659 char32 = ch;
3660 }
3661 }
3662 }
3663 else
3664 char32 = ch;
3665
3666 if ((digVal = u_charDigitValue(char32)) == -1){
3667 if (char32 > 0xFFFF) {// For surrogates.
3668 loadState(source, &state, FALSE);
3669 }
3670 // Don't need to "reverse" the goBackOne call,
3671 // as this points to the next position to process..
3672 //if (char32 > 0xFFFF) // For surrogates.
3673 //getNextNormalizedChar(source);
3674 break;
3675 }
3676 goBackOne(source);
3677 }else
3678 break;
3679 }
3680
3681 if (nonZeroValReached == FALSE){
3682 digIndx = 2;
3683 trailingZeroCount = 0;
3684 numTempBuf[2] = 6;
3685 }
3686
3687 if ((digIndx + trailingZeroCount) % 2 != 0){
3688 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3689 digIndx += 1; // The implicit leading zero
3690 }
3691 if (trailingZeroCount % 2 != 0){
3692 // We had to consume one trailing zero for the low digit
3693 // of the least significant byte
3694 digIndx += 1; // The trailing zero not in the exponent
3695 trailingZeroCount -= 1;
3696 }
3697
3698 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3699
3700 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3701 numTempBuf[2] -= 1;
3702
3703 /*
3704 We want to skip over the first two slots in the buffer. The first slot
3705 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3706 sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3707 The exponent must be adjusted by the number of leading zeroes, and the number of
3708 trailing zeroes.
3709 */
3710 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3711 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3712 if (leadingZeroIndex)
3713 exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3714 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3715
3716 // Now transfer the collation key to our collIterate struct.
3717 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3718 //size = ((endIndex+1) & ~1)/2;
3719 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3720 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3721 UCOL_BYTE_COMMON; // Tertiary weight.
3722 i = endIndex - 1; // Reset the index into the buffer.
3723 while(i >= 2)
3724 {
3725 primWeight = numTempBuf[i--] << 8;
3726 if ( i >= 2)
3727 primWeight |= numTempBuf[i--];
3728 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3729 }
3730 if (numTempBuf != stackNumTempBuf)
3731 uprv_free(numTempBuf);
3732
3733 source->toReturn = source->CEpos -1;
3734 return *(source->toReturn);
3735 }
3736 else {
3737 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3738 CE = *(CEOffset++);
3739 break;
3740 }
3741 }
3742 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3743 {
3744 const uint32_t
3745 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3746 //const uint32_t LCount = 19;
3747 const uint32_t VCount = 21;
3748 const uint32_t TCount = 28;
3749 //const uint32_t NCount = VCount * TCount; /* 588 */
3750 //const uint32_t SCount = LCount * NCount; /* 11172 */
3751
3752 uint32_t L = ch - SBase;
3753 /*
3754 divide into pieces.
3755 we do it in this order since some compilers can do % and / in one
3756 operation
3757 */
3758 uint32_t T = L % TCount;
3759 L /= TCount;
3760 uint32_t V = L % VCount;
3761 L /= VCount;
3762
3763 /* offset them */
3764 L += LBase;
3765 V += VBase;
3766 T += TBase;
3767
3768 /*
3769 return the first CE, but first put the rest into the expansion buffer
3770 */
3771 if (!source->coll->image->jamoSpecial)
3772 {
3773 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3774 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3775 if (T != TBase)
3776 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3777
3778 source->toReturn = source->CEpos - 1;
3779 return *(source->toReturn);
3780 } else {
3781 // Since Hanguls pass the FCD check, it is
3782 // guaranteed that we won't be in
3783 // the normalization buffer if something like this happens
3784 // Move Jamos into normalization buffer
3785 /*
3786 Move the Jamos into the
3787 normalization buffer
3788 */
3789 UChar *tempbuffer = source->writableBuffer +
3790 (source->writableBufSize - 1);
3791 *(tempbuffer) = 0;
3792 if (T != TBase) {
3793 *(tempbuffer - 1) = (UChar)T;
3794 *(tempbuffer - 2) = (UChar)V;
3795 *(tempbuffer - 3) = (UChar)L;
3796 *(tempbuffer - 4) = 0;
3797 } else {
3798 *(tempbuffer - 1) = (UChar)V;
3799 *(tempbuffer - 2) = (UChar)L;
3800 *(tempbuffer - 3) = 0;
3801 }
3802
3803 /*
3804 Indicate where to continue in main input string after exhausting
3805 the writableBuffer
3806 */
3807 if (source->pos == source->string) {
3808 source->fcdPosition = NULL;
3809 } else {
3810 source->fcdPosition = source->pos-1;
3811 }
3812
3813 source->pos = tempbuffer;
3814 source->origFlags = source->flags;
3815 source->flags |= UCOL_ITER_INNORMBUF;
3816 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3817
3818 return(UCOL_IGNORABLE);
3819 }
3820 }
3821 case LEAD_SURROGATE_TAG: /* D800-DBFF*/
3822 return 0; /* broken surrogate sequence */
3823 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3824 {
3825 UChar32 cp = 0;
3826 UChar prevChar;
3827 UChar *prev;
3828 if (isAtStartPrevIterate(source)) {
3829 /* we are at the start of the string, wrong place to be at */
3830 return 0;
3831 }
3832 if (source->pos != source->writableBuffer) {
3833 prev = source->pos - 1;
3834 } else {
3835 prev = source->fcdPosition;
3836 }
3837 prevChar = *prev;
3838
3839 /* Handles Han and Supplementary characters here.*/
3840 if (U16_IS_LEAD(prevChar)) {
3841 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3842 source->pos = prev;
3843 } else {
3844 return 0; /* completely ignorable */
3845 }
3846 return getPrevImplicit(cp, source);
3847 }
3848 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3849 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3850 return getPrevImplicit(ch, source);
3851 case IMPLICIT_TAG: /* everything that is not defined otherwise */
3852 return getPrevImplicit(ch, source);
3853 /* UCA is filled with these. Tailorings are NOT_FOUND */
3854 /* not yet implemented */
3855 case CHARSET_TAG: /* this tag always returns */
3856 /* probably after 1.8 */
3857 return UCOL_NOT_FOUND;
3858 default: /* this tag always returns */
3859 *status = U_INTERNAL_PROGRAM_ERROR;
3860 CE=0;
3861 break;
3862 }
3863 if (CE <= UCOL_NOT_FOUND) {
3864 break;
3865 }
3866 }
3867 return CE;
3868 }
3869
3870 /* This should really be a macro */
3871 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3872 /* anyway */
3873 static
3874 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3875 #ifdef UCOL_DEBUG
3876 fprintf(stderr, ".");
3877 #endif
3878 uint8_t *newStart = NULL;
3879 uint32_t offset = *secondaries-secStart;
3880
3881 if(secStart==second) {
3882 newStart=(uint8_t*)uprv_malloc(newSize);
3883 if(newStart==NULL) {
3884 *status = U_MEMORY_ALLOCATION_ERROR;
3885 return NULL;
3886 }
3887 uprv_memcpy(newStart, secStart, *secondaries-secStart);
3888 } else {
3889 newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3890 if(newStart==NULL) {
3891 *status = U_MEMORY_ALLOCATION_ERROR;
3892 return NULL;
3893 }
3894 }
3895 *secondaries=newStart+offset;
3896 *secSize=newSize;
3897 return newStart;
3898 }
3899
3900
3901 /* This should really be a macro */
3902 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3903 /* secondaries in French */
3904 /*
3905 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3906 uint8_t temp;
3907 while(start<end) {
3908 temp = *start;
3909 *start++ = *end;
3910 *end-- = temp;
3911 }
3912 }
3913 */
3914
3915 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3916 TYPE tempA; \
3917 while((start)<(end)) { \
3918 tempA = *(start); \
3919 *(start)++ = *(end); \
3920 *(end)-- = tempA; \
3921 } \
3922 }
3923
3924 /****************************************************************************/
3925 /* Following are the sortkey generation functions */
3926 /* */
3927 /****************************************************************************/
3928
3929 /**
3930 * Merge two sort keys.
3931 * This is useful, for example, to combine sort keys from first and last names
3932 * to sort such pairs.
3933 * Merged sort keys consider on each collation level the first part first entirely,
3934 * then the second one.
3935 * It is possible to merge multiple sort keys by consecutively merging
3936 * another one with the intermediate result.
3937 *
3938 * The length of the merge result is the sum of the lengths of the input sort keys
3939 * minus 1.
3940 *
3941 * @param src1 the first sort key
3942 * @param src1Length the length of the first sort key, including the zero byte at the end;
3943 * can be -1 if the function is to find the length
3944 * @param src2 the second sort key
3945 * @param src2Length the length of the second sort key, including the zero byte at the end;
3946 * can be -1 if the function is to find the length
3947 * @param dest the buffer where the merged sort key is written,
3948 * can be NULL if destCapacity==0
3949 * @param destCapacity the number of bytes in the dest buffer
3950 * @return the length of the merged sort key, src1Length+src2Length-1;
3951 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3952 * in which cases the contents of dest is undefined
3953 *
3954 * @draft
3955 */
3956 U_CAPI int32_t U_EXPORT2
3957 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3958 const uint8_t *src2, int32_t src2Length,
3959 uint8_t *dest, int32_t destCapacity) {
3960 int32_t destLength;
3961 uint8_t b;
3962
3963 /* check arguments */
3964 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
3965 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
3966 destCapacity<0 || (destCapacity>0 && dest==NULL)
3967 ) {
3968 /* error, attempt to write a zero byte and return 0 */
3969 if(dest!=NULL && destCapacity>0) {
3970 *dest=0;
3971 }
3972 return 0;
3973 }
3974
3975 /* check lengths and capacity */
3976 if(src1Length<0) {
3977 src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
3978 }
3979 if(src2Length<0) {
3980 src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
3981 }
3982
3983 destLength=src1Length+src2Length-1;
3984 if(destLength>destCapacity) {
3985 /* the merged sort key does not fit into the destination */
3986 return destLength;
3987 }
3988
3989 /* merge the sort keys with the same number of levels */
3990 while(*src1!=0 && *src2!=0) { /* while both have another level */
3991 /* copy level from src1 not including 00 or 01 */
3992 while((b=*src1)>=2) {
3993 ++src1;
3994 *dest++=b;
3995 }
3996
3997 /* add a 02 merge separator */
3998 *dest++=2;
3999
4000 /* copy level from src2 not including 00 or 01 */
4001 while((b=*src2)>=2) {
4002 ++src2;
4003 *dest++=b;
4004 }
4005
4006 /* if both sort keys have another level, then add a 01 level separator and continue */
4007 if(*src1==1 && *src2==1) {
4008 ++src1;
4009 ++src2;
4010 *dest++=1;
4011 }
4012 }
4013
4014 /*
4015 * here, at least one sort key is finished now, but the other one
4016 * might have some contents left from containing more levels;
4017 * that contents is just appended to the result
4018 */
4019 if(*src1!=0) {
4020 /* src1 is not finished, therefore *src2==0, and src1 is appended */
4021 src2=src1;
4022 }
4023 /* append src2, "the other, unfinished sort key" */
4024 uprv_strcpy((char *)dest, (const char *)src2);
4025
4026 /* trust that neither sort key contained illegally embedded zero bytes */
4027 return destLength;
4028 }
4029
4030 /* sortkey API */
4031 U_CAPI int32_t U_EXPORT2
4032 ucol_getSortKey(const UCollator *coll,
4033 const UChar *source,
4034 int32_t sourceLength,
4035 uint8_t *result,
4036 int32_t resultLength)
4037 {
4038 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4039 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4040 int32_t actualSrcLen = sourceLength;
4041 if (actualSrcLen==-1 && source!=NULL) {
4042 actualSrcLen = u_strlen(source);
4043 }
4044 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen);
4045 }
4046
4047 UErrorCode status = U_ZERO_ERROR;
4048 int32_t keySize = 0;
4049
4050 if(source != NULL) {
4051 // source == NULL is actually an error situation, but we would need to
4052 // have an error code to return it. Until we introduce a new
4053 // API, it stays like this
4054
4055 /* this uses the function pointer that is set in updateinternalstate */
4056 /* currently, there are two funcs: */
4057 /*ucol_calcSortKey(...);*/
4058 /*ucol_calcSortKeySimpleTertiary(...);*/
4059
4060 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4061 //((UCollator *)coll)->errorCode = status; /*semantically const */
4062 }
4063 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4064 UTRACE_EXIT_STATUS(status);
4065 return keySize;
4066 }
4067
4068 /* this function is called by the C++ API for sortkey generation */
4069 U_CFUNC int32_t
4070 ucol_getSortKeyWithAllocation(const UCollator *coll,
4071 const UChar *source, int32_t sourceLength,
4072 uint8_t **pResult,
4073 UErrorCode *pErrorCode) {
4074 *pResult = 0;
4075 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4076 }
4077
4078 #define UCOL_FSEC_BUF_SIZE 256
4079
4080 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
4081 /* or if we run out of space while making a sortkey and want to return ASAP */
4082 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4083 UErrorCode status = U_ZERO_ERROR;
4084 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4085 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4086 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4087 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4088 UBool compareIdent = (strength == UCOL_IDENTICAL);
4089 UBool doCase = (coll->caseLevel == UCOL_ON);
4090 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4091 //UBool qShifted = shifted && (compareQuad == 0);
4092 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4093 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4094 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4095 uint8_t *fSecs = fSecsBuff;
4096 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4097 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4098
4099 uint32_t variableTopValue = coll->variableTopValue;
4100 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4101 if(doHiragana) {
4102 UCOL_COMMON_BOT4++;
4103 /* allocate one more space for hiragana */
4104 }
4105 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4106
4107 uint32_t order = UCOL_NO_MORE_CES;
4108 uint8_t primary1 = 0;
4109 uint8_t primary2 = 0;
4110 uint8_t secondary = 0;
4111 uint8_t tertiary = 0;
4112 int32_t caseShift = 0;
4113 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4114
4115 uint8_t caseSwitch = coll->caseSwitch;
4116 uint8_t tertiaryMask = coll->tertiaryMask;
4117 uint8_t tertiaryCommon = coll->tertiaryCommon;
4118
4119 UBool wasShifted = FALSE;
4120 UBool notIsContinuation = FALSE;
4121 uint8_t leadPrimary = 0;
4122
4123
4124 for(;;) {
4125 order = ucol_IGetNextCE(coll, s, &status);
4126 if(order == UCOL_NO_MORE_CES) {
4127 break;
4128 }
4129
4130 if(order == 0) {
4131 continue;
4132 }
4133
4134 notIsContinuation = !isContinuation(order);
4135
4136
4137 if(notIsContinuation) {
4138 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4139 } else {
4140 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4141 }
4142 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4143 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4144 primary1 = (uint8_t)(order >> 8);
4145
4146
4147 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4148 || (!notIsContinuation && wasShifted))
4149 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4150 /* and other ignorables should be removed if following a shifted code point */
4151 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4152 /* we should just completely ignore it */
4153 continue;
4154 }
4155 if(compareQuad == 0) {
4156 if(c4 > 0) {
4157 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4158 c4 = 0;
4159 }
4160 currentSize++;
4161 if(primary2 != 0) {
4162 currentSize++;
4163 }
4164 }
4165 wasShifted = TRUE;
4166 } else {
4167 wasShifted = FALSE;
4168 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4169 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4170 /* calculate sortkey size */
4171 if(primary1 != UCOL_IGNORABLE) {
4172 if(notIsContinuation) {
4173 if(leadPrimary == primary1) {
4174 currentSize++;
4175 } else {
4176 if(leadPrimary != 0) {
4177 currentSize++;
4178 }
4179 if(primary2 == UCOL_IGNORABLE) {
4180 /* one byter, not compressed */
4181 currentSize++;
4182 leadPrimary = 0;
4183 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4184 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4185 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4186 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4187 /* not compressible */
4188 leadPrimary = 0;
4189 currentSize+=2;
4190 } else { /* compress */
4191 leadPrimary = primary1;
4192 currentSize+=2;
4193 }
4194 }
4195 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4196 currentSize++;
4197 if(primary2 != UCOL_IGNORABLE) {
4198 currentSize++;
4199 }
4200 }
4201 }
4202
4203 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4204 if(!isFrenchSec){
4205 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4206 c2++;
4207 } else {
4208 if(c2 > 0) {
4209 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4210 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4211 } else {
4212 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4213 }
4214 c2 = 0;
4215 }
4216 currentSize++;
4217 }
4218 } else {
4219 fSecs[fSecsLen++] = secondary;
4220 if(fSecsLen == fSecsMaxLen) {
4221 if(fSecs == fSecsBuff) {
4222 fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4223 } else {
4224 fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4225 }
4226 if(fSecs == NULL) {
4227 status = U_MEMORY_ALLOCATION_ERROR;
4228 return -1;
4229 }
4230 fSecsMaxLen *= 2;
4231 }
4232 if(notIsContinuation) {
4233 if (frenchStartPtr != NULL) {
4234 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4235 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4236 frenchStartPtr = NULL;
4237 }
4238 } else {
4239 if (frenchStartPtr == NULL) {
4240 frenchStartPtr = fSecs+fSecsLen-2;
4241 }
4242 frenchEndPtr = fSecs+fSecsLen-1;
4243 }
4244 }
4245 }
4246
4247 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4248 // do the case level if we need to do it. We don't want to calculate
4249 // case level for primary ignorables if we have only primary strength and case level
4250 // otherwise we would break well formedness of CEs
4251 if (caseShift == 0) {
4252 currentSize++;
4253 caseShift = UCOL_CASE_SHIFT_START;
4254 }
4255 if((tertiary&0x3F) > 0 && notIsContinuation) {
4256 caseShift--;
4257 if((tertiary &0xC0) != 0) {
4258 if (caseShift == 0) {
4259 currentSize++;
4260 caseShift = UCOL_CASE_SHIFT_START;
4261 }
4262 caseShift--;
4263 }
4264 }
4265 } else {
4266 if(notIsContinuation) {
4267 tertiary ^= caseSwitch;
4268 }
4269 }
4270
4271 tertiary &= tertiaryMask;
4272 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4273 if (tertiary == tertiaryCommon && notIsContinuation) {
4274 c3++;
4275 } else {
4276 if(c3 > 0) {
4277 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4278 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4279 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4280 } else {
4281 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4282 }
4283 c3 = 0;
4284 }
4285 currentSize++;
4286 }
4287 }
4288
4289 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4290 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4291 if(c4>0) { // Close this part
4292 currentSize += (c4/UCOL_BOT_COUNT4)+1;
4293 c4 = 0;
4294 }
4295 currentSize++; // Add the Hiragana
4296 } else { // This wasn't Hiragana, so we can continue adding stuff
4297 c4++;
4298 }
4299 }
4300
4301 }
4302 }
4303
4304 if(!isFrenchSec){
4305 if(c2 > 0) {
4306 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4307 }
4308 } else {
4309 uint32_t i = 0;
4310 if(frenchStartPtr != NULL) {
4311 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4312 }
4313 for(i = 0; i<fSecsLen; i++) {
4314 secondary = *(fSecs+fSecsLen-i-1);
4315 /* This is compression code. */
4316 if (secondary == UCOL_COMMON2) {
4317 ++c2;
4318 } else {
4319 if(c2 > 0) {
4320 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4321 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4322 } else {
4323 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4324 }
4325 c2 = 0;
4326 }
4327 currentSize++;
4328 }
4329 }
4330 if(c2 > 0) {
4331 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4332 }
4333 if(fSecs != fSecsBuff) {
4334 uprv_free(fSecs);
4335 }
4336 }
4337
4338 if(c3 > 0) {
4339 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4340 }
4341
4342 if(c4 > 0 && compareQuad == 0) {
4343 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4344 }
4345
4346 if(compareIdent) {
4347 currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4348 }
4349 return currentSize;
4350
4351 }
4352
4353 static
4354 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4355 if (caseShift == 0) {
4356 *(*cases)++ = UCOL_CASE_BYTE_START;
4357 caseShift = UCOL_CASE_SHIFT_START;
4358 }
4359 }
4360
4361 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4362 // know how many values we wanted to add, even if we didn't add them all
4363 static
4364 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4365 size++;
4366 if(primaries < limit) {
4367 *(primaries)++ = value;
4368 }
4369 }
4370
4371 // Packs the secondary buffer when processing French locale. Adds the terminator.
4372 static
4373 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4374 uint8_t secondary;
4375 int32_t count2 = 0;
4376 uint32_t i = 0, size = 0;
4377 // we use i here since the key size already accounts for terminators, so we'll discard the increment
4378 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4379 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4380 if(frenchStartPtr != NULL) {
4381 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4382 }
4383 for(i = 0; i<*secsize; i++) {
4384 secondary = *(secondaries-i-1);
4385 /* This is compression code. */
4386 if (secondary == UCOL_COMMON2) {
4387 ++count2;
4388 } else {
4389 if (count2 > 0) {
4390 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4391 while (count2 > UCOL_TOP_COUNT2) {
4392 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4393 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4394 }
4395 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4396 } else {
4397 while (count2 > UCOL_BOT_COUNT2) {
4398 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4399 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4400 }
4401 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4402 }
4403 count2 = 0;
4404 }
4405 addWithIncrement(primaries, primEnd, size, secondary);
4406 }
4407 }
4408 if (count2 > 0) {
4409 while (count2 > UCOL_BOT_COUNT2) {
4410 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4411 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4412 }
4413 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4414 }
4415 *secsize = size;
4416 return primaries;
4417 }
4418
4419 /* This is the sortkey work horse function */
4420 U_CFUNC int32_t U_CALLCONV
4421 ucol_calcSortKey(const UCollator *coll,
4422 const UChar *source,
4423 int32_t sourceLength,
4424 uint8_t **result,
4425 uint32_t resultLength,
4426 UBool allocateSKBuffer,
4427 UErrorCode *status)
4428 {
4429 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4430
4431 uint32_t i = 0; /* general purpose counter */
4432
4433 /* Stack allocated buffers for buffers we use */
4434 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4435
4436 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4437
4438 if(U_FAILURE(*status)) {
4439 return 0;
4440 }
4441
4442 if(primaries == NULL && allocateSKBuffer == TRUE) {
4443 primaries = *result = prim;
4444 resultLength = UCOL_PRIMARY_MAX_BUFFER;
4445 }
4446
4447 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4448 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4449
4450 uint32_t sortKeySize = 1; /* it is always \0 terminated */
4451
4452 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4453 UChar *normSource = normBuffer;
4454 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4455
4456 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4457
4458 UColAttributeValue strength = coll->strength;
4459
4460 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4461 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4462 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4463 UBool compareIdent = (strength == UCOL_IDENTICAL);
4464 UBool doCase = (coll->caseLevel == UCOL_ON);
4465 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4466 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
4467 //UBool qShifted = shifted && (compareQuad == 0);
4468 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4469 /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4470
4471 uint32_t variableTopValue = coll->variableTopValue;
4472 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4473 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4474 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4475 uint8_t UCOL_HIRAGANA_QUAD = 0;
4476 if(doHiragana) {
4477 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4478 /* allocate one more space for hiragana, value for hiragana */
4479 }
4480 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4481
4482 /* support for special features like caselevel and funky secondaries */
4483 uint8_t *frenchStartPtr = NULL;
4484 uint8_t *frenchEndPtr = NULL;
4485 uint32_t caseShift = 0;
4486
4487 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4488
4489 /* If we need to normalize, we'll do it all at once at the beginning! */
4490 UNormalizationMode normMode;
4491 if(compareIdent) {
4492 normMode = UNORM_NFD;
4493 } else if(coll->normalizationMode != UCOL_OFF) {
4494 normMode = UNORM_FCD;
4495 } else {
4496 normMode = UNORM_NONE;
4497 }
4498
4499 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4500 len = unorm_internalNormalize(normSource, normSourceLen,
4501 source, len,
4502 normMode, FALSE,
4503 status);
4504 if(*status == U_BUFFER_OVERFLOW_ERROR) {
4505 normSourceLen = len;
4506 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4507 if(normSource == NULL) {
4508 *status = U_MEMORY_ALLOCATION_ERROR;
4509 return 0;
4510 }
4511 *status = U_ZERO_ERROR;
4512 len = unorm_internalNormalize(normSource, normSourceLen,
4513 source, len,
4514 normMode, FALSE,
4515 status);
4516 }
4517
4518 if(U_FAILURE(*status)) {
4519 return 0;
4520 }
4521 source = normSource;
4522 }
4523
4524 collIterate s;
4525 IInit_collIterate(coll, (UChar *)source, len, &s);
4526 if(source == normSource) {
4527 s.flags &= ~UCOL_ITER_NORM;
4528 }
4529
4530 if(resultLength == 0 || primaries == NULL) {
4531 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4532 if(normSource != normBuffer) {
4533 uprv_free(normSource);
4534 }
4535 return keyLen;
4536 }
4537 uint8_t *primarySafeEnd = primaries + resultLength - 1;
4538 if(strength > UCOL_PRIMARY) {
4539 primarySafeEnd--;
4540 }
4541
4542 uint32_t minBufferSize = UCOL_MAX_BUFFER;
4543
4544 uint8_t *primStart = primaries;
4545 uint8_t *secStart = secondaries;
4546 uint8_t *terStart = tertiaries;
4547 uint8_t *caseStart = cases;
4548 uint8_t *quadStart = quads;
4549
4550 uint32_t order = 0;
4551
4552 uint8_t primary1 = 0;
4553 uint8_t primary2 = 0;
4554 uint8_t secondary = 0;
4555 uint8_t tertiary = 0;
4556 uint8_t caseSwitch = coll->caseSwitch;
4557 uint8_t tertiaryMask = coll->tertiaryMask;
4558 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4559 uint8_t tertiaryTop = coll->tertiaryTop;
4560 uint8_t tertiaryBottom = coll->tertiaryBottom;
4561 uint8_t tertiaryCommon = coll->tertiaryCommon;
4562 uint8_t caseBits = 0;
4563
4564 UBool finished = FALSE;
4565 UBool wasShifted = FALSE;
4566 UBool notIsContinuation = FALSE;
4567
4568 uint32_t prevBuffSize = 0;
4569
4570 uint32_t count2 = 0, count3 = 0, count4 = 0;
4571 uint8_t leadPrimary = 0;
4572
4573 for(;;) {
4574 for(i=prevBuffSize; i<minBufferSize; ++i) {
4575
4576 order = ucol_IGetNextCE(coll, &s, status);
4577 if(order == UCOL_NO_MORE_CES) {
4578 finished = TRUE;
4579 break;
4580 }
4581
4582 if(order == 0) {
4583 continue;
4584 }
4585
4586 notIsContinuation = !isContinuation(order);
4587
4588 if(notIsContinuation) {
4589 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4590 } else {
4591 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4592 }
4593
4594 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4595 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4596 primary1 = (uint8_t)(order >> 8);
4597
4598 /*if(notIsContinuation && scriptOrder != NULL) {
4599 primary1 = scriptOrder[primary1];
4600 }*/
4601
4602 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4603 || (!notIsContinuation && wasShifted))
4604 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4605 /* and other ignorables should be removed if following a shifted code point */
4606 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4607 /* we should just completely ignore it */
4608 continue;
4609 }
4610 if(compareQuad == 0) {
4611 if(count4 > 0) {
4612 while (count4 > UCOL_BOT_COUNT4) {
4613 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4614 count4 -= UCOL_BOT_COUNT4;
4615 }
4616 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4617 count4 = 0;
4618 }
4619 /* We are dealing with a variable and we're treating them as shifted */
4620 /* This is a shifted ignorable */
4621 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4622 *quads++ = primary1;
4623 }
4624 if(primary2 != 0) {
4625 *quads++ = primary2;
4626 }
4627 }
4628 wasShifted = TRUE;
4629 } else {
4630 wasShifted = FALSE;
4631 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4632 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
4633 /* regular and simple sortkey calc */
4634 if(primary1 != UCOL_IGNORABLE) {
4635 if(notIsContinuation) {
4636 if(leadPrimary == primary1) {
4637 *primaries++ = primary2;
4638 } else {
4639 if(leadPrimary != 0) {
4640 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4641 }
4642 if(primary2 == UCOL_IGNORABLE) {
4643 /* one byter, not compressed */
4644 *primaries++ = primary1;
4645 leadPrimary = 0;
4646 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4647 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4648 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4649 /* not compressible */
4650 leadPrimary = 0;
4651 *primaries++ = primary1;
4652 *primaries++ = primary2;
4653 } else { /* compress */
4654 *primaries++ = leadPrimary = primary1;
4655 *primaries++ = primary2;
4656 }
4657 }
4658 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4659 *primaries++ = primary1;
4660 if(primary2 != UCOL_IGNORABLE) {
4661 *primaries++ = primary2; /* second part */
4662 }
4663 }
4664 }
4665
4666 if(secondary > compareSec) {
4667 if(!isFrenchSec) {
4668 /* This is compression code. */
4669 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4670 ++count2;
4671 } else {
4672 if (count2 > 0) {
4673 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4674 while (count2 > UCOL_TOP_COUNT2) {
4675 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4676 count2 -= (uint32_t)UCOL_TOP_COUNT2;
4677 }
4678 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4679 } else {
4680 while (count2 > UCOL_BOT_COUNT2) {
4681 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4682 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4683 }
4684 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4685 }
4686 count2 = 0;
4687 }
4688 *secondaries++ = secondary;
4689 }
4690 } else {
4691 *secondaries++ = secondary;
4692 /* Do the special handling for French secondaries */
4693 /* We need to get continuation elements and do intermediate restore */
4694 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4695 if(notIsContinuation) {
4696 if (frenchStartPtr != NULL) {
4697 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4698 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4699 frenchStartPtr = NULL;
4700 }
4701 } else {
4702 if (frenchStartPtr == NULL) {
4703 frenchStartPtr = secondaries - 2;
4704 }
4705 frenchEndPtr = secondaries-1;
4706 }
4707 }
4708 }
4709
4710 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4711 // do the case level if we need to do it. We don't want to calculate
4712 // case level for primary ignorables if we have only primary strength and case level
4713 // otherwise we would break well formedness of CEs
4714 doCaseShift(&cases, caseShift);
4715 if(notIsContinuation) {
4716 caseBits = (uint8_t)(tertiary & 0xC0);
4717
4718 if(tertiary != 0) {
4719 if(coll->caseFirst == UCOL_UPPER_FIRST) {
4720 if((caseBits & 0xC0) == 0) {
4721 *(cases-1) |= 1 << (--caseShift);
4722 } else {
4723 *(cases-1) |= 0 << (--caseShift);
4724 /* second bit */
4725 doCaseShift(&cases, caseShift);
4726 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4727 }
4728 } else {
4729 if((caseBits & 0xC0) == 0) {
4730 *(cases-1) |= 0 << (--caseShift);
4731 } else {
4732 *(cases-1) |= 1 << (--caseShift);
4733 /* second bit */
4734 doCaseShift(&cases, caseShift);
4735 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4736 }
4737 }
4738 }
4739
4740 }
4741 } else {
4742 if(notIsContinuation) {
4743 tertiary ^= caseSwitch;
4744 }
4745 }
4746
4747 tertiary &= tertiaryMask;
4748 if(tertiary > compareTer) {
4749 /* This is compression code. */
4750 /* sequence size check is included in the if clause */
4751 if (tertiary == tertiaryCommon && notIsContinuation) {
4752 ++count3;
4753 } else {
4754 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4755 tertiary += tertiaryAddition;
4756 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4757 tertiary -= tertiaryAddition;
4758 }
4759 if (count3 > 0) {
4760 if ((tertiary > tertiaryCommon)) {
4761 while (count3 > coll->tertiaryTopCount) {
4762 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4763 count3 -= (uint32_t)coll->tertiaryTopCount;
4764 }
4765 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4766 } else {
4767 while (count3 > coll->tertiaryBottomCount) {
4768 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4769 count3 -= (uint32_t)coll->tertiaryBottomCount;
4770 }
4771 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4772 }
4773 count3 = 0;
4774 }
4775 *tertiaries++ = tertiary;
4776 }
4777 }
4778
4779 if(/*qShifted*/(compareQuad==0) && notIsContinuation) {
4780 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4781 if(count4>0) { // Close this part
4782 while (count4 > UCOL_BOT_COUNT4) {
4783 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4784 count4 -= UCOL_BOT_COUNT4;
4785 }
4786 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4787 count4 = 0;
4788 }
4789 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4790 } else { // This wasn't Hiragana, so we can continue adding stuff
4791 count4++;
4792 }
4793 }
4794 }
4795
4796 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4797 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4798 IInit_collIterate(coll, (UChar *)source, len, &s);
4799 if(source == normSource) {
4800 s.flags &= ~UCOL_ITER_NORM;
4801 }
4802 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4803 *status = U_BUFFER_OVERFLOW_ERROR;
4804 finished = TRUE;
4805 break;
4806 } else { /* It's much nicer if we can actually reallocate */
4807 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4808 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4809 if(U_SUCCESS(*status)) {
4810 *result = primStart;
4811 primarySafeEnd = primStart + resultLength - 1;
4812 if(strength > UCOL_PRIMARY) {
4813 primarySafeEnd--;
4814 }
4815 } else {
4816 IInit_collIterate(coll, (UChar *)source, len, &s);
4817 if(source == normSource) {
4818 s.flags &= ~UCOL_ITER_NORM;
4819 }
4820 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4821 finished = TRUE;
4822 break;
4823 }
4824 }
4825 }
4826 }
4827 if(finished) {
4828 break;
4829 } else {
4830 prevBuffSize = minBufferSize;
4831 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4832 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4833 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4834 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4835 minBufferSize *= 2;
4836 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4837 IInit_collIterate(coll, (UChar *)source, len, &s);
4838 if(source == normSource) {
4839 s.flags &= ~UCOL_ITER_NORM;
4840 }
4841 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4842 break;
4843 }
4844 }
4845 }
4846
4847 /* Here, we are generally done with processing */
4848 /* bailing out would not be too productive */
4849
4850 if(U_SUCCESS(*status)) {
4851 sortKeySize += (primaries - primStart);
4852 /* we have done all the CE's, now let's put them together to form a key */
4853 if(compareSec == 0) {
4854 if (count2 > 0) {
4855 while (count2 > UCOL_BOT_COUNT2) {
4856 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4857 count2 -= (uint32_t)UCOL_BOT_COUNT2;
4858 }
4859 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4860 }
4861 uint32_t secsize = secondaries-secStart;
4862 if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4863 sortKeySize += secsize;
4864 if(sortKeySize <= resultLength) {
4865 *(primaries++) = UCOL_LEVELTERMINATOR;
4866 uprv_memcpy(primaries, secStart, secsize);
4867 primaries += secsize;
4868 } else {
4869 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4870 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4871 if(U_SUCCESS(*status)) {
4872 *result = primStart;
4873 *(primaries++) = UCOL_LEVELTERMINATOR;
4874 uprv_memcpy(primaries, secStart, secsize);
4875 primaries += secsize;
4876 }
4877 } else {
4878 *status = U_BUFFER_OVERFLOW_ERROR;
4879 }
4880 }
4881 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4882 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4883 sortKeySize += secsize;
4884 if(sortKeySize <= resultLength) { // if we managed to pack fine
4885 primaries = newPrim; // update the primary pointer
4886 } else { // overflow, need to reallocate and redo
4887 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4888 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4889 if(U_SUCCESS(*status)) {
4890 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4891 }
4892 } else {
4893 *status = U_BUFFER_OVERFLOW_ERROR;
4894 }
4895 }
4896 }
4897 }
4898
4899 if(doCase) {
4900 uint32_t casesize = cases - caseStart;
4901 sortKeySize += casesize;
4902 if(sortKeySize <= resultLength) {
4903 *(primaries++) = UCOL_LEVELTERMINATOR;
4904 uprv_memcpy(primaries, caseStart, casesize);
4905 primaries += casesize;
4906 } else {
4907 if(allocateSKBuffer == TRUE) {
4908 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4909 if(U_SUCCESS(*status)) {
4910 *result = primStart;
4911 *(primaries++) = UCOL_LEVELTERMINATOR;
4912 uprv_memcpy(primaries, caseStart, casesize);
4913 }
4914 } else {
4915 *status = U_BUFFER_OVERFLOW_ERROR;
4916 }
4917 }
4918 }
4919
4920 if(compareTer == 0) {
4921 if (count3 > 0) {
4922 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4923 while (count3 >= coll->tertiaryTopCount) {
4924 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4925 count3 -= (uint32_t)coll->tertiaryTopCount;
4926 }
4927 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4928 } else {
4929 while (count3 > coll->tertiaryBottomCount) {
4930 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4931 count3 -= (uint32_t)coll->tertiaryBottomCount;
4932 }
4933 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4934 }
4935 }
4936 uint32_t tersize = tertiaries - terStart;
4937 sortKeySize += tersize;
4938 if(sortKeySize <= resultLength) {
4939 *(primaries++) = UCOL_LEVELTERMINATOR;
4940 uprv_memcpy(primaries, terStart, tersize);
4941 primaries += tersize;
4942 } else {
4943 if(allocateSKBuffer == TRUE) {
4944 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4945 if(U_SUCCESS(*status)) {
4946 *result = primStart;
4947 *(primaries++) = UCOL_LEVELTERMINATOR;
4948 uprv_memcpy(primaries, terStart, tersize);
4949 }
4950 } else {
4951 *status = U_BUFFER_OVERFLOW_ERROR;
4952 }
4953 }
4954
4955 if(compareQuad == 0/*qShifted == TRUE*/) {
4956 if(count4 > 0) {
4957 while (count4 > UCOL_BOT_COUNT4) {
4958 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4959 count4 -= UCOL_BOT_COUNT4;
4960 }
4961 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4962 }
4963 uint32_t quadsize = quads - quadStart;
4964 sortKeySize += quadsize;
4965 if(sortKeySize <= resultLength) {
4966 *(primaries++) = UCOL_LEVELTERMINATOR;
4967 uprv_memcpy(primaries, quadStart, quadsize);
4968 primaries += quadsize;
4969 } else {
4970 if(allocateSKBuffer == TRUE) {
4971 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4972 if(U_SUCCESS(*status)) {
4973 *result = primStart;
4974 *(primaries++) = UCOL_LEVELTERMINATOR;
4975 uprv_memcpy(primaries, quadStart, quadsize);
4976 }
4977 } else {
4978 *status = U_BUFFER_OVERFLOW_ERROR;
4979 }
4980 }
4981 }
4982
4983 if(compareIdent) {
4984 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4985 if(sortKeySize <= resultLength) {
4986 *(primaries++) = UCOL_LEVELTERMINATOR;
4987 primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
4988 } else {
4989 if(allocateSKBuffer == TRUE) {
4990 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
4991 if(U_SUCCESS(*status)) {
4992 *result = primStart;
4993 *(primaries++) = UCOL_LEVELTERMINATOR;
4994 u_writeIdenticalLevelRun(s.string, len, primaries);
4995 }
4996 } else {
4997 *status = U_BUFFER_OVERFLOW_ERROR;
4998 }
4999 }
5000 }
5001 }
5002 *(primaries++) = '\0';
5003 }
5004
5005 if(terStart != tert) {
5006 uprv_free(terStart);
5007 uprv_free(secStart);
5008 uprv_free(caseStart);
5009 uprv_free(quadStart);
5010 }
5011
5012 if(normSource != normBuffer) {
5013 uprv_free(normSource);
5014 }
5015
5016 if(allocateSKBuffer == TRUE) {
5017 *result = (uint8_t*)uprv_malloc(sortKeySize);
5018 /* test for NULL */
5019 if (*result == NULL) {
5020 *status = U_MEMORY_ALLOCATION_ERROR;
5021 return sortKeySize;
5022 }
5023 uprv_memcpy(*result, primStart, sortKeySize);
5024 if(primStart != prim) {
5025 uprv_free(primStart);
5026 }
5027 }
5028
5029 return sortKeySize;
5030 }
5031
5032
5033 U_CFUNC int32_t U_CALLCONV
5034 ucol_calcSortKeySimpleTertiary(const UCollator *coll,
5035 const UChar *source,
5036 int32_t sourceLength,
5037 uint8_t **result,
5038 uint32_t resultLength,
5039 UBool allocateSKBuffer,
5040 UErrorCode *status)
5041 {
5042 U_ALIGN_CODE(16);
5043
5044 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5045 uint32_t i = 0; /* general purpose counter */
5046
5047 /* Stack allocated buffers for buffers we use */
5048 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5049
5050 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5051
5052 if(U_FAILURE(*status)) {
5053 return 0;
5054 }
5055
5056 if(primaries == NULL && allocateSKBuffer == TRUE) {
5057 primaries = *result = prim;
5058 resultLength = UCOL_PRIMARY_MAX_BUFFER;
5059 }
5060
5061 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5062
5063 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5064
5065 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5066 UChar *normSource = normBuffer;
5067 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5068
5069 int32_t len = sourceLength;
5070
5071 /* If we need to normalize, we'll do it all at once at the beginning! */
5072 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5073 len = unorm_internalNormalize(normSource, normSourceLen,
5074 source, len,
5075 UNORM_FCD, FALSE,
5076 status);
5077 if(*status == U_BUFFER_OVERFLOW_ERROR) {
5078 normSourceLen = len;
5079 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5080 if(normSource == NULL) {
5081 *status = U_MEMORY_ALLOCATION_ERROR;
5082 return 0;
5083 }
5084 *status = U_ZERO_ERROR;
5085 len = unorm_internalNormalize(normSource, normSourceLen,
5086 source, len,
5087 UNORM_FCD, FALSE,
5088 status);
5089 }
5090
5091 if(U_FAILURE(*status)) {
5092 return 0;
5093 }
5094 source = normSource;
5095 }
5096
5097 collIterate s;
5098 IInit_collIterate(coll, (UChar *)source, len, &s);
5099 if(source == normSource) {
5100 s.flags &= ~UCOL_ITER_NORM;
5101 }
5102
5103 if(resultLength == 0 || primaries == NULL) {
5104 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5105 if(normSource != normBuffer) {
5106 uprv_free(normSource);
5107 }
5108 return t;
5109 }
5110
5111 uint8_t *primarySafeEnd = primaries + resultLength - 2;
5112
5113 uint32_t minBufferSize = UCOL_MAX_BUFFER;
5114
5115 uint8_t *primStart = primaries;
5116 uint8_t *secStart = secondaries;
5117 uint8_t *terStart = tertiaries;
5118
5119 uint32_t order = 0;
5120
5121 uint8_t primary1 = 0;
5122 uint8_t primary2 = 0;
5123 uint8_t secondary = 0;
5124 uint8_t tertiary = 0;
5125 uint8_t caseSwitch = coll->caseSwitch;
5126 uint8_t tertiaryMask = coll->tertiaryMask;
5127 int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5128 uint8_t tertiaryTop = coll->tertiaryTop;
5129 uint8_t tertiaryBottom = coll->tertiaryBottom;
5130 uint8_t tertiaryCommon = coll->tertiaryCommon;
5131
5132 uint32_t prevBuffSize = 0;
5133
5134 UBool finished = FALSE;
5135 UBool notIsContinuation = FALSE;
5136
5137 uint32_t count2 = 0, count3 = 0;
5138 uint8_t leadPrimary = 0;
5139
5140 for(;;) {
5141 for(i=prevBuffSize; i<minBufferSize; ++i) {
5142
5143 order = ucol_IGetNextCE(coll, &s, status);
5144
5145 if(order == 0) {
5146 continue;
5147 }
5148
5149 if(order == UCOL_NO_MORE_CES) {
5150 finished = TRUE;
5151 break;
5152 }
5153
5154 notIsContinuation = !isContinuation(order);
5155
5156 if(notIsContinuation) {
5157 tertiary = (uint8_t)((order & tertiaryMask));
5158 } else {
5159 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5160 }
5161 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5162 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5163 primary1 = (uint8_t)(order >> 8);
5164
5165 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5166 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
5167 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
5168 /* regular and simple sortkey calc */
5169 if(primary1 != UCOL_IGNORABLE) {
5170 if(notIsContinuation) {
5171 if(leadPrimary == primary1) {
5172 *primaries++ = primary2;
5173 } else {
5174 if(leadPrimary != 0) {
5175 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5176 }
5177 if(primary2 == UCOL_IGNORABLE) {
5178 /* one byter, not compressed */
5179 *primaries++ = primary1;
5180 leadPrimary = 0;
5181 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5182 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5183 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5184 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5185 /* not compressible */
5186 leadPrimary = 0;
5187 *primaries++ = primary1;
5188 *primaries++ = primary2;
5189 } else { /* compress */
5190 *primaries++ = leadPrimary = primary1;
5191 *primaries++ = primary2;
5192 }
5193 }
5194 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5195 *primaries++ = primary1;
5196 if(primary2 != UCOL_IGNORABLE) {
5197 *primaries++ = primary2; /* second part */
5198 }
5199 }
5200 }
5201
5202 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5203 /* This is compression code. */
5204 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5205 ++count2;
5206 } else {
5207 if (count2 > 0) {
5208 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5209 while (count2 > UCOL_TOP_COUNT2) {
5210 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5211 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5212 }
5213 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5214 } else {
5215 while (count2 > UCOL_BOT_COUNT2) {
5216 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5217 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5218 }
5219 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5220 }
5221 count2 = 0;
5222 }
5223 *secondaries++ = secondary;
5224 }
5225 }
5226
5227 if(notIsContinuation) {
5228 tertiary ^= caseSwitch;
5229 }
5230
5231 if(tertiary > 0) {
5232 /* This is compression code. */
5233 /* sequence size check is included in the if clause */
5234 if (tertiary == tertiaryCommon && notIsContinuation) {
5235 ++count3;
5236 } else {
5237 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5238 tertiary += tertiaryAddition;
5239 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5240 tertiary -= tertiaryAddition;
5241 }
5242 if (count3 > 0) {
5243 if ((tertiary > tertiaryCommon)) {
5244 while (count3 > coll->tertiaryTopCount) {
5245 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5246 count3 -= (uint32_t)coll->tertiaryTopCount;
5247 }
5248 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5249 } else {
5250 while (count3 > coll->tertiaryBottomCount) {
5251 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5252 count3 -= (uint32_t)coll->tertiaryBottomCount;
5253 }
5254 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5255 }
5256 count3 = 0;
5257 }
5258 *tertiaries++ = tertiary;
5259 }
5260 }
5261
5262 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5263 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5264 IInit_collIterate(coll, (UChar *)source, len, &s);
5265 if(source == normSource) {
5266 s.flags &= ~UCOL_ITER_NORM;
5267 }
5268 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5269 *status = U_BUFFER_OVERFLOW_ERROR;
5270 finished = TRUE;
5271 break;
5272 } else { /* It's much nicer if we can actually reallocate */
5273 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5274 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5275 if(U_SUCCESS(*status)) {
5276 *result = primStart;
5277 primarySafeEnd = primStart + resultLength - 2;
5278 } else {
5279 IInit_collIterate(coll, (UChar *)source, len, &s);
5280 if(source == normSource) {
5281 s.flags &= ~UCOL_ITER_NORM;
5282 }
5283 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5284 finished = TRUE;
5285 break;
5286 }
5287 }
5288 }
5289 }
5290 if(finished) {
5291 break;
5292 } else {
5293 prevBuffSize = minBufferSize;
5294 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5295 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5296 minBufferSize *= 2;
5297 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5298 IInit_collIterate(coll, (UChar *)source, len, &s);
5299 if(source == normSource) {
5300 s.flags &= ~UCOL_ITER_NORM;
5301 }
5302 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5303 break;
5304 }
5305 }
5306 }
5307
5308 if(U_SUCCESS(*status)) {
5309 sortKeySize += (primaries - primStart);
5310 /* we have done all the CE's, now let's put them together to form a key */
5311 if (count2 > 0) {
5312 while (count2 > UCOL_BOT_COUNT2) {
5313 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5314 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5315 }
5316 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5317 }
5318 uint32_t secsize = secondaries-secStart;
5319 sortKeySize += secsize;
5320 if(sortKeySize <= resultLength) {
5321 *(primaries++) = UCOL_LEVELTERMINATOR;
5322 uprv_memcpy(primaries, secStart, secsize);
5323 primaries += secsize;
5324 } else {
5325 if(allocateSKBuffer == TRUE) {
5326 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5327 if(U_SUCCESS(*status)) {
5328 *(primaries++) = UCOL_LEVELTERMINATOR;
5329 *result = primStart;
5330 uprv_memcpy(primaries, secStart, secsize);
5331 }
5332 } else {
5333 *status = U_BUFFER_OVERFLOW_ERROR;
5334 }
5335 }
5336
5337 if (count3 > 0) {
5338 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5339 while (count3 >= coll->tertiaryTopCount) {
5340 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5341 count3 -= (uint32_t)coll->tertiaryTopCount;
5342 }
5343 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5344 } else {
5345 while (count3 > coll->tertiaryBottomCount) {
5346 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5347 count3 -= (uint32_t)coll->tertiaryBottomCount;
5348 }
5349 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5350 }
5351 }
5352 uint32_t tersize = tertiaries - terStart;
5353 sortKeySize += tersize;
5354 if(sortKeySize <= resultLength) {
5355 *(primaries++) = UCOL_LEVELTERMINATOR;
5356 uprv_memcpy(primaries, terStart, tersize);
5357 primaries += tersize;
5358 } else {
5359 if(allocateSKBuffer == TRUE) {
5360 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5361 if(U_SUCCESS(*status)) {
5362 *result = primStart;
5363 *(primaries++) = UCOL_LEVELTERMINATOR;
5364 uprv_memcpy(primaries, terStart, tersize);
5365 }
5366 } else {
5367 *status = U_MEMORY_ALLOCATION_ERROR;
5368 }
5369 }
5370
5371 *(primaries++) = '\0';
5372 }
5373
5374 if(terStart != tert) {
5375 uprv_free(terStart);
5376 uprv_free(secStart);
5377 }
5378
5379 if(normSource != normBuffer) {
5380 uprv_free(normSource);
5381 }
5382
5383 if(allocateSKBuffer == TRUE) {
5384 *result = (uint8_t*)uprv_malloc(sortKeySize);
5385 /* test for NULL */
5386 if (*result == NULL) {
5387 *status = U_MEMORY_ALLOCATION_ERROR;
5388 return sortKeySize;
5389 }
5390 uprv_memcpy(*result, primStart, sortKeySize);
5391 if(primStart != prim) {
5392 uprv_free(primStart);
5393 }
5394 }
5395
5396 return sortKeySize;
5397 }
5398
5399 static inline
5400 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5401 UBool notIsContinuation = !isContinuation(CE);
5402 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5403 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5404 || (!notIsContinuation && *wasShifted))
5405 || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5406 // The stuff below should probably be in the sortkey code... maybe not...
5407 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5408 /* we should just completely ignore it */
5409 *wasShifted = TRUE;
5410 //continue;
5411 }
5412 //*wasShifted = TRUE;
5413 return TRUE;
5414 } else {
5415 *wasShifted = FALSE;
5416 return FALSE;
5417 }
5418 }
5419 static inline
5420 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5421 if(level < maxLevel) {
5422 dest[i++] = UCOL_LEVELTERMINATOR;
5423 } else {
5424 dest[i++] = 0;
5425 }
5426 }
5427
5428 /** enumeration of level identifiers for partial sort key generation */
5429 enum {
5430 UCOL_PSK_PRIMARY = 0,
5431 UCOL_PSK_SECONDARY = 1,
5432 UCOL_PSK_CASE = 2,
5433 UCOL_PSK_TERTIARY = 3,
5434 UCOL_PSK_QUATERNARY = 4,
5435 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */
5436 UCOL_PSK_IDENTICAL = 6,
5437 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */
5438 UCOL_PSK_LIMIT
5439 };
5440
5441 /** collation state enum. *_SHIFT value is how much to shift right
5442 * to get the state piece to the right. *_MASK value should be
5443 * ANDed with the shifted state. This data is stored in state[1]
5444 * field.
5445 */
5446 enum {
5447 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */
5448 UCOL_PSK_LEVEL_MASK = 7, /** three bits */
5449 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5450 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5451 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5452 * This field is also used to denote that the French secondary level is finished
5453 */
5454 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5455 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5456 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5457 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5458 /** When we do French we need to reverse secondary values. However, continuations
5459 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5460 */
5461 UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5462 UCOL_PSK_BOCSU_BYTES_MASK = 3,
5463 UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5464 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5465 };
5466
5467 // macro calculating the number of expansion CEs available
5468 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5469
5470
5471 /** main sortkey part procedure. On the first call,
5472 * you should pass in a collator, an iterator, empty state
5473 * state[0] == state[1] == 0, a buffer to hold results
5474 * number of bytes you need and an error code pointer.
5475 * Make sure your buffer is big enough to hold the wanted
5476 * number of sortkey bytes. I don't check.
5477 * The only meaningful status you can get back is
5478 * U_BUFFER_OVERFLOW_ERROR, which basically means that you
5479 * have been dealt a raw deal and that you probably won't
5480 * be able to use partial sortkey generation for this
5481 * particular combination of string and collator. This
5482 * is highly unlikely, but you should still check the error code.
5483 * Any other status means that you're not in a sane situation
5484 * anymore. After the first call, preserve state values and
5485 * use them on subsequent calls to obtain more bytes of a sortkey.
5486 * Use until the number of bytes written is smaller than the requested
5487 * number of bytes. Generated sortkey is not compatible with the
5488 * one generated by ucol_getSortKey, as we don't do any compression.
5489 * However, levels are still terminated by a 1 (one) and the sortkey
5490 * is terminated by a 0 (zero). Identical level is the same as in the
5491 * regular sortkey - internal bocu-1 implementation is used.
5492 * For curious, although you cannot do much about this, here is
5493 * the structure of state words.
5494 * state[0] - iterator state. Depends on the iterator implementation,
5495 * but allows the iterator to continue where it stopped in
5496 * the last iteration.
5497 * state[1] - collation processing state. Here is the distribution
5498 * of the bits:
5499 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5500 * quaternary, quin (we don't use this one), identical and
5501 * null (producing only zeroes - first one to terminate the
5502 * sortkey and subsequent to fill the buffer).
5503 * 3 - byte count. Number of bytes written on the primary level.
5504 * 4 - was shifted. Whether the previous iteration finished in the
5505 * shifted state.
5506 * 5, 6 - French continuation bytes written. See the comment in the enum
5507 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on
5508 * the identical level.
5509 * 9..31 - CEs consumed. Number of getCE or next32 operations performed
5510 * since thes last successful update of the iterator state.
5511 */
5512 U_CAPI int32_t U_EXPORT2
5513 ucol_nextSortKeyPart(const UCollator *coll,
5514 UCharIterator *iter,
5515 uint32_t state[2],
5516 uint8_t *dest, int32_t count,
5517 UErrorCode *status) {
5518 /* error checking */
5519 if(status==NULL || U_FAILURE(*status)) {
5520 return 0;
5521 }
5522 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5523 if( coll==NULL || iter==NULL ||
5524 state==NULL ||
5525 count<0 || (count>0 && dest==NULL)
5526 ) {
5527 *status=U_ILLEGAL_ARGUMENT_ERROR;
5528 }
5529
5530 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5531 coll, iter, state[0], state[1], dest, count);
5532
5533 if(count==0) {
5534 /* nothing to do */
5535 UTRACE_EXIT_VALUE(0);
5536 return 0;
5537 }
5538 /** Setting up situation according to the state we got from the previous iteration */
5539 // The state of the iterator from the previous invocation
5540 uint32_t iterState = state[0];
5541 // Has the last iteration ended in the shifted state
5542 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5543 // What is the current level of the sortkey?
5544 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5545 // Have we written only one byte from a two byte primary in the previous iteration?
5546 // Also on secondary level - have we finished with the French secondary?
5547 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5548 // number of bytes in the continuation buffer for French
5549 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5550 // Number of bytes already written from a bocsu sequence. Since
5551 // the longes bocsu sequence is 4 long, this can be up to 3.
5552 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5553 // Number of elements that need to be consumed in this iteration because
5554 // the iterator returned UITER_NO_STATE at the end of the last iteration,
5555 // so we had to save the last valid state.
5556 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5557
5558 /** values that depend on the collator attributes */
5559 // strength of the collator.
5560 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5561 // maximal level of the partial sortkey. Need to take whether case level is done
5562 int32_t maxLevel = 0;
5563 if(strength < UCOL_TERTIARY) {
5564 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5565 maxLevel = UCOL_PSK_CASE;
5566 } else {
5567 maxLevel = strength;
5568 }
5569 } else {
5570 if(strength == UCOL_TERTIARY) {
5571 maxLevel = UCOL_PSK_TERTIARY;
5572 } else if(strength == UCOL_QUATERNARY) {
5573 maxLevel = UCOL_PSK_QUATERNARY;
5574 } else { // identical
5575 maxLevel = UCOL_IDENTICAL;
5576 }
5577 }
5578 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5579 uint8_t UCOL_HIRAGANA_QUAD =
5580 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5581 // Boundary value that decides whether a CE is shifted or not
5582 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5583 // Are we doing French collation?
5584 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5585
5586 /** initializing the collation state */
5587 UBool notIsContinuation = FALSE;
5588 uint32_t CE = UCOL_NO_MORE_CES;
5589
5590 collIterate s;
5591 IInit_collIterate(coll, NULL, -1, &s);
5592 s.iterator = iter;
5593 s.flags |= UCOL_USE_ITERATOR;
5594 // This variable tells us whether we have produced some other levels in this iteration
5595 // before we moved to the identical level. In that case, we need to switch the
5596 // type of the iterator.
5597 UBool doingIdenticalFromStart = FALSE;
5598 // Normalizing iterator
5599 // The division for the array length may truncate the array size to
5600 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5601 // for all platforms anyway.
5602 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5603 UNormIterator *normIter = NULL;
5604 // If the normalization is turned on for the collator and we are below identical level
5605 // we will use a FCD normalizing iterator
5606 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5607 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5608 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5609 s.flags &= ~UCOL_ITER_NORM;
5610 if(U_FAILURE(*status)) {
5611 UTRACE_EXIT_STATUS(*status);
5612 return 0;
5613 }
5614 } else if(level == UCOL_PSK_IDENTICAL) {
5615 // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5616 // will be updating the state - and this cannot be done on an ordinary iterator.
5617 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5618 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5619 s.flags &= ~UCOL_ITER_NORM;
5620 if(U_FAILURE(*status)) {
5621 UTRACE_EXIT_STATUS(*status);
5622 return 0;
5623 }
5624 doingIdenticalFromStart = TRUE;
5625 }
5626
5627 // This is the tentative new state of the iterator. The problem
5628 // is that the iterator might return an undefined state, in
5629 // which case we should save the last valid state and increase
5630 // the iterator skip value.
5631 uint32_t newState = 0;
5632
5633 // First, we set the iterator to the last valid position
5634 // from the last iteration. This was saved in state[0].
5635 if(iterState == 0) {
5636 /* initial state */
5637 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5638 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5639 } else {
5640 s.iterator->move(s.iterator, 0, UITER_START);
5641 }
5642 } else {
5643 /* reset to previous state */
5644 s.iterator->setState(s.iterator, iterState, status);
5645 if(U_FAILURE(*status)) {
5646 UTRACE_EXIT_STATUS(*status);
5647 return 0;
5648 }
5649 }
5650
5651
5652
5653 // This variable tells us whether we can attempt to update the state
5654 // of iterator. Situations where we don't want to update iterator state
5655 // are the existence of expansion CEs that are not yet processed, and
5656 // finishing the case level without enough space in the buffer to insert
5657 // a level terminator.
5658 UBool canUpdateState = TRUE;
5659
5660 // Consume all the CEs that were consumed at the end of the previous
5661 // iteration without updating the iterator state. On identical level,
5662 // consume the code points.
5663 int32_t counter = cces;
5664 if(level < UCOL_PSK_IDENTICAL) {
5665 while(counter-->0) {
5666 // If we're doing French and we are on the secondary level,
5667 // we go backwards.
5668 if(level == UCOL_PSK_SECONDARY && doingFrench) {
5669 CE = ucol_IGetPrevCE(coll, &s, status);
5670 } else {
5671 CE = ucol_IGetNextCE(coll, &s, status);
5672 }
5673 if(CE==UCOL_NO_MORE_CES) {
5674 /* should not happen */
5675 *status=U_INTERNAL_PROGRAM_ERROR;
5676 UTRACE_EXIT_STATUS(*status);
5677 return 0;
5678 }
5679 if(uprv_numAvailableExpCEs(s)) {
5680 canUpdateState = FALSE;
5681 }
5682 }
5683 } else {
5684 while(counter-->0) {
5685 uiter_next32(s.iterator);
5686 }
5687 }
5688
5689 // French secondary needs to know whether the iterator state of zero came from previous level OR
5690 // from a new invocation...
5691 UBool wasDoingPrimary = FALSE;
5692 // destination buffer byte counter. When this guy
5693 // gets to count, we're done with the iteration
5694 int32_t i = 0;
5695 // used to count the zero bytes written after we
5696 // have finished with the sort key
5697 int32_t j = 0;
5698
5699
5700 // Hm.... I think we're ready to plunge in. Basic story is as following:
5701 // we have a fall through case based on level. This is used for initial
5702 // positioning on iteration start. Every level processor contains a
5703 // for(;;) which will be broken when we exhaust all the CEs. Other
5704 // way to exit is a goto saveState, which happens when we have filled
5705 // out our buffer.
5706 switch(level) {
5707 case UCOL_PSK_PRIMARY:
5708 wasDoingPrimary = TRUE;
5709 for(;;) {
5710 if(i==count) {
5711 goto saveState;
5712 }
5713 // We should save the state only if we
5714 // are sure that we are done with the
5715 // previous iterator state
5716 if(canUpdateState && byteCountOrFrenchDone == 0) {
5717 newState = s.iterator->getState(s.iterator);
5718 if(newState != UITER_NO_STATE) {
5719 iterState = newState;
5720 cces = 0;
5721 }
5722 }
5723 CE = ucol_IGetNextCE(coll, &s, status);
5724 cces++;
5725 if(CE==UCOL_NO_MORE_CES) {
5726 // Add the level separator
5727 terminatePSKLevel(level, maxLevel, i, dest);
5728 byteCountOrFrenchDone=0;
5729 // Restart the iteration an move to the
5730 // second level
5731 s.iterator->move(s.iterator, 0, UITER_START);
5732 cces = 0;
5733 level = UCOL_PSK_SECONDARY;
5734 break;
5735 }
5736 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5737 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5738 if(CE != 0) {
5739 if(byteCountOrFrenchDone == 0) {
5740 // get the second byte of primary
5741 dest[i++]=(uint8_t)(CE >> 8);
5742 } else {
5743 byteCountOrFrenchDone = 0;
5744 }
5745 if((CE &=0xff)!=0) {
5746 if(i==count) {
5747 /* overflow */
5748 byteCountOrFrenchDone = 1;
5749 cces--;
5750 goto saveState;
5751 }
5752 dest[i++]=(uint8_t)CE;
5753 }
5754 }
5755 }
5756 if(uprv_numAvailableExpCEs(s)) {
5757 canUpdateState = FALSE;
5758 } else {
5759 canUpdateState = TRUE;
5760 }
5761 }
5762 /* fall through to next level */
5763 case UCOL_PSK_SECONDARY:
5764 if(strength >= UCOL_SECONDARY) {
5765 if(!doingFrench) {
5766 for(;;) {
5767 if(i == count) {
5768 goto saveState;
5769 }
5770 // We should save the state only if we
5771 // are sure that we are done with the
5772 // previous iterator state
5773 if(canUpdateState) {
5774 newState = s.iterator->getState(s.iterator);
5775 if(newState != UITER_NO_STATE) {
5776 iterState = newState;
5777 cces = 0;
5778 }
5779 }
5780 CE = ucol_IGetNextCE(coll, &s, status);
5781 cces++;
5782 if(CE==UCOL_NO_MORE_CES) {
5783 // Add the level separator
5784 terminatePSKLevel(level, maxLevel, i, dest);
5785 byteCountOrFrenchDone = 0;
5786 // Restart the iteration an move to the
5787 // second level
5788 s.iterator->move(s.iterator, 0, UITER_START);
5789 cces = 0;
5790 level = UCOL_PSK_CASE;
5791 break;
5792 }
5793 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5794 CE >>= 8; /* get secondary */
5795 if(CE != 0) {
5796 dest[i++]=(uint8_t)CE;
5797 }
5798 }
5799 if(uprv_numAvailableExpCEs(s)) {
5800 canUpdateState = FALSE;
5801 } else {
5802 canUpdateState = TRUE;
5803 }
5804 }
5805 } else { // French secondary processing
5806 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5807 int32_t frenchIndex = 0;
5808 // Here we are going backwards.
5809 // If the iterator is at the beggining, it should be
5810 // moved to end.
5811 if(wasDoingPrimary) {
5812 s.iterator->move(s.iterator, 0, UITER_LIMIT);
5813 cces = 0;
5814 }
5815 for(;;) {
5816 if(i == count) {
5817 goto saveState;
5818 }
5819 if(canUpdateState) {
5820 newState = s.iterator->getState(s.iterator);
5821 if(newState != UITER_NO_STATE) {
5822 iterState = newState;
5823 cces = 0;
5824 }
5825 }
5826 CE = ucol_IGetPrevCE(coll, &s, status);
5827 cces++;
5828 if(CE==UCOL_NO_MORE_CES) {
5829 // Add the level separator
5830 terminatePSKLevel(level, maxLevel, i, dest);
5831 byteCountOrFrenchDone = 0;
5832 // Restart the iteration an move to the next level
5833 s.iterator->move(s.iterator, 0, UITER_START);
5834 level = UCOL_PSK_CASE;
5835 break;
5836 }
5837 if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5838 // reverse when we get a first non-continuation CE.
5839 CE >>= 8;
5840 frenchBuff[frenchIndex++] = (uint8_t)CE;
5841 } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5842 CE >>= 8; /* get secondary */
5843 if(!frenchIndex) {
5844 if(CE != 0) {
5845 dest[i++]=(uint8_t)CE;
5846 }
5847 } else {
5848 frenchBuff[frenchIndex++] = (uint8_t)CE;
5849 frenchIndex -= usedFrench;
5850 usedFrench = 0;
5851 while(i < count && frenchIndex) {
5852 dest[i++] = frenchBuff[--frenchIndex];
5853 usedFrench++;
5854 }
5855 }
5856 }
5857 if(uprv_numAvailableExpCEs(s)) {
5858 canUpdateState = FALSE;
5859 } else {
5860 canUpdateState = TRUE;
5861 }
5862 }
5863 }
5864 } else {
5865 level = UCOL_PSK_CASE;
5866 }
5867 /* fall through to next level */
5868 case UCOL_PSK_CASE:
5869 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5870 uint32_t caseShift = UCOL_CASE_SHIFT_START;
5871 uint8_t caseByte = UCOL_CASE_BYTE_START;
5872 uint8_t caseBits = 0;
5873
5874 for(;;) {
5875 if(i == count) {
5876 goto saveState;
5877 }
5878 // We should save the state only if we
5879 // are sure that we are done with the
5880 // previous iterator state
5881 if(canUpdateState) {
5882 newState = s.iterator->getState(s.iterator);
5883 if(newState != UITER_NO_STATE) {
5884 iterState = newState;
5885 cces = 0;
5886 }
5887 }
5888 CE = ucol_IGetNextCE(coll, &s, status);
5889 cces++;
5890 if(CE==UCOL_NO_MORE_CES) {
5891 // On the case level we might have an unfinished
5892 // case byte. Add one if it's started.
5893 if(caseShift != UCOL_CASE_SHIFT_START) {
5894 dest[i++] = caseByte;
5895 }
5896 cces = 0;
5897 // We have finished processing CEs on this level.
5898 // However, we don't know if we have enough space
5899 // to add a case level terminator.
5900 if(i < count) {
5901 // Add the level separator
5902 terminatePSKLevel(level, maxLevel, i, dest);
5903 // Restart the iteration and move to the
5904 // next level
5905 s.iterator->move(s.iterator, 0, UITER_START);
5906 level = UCOL_PSK_TERTIARY;
5907 } else {
5908 canUpdateState = FALSE;
5909 }
5910 break;
5911 }
5912
5913 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5914 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5915 // do the case level if we need to do it. We don't want to calculate
5916 // case level for primary ignorables if we have only primary strength and case level
5917 // otherwise we would break well formedness of CEs
5918 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5919 caseBits = (uint8_t)(CE & 0xC0);
5920 // this copies the case level logic from the
5921 // sort key generation code
5922 if(CE != 0) {
5923 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5924 if((caseBits & 0xC0) == 0) {
5925 caseByte |= 1 << (--caseShift);
5926 } else {
5927 caseByte |= 0 << (--caseShift);
5928 /* second bit */
5929 if(caseShift == 0) {
5930 dest[i++] = caseByte;
5931 caseShift = UCOL_CASE_SHIFT_START;
5932 caseByte = UCOL_CASE_BYTE_START;
5933 }
5934 caseByte |= ((caseBits>>6)&1) << (--caseShift);
5935 }
5936 } else {
5937 if((caseBits & 0xC0) == 0) {
5938 caseByte |= 0 << (--caseShift);
5939 } else {
5940 caseByte |= 1 << (--caseShift);
5941 /* second bit */
5942 if(caseShift == 0) {
5943 dest[i++] = caseByte;
5944 caseShift = UCOL_CASE_SHIFT_START;
5945 caseByte = UCOL_CASE_BYTE_START;
5946 }
5947 caseByte |= ((caseBits>>7)&1) << (--caseShift);
5948 }
5949 }
5950 }
5951
5952 }
5953 }
5954 // Not sure this is correct for the case level - revisit
5955 if(uprv_numAvailableExpCEs(s)) {
5956 canUpdateState = FALSE;
5957 } else {
5958 canUpdateState = TRUE;
5959 }
5960 }
5961 } else {
5962 level = UCOL_PSK_TERTIARY;
5963 }
5964 /* fall through to next level */
5965 case UCOL_PSK_TERTIARY:
5966 if(strength >= UCOL_TERTIARY) {
5967 for(;;) {
5968 if(i == count) {
5969 goto saveState;
5970 }
5971 // We should save the state only if we
5972 // are sure that we are done with the
5973 // previous iterator state
5974 if(canUpdateState) {
5975 newState = s.iterator->getState(s.iterator);
5976 if(newState != UITER_NO_STATE) {
5977 iterState = newState;
5978 cces = 0;
5979 }
5980 }
5981 CE = ucol_IGetNextCE(coll, &s, status);
5982 cces++;
5983 if(CE==UCOL_NO_MORE_CES) {
5984 // Add the level separator
5985 terminatePSKLevel(level, maxLevel, i, dest);
5986 byteCountOrFrenchDone = 0;
5987 // Restart the iteration an move to the
5988 // second level
5989 s.iterator->move(s.iterator, 0, UITER_START);
5990 cces = 0;
5991 level = UCOL_PSK_QUATERNARY;
5992 break;
5993 }
5994 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5995 notIsContinuation = !isContinuation(CE);
5996
5997 if(notIsContinuation) {
5998 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5999 CE ^= coll->caseSwitch;
6000 CE &= coll->tertiaryMask;
6001 } else {
6002 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6003 }
6004
6005 if(CE != 0) {
6006 dest[i++]=(uint8_t)CE;
6007 }
6008 }
6009 if(uprv_numAvailableExpCEs(s)) {
6010 canUpdateState = FALSE;
6011 } else {
6012 canUpdateState = TRUE;
6013 }
6014 }
6015 } else {
6016 // if we're not doing tertiary
6017 // skip to the end
6018 level = UCOL_PSK_NULL;
6019 }
6020 /* fall through to next level */
6021 case UCOL_PSK_QUATERNARY:
6022 if(strength >= UCOL_QUATERNARY) {
6023 for(;;) {
6024 if(i == count) {
6025 goto saveState;
6026 }
6027 // We should save the state only if we
6028 // are sure that we are done with the
6029 // previous iterator state
6030 if(canUpdateState) {
6031 newState = s.iterator->getState(s.iterator);
6032 if(newState != UITER_NO_STATE) {
6033 iterState = newState;
6034 cces = 0;
6035 }
6036 }
6037 CE = ucol_IGetNextCE(coll, &s, status);
6038 cces++;
6039 if(CE==UCOL_NO_MORE_CES) {
6040 // Add the level separator
6041 terminatePSKLevel(level, maxLevel, i, dest);
6042 //dest[i++] = UCOL_LEVELTERMINATOR;
6043 byteCountOrFrenchDone = 0;
6044 // Restart the iteration an move to the
6045 // second level
6046 s.iterator->move(s.iterator, 0, UITER_START);
6047 cces = 0;
6048 level = UCOL_PSK_QUIN;
6049 break;
6050 }
6051 if(isShiftedCE(CE, LVT, &wasShifted)) {
6052 CE >>= 16; /* get primary */
6053 if(CE != 0) {
6054 if(byteCountOrFrenchDone == 0) {
6055 dest[i++]=(uint8_t)(CE >> 8);
6056 } else {
6057 byteCountOrFrenchDone = 0;
6058 }
6059 if((CE &=0xff)!=0) {
6060 if(i==count) {
6061 /* overflow */
6062 byteCountOrFrenchDone = 1;
6063 goto saveState;
6064 }
6065 dest[i++]=(uint8_t)CE;
6066 }
6067 }
6068 } else {
6069 notIsContinuation = !isContinuation(CE);
6070 if(notIsContinuation) {
6071 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6072 dest[i++] = UCOL_HIRAGANA_QUAD;
6073 } else {
6074 dest[i++] = 0xFF;
6075 }
6076 }
6077 }
6078 if(uprv_numAvailableExpCEs(s)) {
6079 canUpdateState = FALSE;
6080 } else {
6081 canUpdateState = TRUE;
6082 }
6083 }
6084 } else {
6085 // if we're not doing quaternary
6086 // skip to the end
6087 level = UCOL_PSK_NULL;
6088 }
6089 /* fall through to next level */
6090 case UCOL_PSK_QUIN:
6091 level = UCOL_PSK_IDENTICAL;
6092 /* fall through to next level */
6093 case UCOL_PSK_IDENTICAL:
6094 if(strength >= UCOL_IDENTICAL) {
6095 UChar32 first, second;
6096 int32_t bocsuBytesWritten = 0;
6097 // We always need to do identical on
6098 // the NFD form of the string.
6099 if(normIter == NULL) {
6100 // we arrived from the level below and
6101 // normalization was not turned on.
6102 // therefore, we need to make a fresh NFD iterator
6103 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6104 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6105 } else if(!doingIdenticalFromStart) {
6106 // there is an iterator, but we did some other levels.
6107 // therefore, we have a FCD iterator - need to make
6108 // a NFD one.
6109 // normIter being at the beginning does not guarantee
6110 // that the underlying iterator is at the beginning
6111 iter->move(iter, 0, UITER_START);
6112 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6113 }
6114 // At this point we have a NFD iterator that is positioned
6115 // in the right place
6116 if(U_FAILURE(*status)) {
6117 UTRACE_EXIT_STATUS(*status);
6118 return 0;
6119 }
6120 first = uiter_previous32(s.iterator);
6121 // maybe we're at the start of the string
6122 if(first == U_SENTINEL) {
6123 first = 0;
6124 } else {
6125 uiter_next32(s.iterator);
6126 }
6127
6128 j = 0;
6129 for(;;) {
6130 if(i == count) {
6131 if(j+1 < bocsuBytesWritten) {
6132 bocsuBytesUsed = j+1;
6133 }
6134 goto saveState;
6135 }
6136
6137 // On identical level, we will always save
6138 // the state if we reach this point, since
6139 // we don't depend on getNextCE for content
6140 // all the content is in our buffer and we
6141 // already either stored the full buffer OR
6142 // otherwise we won't arrive here.
6143 newState = s.iterator->getState(s.iterator);
6144 if(newState != UITER_NO_STATE) {
6145 iterState = newState;
6146 cces = 0;
6147 }
6148
6149 uint8_t buff[4];
6150 second = uiter_next32(s.iterator);
6151 cces++;
6152
6153 // end condition for identical level
6154 if(second == U_SENTINEL) {
6155 terminatePSKLevel(level, maxLevel, i, dest);
6156 level = UCOL_PSK_NULL;
6157 break;
6158 }
6159 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6160 first = second;
6161
6162 j = 0;
6163 if(bocsuBytesUsed != 0) {
6164 while(bocsuBytesUsed-->0) {
6165 j++;
6166 }
6167 }
6168
6169 while(i < count && j < bocsuBytesWritten) {
6170 dest[i++] = buff[j++];
6171 }
6172 }
6173
6174 } else {
6175 level = UCOL_PSK_NULL;
6176 }
6177 /* fall through to next level */
6178 case UCOL_PSK_NULL:
6179 j = i;
6180 while(j<count) {
6181 dest[j++]=0;
6182 }
6183 break;
6184 default:
6185 *status = U_INTERNAL_PROGRAM_ERROR;
6186 UTRACE_EXIT_STATUS(*status);
6187 return 0;
6188 }
6189
6190 saveState:
6191 // Now we need to return stuff. First we want to see whether we have
6192 // done everything for the current state of iterator.
6193 if(byteCountOrFrenchDone
6194 || canUpdateState == FALSE
6195 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) {
6196 // Any of above mean that the previous transaction
6197 // wasn't finished and that we should store the
6198 // previous iterator state.
6199 state[0] = iterState;
6200 } else {
6201 // The transaction is complete. We will continue in the next iteration.
6202 state[0] = s.iterator->getState(s.iterator);
6203 cces = 0;
6204 }
6205 // Store the number of bocsu bytes written.
6206 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6207 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6208 }
6209 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6210
6211 // Next we put in the level of comparison
6212 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6213
6214 // If we are doing French, we need to store whether we have just finished the French level
6215 if(level == UCOL_PSK_SECONDARY && doingFrench) {
6216 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6217 } else {
6218 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6219 }
6220
6221 // Was the latest CE shifted
6222 if(wasShifted) {
6223 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6224 }
6225 // Check for cces overflow
6226 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6227 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6228 }
6229 // Store cces
6230 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6231
6232 // Check for French overflow
6233 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6234 *status = U_INDEX_OUTOFBOUNDS_ERROR;
6235 }
6236 // Store number of bytes written in the French secondary continuation sequence
6237 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6238
6239
6240 // If we have used normalizing iterator, get rid of it
6241 if(normIter != NULL) {
6242 unorm_closeIter(normIter);
6243 }
6244
6245 // Return number of meaningful sortkey bytes.
6246 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6247 dest,i, state[0], state[1]);
6248 UTRACE_EXIT_VALUE(i);
6249 return i;
6250 }
6251
6252 /**
6253 * Produce a bound for a given sortkey and a number of levels.
6254 */
6255 U_CAPI int32_t U_EXPORT2
6256 ucol_getBound(const uint8_t *source,
6257 int32_t sourceLength,
6258 UColBoundMode boundType,
6259 uint32_t noOfLevels,
6260 uint8_t *result,
6261 int32_t resultLength,
6262 UErrorCode *status) {
6263 // consistency checks
6264 if(status == NULL || U_FAILURE(*status)) {
6265 return 0;
6266 }
6267 if(source == NULL) {
6268 *status = U_ILLEGAL_ARGUMENT_ERROR;
6269 return 0;
6270 }
6271
6272 int32_t sourceIndex = 0;
6273 // Scan the string until we skip enough of the key OR reach the end of the key
6274 do {
6275 sourceIndex++;
6276 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6277 noOfLevels--;
6278 }
6279 } while (noOfLevels > 0
6280 && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6281
6282 if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6283 && noOfLevels > 0) {
6284 *status = U_SORT_KEY_TOO_SHORT_WARNING;
6285 }
6286
6287
6288 // READ ME: this code assumes that the values for boundType
6289 // enum will not changes. They are set so that the enum value
6290 // corresponds to the number of extra bytes each bound type
6291 // needs.
6292 if(result != NULL && resultLength >= sourceIndex+boundType) {
6293 uprv_memcpy(result, source, sourceIndex);
6294 switch(boundType) {
6295 // Lower bound just gets terminated. No extra bytes
6296 case UCOL_BOUND_LOWER: // = 0
6297 break;
6298 // Upper bound needs one extra byte
6299 case UCOL_BOUND_UPPER: // = 1
6300 result[sourceIndex++] = 2;
6301 break;
6302 // Upper long bound needs two extra bytes
6303 case UCOL_BOUND_UPPER_LONG: // = 2
6304 result[sourceIndex++] = 0xFF;
6305 result[sourceIndex++] = 0xFF;
6306 break;
6307 default:
6308 *status = U_ILLEGAL_ARGUMENT_ERROR;
6309 return 0;
6310 }
6311 result[sourceIndex++] = 0;
6312
6313 return sourceIndex;
6314 } else {
6315 return sourceIndex+boundType+1;
6316 }
6317 }
6318
6319 /****************************************************************************/
6320 /* Following are the functions that deal with the properties of a collator */
6321 /* there are new APIs and some compatibility APIs */
6322 /****************************************************************************/
6323
6324 static inline void
6325 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6326 int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6327 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6328 UBool reverseSecondary = FALSE;
6329 if(!isContinuation(CE)) {
6330 tertiary = (uint8_t)((CE & coll->tertiaryMask));
6331 tertiary ^= coll->caseSwitch;
6332 reverseSecondary = TRUE;
6333 } else {
6334 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6335 tertiary &= UCOL_REMOVE_CASE;
6336 reverseSecondary = FALSE;
6337 }
6338
6339 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6340 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6341 primary1 = (uint8_t)(CE >> 8);
6342
6343 if(primary1 != 0) {
6344 coll->latinOneCEs[ch] |= (primary1 << *primShift);
6345 *primShift -= 8;
6346 }
6347 if(primary2 != 0) {
6348 if(*primShift < 0) {
6349 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6350 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6351 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6352 return;
6353 }
6354 coll->latinOneCEs[ch] |= (primary2 << *primShift);
6355 *primShift -= 8;
6356 }
6357 if(secondary != 0) {
6358 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6359 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6360 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6361 } else { // normal case
6362 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6363 }
6364 *secShift -= 8;
6365 }
6366 if(tertiary != 0) {
6367 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6368 *terShift -= 8;
6369 }
6370 }
6371
6372 static inline UBool
6373 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6374 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6375 if(newTable == NULL) {
6376 *status = U_MEMORY_ALLOCATION_ERROR;
6377 coll->latinOneFailed = TRUE;
6378 return FALSE;
6379 }
6380 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6381 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6382 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6383 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6384 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6385 coll->latinOneTableLen = size;
6386 uprv_free(coll->latinOneCEs);
6387 coll->latinOneCEs = newTable;
6388 return TRUE;
6389 }
6390
6391 static UBool
6392 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6393 UBool result = TRUE;
6394 if(coll->latinOneCEs == NULL) {
6395 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6396 if(coll->latinOneCEs == NULL) {
6397 *status = U_MEMORY_ALLOCATION_ERROR;
6398 return FALSE;
6399 }
6400 coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6401 }
6402 UChar ch = 0;
6403 UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6404 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6405
6406 int32_t primShift = 24, secShift = 24, terShift = 24;
6407 uint32_t CE = 0;
6408 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6409
6410 // TODO: make safe if you get more than you wanted...
6411 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6412 primShift = 24; secShift = 24; terShift = 24;
6413 if(ch < 0x100) {
6414 CE = coll->latinOneMapping[ch];
6415 } else {
6416 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6417 if(CE == UCOL_NOT_FOUND && coll->UCA) {
6418 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6419 }
6420 }
6421 if(CE < UCOL_NOT_FOUND) {
6422 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6423 } else {
6424 switch (getCETag(CE)) {
6425 case EXPANSION_TAG:
6426 case DIGIT_TAG:
6427 ucol_setText(it, &ch, 1, status);
6428 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6429 if(primShift < 0 || secShift < 0 || terShift < 0) {
6430 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6431 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6432 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6433 break;
6434 }
6435 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6436 }
6437 break;
6438 case CONTRACTION_TAG:
6439 // here is the trick
6440 // F2 is contraction. We do something very similar to contractions
6441 // but have two indices, one in the real contraction table and the
6442 // other to where we stuffed things. This hopes that we don't have
6443 // many contractions (this should work for latin-1 tables).
6444 {
6445 if((CE & 0x00FFF000) != 0) {
6446 *status = U_UNSUPPORTED_ERROR;
6447 goto cleanup_after_failure;
6448 }
6449
6450 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6451
6452 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6453
6454 coll->latinOneCEs[ch] = CE;
6455 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6456 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6457
6458 // We're going to jump into contraction table, pick the elements
6459 // and use them
6460 do {
6461 CE = *(coll->contractionCEs +
6462 (UCharOffset - coll->contractionIndex));
6463 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6464 uint32_t size;
6465 uint32_t i; /* general counter */
6466 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6467 size = getExpansionCount(CE);
6468 //CE = *CEOffset++;
6469 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6470 for(i = 0; i<size; i++) {
6471 if(primShift < 0 || secShift < 0 || terShift < 0) {
6472 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6473 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6474 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6475 break;
6476 }
6477 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6478 }
6479 } else { /* else, we do */
6480 while(*CEOffset != 0) {
6481 if(primShift < 0 || secShift < 0 || terShift < 0) {
6482 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6483 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6484 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6485 break;
6486 }
6487 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6488 }
6489 }
6490 contractionOffset++;
6491 } else if(CE < UCOL_NOT_FOUND) {
6492 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6493 } else {
6494 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6495 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6496 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6497 contractionOffset++;
6498 }
6499 UCharOffset++;
6500 primShift = 24; secShift = 24; terShift = 24;
6501 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6502 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6503 goto cleanup_after_failure;
6504 }
6505 }
6506 } while(*UCharOffset != 0xFFFF);
6507 }
6508 break;
6509 default:
6510 goto cleanup_after_failure;
6511 }
6512 }
6513 }
6514 // compact table
6515 if(contractionOffset < coll->latinOneTableLen) {
6516 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6517 goto cleanup_after_failure;
6518 }
6519 }
6520 ucol_closeElements(it);
6521 return result;
6522
6523 cleanup_after_failure:
6524 // status should already be set before arriving here.
6525 coll->latinOneFailed = TRUE;
6526 ucol_closeElements(it);
6527 return FALSE;
6528 }
6529
6530 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6531 if(U_SUCCESS(*status)) {
6532 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6533 coll->caseSwitch = UCOL_CASE_SWITCH;
6534 } else {
6535 coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6536 }
6537
6538 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6539 coll->tertiaryMask = UCOL_REMOVE_CASE;
6540 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6541 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6542 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6543 coll->tertiaryBottom = UCOL_COMMON_BOT3;
6544 } else {
6545 coll->tertiaryMask = UCOL_KEEP_CASE;
6546 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6547 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6548 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6549 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6550 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6551 } else {
6552 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6553 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6554 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6555 }
6556 }
6557
6558 /* Set the compression values */
6559 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6560 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6561 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6562
6563 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6564 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6565 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6566 } else {
6567 coll->sortKeyGen = ucol_calcSortKey;
6568 }
6569 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6570 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6571 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6572 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6573 //fprintf(stderr, "F");
6574 coll->latinOneUse = TRUE;
6575 } else {
6576 coll->latinOneUse = FALSE;
6577 }
6578 if(*status == U_UNSUPPORTED_ERROR) {
6579 *status = U_ZERO_ERROR;
6580 }
6581 } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6582 coll->latinOneUse = TRUE;
6583 }
6584 } else {
6585 coll->latinOneUse = FALSE;
6586 }
6587 }
6588 }
6589
6590 U_CAPI uint32_t U_EXPORT2
6591 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6592 if(U_FAILURE(*status) || coll == NULL) {
6593 return 0;
6594 }
6595 if(len == -1) {
6596 len = u_strlen(varTop);
6597 }
6598 if(len == 0) {
6599 *status = U_ILLEGAL_ARGUMENT_ERROR;
6600 return 0;
6601 }
6602
6603 collIterate s;
6604 IInit_collIterate(coll, varTop, len, &s);
6605
6606 uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6607
6608 /* here we check if we have consumed all characters */
6609 /* you can put in either one character or a contraction */
6610 /* you shouldn't put more... */
6611 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6612 *status = U_CE_NOT_FOUND_ERROR;
6613 return 0;
6614 }
6615
6616 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6617
6618 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6619 *status = U_PRIMARY_TOO_LONG_ERROR;
6620 return 0;
6621 }
6622 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6623 coll->variableTopValueisDefault = FALSE;
6624 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6625 }
6626
6627 return CE & UCOL_PRIMARYMASK;
6628 }
6629
6630 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6631 if(U_FAILURE(*status) || coll == NULL) {
6632 return 0;
6633 }
6634 return coll->variableTopValue<<16;
6635 }
6636
6637 U_CAPI void U_EXPORT2
6638 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6639 if(U_FAILURE(*status) || coll == NULL) {
6640 return;
6641 }
6642
6643 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6644 coll->variableTopValueisDefault = FALSE;
6645 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6646 }
6647 }
6648 /* Attribute setter API */
6649 U_CAPI void U_EXPORT2
6650 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6651 if(U_FAILURE(*status) || coll == NULL) {
6652 return;
6653 }
6654 UColAttributeValue oldFrench = coll->frenchCollation;
6655 UColAttributeValue oldCaseFirst = coll->caseFirst;
6656 switch(attr) {
6657 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6658 if(value == UCOL_ON) {
6659 coll->numericCollation = UCOL_ON;
6660 coll->numericCollationisDefault = FALSE;
6661 } else if (value == UCOL_OFF) {
6662 coll->numericCollation = UCOL_OFF;
6663 coll->numericCollationisDefault = FALSE;
6664 } else if (value == UCOL_DEFAULT) {
6665 coll->numericCollationisDefault = TRUE;
6666 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6667 } else {
6668 *status = U_ILLEGAL_ARGUMENT_ERROR;
6669 }
6670 break;
6671 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6672 if(value == UCOL_ON) {
6673 coll->hiraganaQ = UCOL_ON;
6674 coll->hiraganaQisDefault = FALSE;
6675 } else if (value == UCOL_OFF) {
6676 coll->hiraganaQ = UCOL_OFF;
6677 coll->hiraganaQisDefault = FALSE;
6678 } else if (value == UCOL_DEFAULT) {
6679 coll->hiraganaQisDefault = TRUE;
6680 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6681 } else {
6682 *status = U_ILLEGAL_ARGUMENT_ERROR;
6683 }
6684 break;
6685 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6686 if(value == UCOL_ON) {
6687 coll->frenchCollation = UCOL_ON;
6688 coll->frenchCollationisDefault = FALSE;
6689 } else if (value == UCOL_OFF) {
6690 coll->frenchCollation = UCOL_OFF;
6691 coll->frenchCollationisDefault = FALSE;
6692 } else if (value == UCOL_DEFAULT) {
6693 coll->frenchCollationisDefault = TRUE;
6694 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6695 } else {
6696 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6697 }
6698 break;
6699 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6700 if(value == UCOL_SHIFTED) {
6701 coll->alternateHandling = UCOL_SHIFTED;
6702 coll->alternateHandlingisDefault = FALSE;
6703 } else if (value == UCOL_NON_IGNORABLE) {
6704 coll->alternateHandling = UCOL_NON_IGNORABLE;
6705 coll->alternateHandlingisDefault = FALSE;
6706 } else if (value == UCOL_DEFAULT) {
6707 coll->alternateHandlingisDefault = TRUE;
6708 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6709 } else {
6710 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6711 }
6712 break;
6713 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6714 if(value == UCOL_LOWER_FIRST) {
6715 coll->caseFirst = UCOL_LOWER_FIRST;
6716 coll->caseFirstisDefault = FALSE;
6717 } else if (value == UCOL_UPPER_FIRST) {
6718 coll->caseFirst = UCOL_UPPER_FIRST;
6719 coll->caseFirstisDefault = FALSE;
6720 } else if (value == UCOL_OFF) {
6721 coll->caseFirst = UCOL_OFF;
6722 coll->caseFirstisDefault = FALSE;
6723 } else if (value == UCOL_DEFAULT) {
6724 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6725 coll->caseFirstisDefault = TRUE;
6726 } else {
6727 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6728 }
6729 break;
6730 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6731 if(value == UCOL_ON) {
6732 coll->caseLevel = UCOL_ON;
6733 coll->caseLevelisDefault = FALSE;
6734 } else if (value == UCOL_OFF) {
6735 coll->caseLevel = UCOL_OFF;
6736 coll->caseLevelisDefault = FALSE;
6737 } else if (value == UCOL_DEFAULT) {
6738 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6739 coll->caseLevelisDefault = TRUE;
6740 } else {
6741 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6742 }
6743 break;
6744 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6745 if(value == UCOL_ON) {
6746 coll->normalizationMode = UCOL_ON;
6747 coll->normalizationModeisDefault = FALSE;
6748 } else if (value == UCOL_OFF) {
6749 coll->normalizationMode = UCOL_OFF;
6750 coll->normalizationModeisDefault = FALSE;
6751 } else if (value == UCOL_DEFAULT) {
6752 coll->normalizationModeisDefault = TRUE;
6753 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6754 } else {
6755 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6756 }
6757 break;
6758 case UCOL_STRENGTH: /* attribute for strength */
6759 if (value == UCOL_DEFAULT) {
6760 coll->strengthisDefault = TRUE;
6761 coll->strength = (UColAttributeValue)coll->options->strength;
6762 } else if (value <= UCOL_IDENTICAL) {
6763 coll->strengthisDefault = FALSE;
6764 coll->strength = value;
6765 } else {
6766 *status = U_ILLEGAL_ARGUMENT_ERROR ;
6767 }
6768 break;
6769 case UCOL_ATTRIBUTE_COUNT:
6770 default:
6771 *status = U_ILLEGAL_ARGUMENT_ERROR;
6772 break;
6773 }
6774 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6775 coll->latinOneRegenTable = TRUE;
6776 } else {
6777 coll->latinOneRegenTable = FALSE;
6778 }
6779 ucol_updateInternalState(coll, status);
6780 }
6781
6782 U_CAPI UColAttributeValue U_EXPORT2
6783 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6784 if(U_FAILURE(*status) || coll == NULL) {
6785 return UCOL_DEFAULT;
6786 }
6787 switch(attr) {
6788 case UCOL_NUMERIC_COLLATION:
6789 return coll->numericCollation;
6790 case UCOL_HIRAGANA_QUATERNARY_MODE:
6791 return coll->hiraganaQ;
6792 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6793 return coll->frenchCollation;
6794 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6795 return coll->alternateHandling;
6796 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6797 return coll->caseFirst;
6798 case UCOL_CASE_LEVEL: /* do we have an extra case level */
6799 return coll->caseLevel;
6800 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6801 return coll->normalizationMode;
6802 case UCOL_STRENGTH: /* attribute for strength */
6803 return coll->strength;
6804 case UCOL_ATTRIBUTE_COUNT:
6805 default:
6806 *status = U_ILLEGAL_ARGUMENT_ERROR;
6807 break;
6808 }
6809 return UCOL_DEFAULT;
6810 }
6811
6812 U_CAPI void U_EXPORT2
6813 ucol_setStrength( UCollator *coll,
6814 UCollationStrength strength)
6815 {
6816 UErrorCode status = U_ZERO_ERROR;
6817 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6818 }
6819
6820 U_CAPI UCollationStrength U_EXPORT2
6821 ucol_getStrength(const UCollator *coll)
6822 {
6823 UErrorCode status = U_ZERO_ERROR;
6824 return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6825 }
6826
6827 /****************************************************************************/
6828 /* Following are misc functions */
6829 /* there are new APIs and some compatibility APIs */
6830 /****************************************************************************/
6831
6832 U_CAPI void U_EXPORT2
6833 ucol_getVersion(const UCollator* coll,
6834 UVersionInfo versionInfo)
6835 {
6836 /* RunTime version */
6837 uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6838 /* Builder version*/
6839 uint8_t bdVersion = coll->image->version[0];
6840
6841 /* Charset Version. Need to get the version from cnv files
6842 * makeconv should populate cnv files with version and
6843 * an api has to be provided in ucnv.h to obtain this version
6844 */
6845 uint8_t csVersion = 0;
6846
6847 /* combine the version info */
6848 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6849
6850 /* Tailoring rules */
6851 versionInfo[0] = (uint8_t)(cmbVersion>>8);
6852 versionInfo[1] = (uint8_t)cmbVersion;
6853 versionInfo[2] = coll->image->version[1];
6854 if(coll->UCA) {
6855 versionInfo[3] = coll->UCA->image->UCAVersion[0];
6856 } else {
6857 versionInfo[3] = 0;
6858 }
6859 }
6860
6861
6862 /* This internal API checks whether a character is tailored or not */
6863 U_CAPI UBool U_EXPORT2
6864 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6865 uint32_t CE = UCOL_NOT_FOUND;
6866 const UChar *ContractionStart = NULL;
6867 if(U_SUCCESS(*status) && coll != NULL) {
6868 if(coll == coll->UCA) {
6869 return FALSE;
6870 } else if(u < 0x100) { /* latin-1 */
6871 CE = coll->latinOneMapping[u];
6872 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6873 return FALSE;
6874 }
6875 } else { /* regular */
6876 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6877 }
6878
6879 if(isContraction(CE)) {
6880 ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6881 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6882 }
6883
6884 if(CE == UCOL_NOT_FOUND) {
6885 return FALSE;
6886 } else {
6887 return TRUE;
6888 }
6889 } else {
6890 return FALSE;
6891 }
6892 }
6893
6894
6895 /****************************************************************************/
6896 /* Following are the string compare functions */
6897 /* */
6898 /****************************************************************************/
6899
6900
6901 /* ucol_checkIdent internal function. Does byte level string compare. */
6902 /* Used by strcoll if strength == identical and strings */
6903 /* are otherwise equal. Moved out-of-line because this */
6904 /* is a rare case. */
6905 /* */
6906 /* Comparison must be done on NFD normalized strings. */
6907 /* FCD is not good enough. */
6908 /* */
6909 /* TODO: make an incremental NFD Comparison function, which could */
6910 /* be of general use */
6911
6912 static
6913 UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6914 {
6915
6916 // TODO: When we have an UChar iterator, we need to access the whole string. One
6917 // useful modification would be a UChar iterator extract API, since reset next next...
6918 // is not optimal.
6919 // TODO: Handle long strings. Do the same in compareUsingSortKeys.
6920
6921 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6922 // of same type, but that doesn't really mean that it will stay that way.
6923
6924 // The division for the array length may truncate the array size to
6925 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6926 // for all platforms anyway.
6927 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6928 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6929 //UChar sStackBuf[256], tStackBuf[256];
6930 //int32_t sBufSize = 256, tBufSize = 256;
6931 int32_t comparison;
6932 int32_t sLen = 0;
6933 UChar *sBuf = NULL;
6934 int32_t tLen = 0;
6935 UChar *tBuf = NULL;
6936 UBool freeSBuf = FALSE, freeTBuf = FALSE;
6937
6938 if (sColl->flags & UCOL_USE_ITERATOR) {
6939 UNormIterator *sNIt = NULL, *tNIt = NULL;
6940 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6941 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6942 sColl->iterator->move(sColl->iterator, 0, UITER_START);
6943 tColl->iterator->move(tColl->iterator, 0, UITER_START);
6944 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6945 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6946 comparison = u_strCompareIter(sIt, tIt, TRUE);
6947 unorm_closeIter(sNIt);
6948 unorm_closeIter(tNIt);
6949 } else {
6950 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
6951 sBuf = sColl->string;
6952 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
6953 tBuf = tColl->string;
6954
6955 if (normalize) {
6956 *status = U_ZERO_ERROR;
6957 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
6958 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6959 sBuf, sLen,
6960 FALSE, 0,
6961 status);
6962 if(*status == U_BUFFER_OVERFLOW_ERROR) {
6963 if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
6964 &sColl->writableBuffer,
6965 (int32_t *)&sColl->writableBufSize, sLen,
6966 0)
6967 ) {
6968 *status = U_MEMORY_ALLOCATION_ERROR;
6969 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
6970 }
6971 *status = U_ZERO_ERROR;
6972 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6973 sBuf, sLen,
6974 FALSE, 0,
6975 status);
6976 }
6977 if(freeSBuf) {
6978 uprv_free(sBuf);
6979 freeSBuf = FALSE;
6980 }
6981 sBuf = sColl->writableBuffer;
6982 if (sBuf != sColl->stackWritableBuffer) {
6983 sColl->flags |= UCOL_ITER_ALLOCATED;
6984 }
6985 }
6986
6987 *status = U_ZERO_ERROR;
6988 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
6989 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
6990 tBuf, tLen,
6991 FALSE, 0,
6992 status);
6993 if(*status == U_BUFFER_OVERFLOW_ERROR) {
6994 if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
6995 &tColl->writableBuffer,
6996 (int32_t *)&tColl->writableBufSize, tLen,
6997 0)
6998 ) {
6999 *status = U_MEMORY_ALLOCATION_ERROR;
7000 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7001 }
7002 *status = U_ZERO_ERROR;
7003 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7004 tBuf, tLen,
7005 FALSE, 0,
7006 status);
7007 }
7008 if(freeTBuf) {
7009 uprv_free(tBuf);
7010 freeTBuf = FALSE;
7011 }
7012 tBuf = tColl->writableBuffer;
7013 if (tBuf != tColl->stackWritableBuffer) {
7014 tColl->flags |= UCOL_ITER_ALLOCATED;
7015 }
7016 }
7017 }
7018
7019 if (sLen == -1 && tLen == -1) {
7020 comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7021 } else {
7022 if (sLen == -1) {
7023 sLen = u_strlen(sBuf);
7024 }
7025 if (tLen == -1) {
7026 tLen = u_strlen(tBuf);
7027 }
7028 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7029 if (comparison == 0) {
7030 comparison = sLen - tLen;
7031 }
7032 }
7033 }
7034
7035 if (comparison < 0) {
7036 return UCOL_LESS;
7037 } else if (comparison == 0) {
7038 return UCOL_EQUAL;
7039 } else /* comparison > 0 */ {
7040 return UCOL_GREATER;
7041 }
7042 }
7043
7044 /* CEBuf - A struct and some inline functions to handle the saving */
7045 /* of CEs in a buffer within ucol_strcoll */
7046
7047 #define UCOL_CEBUF_SIZE 512
7048 typedef struct ucol_CEBuf {
7049 uint32_t *buf;
7050 uint32_t *endp;
7051 uint32_t *pos;
7052 uint32_t localArray[UCOL_CEBUF_SIZE];
7053 } ucol_CEBuf;
7054
7055
7056 static
7057 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7058 (b)->buf = (b)->pos = (b)->localArray;
7059 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7060 }
7061
7062 static
7063 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7064 uint32_t oldSize;
7065 uint32_t newSize;
7066 uint32_t *newBuf;
7067
7068 ci->flags |= UCOL_ITER_ALLOCATED;
7069 oldSize = b->pos - b->buf;
7070 newSize = oldSize * 2;
7071 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7072 if(newBuf != NULL) {
7073 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7074 if (b->buf != b->localArray) {
7075 uprv_free(b->buf);
7076 }
7077 b->buf = newBuf;
7078 b->endp = b->buf + newSize;
7079 b->pos = b->buf + oldSize;
7080 }
7081 }
7082
7083 static
7084 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7085 if (b->pos == b->endp) {
7086 ucol_CEBuf_Expand(b, ci);
7087 }
7088 *(b)->pos++ = ce;
7089 }
7090
7091 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7092 /* It is used when compare gets in trouble and needs to bail out */
7093 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7094 collIterate *tColl,
7095 UErrorCode *status)
7096 {
7097 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7098 uint8_t *sourceKeyP = sourceKey;
7099 uint8_t *targetKeyP = targetKey;
7100 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7101 const UCollator *coll = sColl->coll;
7102 UChar *source = NULL;
7103 UChar *target = NULL;
7104 int32_t result = UCOL_EQUAL;
7105 UChar sStackBuf[256], tStackBuf[256];
7106 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7107 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7108
7109 // TODO: Handle long strings. Do the same in ucol_checkIdent.
7110 if(sColl->flags & UCOL_USE_ITERATOR) {
7111 sColl->iterator->move(sColl->iterator, 0, UITER_START);
7112 tColl->iterator->move(tColl->iterator, 0, UITER_START);
7113 source = sStackBuf;
7114 UChar *sBufp = source;
7115 target = tStackBuf;
7116 UChar *tBufp = target;
7117 while(sColl->iterator->hasNext(sColl->iterator)) {
7118 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7119 }
7120 while(tColl->iterator->hasNext(tColl->iterator)) {
7121 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7122 }
7123 sourceLength = sBufp - source;
7124 targetLength = tBufp - target;
7125 } else { // no iterators
7126 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7127 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7128 source = sColl->string;
7129 target = tColl->string;
7130 }
7131
7132
7133
7134 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7135 if(sourceKeyLen > UCOL_MAX_BUFFER) {
7136 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7137 if(sourceKeyP == NULL) {
7138 *status = U_MEMORY_ALLOCATION_ERROR;
7139 goto cleanup_and_do_compare;
7140 }
7141 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7142 }
7143
7144 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7145 if(targetKeyLen > UCOL_MAX_BUFFER) {
7146 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7147 if(targetKeyP == NULL) {
7148 *status = U_MEMORY_ALLOCATION_ERROR;
7149 goto cleanup_and_do_compare;
7150 }
7151 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7152 }
7153
7154 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7155
7156 cleanup_and_do_compare:
7157 if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7158 uprv_free(sourceKeyP);
7159 }
7160
7161 if(targetKeyP != NULL && targetKeyP != targetKey) {
7162 uprv_free(targetKeyP);
7163 }
7164
7165 if(result<0) {
7166 return UCOL_LESS;
7167 } else if(result>0) {
7168 return UCOL_GREATER;
7169 } else {
7170 return UCOL_EQUAL;
7171 }
7172 }
7173
7174
7175 static inline UCollationResult
7176 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7177 // const UCollator *coll,
7178 // const UChar *source,
7179 // int32_t sourceLength,
7180 // const UChar *target,
7181 // int32_t targetLength,
7182 UErrorCode *status)
7183 {
7184 U_ALIGN_CODE(16);
7185
7186 const UCollator *coll = sColl->coll;
7187
7188
7189 // setting up the collator parameters
7190 UColAttributeValue strength = coll->strength;
7191 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY);
7192
7193 UBool checkSecTer = initialCheckSecTer;
7194 UBool checkTertiary = (strength >= UCOL_TERTIARY);
7195 UBool checkQuad = (strength >= UCOL_QUATERNARY);
7196 UBool checkIdent = (strength == UCOL_IDENTICAL);
7197 UBool checkCase = (coll->caseLevel == UCOL_ON);
7198 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7199 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7200 UBool qShifted = shifted && checkQuad;
7201 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7202
7203 if(doHiragana && shifted) {
7204 return (ucol_compareUsingSortKeys(sColl, tColl, status));
7205 }
7206 uint8_t caseSwitch = coll->caseSwitch;
7207 uint8_t tertiaryMask = coll->tertiaryMask;
7208
7209 // This is the lowest primary value that will not be ignored if shifted
7210 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7211
7212 UCollationResult result = UCOL_EQUAL;
7213 UCollationResult hirResult = UCOL_EQUAL;
7214
7215 // Preparing the CE buffers. They will be filled during the primary phase
7216 ucol_CEBuf sCEs;
7217 ucol_CEBuf tCEs;
7218 UCOL_INIT_CEBUF(&sCEs);
7219 UCOL_INIT_CEBUF(&tCEs);
7220
7221 uint32_t secS = 0, secT = 0;
7222 uint32_t sOrder=0, tOrder=0;
7223
7224 // Non shifted primary processing is quite simple
7225 if(!shifted) {
7226 for(;;) {
7227
7228 // We fetch CEs until we hit a non ignorable primary or end.
7229 do {
7230 // We get the next CE
7231 sOrder = ucol_IGetNextCE(coll, sColl, status);
7232 // Stuff it in the buffer
7233 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7234 // And keep just the primary part.
7235 sOrder &= UCOL_PRIMARYMASK;
7236 } while(sOrder == 0);
7237
7238 // see the comments on the above block
7239 do {
7240 tOrder = ucol_IGetNextCE(coll, tColl, status);
7241 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7242 tOrder &= UCOL_PRIMARYMASK;
7243 } while(tOrder == 0);
7244
7245 // if both primaries are the same
7246 if(sOrder == tOrder) {
7247 // and there are no more CEs, we advance to the next level
7248 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7249 break;
7250 }
7251 if(doHiragana && hirResult == UCOL_EQUAL) {
7252 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7253 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7254 ? UCOL_LESS:UCOL_GREATER;
7255 }
7256 }
7257 } else {
7258 // if two primaries are different, we are done
7259 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER;
7260 goto commonReturn;
7261 }
7262 } // no primary difference... do the rest from the buffers
7263 } else { // shifted - do a slightly more complicated processing :)
7264 for(;;) {
7265 UBool sInShifted = FALSE;
7266 UBool tInShifted = FALSE;
7267 // This version of code can be refactored. However, it seems easier to understand this way.
7268 // Source loop. Sam as the target loop.
7269 for(;;) {
7270 sOrder = ucol_IGetNextCE(coll, sColl, status);
7271 if(sOrder == UCOL_NO_MORE_CES) {
7272 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7273 break;
7274 } else if(sOrder == 0
7275 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7276 /* UCA amendment - ignore ignorables that follow shifted code points */
7277 continue;
7278 } else if(isContinuation(sOrder)) {
7279 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7280 if(sInShifted) {
7281 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7282 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7283 continue;
7284 } else {
7285 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7286 break;
7287 }
7288 } else { /* Just lower level values */
7289 if(sInShifted) {
7290 continue;
7291 } else {
7292 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7293 continue;
7294 }
7295 }
7296 } else { /* regular */
7297 if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7298 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7299 break;
7300 } else {
7301 if((sOrder & UCOL_PRIMARYMASK) > 0) {
7302 sInShifted = TRUE;
7303 sOrder &= UCOL_PRIMARYMASK;
7304 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7305 continue;
7306 } else {
7307 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7308 sInShifted = FALSE;
7309 continue;
7310 }
7311 }
7312 }
7313 }
7314 sOrder &= UCOL_PRIMARYMASK;
7315 sInShifted = FALSE;
7316
7317 for(;;) {
7318 tOrder = ucol_IGetNextCE(coll, tColl, status);
7319 if(tOrder == UCOL_NO_MORE_CES) {
7320 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7321 break;
7322 } else if(tOrder == 0
7323 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7324 /* UCA amendment - ignore ignorables that follow shifted code points */
7325 continue;
7326 } else if(isContinuation(tOrder)) {
7327 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7328 if(tInShifted) {
7329 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7330 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7331 continue;
7332 } else {
7333 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7334 break;
7335 }
7336 } else { /* Just lower level values */
7337 if(tInShifted) {
7338 continue;
7339 } else {
7340 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7341 continue;
7342 }
7343 }
7344 } else { /* regular */
7345 if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7346 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7347 break;
7348 } else {
7349 if((tOrder & UCOL_PRIMARYMASK) > 0) {
7350 tInShifted = TRUE;
7351 tOrder &= UCOL_PRIMARYMASK;
7352 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7353 continue;
7354 } else {
7355 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7356 tInShifted = FALSE;
7357 continue;
7358 }
7359 }
7360 }
7361 }
7362 tOrder &= UCOL_PRIMARYMASK;
7363 tInShifted = FALSE;
7364
7365 if(sOrder == tOrder) {
7366 /*
7367 if(doHiragana && hirResult == UCOL_EQUAL) {
7368 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7369 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7370 ? UCOL_LESS:UCOL_GREATER;
7371 }
7372 }
7373 */
7374 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7375 break;
7376 } else {
7377 sOrder = 0; tOrder = 0;
7378 continue;
7379 }
7380 } else {
7381 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7382 goto commonReturn;
7383 }
7384 } /* no primary difference... do the rest from the buffers */
7385 }
7386
7387 /* now, we're gonna reexamine collected CEs */
7388 uint32_t *sCE;
7389 uint32_t *tCE;
7390
7391 /* This is the secondary level of comparison */
7392 if(checkSecTer) {
7393 if(!isFrenchSec) { /* normal */
7394 sCE = sCEs.buf;
7395 tCE = tCEs.buf;
7396 for(;;) {
7397 while (secS == 0) {
7398 secS = *(sCE++) & UCOL_SECONDARYMASK;
7399 }
7400
7401 while(secT == 0) {
7402 secT = *(tCE++) & UCOL_SECONDARYMASK;
7403 }
7404
7405 if(secS == secT) {
7406 if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7407 break;
7408 } else {
7409 secS = 0; secT = 0;
7410 continue;
7411 }
7412 } else {
7413 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7414 goto commonReturn;
7415 }
7416 }
7417 } else { /* do the French */
7418 uint32_t *sCESave = NULL;
7419 uint32_t *tCESave = NULL;
7420 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7421 tCE = tCEs.pos-2;
7422 for(;;) {
7423 while (secS == 0 && sCE >= sCEs.buf) {
7424 if(sCESave == 0) {
7425 secS = *(sCE--);
7426 if(isContinuation(secS)) {
7427 while(isContinuation(secS = *(sCE--)));
7428 /* after this, secS has the start of continuation, and sCEs points before that */
7429 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7430 sCE+=2; /* need to point to the first continuation CP */
7431 /* However, now you can just continue doing stuff */
7432 }
7433 } else {
7434 secS = *(sCE++);
7435 if(!isContinuation(secS)) { /* This means we have finished with this cont */
7436 sCE = sCESave; /* reset the pointer to before continuation */
7437 sCESave = 0;
7438 continue;
7439 }
7440 }
7441 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7442 }
7443
7444 while(secT == 0 && tCE >= tCEs.buf) {
7445 if(tCESave == 0) {
7446 secT = *(tCE--);
7447 if(isContinuation(secT)) {
7448 while(isContinuation(secT = *(tCE--)));
7449 /* after this, secS has the start of continuation, and sCEs points before that */
7450 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7451 tCE+=2; /* need to point to the first continuation CP */
7452 /* However, now you can just continue doing stuff */
7453 }
7454 } else {
7455 secT = *(tCE++);
7456 if(!isContinuation(secT)) { /* This means we have finished with this cont */
7457 tCE = tCESave; /* reset the pointer to before continuation */
7458 tCESave = 0;
7459 continue;
7460 }
7461 }
7462 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7463 }
7464
7465 if(secS == secT) {
7466 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7467 break;
7468 } else {
7469 secS = 0; secT = 0;
7470 continue;
7471 }
7472 } else {
7473 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7474 goto commonReturn;
7475 }
7476 }
7477 }
7478 }
7479
7480 /* doing the case bit */
7481 if(checkCase) {
7482 sCE = sCEs.buf;
7483 tCE = tCEs.buf;
7484 for(;;) {
7485 while((secS & UCOL_REMOVE_CASE) == 0) {
7486 if(!isContinuation(*sCE++)) {
7487 secS =*(sCE-1);
7488 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7489 // primary ignorables should not be considered on the case level when the strength is primary
7490 // otherwise, the CEs stop being well-formed
7491 secS &= UCOL_TERT_CASE_MASK;
7492 secS ^= caseSwitch;
7493 } else {
7494 secS = 0;
7495 }
7496 } else {
7497 secS = 0;
7498 }
7499 }
7500
7501 while((secT & UCOL_REMOVE_CASE) == 0) {
7502 if(!isContinuation(*tCE++)) {
7503 secT = *(tCE-1);
7504 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7505 // primary ignorables should not be considered on the case level when the strength is primary
7506 // otherwise, the CEs stop being well-formed
7507 secT &= UCOL_TERT_CASE_MASK;
7508 secT ^= caseSwitch;
7509 } else {
7510 secT = 0;
7511 }
7512 } else {
7513 secT = 0;
7514 }
7515 }
7516
7517 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7518 result = UCOL_LESS;
7519 goto commonReturn;
7520 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7521 result = UCOL_GREATER;
7522 goto commonReturn;
7523 }
7524
7525 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7526 break;
7527 } else {
7528 secS = 0;
7529 secT = 0;
7530 }
7531 }
7532 }
7533
7534 /* Tertiary level */
7535 if(checkTertiary) {
7536 secS = 0;
7537 secT = 0;
7538 sCE = sCEs.buf;
7539 tCE = tCEs.buf;
7540 for(;;) {
7541 while((secS & UCOL_REMOVE_CASE) == 0) {
7542 secS = *(sCE++) & tertiaryMask;
7543 if(!isContinuation(secS)) {
7544 secS ^= caseSwitch;
7545 } else {
7546 secS &= UCOL_REMOVE_CASE;
7547 }
7548 }
7549
7550 while((secT & UCOL_REMOVE_CASE) == 0) {
7551 secT = *(tCE++) & tertiaryMask;
7552 if(!isContinuation(secT)) {
7553 secT ^= caseSwitch;
7554 } else {
7555 secT &= UCOL_REMOVE_CASE;
7556 }
7557 }
7558
7559 if(secS == secT) {
7560 if((secS & UCOL_REMOVE_CASE) == 1) {
7561 break;
7562 } else {
7563 secS = 0; secT = 0;
7564 continue;
7565 }
7566 } else {
7567 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7568 goto commonReturn;
7569 }
7570 }
7571 }
7572
7573
7574 if(qShifted /*checkQuad*/) {
7575 UBool sInShifted = TRUE;
7576 UBool tInShifted = TRUE;
7577 secS = 0;
7578 secT = 0;
7579 sCE = sCEs.buf;
7580 tCE = tCEs.buf;
7581 for(;;) {
7582 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7583 secS = *(sCE++);
7584 if(isContinuation(secS)) {
7585 if(!sInShifted) {
7586 continue;
7587 }
7588 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7589 secS = UCOL_PRIMARYMASK;
7590 sInShifted = FALSE;
7591 } else {
7592 sInShifted = TRUE;
7593 }
7594 }
7595 secS &= UCOL_PRIMARYMASK;
7596
7597
7598 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7599 secT = *(tCE++);
7600 if(isContinuation(secT)) {
7601 if(!tInShifted) {
7602 continue;
7603 }
7604 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7605 secT = UCOL_PRIMARYMASK;
7606 tInShifted = FALSE;
7607 } else {
7608 tInShifted = TRUE;
7609 }
7610 }
7611 secT &= UCOL_PRIMARYMASK;
7612
7613 if(secS == secT) {
7614 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7615 break;
7616 } else {
7617 secS = 0; secT = 0;
7618 continue;
7619 }
7620 } else {
7621 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7622 goto commonReturn;
7623 }
7624 }
7625 } else if(doHiragana && hirResult != UCOL_EQUAL) {
7626 // If we're fine on quaternaries, we might be different
7627 // on Hiragana. This, however, might fail us in shifted.
7628 result = hirResult;
7629 goto commonReturn;
7630 }
7631
7632 /* For IDENTICAL comparisons, we use a bitwise character comparison */
7633 /* as a tiebreaker if all else is equal. */
7634 /* Getting here should be quite rare - strings are not identical - */
7635 /* that is checked first, but compared == through all other checks. */
7636 if(checkIdent)
7637 {
7638 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7639 result = ucol_checkIdent(sColl, tColl, TRUE, status);
7640 }
7641
7642 commonReturn:
7643 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7644 freeHeapWritableBuffer(sColl);
7645 freeHeapWritableBuffer(tColl);
7646
7647 if (sCEs.buf != sCEs.localArray ) {
7648 uprv_free(sCEs.buf);
7649 }
7650 if (tCEs.buf != tCEs.localArray ) {
7651 uprv_free(tCEs.buf);
7652 }
7653 }
7654
7655 return result;
7656 }
7657
7658
7659 static inline uint32_t
7660 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7661 uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7662 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7663 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7664 int32_t offset = 1;
7665 UChar schar = 0, tchar = 0;
7666
7667 for(;;) {
7668 if(len == -1) {
7669 if(s[*index] == 0) { // end of string
7670 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7671 } else {
7672 schar = s[*index];
7673 }
7674 } else {
7675 if(*index == len) {
7676 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7677 } else {
7678 schar = s[*index];
7679 }
7680 }
7681
7682 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7683 offset++;
7684 }
7685
7686 if (schar == tchar) {
7687 (*index)++;
7688 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7689 }
7690 else
7691 {
7692 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7693 return UCOL_BAIL_OUT_CE;
7694 }
7695 // skip completely ignorables
7696 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7697 if(isZeroCE == 0) { // we have to ignore completely ignorables
7698 (*index)++;
7699 continue;
7700 }
7701
7702 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7703 }
7704 }
7705 }
7706
7707
7708 /**
7709 * This is a fast strcoll, geared towards text in Latin-1.
7710 * It supports contractions of size two, French secondaries
7711 * and case switching. You can use it with strengths primary
7712 * to tertiary. It does not support shifted and case level.
7713 * It relies on the table build by setupLatin1Table. If it
7714 * doesn't understand something, it will go to the regular
7715 * strcoll.
7716 */
7717 static inline UCollationResult
7718 ucol_strcollUseLatin1( const UCollator *coll,
7719 const UChar *source,
7720 int32_t sLen,
7721 const UChar *target,
7722 int32_t tLen,
7723 UErrorCode *status)
7724 {
7725 U_ALIGN_CODE(16);
7726 int32_t strength = coll->strength;
7727
7728 int32_t sIndex = 0, tIndex = 0;
7729 UChar sChar = 0, tChar = 0;
7730 uint32_t sOrder=0, tOrder=0;
7731
7732 UBool endOfSource = FALSE;
7733
7734 uint32_t *elements = coll->latinOneCEs;
7735
7736 UBool haveContractions = FALSE; // if we have contractions in our string
7737 // we cannot do French secondary
7738
7739 // Do the primary level
7740 for(;;) {
7741 while(sOrder==0) { // this loop skips primary ignorables
7742 // sOrder=getNextlatinOneCE(source);
7743 if(sLen==-1) { // handling zero terminated strings
7744 sChar=source[sIndex++];
7745 if(sChar==0) {
7746 endOfSource = TRUE;
7747 break;
7748 }
7749 } else { // handling strings with known length
7750 if(sIndex==sLen) {
7751 endOfSource = TRUE;
7752 break;
7753 }
7754 sChar=source[sIndex++];
7755 }
7756 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7757 //fprintf(stderr, "R");
7758 goto returnRegular;
7759 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7760 }
7761 sOrder = elements[sChar];
7762 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7763 // specials can basically be either contractions or bail-out signs. If we get anything
7764 // else, we'll bail out anywasy
7765 if(getCETag(sOrder) == CONTRACTION_TAG) {
7766 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7767 haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7768 // However, if there are contractions in the table, but we always use just one char,
7769 // we might be able to do French. This should be checked out.
7770 }
7771 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7772 //fprintf(stderr, "S");
7773 goto returnRegular;
7774 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7775 }
7776 }
7777 }
7778
7779 while(tOrder==0) { // this loop skips primary ignorables
7780 // tOrder=getNextlatinOneCE(target);
7781 if(tLen==-1) { // handling zero terminated strings
7782 tChar=target[tIndex++];
7783 if(tChar==0) {
7784 if(endOfSource) { // this is different than source loop,
7785 // as we already know that source loop is done here,
7786 // so we can either finish the primary loop if both
7787 // strings are done or anounce the result if only
7788 // target is done. Same below.
7789 goto endOfPrimLoop;
7790 } else {
7791 return UCOL_GREATER;
7792 }
7793 }
7794 } else { // handling strings with known length
7795 if(tIndex==tLen) {
7796 if(endOfSource) {
7797 goto endOfPrimLoop;
7798 } else {
7799 return UCOL_GREATER;
7800 }
7801 }
7802 tChar=target[tIndex++];
7803 }
7804 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7805 //fprintf(stderr, "R");
7806 goto returnRegular;
7807 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7808 }
7809 tOrder = elements[tChar];
7810 if(tOrder >= UCOL_NOT_FOUND) {
7811 // Handling specials, see the comments for source
7812 if(getCETag(tOrder) == CONTRACTION_TAG) {
7813 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7814 haveContractions = TRUE;
7815 }
7816 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7817 //fprintf(stderr, "S");
7818 goto returnRegular;
7819 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7820 }
7821 }
7822 }
7823 if(endOfSource) { // source is finished, but target is not, say the result.
7824 return UCOL_LESS;
7825 }
7826
7827 if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7828 sOrder = 0; tOrder = 0;
7829 continue;
7830 } else {
7831 // compare current top bytes
7832 if(((sOrder^tOrder)&0xFF000000)!=0) {
7833 // top bytes differ, return difference
7834 if(sOrder < tOrder) {
7835 return UCOL_LESS;
7836 } else if(sOrder > tOrder) {
7837 return UCOL_GREATER;
7838 }
7839 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7840 // since we must return enum value
7841 }
7842
7843 // top bytes match, continue with following bytes
7844 sOrder<<=8;
7845 tOrder<<=8;
7846 }
7847 }
7848
7849 endOfPrimLoop:
7850 // after primary loop, we definitely know the sizes of strings,
7851 // so we set it and use simpler loop for secondaries and tertiaries
7852 sLen = sIndex; tLen = tIndex;
7853 if(strength >= UCOL_SECONDARY) {
7854 // adjust the table beggining
7855 elements += coll->latinOneTableLen;
7856 endOfSource = FALSE;
7857
7858 if(coll->frenchCollation == UCOL_OFF) { // non French
7859 // This loop is a simplified copy of primary loop
7860 // at this point we know that whole strings are latin-1, so we don't
7861 // check for that. We also know that we only have contractions as
7862 // specials.
7863 sIndex = 0; tIndex = 0;
7864 for(;;) {
7865 while(sOrder==0) {
7866 if(sIndex==sLen) {
7867 endOfSource = TRUE;
7868 break;
7869 }
7870 sChar=source[sIndex++];
7871 sOrder = elements[sChar];
7872 if(sOrder > UCOL_NOT_FOUND) {
7873 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7874 }
7875 }
7876
7877 while(tOrder==0) {
7878 if(tIndex==tLen) {
7879 if(endOfSource) {
7880 goto endOfSecLoop;
7881 } else {
7882 return UCOL_GREATER;
7883 }
7884 }
7885 tChar=target[tIndex++];
7886 tOrder = elements[tChar];
7887 if(tOrder > UCOL_NOT_FOUND) {
7888 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7889 }
7890 }
7891 if(endOfSource) {
7892 return UCOL_LESS;
7893 }
7894
7895 if(sOrder == tOrder) {
7896 sOrder = 0; tOrder = 0;
7897 continue;
7898 } else {
7899 // see primary loop for comments on this
7900 if(((sOrder^tOrder)&0xFF000000)!=0) {
7901 if(sOrder < tOrder) {
7902 return UCOL_LESS;
7903 } else if(sOrder > tOrder) {
7904 return UCOL_GREATER;
7905 }
7906 }
7907 sOrder<<=8;
7908 tOrder<<=8;
7909 }
7910 }
7911 } else { // French
7912 if(haveContractions) { // if we have contractions, we have to bail out
7913 // since we don't really know how to handle them here
7914 goto returnRegular;
7915 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7916 }
7917 // For French, we go backwards
7918 sIndex = sLen; tIndex = tLen;
7919 for(;;) {
7920 while(sOrder==0) {
7921 if(sIndex==0) {
7922 endOfSource = TRUE;
7923 break;
7924 }
7925 sChar=source[--sIndex];
7926 sOrder = elements[sChar];
7927 // don't even look for contractions
7928 }
7929
7930 while(tOrder==0) {
7931 if(tIndex==0) {
7932 if(endOfSource) {
7933 goto endOfSecLoop;
7934 } else {
7935 return UCOL_GREATER;
7936 }
7937 }
7938 tChar=target[--tIndex];
7939 tOrder = elements[tChar];
7940 // don't even look for contractions
7941 }
7942 if(endOfSource) {
7943 return UCOL_LESS;
7944 }
7945
7946 if(sOrder == tOrder) {
7947 sOrder = 0; tOrder = 0;
7948 continue;
7949 } else {
7950 // see the primary loop for comments
7951 if(((sOrder^tOrder)&0xFF000000)!=0) {
7952 if(sOrder < tOrder) {
7953 return UCOL_LESS;
7954 } else if(sOrder > tOrder) {
7955 return UCOL_GREATER;
7956 }
7957 }
7958 sOrder<<=8;
7959 tOrder<<=8;
7960 }
7961 }
7962 }
7963 }
7964
7965 endOfSecLoop:
7966 if(strength >= UCOL_TERTIARY) {
7967 // tertiary loop is the same as secondary (except no French)
7968 elements += coll->latinOneTableLen;
7969 sIndex = 0; tIndex = 0;
7970 endOfSource = FALSE;
7971 for(;;) {
7972 while(sOrder==0) {
7973 if(sIndex==sLen) {
7974 endOfSource = TRUE;
7975 break;
7976 }
7977 sChar=source[sIndex++];
7978 sOrder = elements[sChar];
7979 if(sOrder > UCOL_NOT_FOUND) {
7980 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7981 }
7982 }
7983 while(tOrder==0) {
7984 if(tIndex==tLen) {
7985 if(endOfSource) {
7986 return UCOL_EQUAL; // if both strings are at the end, they are equal
7987 } else {
7988 return UCOL_GREATER;
7989 }
7990 }
7991 tChar=target[tIndex++];
7992 tOrder = elements[tChar];
7993 if(tOrder > UCOL_NOT_FOUND) {
7994 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7995 }
7996 }
7997 if(endOfSource) {
7998 return UCOL_LESS;
7999 }
8000 if(sOrder == tOrder) {
8001 sOrder = 0; tOrder = 0;
8002 continue;
8003 } else {
8004 if(((sOrder^tOrder)&0xff000000)!=0) {
8005 if(sOrder < tOrder) {
8006 return UCOL_LESS;
8007 } else if(sOrder > tOrder) {
8008 return UCOL_GREATER;
8009 }
8010 }
8011 sOrder<<=8;
8012 tOrder<<=8;
8013 }
8014 }
8015 }
8016 return UCOL_EQUAL;
8017
8018 returnRegular:
8019 // Preparing the context objects for iterating over strings
8020 collIterate sColl, tColl;
8021
8022 IInit_collIterate(coll, source, sLen, &sColl);
8023 IInit_collIterate(coll, target, tLen, &tColl);
8024 return ucol_strcollRegular(&sColl, &tColl, status);
8025 }
8026
8027
8028 U_CAPI UCollationResult U_EXPORT2
8029 ucol_strcollIter( const UCollator *coll,
8030 UCharIterator *sIter,
8031 UCharIterator *tIter,
8032 UErrorCode *status) {
8033 if(!status || U_FAILURE(*status)) {
8034 return UCOL_EQUAL;
8035 }
8036
8037 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8038 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8039
8040 if (sIter == tIter) {
8041 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8042 return UCOL_EQUAL;
8043 }
8044 if(sIter == NULL || tIter == NULL || coll == NULL) {
8045 *status = U_ILLEGAL_ARGUMENT_ERROR;
8046 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8047 return UCOL_EQUAL;
8048 }
8049
8050 UCollationResult result = UCOL_EQUAL;
8051
8052 // Preparing the context objects for iterating over strings
8053 collIterate sColl, tColl;
8054 // The division for the array length may truncate the array size to
8055 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8056 // for all platforms anyway.
8057 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8058 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8059 UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8060
8061 IInit_collIterate(coll, NULL, -1, &sColl);
8062 sColl.iterator = sIter;
8063 sColl.flags |= UCOL_USE_ITERATOR;
8064 IInit_collIterate(coll, NULL, -1, &tColl);
8065 tColl.flags |= UCOL_USE_ITERATOR;
8066 tColl.iterator = tIter;
8067
8068 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8069 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8070 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8071 sColl.flags &= ~UCOL_ITER_NORM;
8072
8073 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8074 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8075 tColl.flags &= ~UCOL_ITER_NORM;
8076 }
8077
8078 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8079
8080 while((sChar = sColl.iterator->next(sColl.iterator)) ==
8081 (tChar = tColl.iterator->next(tColl.iterator))) {
8082 if(sChar == U_SENTINEL) {
8083 result = UCOL_EQUAL;
8084 goto end_compare;
8085 }
8086 }
8087
8088 if(sChar == U_SENTINEL) {
8089 tChar = tColl.iterator->previous(tColl.iterator);
8090 }
8091
8092 if(tChar == U_SENTINEL) {
8093 sChar = sColl.iterator->previous(sColl.iterator);
8094 }
8095
8096 sChar = sColl.iterator->previous(sColl.iterator);
8097 tChar = tColl.iterator->previous(tColl.iterator);
8098
8099 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8100 {
8101 // We are stopped in the middle of a contraction.
8102 // Scan backwards through the == part of the string looking for the start of the contraction.
8103 // It doesn't matter which string we scan, since they are the same in this region.
8104 do
8105 {
8106 sChar = sColl.iterator->previous(sColl.iterator);
8107 tChar = tColl.iterator->previous(tColl.iterator);
8108 }
8109 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8110 }
8111
8112
8113 if(U_SUCCESS(*status)) {
8114 result = ucol_strcollRegular(&sColl, &tColl, status);
8115 }
8116
8117 end_compare:
8118 if(sNormIter || tNormIter) {
8119 unorm_closeIter(sNormIter);
8120 unorm_closeIter(tNormIter);
8121 }
8122
8123 UTRACE_EXIT_VALUE_STATUS(result, *status)
8124 return result;
8125 }
8126
8127
8128
8129 /* */
8130 /* ucol_strcoll Main public API string comparison function */
8131 /* */
8132 U_CAPI UCollationResult U_EXPORT2
8133 ucol_strcoll( const UCollator *coll,
8134 const UChar *source,
8135 int32_t sourceLength,
8136 const UChar *target,
8137 int32_t targetLength) {
8138 U_ALIGN_CODE(16);
8139
8140 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8141 if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8142 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8143 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8144 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8145 }
8146
8147 UErrorCode status = U_ZERO_ERROR;
8148 if(source == NULL || target == NULL) {
8149 // do not crash, but return. Should have
8150 // status argument to return error.
8151 UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
8152 return UCOL_EQUAL;
8153 }
8154 collIterate sColl, tColl;
8155
8156 /* Scan the strings. Find: */
8157 /* The length of any leading portion that is equal */
8158 /* Whether they are exactly equal. (in which case we just return) */
8159 const UChar *pSrc = source;
8160 const UChar *pTarg = target;
8161 int32_t equalLength;
8162
8163 if (sourceLength == -1 && targetLength == -1) {
8164 // Both strings are null terminated.
8165 // Check for them being the same string, and scan through
8166 // any leading equal portion.
8167 if (source==target) {
8168 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8169 return UCOL_EQUAL;
8170 }
8171
8172 for (;;) {
8173 if ( *pSrc != *pTarg || *pSrc == 0) {
8174 break;
8175 }
8176 pSrc++;
8177 pTarg++;
8178 }
8179 if (*pSrc == 0 && *pTarg == 0) {
8180 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8181 return UCOL_EQUAL;
8182 }
8183 equalLength = pSrc - source;
8184 }
8185 else
8186 {
8187 // One or both strings has an explicit length.
8188 /* check if source and target are same strings */
8189
8190 if (source==target && sourceLength==targetLength) {
8191 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8192 return UCOL_EQUAL;
8193 }
8194 const UChar *pSrcEnd = source + sourceLength;
8195 const UChar *pTargEnd = target + targetLength;
8196
8197
8198 // Scan while the strings are bitwise ==, or until one is exhausted.
8199 for (;;) {
8200 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8201 break;
8202 }
8203 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8204 break;
8205 }
8206 if (*pSrc != *pTarg) {
8207 break;
8208 }
8209 pSrc++;
8210 pTarg++;
8211 }
8212 equalLength = pSrc - source;
8213
8214 // If we made it all the way through both strings, we are done. They are ==
8215 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */
8216 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) { /* and also at end of dest string */
8217 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8218 return UCOL_EQUAL;
8219 }
8220 }
8221 if (equalLength > 0) {
8222 /* There is an identical portion at the beginning of the two strings. */
8223 /* If the identical portion ends within a contraction or a comibining */
8224 /* character sequence, back up to the start of that sequence. */
8225 pSrc = source + equalLength; /* point to the first differing chars */
8226 pTarg = target + equalLength;
8227 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8228 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8229 {
8230 // We are stopped in the middle of a contraction.
8231 // Scan backwards through the == part of the string looking for the start of the contraction.
8232 // It doesn't matter which string we scan, since they are the same in this region.
8233 do
8234 {
8235 equalLength--;
8236 pSrc--;
8237 }
8238 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8239 }
8240
8241 source += equalLength;
8242 target += equalLength;
8243 if (sourceLength > 0) {
8244 sourceLength -= equalLength;
8245 }
8246 if (targetLength > 0) {
8247 targetLength -= equalLength;
8248 }
8249 }
8250
8251 UCollationResult returnVal;
8252 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8253 // Preparing the context objects for iterating over strings
8254 IInit_collIterate(coll, source, sourceLength, &sColl);
8255 IInit_collIterate(coll, target, targetLength, &tColl);
8256 returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8257 } else {
8258 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8259 }
8260 UTRACE_EXIT_VALUE(returnVal);
8261 return returnVal;
8262 }
8263
8264 /* convenience function for comparing strings */
8265 U_CAPI UBool U_EXPORT2
8266 ucol_greater( const UCollator *coll,
8267 const UChar *source,
8268 int32_t sourceLength,
8269 const UChar *target,
8270 int32_t targetLength)
8271 {
8272 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8273 == UCOL_GREATER);
8274 }
8275
8276 /* convenience function for comparing strings */
8277 U_CAPI UBool U_EXPORT2
8278 ucol_greaterOrEqual( const UCollator *coll,
8279 const UChar *source,
8280 int32_t sourceLength,
8281 const UChar *target,
8282 int32_t targetLength)
8283 {
8284 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8285 != UCOL_LESS);
8286 }
8287
8288 /* convenience function for comparing strings */
8289 U_CAPI UBool U_EXPORT2
8290 ucol_equal( const UCollator *coll,
8291 const UChar *source,
8292 int32_t sourceLength,
8293 const UChar *target,
8294 int32_t targetLength)
8295 {
8296 return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8297 == UCOL_EQUAL);
8298 }
8299
8300 U_CAPI void U_EXPORT2
8301 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8302 if(coll && coll->UCA) {
8303 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8304 }
8305 }
8306
8307 U_CAPI int32_t U_EXPORT2
8308 ucol_cloneBinary(const UCollator *coll,
8309 uint8_t *buffer, int32_t capacity,
8310 UErrorCode *status)
8311 {
8312 int32_t length = 0;
8313 if(U_FAILURE(*status)) {
8314 return length;
8315 }
8316 if(capacity < 0) {
8317 *status = U_ILLEGAL_ARGUMENT_ERROR;
8318 return length;
8319 }
8320 if(coll->hasRealData == TRUE) {
8321 length = coll->image->size;
8322 if(length <= capacity) {
8323 uprv_memcpy(buffer, coll->image, length);
8324 } else {
8325 *status = U_BUFFER_OVERFLOW_ERROR;
8326 }
8327 } else {
8328 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
8329 if(length <= capacity) {
8330 /* build the UCATableHeader with minimal entries */
8331 /* do not copy the header from the UCA file because its values are wrong! */
8332 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
8333
8334 /* reset everything */
8335 uprv_memset(buffer, 0, length);
8336
8337 /* set the tailoring-specific values */
8338 UCATableHeader *myData = (UCATableHeader *)buffer;
8339 myData->size = length;
8340
8341 /* offset for the options, the only part of the data that is present after the header */
8342 myData->options = sizeof(UCATableHeader);
8343
8344 /* need to always set the expansion value for an upper bound of the options */
8345 myData->expansion = myData->options + sizeof(UColOptionSet);
8346
8347 myData->magic = UCOL_HEADER_MAGIC;
8348 myData->isBigEndian = U_IS_BIG_ENDIAN;
8349 myData->charSetFamily = U_CHARSET_FAMILY;
8350
8351 /* copy UCA's version; genrb will override all but the builder version with tailoring data */
8352 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
8353
8354 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
8355 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
8356 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
8357 myData->jamoSpecial = coll->image->jamoSpecial;
8358
8359 /* copy the collator options */
8360 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
8361 } else {
8362 *status = U_BUFFER_OVERFLOW_ERROR;
8363 }
8364 }
8365 return length;
8366 }
8367
8368 U_CAPI void U_EXPORT2
8369 ucol_forgetUCA(void)
8370 {
8371 _staticUCA = NULL;
8372 UCA_DATA_MEM = NULL;
8373 }
8374
8375 #endif /* #if !UCONFIG_NO_COLLATION */
8376