icuSources/i18n/ucol.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 1996-2006, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  ucol.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 * Modification history
  12 * Date        Name      Comments
  13 * 1996-1999   various members of ICU team maintained C API for collation framework
  14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
  15 * 03/01/2001  synwee    Added maxexpansion functionality.
  16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "uassert.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/coleitr.h"
  25 #include "unicode/unorm.h"
  26 #include "unicode/udata.h"
  27 #include "unicode/ustring.h"
  28
  29 #include "ucol_imp.h"
  30 #include "ucol_elm.h"
  31 #include "bocsu.h"
  32
  33 #include "unormimp.h"
  34 #include "unorm_it.h"
  35 #include "umutex.h"
  36 #include "cmemory.h"
  37 #include "ucln_in.h"
  38 #include "cstring.h"
  39 #include "utracimp.h"
  40 #include "putilimp.h"
  41
  42 #ifdef UCOL_DEBUG
  43 #include <stdio.h>
  44 #endif
  45
  46 U_NAMESPACE_USE
  47
  48 /* added by synwee for trie manipulation*/
  49 #define STAGE_1_SHIFT_            10
  50 #define STAGE_2_SHIFT_            4
  51 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
  52 #define STAGE_3_MASK_             0xF
  53 #define LAST_BYTE_MASK_           0xFF
  54 #define SECOND_LAST_BYTE_SHIFT_   8
  55
  56 #define ZERO_CC_LIMIT_            0xC0
  57
  58 // static UCA. There is only one. Collators don't use it.
  59 // It is referenced only in ucol_initUCA and ucol_cleanup
  60 static UCollator* _staticUCA = NULL;
  61 // static pointer to udata memory. Inited in ucol_initUCA
  62 // used for cleanup in ucol_cleanup
  63 static UDataMemory* UCA_DATA_MEM = NULL;
  64
  65 // this is static pointer to the normalizer fcdTrieIndex
  66 // it is always the same between calls to u_cleanup
  67 // and therefore writing to it is not synchronized.
  68 // It is cleaned in ucol_cleanup
  69 static const uint16_t *fcdTrieIndex=NULL;
  70
  71 // These are values from UCA required for
  72 // implicit generation and supressing sort key compression
  73 // they should regularly be in the UCA, but if one
  74 // is running without UCA, it could be a problem
  75 static int32_t maxRegularPrimary  = 0xA0;
  76 static int32_t minImplicitPrimary = 0xE0;
  77 static int32_t maxImplicitPrimary = 0xE4;
  78
  79 U_CDECL_BEGIN
  80 static UBool U_CALLCONV
  81 isAcceptableUCA(void * /*context*/,
  82              const char * /*type*/, const char * /*name*/,
  83              const UDataInfo *pInfo){
  84   /* context, type & name are intentionally not used */
  85     if( pInfo->size>=20 &&
  86         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  87         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  88         pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 &&   /* dataFormat="UCol" */
  89         pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
  90         pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
  91         pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
  92         pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
  93         pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
  94         //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
  95         //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
  96         //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
  97         ) {
  98         UVersionInfo UCDVersion;
  99         u_getUnicodeVersion(UCDVersion);
 100         if(pInfo->dataVersion[0]==UCDVersion[0] &&
 101           pInfo->dataVersion[1]==UCDVersion[1]) { // &&
 102         //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
 103         //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
 104           return TRUE;
 105         } else {
 106           return FALSE;
 107         }
 108     } else {
 109         return FALSE;
 110     }
 111 }
 112
 113
 114 static int32_t U_CALLCONV
 115 _getFoldingOffset(uint32_t data) {
 116     return (int32_t)(data&0xFFFFFF);
 117 }
 118
 119 U_CDECL_END
 120
 121 static
 122 inline void  IInit_collIterate(const UCollator *collator, const UChar *sourceString,
 123                               int32_t sourceLen, collIterate *s) {
 124     (s)->string = (s)->pos = (UChar *)(sourceString);
 125     (s)->origFlags = 0;
 126     (s)->flags = 0;
 127     if (sourceLen >= 0) {
 128         s->flags |= UCOL_ITER_HASLEN;
 129         (s)->endp = (UChar *)sourceString+sourceLen;
 130     }
 131     else {
 132         /* change to enable easier checking for end of string for fcdpositon */
 133         (s)->endp = NULL;
 134     }
 135     (s)->CEpos = (s)->toReturn = (s)->CEs;
 136     (s)->writableBuffer = (s)->stackWritableBuffer;
 137     (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
 138     (s)->coll = (collator);
 139     (s)->fcdPosition = 0;
 140     if(collator->normalizationMode == UCOL_ON) {
 141         (s)->flags |= UCOL_ITER_NORM;
 142     }
 143     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
 144       (s)->flags |= UCOL_HIRAGANA_Q;
 145     }
 146     (s)->iterator = NULL;
 147     //(s)->iteratorIndex = 0;
 148 }
 149
 150 U_CAPI void  U_EXPORT2
 151 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
 152                              int32_t sourceLen, collIterate *s){
 153     /* Out-of-line version for use from other files. */
 154     IInit_collIterate(collator, sourceString, sourceLen, s);
 155 }
 156
 157
 158 /**
 159 * Backup the state of the collIterate struct data
 160 * @param data collIterate to backup
 161 * @param backup storage
 162 */
 163 static
 164 inline void backupState(const collIterate *data, collIterateState *backup)
 165 {
 166     backup->fcdPosition = data->fcdPosition;
 167     backup->flags       = data->flags;
 168     backup->origFlags   = data->origFlags;
 169     backup->pos         = data->pos;
 170     backup->bufferaddress = data->writableBuffer;
 171     backup->buffersize    = data->writableBufSize;
 172     backup->iteratorMove = 0;
 173     backup->iteratorIndex = 0;
 174     if(data->iterator != NULL) {
 175         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
 176         backup->iteratorIndex = data->iterator->getState(data->iterator);
 177         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
 178         if(backup->iteratorIndex == UITER_NO_STATE) {
 179             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
 180                 backup->iteratorMove++;
 181                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
 182             }
 183             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 184         }
 185     }
 186 }
 187
 188 /**
 189 * Loads the state into the collIterate struct data
 190 * @param data collIterate to backup
 191 * @param backup storage
 192 * @param forwards boolean to indicate if forwards iteration is used,
 193 *        false indicates backwards iteration
 194 */
 195 static
 196 inline void loadState(collIterate *data, const collIterateState *backup,
 197                       UBool        forwards)
 198 {
 199     UErrorCode status = U_ZERO_ERROR;
 200     data->flags       = backup->flags;
 201     data->origFlags   = backup->origFlags;
 202     if(data->iterator != NULL) {
 203         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
 204         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
 205         if(backup->iteratorMove != 0) {
 206             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 207         }
 208     }
 209     data->pos         = backup->pos;
 210     if ((data->flags & UCOL_ITER_INNORMBUF) &&
 211         data->writableBuffer != backup->bufferaddress) {
 212         /*
 213         this is when a new buffer has been reallocated and we'll have to
 214         calculate the new position.
 215         note the new buffer has to contain the contents of the old buffer.
 216         */
 217         if (forwards) {
 218             data->pos = data->writableBuffer +
 219                                          (data->pos - backup->bufferaddress);
 220         }
 221         else {
 222             /* backwards direction */
 223             uint32_t temp = backup->buffersize -
 224                                   (data->pos - backup->bufferaddress);
 225             data->pos = data->writableBuffer + (data->writableBufSize - temp);
 226         }
 227     }
 228     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
 229         /*
 230         this is alittle tricky.
 231         if we are initially not in the normalization buffer, even if we
 232         normalize in the later stage, the data in the buffer will be
 233         ignored, since we skip back up to the data string.
 234         however if we are already in the normalization buffer, any
 235         further normalization will pull data into the normalization
 236         buffer and modify the fcdPosition.
 237         since we are keeping the data in the buffer for use, the
 238         fcdPosition can not be reverted back.
 239         arrgghh....
 240         */
 241         data->fcdPosition = backup->fcdPosition;
 242     }
 243 }
 244
 245
 246 /*
 247 * collIter_eos()
 248 *     Checks for a collIterate being positioned at the end of
 249 *     its source string.
 250 *
 251 */
 252 static
 253 inline UBool collIter_eos(collIterate *s) {
 254     if(s->flags & UCOL_USE_ITERATOR) {
 255       return !(s->iterator->hasNext(s->iterator));
 256     }
 257     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
 258         // Null terminated string, but not at null, so not at end.
 259         //   Whether in main or normalization buffer doesn't matter.
 260         return FALSE;
 261     }
 262
 263     // String with length.  Can't be in normalization buffer, which is always
 264     //  null termintated.
 265     if (s->flags & UCOL_ITER_HASLEN) {
 266         return (s->pos == s->endp);
 267     }
 268
 269     // We are at a null termination, could be either normalization buffer or main string.
 270     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
 271         // At null at end of main string.
 272         return TRUE;
 273     }
 274
 275     // At null at end of normalization buffer.  Need to check whether there there are
 276     //   any characters left in the main buffer.
 277     if(s->origFlags & UCOL_USE_ITERATOR) {
 278       return !(s->iterator->hasNext(s->iterator));
 279     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
 280         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
 281         return (*s->fcdPosition == 0);
 282     }
 283     else {
 284         // Main string with an end pointer.
 285         return s->fcdPosition == s->endp;
 286     }
 287 }
 288
 289 /*
 290 * collIter_bos()
 291 *     Checks for a collIterate being positioned at the start of
 292 *     its source string.
 293 *
 294 */
 295 static
 296 inline UBool collIter_bos(collIterate *source) {
 297   // if we're going backwards, we need to know whether there is more in the
 298   // iterator, even if we are in the side buffer
 299   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 300     return !source->iterator->hasPrevious(source->iterator);
 301   }
 302   if (source->pos <= source->string ||
 303       ((source->flags & UCOL_ITER_INNORMBUF) &&
 304       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
 305     return TRUE;
 306   }
 307   return FALSE;
 308 }
 309
 310 static
 311 inline UBool collIter_SimpleBos(collIterate *source) {
 312   // if we're going backwards, we need to know whether there is more in the
 313   // iterator, even if we are in the side buffer
 314   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 315     return !source->iterator->hasPrevious(source->iterator);
 316   }
 317   if (source->pos == source->string) {
 318     return TRUE;
 319   }
 320   return FALSE;
 321 }
 322     //return (data->pos == data->string) ||
 323
 324
 325 /**
 326 * Checks and free writable buffer if it is not the original stack buffer
 327 * in collIterate. This function does not reassign the writable buffer.
 328 * @param data collIterate struct to determine and free the writable buffer
 329 */
 330 static
 331 inline void freeHeapWritableBuffer(collIterate *data)
 332 {
 333     if (data->writableBuffer != data->stackWritableBuffer) {
 334         uprv_free(data->writableBuffer);
 335     }
 336 }
 337
 338
 339 /****************************************************************************/
 340 /* Following are the open/close functions                                   */
 341 /*                                                                          */
 342 /****************************************************************************/
 343
 344 static UCollator*
 345 ucol_initFromBinary(const uint8_t *bin, int32_t length,
 346                 const UCollator *base,
 347                 UCollator *fillIn,
 348                 UErrorCode *status)
 349 {
 350     UCollator *result = fillIn;
 351     if(U_FAILURE(*status)) {
 352         return NULL;
 353     }
 354     /*
 355     if(base == NULL) {
 356         // we don't support null base yet
 357         *status = U_ILLEGAL_ARGUMENT_ERROR;
 358         return NULL;
 359     }
 360     */
 361     // We need these and we could be running without UCA
 362     uprv_uca_initImplicitConstants(0, 0, status);
 363     UCATableHeader *colData = (UCATableHeader *)bin;
 364     // do we want version check here? We're trying to figure out whether collators are compatible
 365     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
 366         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
 367         colData->version[0] != UCOL_BUILDER_VERSION)
 368     {
 369         *status = U_COLLATOR_VERSION_MISMATCH;
 370         return NULL;
 371     }
 372     else {
 373         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
 374             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
 375             if(U_FAILURE(*status)){
 376                 return NULL;
 377             }
 378             result->hasRealData = TRUE;
 379         }
 380         else {
 381             if(base) {
 382                 result = ucol_initCollator(base->image, result, base, status);
 383                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
 384                 if(U_FAILURE(*status)){
 385                     return NULL;
 386                 }
 387                 result->hasRealData = FALSE;
 388             }
 389             else {
 390                 *status = U_USELESS_COLLATOR_ERROR;
 391                 return NULL;
 392             }
 393         }
 394         result->freeImageOnClose = FALSE;
 395     }
 396     result->validLocale = NULL;
 397     result->requestedLocale = NULL;
 398     result->rules = NULL;
 399     result->rulesLength = 0;
 400     result->freeRulesOnClose = FALSE;
 401     result->rb = NULL;
 402     result->elements = NULL;
 403     return result;
 404 }
 405
 406 U_CAPI UCollator* U_EXPORT2
 407 ucol_openBinary(const uint8_t *bin, int32_t length,
 408                 const UCollator *base,
 409                 UErrorCode *status)
 410 {
 411     return ucol_initFromBinary(bin, length, base, NULL, status);
 412 }
 413
 414 U_CAPI UCollator* U_EXPORT2
 415 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
 416 {
 417     UCollator * localCollator;
 418     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
 419     char *stackBufferChars = (char *)stackBuffer;
 420     int32_t imageSize = 0;
 421     int32_t rulesSize = 0;
 422     int32_t rulesPadding = 0;
 423     uint8_t *image;
 424     UChar *rules;
 425     UBool colAllocated = FALSE;
 426     UBool imageAllocated = FALSE;
 427
 428     if (status == NULL || U_FAILURE(*status)){
 429         return 0;
 430     }
 431     if ((stackBuffer && !pBufferSize) || !coll){
 432        *status = U_ILLEGAL_ARGUMENT_ERROR;
 433         return 0;
 434     }
 435     if (coll->rules && coll->freeRulesOnClose) {
 436         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
 437         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
 438         bufferSizeNeeded += rulesSize + rulesPadding;
 439     }
 440
 441     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
 442         *pBufferSize =  bufferSizeNeeded;
 443         return 0;
 444     }
 445
 446     /* Pointers on 64-bit platforms need to be aligned
 447      * on a 64-bit boundry in memory.
 448      */
 449     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
 450         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
 451         if (*pBufferSize > offsetUp) {
 452             *pBufferSize -= offsetUp;
 453             stackBufferChars += offsetUp;
 454         }
 455         else {
 456             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
 457             *pBufferSize = 1;
 458         }
 459     }
 460     stackBuffer = (void *)stackBufferChars;
 461
 462     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
 463         /* allocate one here...*/
 464         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
 465         colAllocated = TRUE;
 466         if (U_SUCCESS(*status)) {
 467             *status = U_SAFECLONE_ALLOCATED_WARNING;
 468         }
 469     }
 470     localCollator = (UCollator *)stackBufferChars;
 471     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
 472     {
 473         UErrorCode tempStatus = U_ZERO_ERROR;
 474         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
 475     }
 476     if (coll->freeImageOnClose) {
 477         image = (uint8_t *)uprv_malloc(imageSize);
 478         ucol_cloneBinary(coll, image, imageSize, status);
 479         imageAllocated = TRUE;
 480     }
 481     else {
 482         image = (uint8_t *)coll->image;
 483     }
 484     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
 485     if (U_FAILURE(*status)) {
 486         return NULL;
 487     }
 488
 489     if (coll->rules) {
 490         if (coll->freeRulesOnClose) {
 491             localCollator->rules = u_strcpy(rules, coll->rules);
 492             //bufferEnd += rulesSize;
 493         }
 494         else {
 495             localCollator->rules = coll->rules;
 496         }
 497         localCollator->freeRulesOnClose = FALSE;
 498         localCollator->rulesLength = coll->rulesLength;
 499     }
 500
 501     int32_t i;
 502     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
 503         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
 504     }
 505     localCollator->requestedLocale = NULL; // zero copies of pointers
 506     localCollator->validLocale = NULL;
 507     localCollator->rb = NULL;
 508     localCollator->elements = NULL;
 509     localCollator->freeOnClose = colAllocated;
 510     localCollator->freeImageOnClose = imageAllocated;
 511     return localCollator;
 512 }
 513
 514 U_CAPI void U_EXPORT2
 515 ucol_close(UCollator *coll)
 516 {
 517     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
 518     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
 519     if(coll != NULL) {
 520         // these are always owned by each UCollator struct,
 521         // so we always free them
 522         if(coll->validLocale != NULL) {
 523             uprv_free(coll->validLocale);
 524         }
 525         if(coll->requestedLocale != NULL) {
 526             uprv_free(coll->requestedLocale);
 527         }
 528         if(coll->resCleaner != NULL) {
 529             coll->resCleaner(coll);
 530         }
 531         if(coll->latinOneCEs != NULL) {
 532             uprv_free(coll->latinOneCEs);
 533         }
 534         if(coll->options != NULL && coll->freeOptionsOnClose) {
 535             uprv_free(coll->options);
 536         }
 537         if(coll->rules != NULL && coll->freeRulesOnClose) {
 538             uprv_free((UChar *)coll->rules);
 539         }
 540         if(coll->image != NULL && coll->freeImageOnClose) {
 541             uprv_free((UCATableHeader *)coll->image);
 542         }
 543
 544         /* Here, it would be advisable to close: */
 545         /* - UData for UCA (unless we stuff it in the root resb */
 546         /* Again, do we need additional housekeeping... HMMM! */
 547         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
 548         if(coll->freeOnClose){
 549             /* for safeClone, if freeOnClose is FALSE,
 550             don't free the other instance data */
 551             uprv_free(coll);
 552         }
 553     }
 554     UTRACE_EXIT();
 555 }
 556
 557 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
 558 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
 559 U_CAPI uint8_t* U_EXPORT2
 560 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
 561 {
 562   uint8_t *result = NULL;
 563   if(U_FAILURE(*status)) {
 564     return NULL;
 565   }
 566   if(coll->hasRealData == TRUE) {
 567     *length = coll->image->size;
 568     result = (uint8_t *)uprv_malloc(*length);
 569     /* test for NULL */
 570     if (result == NULL) {
 571         *status = U_MEMORY_ALLOCATION_ERROR;
 572         return NULL;
 573     }
 574     uprv_memcpy(result, coll->image, *length);
 575   } else {
 576     *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 577     result = (uint8_t *)uprv_malloc(*length);
 578     /* test for NULL */
 579     if (result == NULL) {
 580         *status = U_MEMORY_ALLOCATION_ERROR;
 581         return NULL;
 582     }
 583
 584     /* build the UCATableHeader with minimal entries */
 585     /* do not copy the header from the UCA file because its values are wrong! */
 586     /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 587
 588     /* reset everything */
 589     uprv_memset(result, 0, *length);
 590
 591     /* set the tailoring-specific values */
 592     UCATableHeader *myData = (UCATableHeader *)result;
 593     myData->size = *length;
 594
 595     /* offset for the options, the only part of the data that is present after the header */
 596     myData->options = sizeof(UCATableHeader);
 597
 598     /* need to always set the expansion value for an upper bound of the options */
 599     myData->expansion = myData->options + sizeof(UColOptionSet);
 600
 601     myData->magic = UCOL_HEADER_MAGIC;
 602     myData->isBigEndian = U_IS_BIG_ENDIAN;
 603     myData->charSetFamily = U_CHARSET_FAMILY;
 604
 605     /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 606     uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 607
 608     uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 609     uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 610     uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 611     myData->jamoSpecial = coll->image->jamoSpecial;
 612
 613     /* copy the collator options */
 614     uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 615   }
 616   return result;
 617 }
 618
 619 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 620   if(U_FAILURE(*status)) {
 621     return;
 622   }
 623     result->caseFirst = (UColAttributeValue)opts->caseFirst;
 624     result->caseLevel = (UColAttributeValue)opts->caseLevel;
 625     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
 626     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
 627     result->strength = (UColAttributeValue)opts->strength;
 628     result->variableTopValue = opts->variableTopValue;
 629     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
 630     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
 631     result->numericCollation = (UColAttributeValue)opts->numericCollation;
 632
 633     result->caseFirstisDefault = TRUE;
 634     result->caseLevelisDefault = TRUE;
 635     result->frenchCollationisDefault = TRUE;
 636     result->normalizationModeisDefault = TRUE;
 637     result->strengthisDefault = TRUE;
 638     result->variableTopValueisDefault = TRUE;
 639     result->hiraganaQisDefault = TRUE;
 640     result->numericCollationisDefault = TRUE;
 641
 642     ucol_updateInternalState(result, status);
 643
 644     result->options = opts;
 645 }
 646
 647
 648 /**
 649 * Approximate determination if a character is at a contraction end.
 650 * Guaranteed to be TRUE if a character is at the end of a contraction,
 651 * otherwise it is not deterministic.
 652 * @param c character to be determined
 653 * @param coll collator
 654 */
 655 static
 656 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 657     if (U16_IS_TRAIL(c)) {
 658       return TRUE;
 659     }
 660
 661     if (c < coll->minContrEndCP) {
 662         return FALSE;
 663     }
 664
 665     int32_t  hash = c;
 666     uint8_t  htbyte;
 667     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 668         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 669     }
 670     htbyte = coll->contrEndCP[hash>>3];
 671     return (((htbyte >> (hash & 7)) & 1) == 1);
 672 }
 673
 674
 675
 676 /*
 677 *   i_getCombiningClass()
 678 *        A fast, at least partly inline version of u_getCombiningClass()
 679 *        This is a candidate for further optimization.  Used heavily
 680 *        in contraction processing.
 681 */
 682 static
 683 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
 684     uint8_t sCC = 0;
 685     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
 686         sCC = u_getCombiningClass(c);
 687     }
 688     return sCC;
 689 }
 690
 691 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
 692     UChar c;
 693     UCollator *result = fillIn;
 694     if(U_FAILURE(*status) || image == NULL) {
 695         return NULL;
 696     }
 697
 698     if(result == NULL) {
 699         result = (UCollator *)uprv_malloc(sizeof(UCollator));
 700         if(result == NULL) {
 701             *status = U_MEMORY_ALLOCATION_ERROR;
 702             return result;
 703         }
 704         result->freeOnClose = TRUE;
 705     } else {
 706         result->freeOnClose = FALSE;
 707     }
 708
 709     result->image = image;
 710     result->mapping.getFoldingOffset = _getFoldingOffset;
 711     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
 712     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
 713     if(U_FAILURE(*status)) {
 714         if(result->freeOnClose == TRUE) {
 715             uprv_free(result);
 716             result = NULL;
 717         }
 718         return result;
 719     }
 720
 721     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
 722     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
 723     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
 724     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
 725     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
 726
 727     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
 728     result->freeOptionsOnClose = FALSE;
 729
 730     /* set attributes */
 731     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
 732     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
 733     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
 734     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
 735     result->strength = (UColAttributeValue)result->options->strength;
 736     result->variableTopValue = result->options->variableTopValue;
 737     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
 738     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
 739     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
 740
 741     result->caseFirstisDefault = TRUE;
 742     result->caseLevelisDefault = TRUE;
 743     result->frenchCollationisDefault = TRUE;
 744     result->normalizationModeisDefault = TRUE;
 745     result->strengthisDefault = TRUE;
 746     result->variableTopValueisDefault = TRUE;
 747     result->alternateHandlingisDefault = TRUE;
 748     result->hiraganaQisDefault = TRUE;
 749     result->numericCollationisDefault = TRUE;
 750
 751     /*result->scriptOrder = NULL;*/
 752
 753     result->rules = NULL;
 754     result->rulesLength = 0;
 755
 756     /* get the version info from UCATableHeader and populate the Collator struct*/
 757     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
 758     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
 759     result->dataVersion[2] = 0;
 760     result->dataVersion[3] = 0;
 761
 762     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
 763     result->minUnsafeCP = 0;
 764     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
 765         if (ucol_unsafeCP(c, result)) break;
 766     }
 767     result->minUnsafeCP = c;
 768
 769     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
 770     result->minContrEndCP = 0;
 771     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
 772         if (ucol_contractionEndCP(c, result)) break;
 773     }
 774     result->minContrEndCP = c;
 775
 776     /* max expansion tables */
 777     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
 778                                          result->image->endExpansionCE);
 779     result->lastEndExpansionCE = result->endExpansionCE +
 780                                  result->image->endExpansionCECount - 1;
 781     result->expansionCESize = (uint8_t*)result->image +
 782                                                result->image->expansionCESize;
 783
 784
 785     //result->errorCode = *status;
 786
 787     result->latinOneCEs = NULL;
 788
 789     result->latinOneRegenTable = FALSE;
 790     result->latinOneFailed = FALSE;
 791     result->UCA = UCA;
 792     result->resCleaner = NULL;
 793
 794     ucol_updateInternalState(result, status);
 795
 796
 797     return result;
 798 }
 799
 800 /* new Mark's code */
 801
 802 /**
 803  * For generation of Implicit CEs
 804  * @author Davis
 805  *
 806  * Cleaned up so that changes can be made more easily.
 807  * Old values:
 808 # First Implicit: E26A792D
 809 # Last Implicit: E3DC70C0
 810 # First CJK: E0030300
 811 # Last CJK: E0A9DD00
 812 # First CJK_A: E0A9DF00
 813 # Last CJK_A: E0DE3100
 814  */
 815 /* Following is a port of Mark's code for new treatment of implicits.
 816  * It is positioned here, since ucol_initUCA need to initialize the
 817  * variables below according to the data in the fractional UCA.
 818  */
 819
 820 /**
 821     * Function used to:
 822     * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
 823     * b) bump any non-CJK characters by 10FFFF.
 824     * The relevant blocks are:
 825     * A:    4E00..9FFF; CJK Unified Ideographs
 826     *       F900..FAFF; CJK Compatibility Ideographs
 827     * B:    3400..4DBF; CJK Unified Ideographs Extension A
 828     *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
 829     * As long as
 830     *   no new B characters are allocated between 4E00 and FAFF, and
 831     *   no new A characters are outside of this range,
 832     * (very high probability) this simple code will work.
 833     * The reordered blocks are:
 834     * Block1 is CJK
 835     * Block2 is CJK_COMPAT_USED
 836     * Block3 is CJK_A
 837     * (all contiguous)
 838     * Any other CJK gets its normal code point
 839     * Any non-CJK gets +10FFFF
 840     * When we reorder Block1, we make sure that it is at the very start,
 841     * so that it will use a 3-byte form.
 842     * Warning: the we only pick up the compatibility characters that are
 843     * NOT decomposed, so that block is smaller!
 844     */
 845
 846 // CONSTANTS
 847 static const UChar32
 848     NON_CJK_OFFSET = 0x110000,
 849     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
 850
 851 /**
 852  * Precomputed by constructor
 853  */
 854 static int32_t
 855     final3Multiplier = 0,
 856     final4Multiplier = 0,
 857     final3Count = 0,
 858     final4Count = 0,
 859     medialCount = 0,
 860     min3Primary = 0,
 861     min4Primary = 0,
 862     max4Primary = 0,
 863     minTrail = 0,
 864     maxTrail = 0,
 865     max3Trail = 0,
 866     max4Trail = 0,
 867     min4Boundary = 0;
 868
 869 static const UChar32
 870     CJK_BASE = 0x4E00,
 871     CJK_LIMIT = 0x9FFF+1,
 872     CJK_COMPAT_USED_BASE = 0xFA0E,
 873     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
 874     CJK_A_BASE = 0x3400,
 875     CJK_A_LIMIT = 0x4DBF+1,
 876     CJK_B_BASE = 0x20000,
 877     CJK_B_LIMIT = 0x2A6DF+1;
 878
 879 static UChar32 swapCJK(UChar32 i) {
 880
 881     if (i >= CJK_BASE) {
 882         if (i < CJK_LIMIT)              return i - CJK_BASE;
 883
 884         if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
 885
 886         if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
 887                                                 + (CJK_LIMIT - CJK_BASE);
 888         if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
 889
 890         if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
 891
 892         return i + NON_CJK_OFFSET;  // non-CJK
 893     }
 894     if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
 895
 896     if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
 897                                                 + (CJK_LIMIT - CJK_BASE)
 898                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
 899     return i + NON_CJK_OFFSET; // non-CJK
 900 }
 901
 902 U_CAPI UChar32 U_EXPORT2
 903 uprv_uca_getRawFromCodePoint(UChar32 i) {
 904     return swapCJK(i)+1;
 905 }
 906
 907 U_CAPI UChar32 U_EXPORT2
 908 uprv_uca_getCodePointFromRaw(UChar32 i) {
 909     i--;
 910     UChar32 result = 0;
 911     if(i >= NON_CJK_OFFSET) {
 912         result = i - NON_CJK_OFFSET;
 913     } else if(i >= CJK_B_BASE) {
 914         result = i;
 915     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
 916         if(i < CJK_LIMIT - CJK_BASE) {
 917             result = i + CJK_BASE;
 918         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
 919             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
 920         } else {
 921             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
 922         }
 923     } else {
 924         result = -1;
 925     }
 926     return result;
 927 }
 928
 929 // GET IMPLICIT PRIMARY WEIGHTS
 930 // Return value is left justified primary key
 931 U_CAPI uint32_t U_EXPORT2
 932 uprv_uca_getImplicitFromRaw(UChar32 cp) {
 933     /*
 934     if (cp < 0 || cp > UCOL_MAX_INPUT) {
 935         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
 936     }
 937     */
 938     int32_t last0 = cp - min4Boundary;
 939     if (last0 < 0) {
 940         int32_t last1 = cp / final3Count;
 941         last0 = cp % final3Count;
 942
 943         int32_t last2 = last1 / medialCount;
 944         last1 %= medialCount;
 945
 946         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
 947         last1 = minTrail + last1; // offset
 948         last2 = min3Primary + last2; // offset
 949         /*
 950         if (last2 >= min4Primary) {
 951             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
 952         }
 953         */
 954         return (last2 << 24) + (last1 << 16) + (last0 << 8);
 955     } else {
 956         int32_t last1 = last0 / final4Count;
 957         last0 %= final4Count;
 958
 959         int32_t last2 = last1 / medialCount;
 960         last1 %= medialCount;
 961
 962         int32_t last3 = last2 / medialCount;
 963         last2 %= medialCount;
 964
 965         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
 966         last1 = minTrail + last1; // offset
 967         last2 = minTrail + last2; // offset
 968         last3 = min4Primary + last3; // offset
 969         /*
 970         if (last3 > max4Primary) {
 971             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
 972         }
 973         */
 974         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
 975     }
 976 }
 977
 978 U_CAPI uint32_t U_EXPORT2
 979 uprv_uca_getImplicitPrimary(UChar32 cp) {
 980     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
 981
 982     cp = swapCJK(cp);
 983     cp++;
 984     // we now have a range of numbers from 0 to 21FFFF.
 985
 986     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
 987
 988     return uprv_uca_getImplicitFromRaw(cp);
 989 }
 990
 991 /**
 992  * Converts implicit CE into raw integer ("code point")
 993  * @param implicit
 994  * @return -1 if illegal format
 995  */
 996 U_CAPI UChar32 U_EXPORT2
 997 uprv_uca_getRawFromImplicit(uint32_t implicit) {
 998     UChar32 result;
 999     UChar32 b3 = implicit & 0xFF;
1000     implicit >>= 8;
1001     UChar32 b2 = implicit & 0xFF;
1002     implicit >>= 8;
1003     UChar32 b1 = implicit & 0xFF;
1004     implicit >>= 8;
1005     UChar32 b0 = implicit & 0xFF;
1006
1007     // simple parameter checks
1008     if (b0 < min3Primary || b0 > max4Primary
1009       || b1 < minTrail || b1 > maxTrail) return -1;
1010     // normal offsets
1011     b1 -= minTrail;
1012
1013     // take care of the final values, and compose
1014     if (b0 < min4Primary) {
1015         if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1016         b2 -= minTrail;
1017         UChar32 remainder = b2 % final3Multiplier;
1018         if (remainder != 0) return -1;
1019         b0 -= min3Primary;
1020         b2 /= final3Multiplier;
1021         result = ((b0 * medialCount) + b1) * final3Count + b2;
1022     } else {
1023          if (b2 < minTrail || b2 > maxTrail
1024         || b3 < minTrail || b3 > max4Trail) return -1;
1025         b2 -= minTrail;
1026         b3 -= minTrail;
1027         UChar32 remainder = b3 % final4Multiplier;
1028         if (remainder != 0) return -1;
1029         b3 /= final4Multiplier;
1030         b0 -= min4Primary;
1031         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1032     }
1033     // final check
1034     if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1035     return result;
1036 }
1037
1038
1039 static inline int32_t divideAndRoundUp(int a, int b) {
1040     return 1 + (a-1)/b;
1041 }
1042
1043 /* this function is either called from initUCA or from genUCA before
1044  * doing canonical closure for the UCA.
1045  */
1046
1047 /**
1048  * Set up to generate implicits.
1049  * @param minPrimary
1050  * @param maxPrimary
1051  * @param minTrail final byte
1052  * @param maxTrail final byte
1053  * @param gap3 the gap we leave for tailoring for 3-byte forms
1054  * @param gap4 the gap we leave for tailoring for 4-byte forms
1055  */
1056 static void initImplicitConstants(int minPrimary, int maxPrimary,
1057                                     int minTrailIn, int maxTrailIn,
1058                                     int gap3, int primaries3count,
1059                                     UErrorCode *status) {
1060     // some simple parameter checks
1061     if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1062         *status = U_ILLEGAL_ARGUMENT_ERROR;
1063         return;
1064     };
1065     if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1066         *status = U_ILLEGAL_ARGUMENT_ERROR;
1067         return;
1068     };
1069     if (primaries3count < 1) {
1070         *status = U_ILLEGAL_ARGUMENT_ERROR;
1071         return;
1072     };
1073
1074     minTrail = minTrailIn;
1075     maxTrail = maxTrailIn;
1076
1077     min3Primary = minPrimary;
1078     max4Primary = maxPrimary;
1079     // compute constants for use later.
1080     // number of values we can use in trailing bytes
1081     // leave room for empty values between AND above, e.g. if gap = 2
1082     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1083     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1084     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1085     final3Multiplier = gap3 + 1;
1086     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1087     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1088
1089     // medials can use full range
1090     medialCount = (maxTrail - minTrail + 1);
1091     // find out how many values fit in each form
1092     int32_t threeByteCount = medialCount * final3Count;
1093     // now determine where the 3/4 boundary is.
1094     // we use 3 bytes below the boundary, and 4 above
1095     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1096     int32_t primaries4count = primariesAvailable - primaries3count;
1097
1098
1099     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1100     min4Primary = minPrimary + primaries3count;
1101     min4Boundary = min3ByteCoverage;
1102     // Now expand out the multiplier for the 4 bytes, and redo.
1103
1104     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1105     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1106     //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1107     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1108     //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1109     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1110     //if (DEBUG) System.out.println("expandedGap: " + gap4);
1111     if (gap4 < 1) {
1112         *status = U_ILLEGAL_ARGUMENT_ERROR;
1113         return;
1114     }
1115     final4Multiplier = gap4 + 1;
1116     final4Count = neededPerFinalByte;
1117     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1118     /*
1119     if (DEBUG) {
1120         System.out.println("final4Count: " + final4Count);
1121         for (int counter = 0; counter <= final4Count; ++counter) {
1122             int value = minTrail + (1 + counter)*final4Multiplier;
1123             System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1124         }
1125     }
1126     */
1127 }
1128
1129     /**
1130      * Supply parameters for generating implicit CEs
1131      */
1132 U_CAPI void U_EXPORT2
1133 uprv_uca_initImplicitConstants(int32_t, int32_t, UErrorCode *status) {
1134     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1135     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1136   initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1137 }
1138
1139 U_CDECL_BEGIN
1140 static UBool U_CALLCONV
1141 ucol_cleanup(void)
1142 {
1143     if (UCA_DATA_MEM) {
1144         udata_close(UCA_DATA_MEM);
1145         UCA_DATA_MEM = NULL;
1146     }
1147     if (_staticUCA) {
1148         ucol_close(_staticUCA);
1149         _staticUCA = NULL;
1150     }
1151     fcdTrieIndex = NULL;
1152     return TRUE;
1153 }
1154 U_CDECL_END
1155
1156 /* do not close UCA returned by ucol_initUCA! */
1157 UCollator *
1158 ucol_initUCA(UErrorCode *status) {
1159     if(U_FAILURE(*status)) {
1160         return NULL;
1161     }
1162     umtx_lock(NULL);
1163     UBool f = (_staticUCA == NULL);
1164     umtx_unlock(NULL);
1165
1166     if(f) {
1167         UCollator *newUCA = NULL;
1168         UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1169
1170         if(U_FAILURE(*status)) {
1171             if (result) {
1172                 udata_close(result);
1173             }
1174             uprv_free(newUCA);
1175         }
1176
1177         // init FCD data
1178         if (fcdTrieIndex == NULL) {
1179             fcdTrieIndex = unorm_getFCDTrie(status);
1180             ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1181         }
1182
1183         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1184             newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
1185             if(U_SUCCESS(*status)){
1186                 newUCA->rb = NULL;
1187                 newUCA->elements = NULL;
1188                 newUCA->validLocale = NULL;
1189                 newUCA->requestedLocale = NULL;
1190                 newUCA->hasRealData = FALSE; // real data lives in .dat file...
1191                 newUCA->freeImageOnClose = FALSE;
1192                 umtx_lock(NULL);
1193                 if(_staticUCA == NULL) {
1194                     _staticUCA = newUCA;
1195                     UCA_DATA_MEM = result;
1196                     result = NULL;
1197                     newUCA = NULL;
1198                 }
1199                 umtx_unlock(NULL);
1200
1201                 if(newUCA != NULL) {
1202                     udata_close(result);
1203                     uprv_free(newUCA);
1204                 }
1205                 else {
1206                     ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1207                 }
1208                 // Initalize variables for implicit generation
1209                 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1210                 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1211                 //_staticUCA->mapping.getFoldingOffset = _getFoldingOffset;
1212             }else{
1213                 udata_close(result);
1214                 uprv_free(newUCA);
1215                 _staticUCA= NULL;
1216             }
1217         }
1218     }
1219     return _staticUCA;
1220 }
1221
1222
1223 /*    collIterNormalize     Incremental Normalization happens here.                       */
1224 /*                          pick up the range of chars identifed by FCD,                  */
1225 /*                          normalize it into the collIterate's writable buffer,          */
1226 /*                          switch the collIterate's state to use the writable buffer.    */
1227 /*                                                                                        */
1228 static
1229 void collIterNormalize(collIterate *collationSource)
1230 {
1231     UErrorCode  status = U_ZERO_ERROR;
1232
1233     int32_t    normLen;
1234     UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1235     UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1236
1237     normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1238                               srcP, (int32_t)(endP - srcP),
1239                               FALSE, 0,
1240                               &status);
1241     if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1242         // reallocate and terminate
1243         if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1244                                    &collationSource->writableBuffer,
1245                                    (int32_t *)&collationSource->writableBufSize, normLen + 1,
1246                                    0)
1247         ) {
1248 #ifdef UCOL_DEBUG
1249             fprintf(stderr, "collIterNormalize(), out of memory\n");
1250 #endif
1251             return;
1252         }
1253         status = U_ZERO_ERROR;
1254         normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1255                                   srcP, (int32_t)(endP - srcP),
1256                                   FALSE, 0,
1257                                   &status);
1258     }
1259     if (U_FAILURE(status)) {
1260 #ifdef UCOL_DEBUG
1261         fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1262 #endif
1263         return;
1264     }
1265
1266   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1267       collationSource->flags |= UCOL_ITER_ALLOCATED;
1268   }
1269   collationSource->pos        = collationSource->writableBuffer;
1270   collationSource->origFlags  = collationSource->flags;
1271   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1272   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1273 }
1274
1275
1276 // This function takes the iterator and extracts normalized stuff up to the next boundary
1277 // It is similar in the end results to the collIterNormalize, but for the cases when we
1278 // use an iterator
1279 static
1280 inline void normalizeIterator(collIterate *collationSource) {
1281   UErrorCode status = U_ZERO_ERROR;
1282   UBool wasNormalized = FALSE;
1283   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1284   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1285   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1286     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1287   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1288     // reallocate and terminate
1289     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1290                                &collationSource->writableBuffer,
1291                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1292                                0)
1293     ) {
1294     #ifdef UCOL_DEBUG
1295         fprintf(stderr, "normalizeIterator(), out of memory\n");
1296     #endif
1297         return;
1298     }
1299     status = U_ZERO_ERROR;
1300     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1301     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1302     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1303     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1304   }
1305   // Terminate the buffer - we already checked that it is big enough
1306   collationSource->writableBuffer[normLen] = 0;
1307   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1308       collationSource->flags |= UCOL_ITER_ALLOCATED;
1309   }
1310   collationSource->pos        = collationSource->writableBuffer;
1311   collationSource->origFlags  = collationSource->flags;
1312   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1313   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1314 }
1315
1316
1317 /* Incremental FCD check and normalize                                                    */
1318 /*   Called from getNextCE when normalization state is suspect.                           */
1319 /*   When entering, the state is known to be this:                                        */
1320 /*      o   We are working in the main buffer of the collIterate, not the side            */
1321 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1322 /*          so we won't get here.                                                         */
1323 /*      o   The leading combining class from the current character is 0 or                */
1324 /*          the trailing combining class of the previous char was zero.                   */
1325 /*          True because the previous call to this function will have always exited       */
1326 /*          that way, and we get called for every char where cc might be non-zero.        */
1327 static
1328 inline UBool collIterFCD(collIterate *collationSource) {
1329     UChar       c, c2;
1330     const UChar *srcP, *endP;
1331     uint8_t     leadingCC;
1332     uint8_t     prevTrailingCC = 0;
1333     uint16_t    fcd;
1334     UBool       needNormalize = FALSE;
1335
1336     srcP = collationSource->pos-1;
1337
1338     if (collationSource->flags & UCOL_ITER_HASLEN) {
1339         endP = collationSource->endp;
1340     } else {
1341         endP = NULL;
1342     }
1343
1344     // Get the trailing combining class of the current character.  If it's zero,
1345     //   we are OK.
1346     c = *srcP++;
1347     /* trie access */
1348     fcd = unorm_getFCD16(fcdTrieIndex, c);
1349     if (fcd != 0) {
1350         if (U16_IS_LEAD(c)) {
1351             if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1352                 ++srcP;
1353                 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1354             } else {
1355                 fcd = 0;
1356             }
1357         }
1358
1359         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1360
1361         if (prevTrailingCC != 0) {
1362             // The current char has a non-zero trailing CC.  Scan forward until we find
1363             //   a char with a leading cc of zero.
1364             while (endP == NULL || srcP != endP)
1365             {
1366                 const UChar *savedSrcP = srcP;
1367
1368                 c = *srcP++;
1369                 /* trie access */
1370                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1371                 if (fcd != 0 && U16_IS_LEAD(c)) {
1372                     if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1373                         ++srcP;
1374                         fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1375                     } else {
1376                         fcd = 0;
1377                     }
1378                 }
1379                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1380                 if (leadingCC == 0) {
1381                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1382                                            //   back up over it.  (Could be surrogate pair!)
1383                     break;
1384                 }
1385
1386                 if (leadingCC < prevTrailingCC) {
1387                     needNormalize = TRUE;
1388                 }
1389
1390                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1391             }
1392         }
1393     }
1394
1395     collationSource->fcdPosition = (UChar *)srcP;
1396
1397     return needNormalize;
1398 }
1399
1400 /****************************************************************************/
1401 /* Following are the CE retrieval functions                                 */
1402 /*                                                                          */
1403 /****************************************************************************/
1404
1405 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1406 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1407
1408 /* there should be a macro version of this function in the header file */
1409 /* This is the first function that tries to fetch a collation element  */
1410 /* If it's not succesfull or it encounters a more difficult situation  */
1411 /* some more sofisticated and slower functions are invoked             */
1412 static
1413 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1414     uint32_t order = 0;
1415     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1416       order = *(collationSource->toReturn++);                         /* if so, return them */
1417       if(collationSource->CEpos == collationSource->toReturn) {
1418         collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1419       }
1420       return order;
1421     }
1422
1423     UChar ch = 0;
1424
1425     for (;;)                           /* Loop handles case when incremental normalize switches   */
1426     {                                  /*   to or from the side buffer / original string, and we  */
1427                                        /*   need to start again to get the next character.        */
1428
1429         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1430         {
1431             // The source string is null terminated and we're not working from the side buffer,
1432             //   and we're not normalizing.  This is the fast path.
1433             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1434             ch = *collationSource->pos++;
1435             if (ch != 0) {
1436                 break;
1437             }
1438             else {
1439                 return UCOL_NO_MORE_CES;
1440             }
1441         }
1442
1443         if (collationSource->flags & UCOL_ITER_HASLEN) {
1444             // Normal path for strings when length is specified.
1445             //   (We can't be in side buffer because it is always null terminated.)
1446             if (collationSource->pos >= collationSource->endp) {
1447                 // Ran off of the end of the main source string.  We're done.
1448                 return UCOL_NO_MORE_CES;
1449             }
1450             ch = *collationSource->pos++;
1451         }
1452         else if(collationSource->flags & UCOL_USE_ITERATOR) {
1453             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1454             if(iterCh == U_SENTINEL) {
1455               return UCOL_NO_MORE_CES;
1456             }
1457             ch = (UChar)iterCh;
1458         }
1459         else
1460         {
1461             // Null terminated string.
1462             ch = *collationSource->pos++;
1463             if (ch == 0) {
1464                 // Ran off end of buffer.
1465                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1466                     // Ran off end of main string. backing up one character.
1467                     collationSource->pos--;
1468                     return UCOL_NO_MORE_CES;
1469                 }
1470                 else
1471                 {
1472                     // Hit null in the normalize side buffer.
1473                     // Usually this means the end of the normalized data,
1474                     // except for one odd case: a null followed by combining chars,
1475                     //   which is the case if we are at the start of the buffer.
1476                     if (collationSource->pos == collationSource->writableBuffer+1) {
1477                         break;
1478                     }
1479
1480                     //  Null marked end of side buffer.
1481                     //   Revert to the main string and
1482                     //   loop back to top to try again to get a character.
1483                     collationSource->pos   = collationSource->fcdPosition;
1484                     collationSource->flags = collationSource->origFlags;
1485                     continue;
1486                 }
1487             }
1488         }
1489
1490         if(collationSource->flags&UCOL_HIRAGANA_Q) {
1491           if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1492             collationSource->flags |= UCOL_WAS_HIRAGANA;
1493           } else {
1494             collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1495           }
1496         }
1497
1498         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1499         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1500         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1501             break;
1502         }
1503
1504         if (collationSource->fcdPosition >= collationSource->pos) {
1505             // An earlier FCD check has already covered the current character.
1506             // We can go ahead and process this char.
1507             break;
1508         }
1509
1510         if (ch < ZERO_CC_LIMIT_ ) {
1511             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1512             break;
1513         }
1514
1515         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1516             // We need to peek at the next character in order to tell if we are FCD
1517             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1518                 // We are at the last char of source string.
1519                 //  It is always OK for FCD check.
1520                 break;
1521             }
1522
1523             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1524             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1525                 break;
1526             }
1527         }
1528
1529
1530         // Need a more complete FCD check and possible normalization.
1531         if (collIterFCD(collationSource)) {
1532             collIterNormalize(collationSource);
1533         }
1534         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1535             //  No normalization was needed.  Go ahead and process the char we already had.
1536             break;
1537         }
1538
1539         // Some normalization happened.  Next loop iteration will pick up a char
1540         //   from the normalization buffer.
1541
1542     }   // end for (;;)
1543
1544
1545       if (ch <= 0xFF) {
1546           /*  For latin-1 characters we never need to fall back to the UCA table        */
1547           /*    because all of the UCA data is replicated in the latinOneMapping array  */
1548           order = coll->latinOneMapping[ch];
1549           if (order > UCOL_NOT_FOUND) {
1550               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1551           }
1552       }
1553       else
1554       {
1555           order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1556           if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1557               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1558           }
1559           if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1560             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1561             order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1562
1563             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1564               order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1565             }
1566           }
1567       }
1568       if(order == UCOL_NOT_FOUND) {
1569         order = getImplicit(ch, collationSource);
1570       }
1571       return order; /* return the CE */
1572 }
1573
1574 /* ucol_getNextCE, out-of-line version for use from other files.   */
1575 U_CAPI uint32_t  U_EXPORT2
1576 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1577     return ucol_IGetNextCE(coll, collationSource, status);
1578 }
1579
1580
1581 /**
1582 * Incremental previous normalization happens here. Pick up the range of chars
1583 * identifed by FCD, normalize it into the collIterate's writable buffer,
1584 * switch the collIterate's state to use the writable buffer.
1585 * @param data collation iterator data
1586 */
1587 static
1588 void collPrevIterNormalize(collIterate *data)
1589 {
1590     UErrorCode status  = U_ZERO_ERROR;
1591     UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1592     UChar      *pStart;
1593     uint32_t    normLen;
1594     UChar      *pStartNorm;
1595
1596     /* Start normalize */
1597     if (data->fcdPosition == NULL) {
1598         pStart = data->string;
1599     }
1600     else {
1601         pStart = data->fcdPosition + 1;
1602     }
1603
1604     normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1605                               data->writableBuffer, 0, &status);
1606
1607     if (data->writableBufSize <= normLen) {
1608             freeHeapWritableBuffer(data);
1609             data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1610                                                         sizeof(UChar));
1611             if(data->writableBuffer == NULL) { // something is wrong here, return
1612               return;
1613             }
1614             data->flags |= UCOL_ITER_ALLOCATED;
1615             /* to handle the zero termination */
1616             data->writableBufSize = normLen + 1;
1617     }
1618             status = U_ZERO_ERROR;
1619     /*
1620     this puts the null termination infront of the normalized string instead
1621     of the end
1622     */
1623     pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1624     *(pStartNorm - 1) = 0;
1625     unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1626                     normLen, &status);
1627
1628     data->pos        = data->writableBuffer + data->writableBufSize;
1629     data->origFlags  = data->flags;
1630     data->flags     |= UCOL_ITER_INNORMBUF;
1631     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1632 }
1633
1634
1635 /**
1636 * Incremental FCD check for previous iteration and normalize. Called from
1637 * getPrevCE when normalization state is suspect.
1638 * When entering, the state is known to be this:
1639 * o  We are working in the main buffer of the collIterate, not the side
1640 *    writable buffer. When in the side buffer, normalization mode is always
1641 *    off, so we won't get here.
1642 * o  The leading combining class from the current character is 0 or the
1643 *    trailing combining class of the previous char was zero.
1644 *    True because the previous call to this function will have always exited
1645 *    that way, and we get called for every char where cc might be non-zero.
1646 * @param data collation iterate struct
1647 * @return normalization status, TRUE for normalization to be done, FALSE
1648 *         otherwise
1649 */
1650 static
1651 inline UBool collPrevIterFCD(collIterate *data)
1652 {
1653     const UChar *src, *start;
1654     UChar       c, c2;
1655     uint8_t     leadingCC;
1656     uint8_t     trailingCC = 0;
1657     uint16_t    fcd;
1658     UBool       result = FALSE;
1659
1660     start = data->string;
1661     src = data->pos + 1;
1662
1663     /* Get the trailing combining class of the current character. */
1664     c = *--src;
1665     if (!U16_IS_SURROGATE(c)) {
1666         fcd = unorm_getFCD16(fcdTrieIndex, c);
1667     } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1668         --src;
1669         fcd = unorm_getFCD16(fcdTrieIndex, c2);
1670         if (fcd != 0) {
1671             fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1672         }
1673     } else /* unpaired surrogate */ {
1674         fcd = 0;
1675     }
1676
1677     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1678
1679     if (leadingCC != 0) {
1680         /*
1681         The current char has a non-zero leading combining class.
1682         Scan backward until we find a char with a trailing cc of zero.
1683         */
1684         for (;;)
1685         {
1686             if (start == src) {
1687                 data->fcdPosition = NULL;
1688                 return result;
1689             }
1690
1691             c = *--src;
1692             if (!U16_IS_SURROGATE(c)) {
1693                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1694             } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1695                 --src;
1696                 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1697                 if (fcd != 0) {
1698                     fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1699                 }
1700             } else /* unpaired surrogate */ {
1701                 fcd = 0;
1702             }
1703
1704             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1705
1706             if (trailingCC == 0) {
1707                 break;
1708             }
1709
1710             if (leadingCC < trailingCC) {
1711                 result = TRUE;
1712             }
1713
1714             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1715         }
1716     }
1717
1718     data->fcdPosition = (UChar *)src;
1719
1720     return result;
1721 }
1722
1723 /** gets a character from the string at a given offset
1724  *  Handles both normal and iterative cases.
1725  *  No error checking - caller beware!
1726  */
1727 inline static
1728 UChar peekCharacter(collIterate *source, int32_t offset) {
1729   if(source->pos != NULL) {
1730     return *(source->pos + offset);
1731   } else if(source->iterator != NULL) {
1732     if(offset != 0) {
1733       source->iterator->move(source->iterator, offset, UITER_CURRENT);
1734       UChar toReturn = (UChar)source->iterator->next(source->iterator);
1735       source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1736       return toReturn;
1737     } else {
1738       return (UChar)source->iterator->current(source->iterator);
1739     }
1740   } else {
1741     return (UChar)U_SENTINEL;
1742   }
1743 }
1744
1745 /**
1746 * Determines if we are at the start of the data string in the backwards
1747 * collation iterator
1748 * @param data collation iterator
1749 * @return TRUE if we are at the start
1750 */
1751 static
1752 inline UBool isAtStartPrevIterate(collIterate *data) {
1753   if(data->pos == NULL && data->iterator != NULL) {
1754     return !data->iterator->hasPrevious(data->iterator);
1755   }
1756   //return (collIter_bos(data)) ||
1757   return (data->pos == data->string) ||
1758             ((data->flags & UCOL_ITER_INNORMBUF) &&
1759             *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1760 }
1761
1762 static
1763 inline void goBackOne(collIterate *data) {
1764 # if 0
1765   // somehow, it looks like we need to keep iterator synced up
1766   // at all times, as above.
1767   if(data->pos) {
1768     data->pos--;
1769   }
1770   if(data->iterator) {
1771     data->iterator->previous(data->iterator);
1772   }
1773 #endif
1774   if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1775     data->iterator->previous(data->iterator);
1776   }
1777   if(data->pos) {
1778     data->pos --;
1779   }
1780 }
1781
1782 /**
1783 * Inline function that gets a simple CE.
1784 * So what it does is that it will first check the expansion buffer. If the
1785 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1786 * is different from the string pointer, we return the collation element at the
1787 * return pointer and decrement it.
1788 * For more complicated CEs it resorts to getComplicatedCE.
1789 * @param coll collator data
1790 * @param data collation iterator struct
1791 * @param status error status
1792 */
1793 static
1794 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1795                                UErrorCode *status)
1796 {
1797     uint32_t result = (uint32_t)UCOL_NULLORDER;
1798     if (data->toReturn > data->CEs) {
1799         data->toReturn --;
1800         result = *(data->toReturn);
1801         if (data->CEs == data->toReturn) {
1802             data->CEpos = data->toReturn;
1803         }
1804     }
1805     else {
1806         UChar ch = 0;
1807         /*
1808         Loop handles case when incremental normalize switches to or from the
1809         side buffer / original string, and we need to start again to get the
1810         next character.
1811         */
1812         for (;;) {
1813             if (data->flags & UCOL_ITER_HASLEN) {
1814                 /*
1815                 Normal path for strings when length is specified.
1816                 Not in side buffer because it is always null terminated.
1817                 */
1818                 if (data->pos <= data->string) {
1819                     /* End of the main source string */
1820                     return UCOL_NO_MORE_CES;
1821                 }
1822                 data->pos --;
1823                 ch = *data->pos;
1824             }
1825             // we are using an iterator to go back. Pray for us!
1826             else if (data->flags & UCOL_USE_ITERATOR) {
1827               UChar32 iterCh = data->iterator->previous(data->iterator);
1828               if(iterCh == U_SENTINEL) {
1829                 return UCOL_NO_MORE_CES;
1830               } else {
1831                 ch = (UChar)iterCh;
1832               }
1833             }
1834             else {
1835                 data->pos --;
1836                 ch = *data->pos;
1837                 /* we are in the side buffer. */
1838                 if (ch == 0) {
1839                     /*
1840                     At the start of the normalize side buffer.
1841                     Go back to string.
1842                     Because pointer points to the last accessed character,
1843                     hence we have to increment it by one here.
1844                     */
1845                     if (data->fcdPosition == NULL) {
1846                         data->pos = data->string;
1847                         return UCOL_NO_MORE_CES;
1848                     }
1849                     else {
1850                         data->pos   = data->fcdPosition + 1;
1851                     }
1852                     data->flags = data->origFlags;
1853                     continue;
1854                 }
1855             }
1856
1857             if(data->flags&UCOL_HIRAGANA_Q) {
1858               if(ch>=0x3040 && ch<=0x309f) {
1859                 data->flags |= UCOL_WAS_HIRAGANA;
1860               } else {
1861                 data->flags &= ~UCOL_WAS_HIRAGANA;
1862               }
1863             }
1864
1865             /*
1866             * got a character to determine if there's fcd and/or normalization
1867             * stuff to do.
1868             * if the current character is not fcd.
1869             * if current character is at the start of the string
1870             * Trailing combining class == 0.
1871             * Note if pos is in the writablebuffer, norm is always 0
1872             */
1873             if (ch < ZERO_CC_LIMIT_ ||
1874               // this should propel us out of the loop in the iterator case
1875                 (data->flags & UCOL_ITER_NORM) == 0 ||
1876                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1877                 || data->string == data->pos) {
1878                 break;
1879             }
1880
1881             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1882                 /* if next character is FCD */
1883                 if (data->pos == data->string) {
1884                     /* First char of string is always OK for FCD check */
1885                     break;
1886                 }
1887
1888                 /* Not first char of string, do the FCD fast test */
1889                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1890                     break;
1891                 }
1892             }
1893
1894             /* Need a more complete FCD check and possible normalization. */
1895             if (collPrevIterFCD(data)) {
1896                 collPrevIterNormalize(data);
1897             }
1898
1899             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1900                 /*  No normalization. Go ahead and process the char. */
1901                 break;
1902             }
1903
1904             /*
1905             Some normalization happened.
1906             Next loop picks up a char from the normalization buffer.
1907             */
1908         }
1909
1910         /* attempt to handle contractions, after removal of the backwards
1911         contraction
1912         */
1913         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1914           result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1915         } else {
1916           if (ch <= 0xFF) {
1917             result = coll->latinOneMapping[ch];
1918           }
1919           else {
1920             result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1921           }
1922           if (result > UCOL_NOT_FOUND) {
1923             result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1924           }
1925           if (result == UCOL_NOT_FOUND) { // Not found in master list
1926             if (!isAtStartPrevIterate(data) &&
1927               ucol_contractionEndCP(ch, data->coll)) {
1928                 result = UCOL_CONTRACTION;
1929             } else {
1930               if(coll->UCA) {
1931                 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1932               }
1933             }
1934
1935             if (result > UCOL_NOT_FOUND) {
1936               if(coll->UCA) {
1937                 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
1938               }
1939             }
1940           }
1941         }
1942         if(result == UCOL_NOT_FOUND) {
1943           result = getPrevImplicit(ch, data);
1944         }
1945     }
1946     return result;
1947 }
1948
1949
1950 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
1951 U_CAPI uint32_t  U_EXPORT2
1952 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1953                         UErrorCode *status) {
1954     return ucol_IGetPrevCE(coll, data, status);
1955 }
1956
1957
1958 /* this should be connected to special Jamo handling */
1959 U_CAPI uint32_t  U_EXPORT2
1960 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1961   collIterate colIt;
1962   uint32_t order;
1963   IInit_collIterate(coll, &u, 1, &colIt);
1964   order = ucol_IGetNextCE(coll, &colIt, status);
1965   /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1966   return order;
1967 }
1968
1969 /**
1970 * Inserts the argument character into the end of the buffer pushing back the
1971 * null terminator.
1972 * @param data collIterate struct data
1973 * @param pNull pointer to the null termination
1974 * @param ch character to be appended
1975 * @return the position of the new addition
1976 */
1977 static
1978 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1979 {
1980           uint32_t  size    = data->writableBufSize;
1981           UChar    *newbuffer;
1982     const uint32_t  incsize = 5;
1983
1984     if ((data->writableBuffer + size) > (pNull + 1)) {
1985         *pNull = ch;
1986         *(pNull + 1) = 0;
1987         return pNull;
1988     }
1989
1990     /*
1991     buffer will always be null terminated at the end.
1992     giving extra space since it is likely that more characters will be added.
1993     */
1994     size += incsize;
1995     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1996     if(newbuffer != NULL) { // something wrong, but no status
1997       uprv_memcpy(newbuffer, data->writableBuffer,
1998                   data->writableBufSize * sizeof(UChar));
1999
2000       freeHeapWritableBuffer(data);
2001       data->writableBufSize = size;
2002       data->writableBuffer  = newbuffer;
2003
2004       newbuffer        = newbuffer + data->writableBufSize;
2005       *newbuffer       = ch;
2006       *(newbuffer + 1) = 0;
2007     }
2008     return newbuffer;
2009 }
2010
2011 /**
2012 * Inserts the argument string into the end of the buffer pushing back the
2013 * null terminator.
2014 * @param data collIterate struct data
2015 * @param pNull pointer to the null termination
2016 * @param string to be appended
2017 * @param length of the string to be appended
2018 * @return the position of the new addition
2019 */
2020 static
2021 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2022                                int32_t length)
2023 {
2024     uint32_t  size = pNull - data->writableBuffer;
2025     UChar    *newbuffer;
2026
2027     if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2028         uprv_memcpy(pNull, str, length * sizeof(UChar));
2029         *(pNull + length) = 0;
2030         return pNull;
2031     }
2032
2033     /*
2034     buffer will always be null terminated at the end.
2035     giving extra space since it is likely that more characters will be added.
2036     */
2037     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2038     if(newbuffer != NULL) {
2039       uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2040       uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2041
2042       freeHeapWritableBuffer(data);
2043       data->writableBufSize = size + length + 1;
2044       data->writableBuffer  = newbuffer;
2045     }
2046
2047     return newbuffer;
2048 }
2049
2050 /**
2051 * Special normalization function for contraction in the forwards iterator.
2052 * This normalization sequence will place the current character at source->pos
2053 * and its following normalized sequence into the buffer.
2054 * The fcd position, pos will be changed.
2055 * pos will now point to positions in the buffer.
2056 * Flags will be changed accordingly.
2057 * @param data collation iterator data
2058 */
2059 static
2060 inline void normalizeNextContraction(collIterate *data)
2061 {
2062     UChar      *buffer     = data->writableBuffer;
2063     uint32_t    buffersize = data->writableBufSize;
2064     uint32_t    strsize;
2065     UErrorCode  status     = U_ZERO_ERROR;
2066     /* because the pointer points to the next character */
2067     UChar      *pStart     = data->pos - 1;
2068     UChar      *pEnd;
2069     uint32_t    normLen;
2070     UChar      *pStartNorm;
2071
2072     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2073         *data->writableBuffer = *(pStart - 1);
2074         strsize               = 1;
2075     }
2076     else {
2077         strsize = u_strlen(data->writableBuffer);
2078     }
2079
2080     pEnd = data->fcdPosition;
2081
2082     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2083                               &status);
2084
2085     if (buffersize <= normLen + strsize) {
2086         uint32_t  size = strsize + normLen + 1;
2087         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2088         if(temp != NULL) {
2089           uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2090           freeHeapWritableBuffer(data);
2091           data->writableBuffer = temp;
2092           data->writableBufSize = size;
2093           data->flags |= UCOL_ITER_ALLOCATED;
2094         }
2095     }
2096
2097     status            = U_ZERO_ERROR;
2098     pStartNorm        = buffer + strsize;
2099     /* null-termination will be added here */
2100     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2101                     normLen + 1, &status);
2102
2103     data->pos        = data->writableBuffer + strsize;
2104     data->origFlags  = data->flags;
2105     data->flags     |= UCOL_ITER_INNORMBUF;
2106     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2107 }
2108
2109 /**
2110 * Contraction character management function that returns the next character
2111 * for the forwards iterator.
2112 * Does nothing if the next character is in buffer and not the first character
2113 * in it.
2114 * Else it checks next character in data string to see if it is normalizable.
2115 * If it is not, the character is simply copied into the buffer, else
2116 * the whole normalized substring is copied into the buffer, including the
2117 * current character.
2118 * @param data collation element iterator data
2119 * @return next character
2120 */
2121 static
2122 inline UChar getNextNormalizedChar(collIterate *data)
2123 {
2124     UChar  nextch;
2125     UChar  ch;
2126     // Here we need to add the iterator code. One problem is the way
2127     // end of string is handled. If we just return next char, it could
2128     // be the sentinel. Most of the cases already check for this, but we
2129     // need to be sure.
2130     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2131          /* if no normalization and not in buffer. */
2132       if(data->flags & UCOL_USE_ITERATOR) {
2133          return (UChar)data->iterator->next(data->iterator);
2134       } else {
2135          return *(data->pos ++);
2136       }
2137     }
2138
2139     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2140       //normalizeIterator(data);
2141     //}
2142
2143     UChar  *pEndWritableBuffer = NULL;
2144     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2145     if ((innormbuf && *data->pos != 0) ||
2146         (data->fcdPosition != NULL && !innormbuf &&
2147         data->pos < data->fcdPosition)) {
2148         /*
2149         if next character is in normalized buffer, no further normalization
2150         is required
2151         */
2152         return *(data->pos ++);
2153     }
2154
2155     if (data->flags & UCOL_ITER_HASLEN) {
2156         /* in data string */
2157         if (data->pos + 1 == data->endp) {
2158             return *(data->pos ++);
2159         }
2160     }
2161     else {
2162         if (innormbuf) {
2163           // inside the normalization buffer, but at the end
2164           // (since we encountered zero). This means, in the
2165           // case we're using char iterator, that we need to
2166           // do another round of normalization.
2167           //if(data->origFlags & UCOL_USE_ITERATOR) {
2168             // we need to restore original flags,
2169             // otherwise, we'll lose them
2170             //data->flags = data->origFlags;
2171             //normalizeIterator(data);
2172             //return *(data->pos++);
2173           //} else {
2174             /*
2175             in writable buffer, at this point fcdPosition can not be
2176             pointing to the end of the data string. see contracting tag.
2177             */
2178           if(data->fcdPosition) {
2179             if (*(data->fcdPosition + 1) == 0 ||
2180                 data->fcdPosition + 1 == data->endp) {
2181                 /* at the end of the string, dump it into the normalizer */
2182                 data->pos = insertBufferEnd(data, data->pos,
2183                                             *(data->fcdPosition)) + 1;
2184                 return *(data->fcdPosition ++);
2185             }
2186             pEndWritableBuffer = data->pos;
2187             data->pos = data->fcdPosition;
2188           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2189             // if we are here, we're using a normalizing iterator.
2190             // we should just continue further.
2191             data->flags = data->origFlags;
2192             data->pos = NULL;
2193             return (UChar)data->iterator->next(data->iterator);
2194           }
2195           //}
2196         }
2197         else {
2198             if (*(data->pos + 1) == 0) {
2199                 return *(data->pos ++);
2200             }
2201         }
2202     }
2203
2204     ch = *data->pos ++;
2205     nextch = *data->pos;
2206
2207     /*
2208     * if the current character is not fcd.
2209     * Trailing combining class == 0.
2210     */
2211     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2212         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2213          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2214             /*
2215             Need a more complete FCD check and possible normalization.
2216             normalize substring will be appended to buffer
2217             */
2218         if (collIterFCD(data)) {
2219             normalizeNextContraction(data);
2220             return *(data->pos ++);
2221         }
2222         else if (innormbuf) {
2223             /* fcdposition shifted even when there's no normalization, if we
2224             don't input the rest into this, we'll get the wrong position when
2225             we reach the end of the writableBuffer */
2226             int32_t length = data->fcdPosition - data->pos + 1;
2227             data->pos = insertBufferEnd(data, pEndWritableBuffer,
2228                                         data->pos - 1, length);
2229             return *(data->pos ++);
2230         }
2231     }
2232
2233     if (innormbuf) {
2234         /*
2235         no normalization is to be done hence only one character will be
2236         appended to the buffer.
2237         */
2238         data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2239     }
2240
2241     /* points back to the pos in string */
2242     return ch;
2243 }
2244
2245
2246
2247 /**
2248 * Function to copy the buffer into writableBuffer and sets the fcd position to
2249 * the correct position
2250 * @param source data string source
2251 * @param buffer character buffer
2252 * @param tempdb current position in buffer that has been used up
2253 */
2254 static
2255 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2256                                      UChar *tempdb)
2257 {
2258     /* okay confusing part here. to ensure that the skipped characters are
2259     considered later, we need to place it in the appropriate position in the
2260     normalization buffer and reassign the pos pointer. simple case if pos
2261     reside in string, simply copy to normalization buffer and
2262     fcdposition = pos, pos = start of normalization buffer. if pos in
2263     normalization buffer, we'll insert the copy infront of pos and point pos
2264     to the start of the normalization buffer. why am i doing these copies?
2265     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2266     not require any changes, which be really painful. */
2267     uint32_t length = u_strlen(buffer);;
2268     if (source->flags & UCOL_ITER_INNORMBUF) {
2269         u_strcpy(tempdb, source->pos);
2270     }
2271     else {
2272         source->fcdPosition  = source->pos;
2273         source->origFlags    = source->flags;
2274         source->flags       |= UCOL_ITER_INNORMBUF;
2275         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2276     }
2277
2278     if (length >= source->writableBufSize) {
2279         freeHeapWritableBuffer(source);
2280         source->writableBuffer =
2281                      (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2282         if(source->writableBuffer == NULL) {
2283           return;
2284         }
2285         source->writableBufSize = length;
2286     }
2287
2288     u_strcpy(source->writableBuffer, buffer);
2289     source->pos = source->writableBuffer;
2290 }
2291
2292 /**
2293 * Function to get the discontiguos collation element within the source.
2294 * Note this function will set the position to the appropriate places.
2295 * @param coll current collator used
2296 * @param source data string source
2297 * @param constart index to the start character in the contraction table
2298 * @return discontiguos collation element offset
2299 */
2300 static
2301 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2302                                 const UChar *constart)
2303 {
2304     /* source->pos currently points to the second combining character after
2305        the start character */
2306           UChar   *temppos      = source->pos;
2307           UChar    buffer[4*UCOL_MAX_BUFFER];
2308           UChar   *tempdb       = buffer;
2309     const UChar   *tempconstart = constart;
2310           uint8_t  tempflags    = source->flags;
2311           UBool    multicontraction = FALSE;
2312           UChar   *tempbufferpos = 0;
2313           collIterateState discState;
2314
2315           backupState(source, &discState);
2316
2317     //*tempdb = *(source->pos - 1);
2318           *tempdb = peekCharacter(source, -1);
2319     tempdb ++;
2320     while (TRUE) {
2321         UChar    *UCharOffset;
2322         UChar     schar,
2323                   tchar;
2324         uint32_t  result;
2325
2326         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2327             || (peekCharacter(source, 0) == 0  &&
2328             //|| (*source->pos == 0  &&
2329                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2330                  source->fcdPosition == NULL ||
2331                  source->fcdPosition == source->endp ||
2332                  *(source->fcdPosition) == 0 ||
2333                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2334                  /* end of string in null terminated string or stopped by a
2335                  null character, note fcd does not always point to a base
2336                  character after the discontiguos change */
2337                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2338                  //u_getCombiningClass(*(source->pos)) == 0) {
2339             //constart = (UChar *)coll->image + getContractOffset(CE);
2340             if (multicontraction) {
2341                 *tempbufferpos = 0;
2342                 source->pos    = temppos - 1;
2343                 setDiscontiguosAttribute(source, buffer, tempdb);
2344                 return *(coll->contractionCEs +
2345                                     (tempconstart - coll->contractionIndex));
2346             }
2347             constart = tempconstart;
2348             break;
2349         }
2350
2351         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2352         schar = getNextNormalizedChar(source);
2353
2354         while (schar > (tchar = *UCharOffset)) {
2355             UCharOffset++;
2356         }
2357
2358         if (schar != tchar) {
2359             /* not the correct codepoint. we stuff the current codepoint into
2360             the discontiguos buffer and try the next character */
2361             *tempdb = schar;
2362             tempdb ++;
2363             continue;
2364         }
2365         else {
2366             if (u_getCombiningClass(schar) ==
2367                 u_getCombiningClass(peekCharacter(source, -2))) {
2368                 //u_getCombiningClass(*(source->pos - 2))) {
2369                 *tempdb = schar;
2370                 tempdb ++;
2371                 continue;
2372             }
2373             result = *(coll->contractionCEs +
2374                                       (UCharOffset - coll->contractionIndex));
2375         }
2376         *tempdb = 0;
2377
2378         if (result == UCOL_NOT_FOUND) {
2379           break;
2380         } else if (isContraction(result)) {
2381             /* this is a multi-contraction*/
2382             tempconstart = (UChar *)coll->image + getContractOffset(result);
2383             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2384                 != UCOL_NOT_FOUND) {
2385                 multicontraction = TRUE;
2386                 temppos       = source->pos + 1;
2387                 tempbufferpos = buffer + u_strlen(buffer);
2388             }
2389         } else {
2390             setDiscontiguosAttribute(source, buffer, tempdb);
2391             return result;
2392         }
2393     }
2394
2395     /* no problems simply reverting just like that,
2396     if we are in string before getting into this function, points back to
2397     string hence no problem.
2398     if we are in normalization buffer before getting into this function,
2399     since we'll never use another normalization within this function, we
2400     know that fcdposition points to a base character. the normalization buffer
2401     never change, hence this revert works. */
2402     loadState(source, &discState, TRUE);
2403     goBackOne(source);
2404
2405     //source->pos   = temppos - 1;
2406     source->flags = tempflags;
2407     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2408 }
2409
2410 static
2411 inline UBool isNonChar(UChar32 cp) {
2412   if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2413     return TRUE;
2414   }
2415   return FALSE;
2416 }
2417
2418 /* now uses Mark's getImplicitPrimary code */
2419 static
2420 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2421   if(isNonChar(cp)) {
2422     return 0;
2423   }
2424   uint32_t r = uprv_uca_getImplicitPrimary(cp);
2425   *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2426   return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2427 }
2428
2429 /**
2430 * Inserts the argument character into the front of the buffer replacing the
2431 * front null terminator.
2432 * @param data collation element iterator data
2433 * @param pNull pointer to the null terminator
2434 * @param ch character to be appended
2435 * @return positon of added character
2436 */
2437 static
2438 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2439 {
2440           uint32_t  size    = data->writableBufSize;
2441           UChar    *end;
2442           UChar    *newbuffer;
2443     const uint32_t  incsize = 5;
2444
2445     if (pNull > data->writableBuffer + 1) {
2446         *pNull       = ch;
2447         *(pNull - 1) = 0;
2448         return pNull;
2449     }
2450
2451     /*
2452     buffer will always be null terminated infront.
2453     giving extra space since it is likely that more characters will be added.
2454     */
2455     size += incsize;
2456     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2457     if(newbuffer == NULL) {
2458       return NULL;
2459     }
2460     end = newbuffer + incsize;
2461     uprv_memcpy(end, data->writableBuffer,
2462                 data->writableBufSize * sizeof(UChar));
2463     *end       = ch;
2464     *(end - 1) = 0;
2465
2466     freeHeapWritableBuffer(data);
2467
2468     data->writableBufSize = size;
2469     data->writableBuffer  = newbuffer;
2470     return end;
2471 }
2472
2473 /**
2474 * Special normalization function for contraction in the previous iterator.
2475 * This normalization sequence will place the current character at source->pos
2476 * and its following normalized sequence into the buffer.
2477 * The fcd position, pos will be changed.
2478 * pos will now point to positions in the buffer.
2479 * Flags will be changed accordingly.
2480 * @param data collation iterator data
2481 */
2482 static
2483 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2484 {
2485     UChar      *buffer     = data->writableBuffer;
2486     uint32_t    buffersize = data->writableBufSize;
2487     uint32_t    nulltermsize;
2488     UErrorCode  localstatus = U_ZERO_ERROR;
2489     UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2490     UChar      *pStart;
2491     uint32_t    normLen;
2492     UChar      *pStartNorm;
2493
2494     if (data->flags & UCOL_ITER_HASLEN) {
2495         /*
2496         normalization buffer not used yet, we'll pull down the next
2497         character into the end of the buffer
2498         */
2499         *(buffer + (buffersize - 1)) = *(data->pos + 1);
2500         nulltermsize                  = buffersize - 1;
2501     }
2502     else {
2503         nulltermsize = buffersize;
2504         UChar *temp = buffer + (nulltermsize - 1);
2505         while (*(temp --) != 0) {
2506             nulltermsize --;
2507         }
2508     }
2509
2510     /* Start normalize */
2511     if (data->fcdPosition == NULL) {
2512         pStart = data->string;
2513     }
2514     else {
2515         pStart = data->fcdPosition + 1;
2516     }
2517
2518     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2519                               &localstatus);
2520
2521     if (nulltermsize <= normLen) {
2522         uint32_t  size = buffersize - nulltermsize + normLen + 1;
2523         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2524         if (temp == NULL) {
2525             *status = U_MEMORY_ALLOCATION_ERROR;
2526             return;
2527         }
2528         nulltermsize   = normLen + 1;
2529         uprv_memcpy(temp + normLen, buffer,
2530                     sizeof(UChar) * (buffersize - nulltermsize));
2531         freeHeapWritableBuffer(data);
2532         data->writableBuffer = temp;
2533         data->writableBufSize = size;
2534     }
2535
2536     /*
2537     this puts the null termination infront of the normalized string instead
2538     of the end
2539     */
2540     pStartNorm   = buffer + (nulltermsize - normLen);
2541     *(pStartNorm - 1) = 0;
2542     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2543                     status);
2544
2545     data->pos        = data->writableBuffer + nulltermsize;
2546     data->origFlags  = data->flags;
2547     data->flags     |= UCOL_ITER_INNORMBUF;
2548     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2549 }
2550
2551 /**
2552 * Contraction character management function that returns the previous character
2553 * for the backwards iterator.
2554 * Does nothing if the previous character is in buffer and not the first
2555 * character in it.
2556 * Else it checks previous character in data string to see if it is
2557 * normalizable.
2558 * If it is not, the character is simply copied into the buffer, else
2559 * the whole normalized substring is copied into the buffer, including the
2560 * current character.
2561 * @param data collation element iterator data
2562 * @return previous character
2563 */
2564 static
2565 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2566 {
2567     UChar  prevch;
2568     UChar  ch;
2569     UChar *start;
2570     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2571     UChar *pNull = NULL;
2572     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2573         (innormbuf && *(data->pos - 1) != 0)) {
2574         /*
2575         if no normalization.
2576         if previous character is in normalized buffer, no further normalization
2577         is required
2578         */
2579       if(data->flags & UCOL_USE_ITERATOR) {
2580         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2581         return (UChar)data->iterator->next(data->iterator);
2582       } else {
2583         return *(data->pos - 1);
2584       }
2585     }
2586
2587     start = data->pos;
2588     if (data->flags & UCOL_ITER_HASLEN) {
2589         /* in data string */
2590         if ((start - 1) == data->string) {
2591             return *(start - 1);
2592         }
2593         start --;
2594         ch     = *start;
2595         prevch = *(start - 1);
2596     }
2597     else {
2598         /*
2599         in writable buffer, at this point fcdPosition can not be NULL.
2600         see contracting tag.
2601         */
2602         if (data->fcdPosition == data->string) {
2603             /* at the start of the string, just dump it into the normalizer */
2604             insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2605             data->fcdPosition = NULL;
2606             return *(data->pos - 1);
2607         }
2608         pNull  = data->pos - 1;
2609         start  = data->fcdPosition;
2610         ch     = *start;
2611         prevch = *(start - 1);
2612     }
2613     /*
2614     * if the current character is not fcd.
2615     * Trailing combining class == 0.
2616     */
2617     if (data->fcdPosition > start &&
2618        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2619     {
2620         /*
2621         Need a more complete FCD check and possible normalization.
2622         normalize substring will be appended to buffer
2623         */
2624         UChar *backuppos = data->pos;
2625         data->pos = start;
2626         if (collPrevIterFCD(data)) {
2627             normalizePrevContraction(data, status);
2628             return *(data->pos - 1);
2629         }
2630         data->pos = backuppos;
2631         data->fcdPosition ++;
2632     }
2633
2634     if (innormbuf) {
2635     /*
2636     no normalization is to be done hence only one character will be
2637     appended to the buffer.
2638     */
2639         insertBufferFront(data, pNull, ch);
2640         data->fcdPosition --;
2641     }
2642
2643     return ch;
2644 }
2645
2646 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2647 /* It is called by getNextCE */
2648
2649 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2650   collIterateState entryState;
2651   backupState(source, &entryState);
2652   UChar32 cp = ch;
2653
2654   for (;;) {
2655     // This loop will repeat only in the case of contractions, and only when a contraction
2656     //   is found and the first CE resulting from that contraction is itself a special
2657     //   (an expansion, for example.)  All other special CE types are fully handled the
2658     //   first time through, and the loop exits.
2659
2660     const uint32_t *CEOffset = NULL;
2661     switch(getCETag(CE)) {
2662     case NOT_FOUND_TAG:
2663       /* This one is not found, and we'll let somebody else bother about it... no more games */
2664       return CE;
2665     case SURROGATE_TAG:
2666       /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2667       /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2668       /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2669       /* we return 0 (completely ignorable - per UCA specification */
2670       {
2671         UChar trail;
2672         collIterateState state;
2673         backupState(source, &state);
2674         if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2675           // we chould have stepped one char forward and it might have turned that it
2676           // was not a trail surrogate. In that case, we have to backup.
2677           loadState(source, &state, TRUE);
2678           return 0;
2679         } else {
2680           /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2681           CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
2682           if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2683             // We need to backup
2684             loadState(source, &state, TRUE);
2685             return CE;
2686           }
2687           // calculate the supplementary code point value, if surrogate was not tailored
2688           cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2689         }
2690       }
2691       break;
2692     case SPEC_PROC_TAG:
2693       {
2694         // Special processing is getting a CE that is preceded by a certain prefix
2695         // Currently this is only needed for optimizing Japanese length and iteration marks.
2696         // When we encouter a special processing tag, we go backwards and try to see if
2697         // we have a match.
2698         // Contraction tables are used - so the whole process is not unlike contraction.
2699         // prefix data is stored backwards in the table.
2700         const UChar *UCharOffset;
2701         UChar schar, tchar;
2702         collIterateState prefixState;
2703         backupState(source, &prefixState);
2704         loadState(source, &entryState, TRUE);
2705         goBackOne(source); // We want to look at the point where we entered - actually one
2706         // before that...
2707
2708         for(;;) {
2709         // This loop will run once per source string character, for as long as we
2710         //  are matching a potential contraction sequence
2711
2712           // First we position ourselves at the begining of contraction sequence
2713           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2714           if (collIter_bos(source)) {
2715             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2716             break;
2717           }
2718           schar = getPrevNormalizedChar(source, status);
2719           goBackOne(source);
2720
2721           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2722             UCharOffset++;
2723           }
2724
2725           if (schar == tchar) {
2726               // Found the source string char in the table.
2727               //  Pick up the corresponding CE from the table.
2728               CE = *(coll->contractionCEs +
2729                   (UCharOffset - coll->contractionIndex));
2730           }
2731           else
2732           {
2733               // Source string char was not in the table.
2734               //   We have not found the prefix.
2735               CE = *(coll->contractionCEs +
2736                   (ContractionStart - coll->contractionIndex));
2737           }
2738
2739           if(!isPrefix(CE)) {
2740               // The source string char was in the contraction table, and the corresponding
2741               //   CE is not a prefix CE.  We found the prefix, break
2742               //   out of loop, this CE will end up being returned.  This is the normal
2743               //   way out of prefix handling when the source actually contained
2744               //   the prefix.
2745               break;
2746           }
2747         }
2748         if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2749           loadState(source, &prefixState, TRUE);
2750           if(source->origFlags & UCOL_USE_ITERATOR) {
2751             source->flags = source->origFlags;
2752           }
2753         } else { // prefix search was a failure, we have to backup all the way to the start
2754           loadState(source, &entryState, TRUE);
2755         }
2756       break;
2757       }
2758     case CONTRACTION_TAG:
2759       {
2760       /* This should handle contractions */
2761       collIterateState state;
2762       backupState(source, &state);
2763       uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2764       const UChar *UCharOffset;
2765       UChar schar, tchar;
2766
2767       for (;;) {
2768         /* This loop will run once per source string character, for as long as we     */
2769         /*  are matching a potential contraction sequence                  */
2770
2771         /* First we position ourselves at the begining of contraction sequence */
2772         const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2773
2774         if (collIter_eos(source)) {
2775             // Ran off the end of the source string.
2776             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2777             // So we'll pick whatever we have at the point...
2778             if (CE == UCOL_NOT_FOUND) {
2779                 // back up the source over all the chars we scanned going into this contraction.
2780                 CE = firstCE;
2781                 loadState(source, &state, TRUE);
2782                 if(source->origFlags & UCOL_USE_ITERATOR) {
2783                     source->flags = source->origFlags;
2784                 }
2785             }
2786             break;
2787         }
2788
2789         uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2790         uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2791
2792         schar = getNextNormalizedChar(source);
2793         while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2794           UCharOffset++;
2795         }
2796
2797         if (schar == tchar) {
2798             // Found the source string char in the contraction table.
2799             //  Pick up the corresponding CE from the table.
2800             CE = *(coll->contractionCEs +
2801                 (UCharOffset - coll->contractionIndex));
2802         }
2803         else
2804         {
2805             // Source string char was not in contraction table.
2806             //   Unless we have a discontiguous contraction, we have finished
2807             //   with this contraction.
2808             UChar32 miss = schar;
2809             if(U16_IS_LEAD(schar)) { // in order to do the proper detection, we
2810               // need to see if we're dealing with a supplementary
2811               miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2812             }
2813
2814             uint8_t sCC;
2815             if (miss < 0x300 ||
2816                 maxCC == 0 ||
2817                 (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2818                 sCC>maxCC ||
2819                 (allSame != 0 && sCC == maxCC) ||
2820                 collIter_eos(source)) {
2821                     //  Contraction can not be discontiguous.
2822                     goBackOne(source);  // back up the source string by one,
2823                                         //  because  the character we just looked at was
2824                                         //  not part of the contraction.   */
2825                     if(U_IS_SUPPLEMENTARY(miss)) {
2826                       goBackOne(source);
2827                     }
2828                     CE = *(coll->contractionCEs +
2829                         (ContractionStart - coll->contractionIndex));
2830             } else {
2831                 //
2832                 // Contraction is possibly discontiguous.
2833                 //   Scan more of source string looking for a match
2834                 //
2835                 UChar tempchar;
2836                 /* find the next character if schar is not a base character
2837                     and we are not yet at the end of the string */
2838                 tempchar = getNextNormalizedChar(source);
2839                 // probably need another supplementary thingie here
2840                 goBackOne(source);
2841                 if (i_getCombiningClass(tempchar, coll) == 0) {
2842                     goBackOne(source);
2843                     if(U_IS_SUPPLEMENTARY(miss)) {
2844                       goBackOne(source);
2845                     }
2846                     /* Spit out the last char of the string, wasn't tasty enough */
2847                     CE = *(coll->contractionCEs +
2848                         (ContractionStart - coll->contractionIndex));
2849                 } else {
2850                     CE = getDiscontiguous(coll, source, ContractionStart);
2851                 }
2852             }
2853         } // else after if(schar == tchar)
2854
2855         if(CE == UCOL_NOT_FOUND) {
2856             /* The Source string did not match the contraction that we were checking.  */
2857             /*  Back up the source position to undo the effects of having partially    */
2858             /*   scanned through what ultimately proved to not be a contraction.       */
2859           loadState(source, &state, TRUE);
2860           CE = firstCE;
2861           break;
2862         }
2863
2864         if(!isContraction(CE)) {
2865             // The source string char was in the contraction table, and the corresponding
2866             //   CE is not a contraction CE.  We completed the contraction, break
2867             //   out of loop, this CE will end up being returned.  This is the normal
2868             //   way out of contraction handling when the source actually contained
2869             //   the contraction.
2870             break;
2871         }
2872
2873
2874         // The source string char was in the contraction table, and the corresponding
2875         //   CE is IS  a contraction CE.  We will continue looping to check the source
2876         //   string for the remaining chars in the contraction.
2877         uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2878         if(tempCE != UCOL_NOT_FOUND) {
2879             // We have scanned a a section of source string for which there is a
2880             //  CE from the contraction table.  Remember the CE and scan position, so
2881             //  that we can return to this point if further scanning fails to
2882             //  match a longer contraction sequence.
2883             firstCE = tempCE;
2884
2885             goBackOne(source);
2886             backupState(source, &state);
2887             getNextNormalizedChar(source);
2888
2889             // Another way to do this is:
2890             //collIterateState tempState;
2891             //backupState(source, &tempState);
2892             //goBackOne(source);
2893             //backupState(source, &state);
2894             //loadState(source, &tempState, TRUE);
2895
2896             // The problem is that for incomplete contractions we have to remember the previous
2897             // position. Before, the only thing I needed to do was state.pos--;
2898             // After iterator introduction and especially after introduction of normalizing
2899             // iterators, it became much more difficult to decrease the saved state.
2900             // I'm not yet sure which of the two methods above is faster.
2901         }
2902       } // for(;;)
2903       break;
2904       } // case CONTRACTION_TAG:
2905     case LONG_PRIMARY_TAG:
2906       {
2907         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2908         CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2909         return CE;
2910       }
2911     case EXPANSION_TAG:
2912       {
2913       /* This should handle expansion. */
2914       /* NOTE: we can encounter both continuations and expansions in an expansion! */
2915       /* I have to decide where continuations are going to be dealt with */
2916       uint32_t size;
2917       uint32_t i;    /* general counter */
2918       CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2919       size = getExpansionCount(CE);
2920       CE = *CEOffset++;
2921       if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2922         for(i = 1; i<size; i++) {
2923           *(source->CEpos++) = *CEOffset++;
2924         }
2925       } else { /* else, we do */
2926         while(*CEOffset != 0) {
2927           *(source->CEpos++) = *CEOffset++;
2928         }
2929       }
2930       return CE;
2931       }
2932     case DIGIT_TAG:
2933       {
2934       /*
2935          We do a check to see if we want to collate digits as numbers; if so we generate
2936          a custom collation key. Otherwise we pull out the value stored in the expansion table.
2937       */
2938       uint32_t size;
2939       uint32_t i;    /* general counter */
2940
2941       if (source->coll->numericCollation == UCOL_ON){
2942         collIterateState digitState = {0,0,0,0,0,0,0,0};
2943         UChar32 char32 = 0;
2944
2945         uint32_t digIndx = 0;
2946         uint32_t endIndex = 0;
2947         uint32_t trailingZeroIndex = 0;
2948
2949         uint32_t primWeight = 0;
2950
2951         int32_t digVal = 0;
2952         uint8_t collateVal = 0;
2953
2954         UBool nonZeroValReached = FALSE;
2955
2956         uint8_t *numTempBuf;
2957         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2958         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2959
2960         numTempBuf = stackNumTempBuf;
2961         /*
2962              We parse the source string until we hit a char that's NOT a digit.
2963             Use this u_charDigitValue. This might be slow because we have to
2964             handle surrogates...
2965         */
2966 /*
2967         if (U16_IS_LEAD(ch)){
2968           if (!collIter_eos(source)) {
2969             backupState(source, &digitState);
2970             UChar trail = getNextNormalizedChar(source);
2971             if(U16_IS_TRAIL(trail)) {
2972               char32 = U16_GET_SUPPLEMENTARY(ch, trail);
2973             } else {
2974               loadState(source, &digitState, TRUE);
2975               char32 = ch;
2976             }
2977           } else {
2978             char32 = ch;
2979           }
2980         } else {
2981           char32 = ch;
2982         }
2983         digVal = u_charDigitValue(char32);
2984 */
2985         digVal = u_charDigitValue(cp); // if we have arrived here, we have
2986         // already processed possible supplementaries that trigered the digit tag -
2987         // all supplementaries are marked in the UCA.
2988         /*
2989             We  pad a zero in front of the first element anyways. This takes
2990             care of the (probably) most common case where people are sorting things followed
2991             by a single digit
2992         */
2993         digIndx++;
2994         for(;;){
2995             // Make sure we have enough space.
2996             if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2997             {
2998                 numTempBufSize *= 2;
2999                 if (numTempBuf == stackNumTempBuf){
3000                     numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3001                     uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3002                 } else {
3003                     uprv_realloc(numTempBuf, numTempBufSize);
3004                 }
3005             }
3006
3007             // Skipping over leading zeroes.
3008             if (digVal != 0) {
3009                 nonZeroValReached = TRUE;
3010             }
3011             if (nonZeroValReached) {
3012                 /*
3013                     We parse the digit string into base 100 numbers (this fits into a byte).
3014                     We only add to the buffer in twos, thus if we are parsing an odd character,
3015                     that serves as the 'tens' digit while the if we are parsing an even one, that
3016                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3017                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3018                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3019                     than all the other bytes.
3020                  */
3021
3022                 if (digIndx % 2 == 1){
3023                     collateVal += (uint8_t)digVal;
3024
3025                     // We don't enter the low-order-digit case unless we've already seen
3026                     // the high order, or for the first digit, which is always non-zero.
3027                     if (collateVal != 0)
3028                         trailingZeroIndex = 0;
3029
3030                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3031                     collateVal = 0;
3032                 }
3033                 else{
3034                     // We drop the collation value into the buffer so if we need to do
3035                     // a "front patch" we don't have to check to see if we're hitting the
3036                     // last element.
3037                     collateVal = (uint8_t)(digVal * 10);
3038
3039                     // Check for trailing zeroes.
3040                     if (collateVal == 0)
3041                     {
3042                         if (!trailingZeroIndex)
3043                             trailingZeroIndex = (digIndx/2) + 2;
3044                     }
3045                     else
3046                         trailingZeroIndex = 0;
3047
3048                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3049                 }
3050                 digIndx++;
3051             }
3052
3053             // Get next character.
3054             if (!collIter_eos(source)){
3055                 ch = getNextNormalizedChar(source);
3056                 if (U16_IS_LEAD(ch)){
3057                   if (!collIter_eos(source)) {
3058                     backupState(source, &digitState);
3059                     UChar trail = getNextNormalizedChar(source);
3060                     if(U16_IS_TRAIL(trail)) {
3061                       char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3062                     } else {
3063                       loadState(source, &digitState, TRUE);
3064                       char32 = ch;
3065                     }
3066                   }
3067                 } else {
3068                   char32 = ch;
3069                 }
3070
3071                 if ((digVal = u_charDigitValue(char32)) == -1){
3072                     // Resetting position to point to the next unprocessed char. We
3073                     // overshot it when doing our test/set for numbers.
3074                   if (char32 > 0xFFFF) { // For surrogates.
3075                     loadState(source, &digitState, TRUE);
3076                     //goBackOne(source);
3077                   }
3078                   goBackOne(source);
3079                   break;
3080                 }
3081             } else {
3082               break;
3083             }
3084         }
3085
3086         if (nonZeroValReached == FALSE){
3087             digIndx = 2;
3088             numTempBuf[2] = 6;
3089         }
3090
3091         endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3092         if (digIndx % 2 != 0){
3093             /*
3094                 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3095                 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3096                 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3097                 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3098             */
3099
3100             for(i = 2; i < endIndex; i++){
3101                 numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3102                                     (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3103             }
3104             --digIndx;
3105         }
3106
3107         // Subtract one off of the last byte.
3108         numTempBuf[endIndex-1] -= 1;
3109
3110         /*
3111             We want to skip over the first two slots in the buffer. The first slot
3112             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3113             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3114         */
3115         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3116         numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3117
3118         // Now transfer the collation key to our collIterate struct.
3119         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3120           size = ((endIndex+1) & ~1)/2;
3121           CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3122                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3123                 UCOL_BYTE_COMMON; // Tertiary weight.
3124           i = 2; // Reset the index into the buffer.
3125           while(i < endIndex)
3126           {
3127             primWeight = numTempBuf[i++] << 8;
3128             if ( i < endIndex)
3129                 primWeight |= numTempBuf[i++];
3130             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3131           }
3132
3133           if (numTempBuf != stackNumTempBuf)
3134             uprv_free(numTempBuf);
3135       } else {
3136         // no numeric mode, we'll just switch to whatever we stashed and continue
3137           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3138           CE = *CEOffset++;
3139           break;
3140       }
3141       return CE;
3142       }
3143     /* various implicits optimization */
3144     // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3145     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3146       //return getImplicit(cp, source, 0x04000000);
3147       return getImplicit(cp, source);
3148     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3149       /* UCA is filled with these. Tailorings are NOT_FOUND */
3150       //return getImplicit(cp, source, 0);
3151       return getImplicit(cp, source);
3152     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3153       return 0; /* broken surrogate sequence */
3154     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3155       UChar nextChar;
3156       if( source->flags & UCOL_USE_ITERATOR) {
3157         if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3158           cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3159           source->iterator->next(source->iterator);
3160           return getImplicit(cp, source);
3161         }  else {
3162           return 0;
3163         }
3164       } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3165         U_IS_TRAIL((nextChar=*source->pos))) {
3166         cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3167         source->pos++;
3168         return getImplicit(cp, source);
3169       } else {
3170         return 0; /* completely ignorable */
3171       }
3172     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3173       {
3174         const uint32_t
3175           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3176         //const uint32_t LCount = 19;
3177         const uint32_t VCount = 21;
3178         const uint32_t TCount = 28;
3179         //const uint32_t NCount = VCount * TCount;   // 588
3180         //const uint32_t SCount = LCount * NCount;   // 11172
3181         uint32_t L = ch - SBase;
3182
3183         // divide into pieces
3184
3185         uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3186         L /= TCount;
3187         uint32_t V = L % VCount;
3188         L /= VCount;
3189
3190         // offset them
3191
3192         L += LBase;
3193         V += VBase;
3194         T += TBase;
3195
3196         // return the first CE, but first put the rest into the expansion buffer
3197         if (!source->coll->image->jamoSpecial) { // FAST PATH
3198
3199           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3200           if (T != TBase) {
3201               *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3202           }
3203
3204           return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3205
3206         } else { // Jamo is Special
3207           // Since Hanguls pass the FCD check, it is
3208           // guaranteed that we won't be in
3209           // the normalization buffer if something like this happens
3210           // However, if we are using a uchar iterator and normalization
3211           // is ON, the Hangul that lead us here is going to be in that
3212           // normalization buffer. Here we want to restore the uchar
3213           // iterator state and pull out of the normalization buffer
3214           if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3215             source->flags = source->origFlags; // restore the iterator
3216             source->pos = NULL;
3217           }
3218           // Move Jamos into normalization buffer
3219           source->writableBuffer[0] = (UChar)L;
3220           source->writableBuffer[1] = (UChar)V;
3221           if (T != TBase) {
3222             source->writableBuffer[2] = (UChar)T;
3223             source->writableBuffer[3] = 0;
3224           } else {
3225             source->writableBuffer[2] = 0;
3226           }
3227
3228           source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3229                                                          //   after exhausting the writableBuffer
3230           source->pos   = source->writableBuffer;
3231           source->origFlags         = source->flags;
3232           source->flags            |= UCOL_ITER_INNORMBUF;
3233           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3234
3235           return(UCOL_IGNORABLE);
3236         }
3237       }
3238     case CHARSET_TAG:
3239     /* not yet implemented */
3240       /* probably after 1.8 */
3241       return UCOL_NOT_FOUND;
3242     default:
3243       *status = U_INTERNAL_PROGRAM_ERROR;
3244       CE=0;
3245       break;
3246     }
3247     if (CE <= UCOL_NOT_FOUND) break;
3248   }
3249   return CE;
3250 }
3251
3252
3253 /* now uses Mark's getImplicitPrimary code */
3254 static
3255 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3256   if(isNonChar(cp)) {
3257     return 0;
3258   }
3259
3260   uint32_t r = uprv_uca_getImplicitPrimary(cp);
3261
3262   *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3263   collationSource->toReturn = collationSource->CEpos;
3264   return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3265 }
3266
3267 /**
3268  * This function handles the special CEs like contractions, expansions,
3269  * surrogates, Thai.
3270  * It is called by both getPrevCE
3271  */
3272 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3273                           collIterate *source,
3274                           UErrorCode *status)
3275 {
3276   const uint32_t *CEOffset    = NULL;
3277         UChar    *UCharOffset = NULL;
3278         UChar    schar;
3279   const UChar    *constart    = NULL;
3280         uint32_t size;
3281         UChar    buffer[UCOL_MAX_BUFFER];
3282         uint32_t *endCEBuffer;
3283         UChar   *strbuffer;
3284         int32_t noChars = 0;
3285
3286   for(;;)
3287   {
3288     /* the only ces that loops are thai and contractions */
3289     switch (getCETag(CE))
3290     {
3291     case NOT_FOUND_TAG:  /* this tag always returns */
3292       return CE;
3293     case SURROGATE_TAG:  /* This is a surrogate pair */
3294       /* essentialy an engaged lead surrogate. */
3295       /* if you have encountered it here, it means that a */
3296       /* broken sequence was encountered and this is an error */
3297       return 0;
3298     case SPEC_PROC_TAG:
3299       {
3300         // Special processing is getting a CE that is preceded by a certain prefix
3301         // Currently this is only needed for optimizing Japanese length and iteration marks.
3302         // When we encouter a special processing tag, we go backwards and try to see if
3303         // we have a match.
3304         // Contraction tables are used - so the whole process is not unlike contraction.
3305         // prefix data is stored backwards in the table.
3306         const UChar *UCharOffset;
3307         UChar schar, tchar;
3308         collIterateState prefixState;
3309         backupState(source, &prefixState);
3310         for(;;) {
3311         // This loop will run once per source string character, for as long as we
3312         //  are matching a potential contraction sequence
3313
3314           // First we position ourselves at the begining of contraction sequence
3315           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3316
3317           if (collIter_bos(source)) {
3318             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3319             break;
3320           }
3321           schar = getPrevNormalizedChar(source, status);
3322           goBackOne(source);
3323
3324           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3325             UCharOffset++;
3326           }
3327
3328           if (schar == tchar) {
3329               // Found the source string char in the table.
3330               //  Pick up the corresponding CE from the table.
3331               CE = *(coll->contractionCEs +
3332                   (UCharOffset - coll->contractionIndex));
3333           }
3334           else
3335           {
3336               // if there is a completely ignorable code point in the middle of
3337               // a prefix, we need to act as if it's not there
3338               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3339               // lone surrogates cannot be set to zero as it would break other processing
3340               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3341               // it's easy for BMP code points
3342               if(isZeroCE == 0) {
3343                 continue;
3344               } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3345                 // for supplementary code points, we have to check the next one
3346                 // situations where we are going to ignore
3347                 // 1. beginning of the string: schar is a lone surrogate
3348                 // 2. schar is a lone surrogate
3349                 // 3. schar is a trail surrogate in a valid surrogate sequence
3350                 //    that is explicitly set to zero.
3351                 if (!collIter_bos(source)) {
3352                   UChar lead;
3353                   if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3354                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3355                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3356                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3357                       if(finalCE == 0) {
3358                         // this is a real, assigned completely ignorable code point
3359                         goBackOne(source);
3360                         continue;
3361                       }
3362                     }
3363                   } else {
3364                     // lone surrogate, completely ignorable
3365                     continue;
3366                   }
3367                 } else {
3368                   // lone surrogate at the beggining, completely ignorable
3369                   continue;
3370                 }
3371               }
3372               // Source string char was not in the table.
3373               //   We have not found the prefix.
3374               CE = *(coll->contractionCEs +
3375                   (ContractionStart - coll->contractionIndex));
3376           }
3377
3378           if(!isPrefix(CE)) {
3379               // The source string char was in the contraction table, and the corresponding
3380               //   CE is not a prefix CE.  We found the prefix, break
3381               //   out of loop, this CE will end up being returned.  This is the normal
3382               //   way out of prefix handling when the source actually contained
3383               //   the prefix.
3384               break;
3385           }
3386         }
3387       loadState(source, &prefixState, TRUE);
3388       break;
3389       }
3390
3391     case CONTRACTION_TAG:
3392         /* to ensure that the backwards and forwards iteration matches, we
3393         take the current region of most possible match and pass it through
3394         the forward iteration. this will ensure that the obstinate problem of
3395         overlapping contractions will not occur.
3396         */
3397         schar = peekCharacter(source, 0);
3398         constart = (UChar *)coll->image + getContractOffset(CE);
3399         if (isAtStartPrevIterate(source)
3400             /* commented away contraction end checks after adding the checks
3401             in getPrevCE  */) {
3402             /* start of string or this is not the end of any contraction */
3403             CE = *(coll->contractionCEs +
3404                      (constart - coll->contractionIndex));
3405             break;
3406         }
3407         strbuffer = buffer;
3408         UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3409         *(UCharOffset --) = 0;
3410         noChars = 0;
3411         // have to swap thai characters
3412         while (ucol_unsafeCP(schar, coll)) {
3413             *(UCharOffset) = schar;
3414             noChars++;
3415             UCharOffset --;
3416             schar = getPrevNormalizedChar(source, status);
3417             goBackOne(source);
3418             // TODO: when we exhaust the contraction buffer,
3419             // it needs to get reallocated. The problem is
3420             // that the size depends on the string which is
3421             // not iterated over. However, since we're travelling
3422             // backwards, we already had to set the iterator at
3423             // the end - so we might as well know where we are?
3424             if (UCharOffset + 1 == buffer) {
3425                 /* we have exhausted the buffer */
3426               int32_t newsize = 0;
3427               if(source->pos) { // actually dealing with a position
3428                 newsize = source->pos - source->string + 1;
3429               } else { // iterator
3430                 newsize = 4 * UCOL_MAX_BUFFER;
3431               }
3432                 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3433                                              (newsize + UCOL_MAX_BUFFER));
3434                 /* test for NULL */
3435                 if (strbuffer == NULL) {
3436                     *status = U_MEMORY_ALLOCATION_ERROR;
3437                     return UCOL_NO_MORE_CES;
3438                 }
3439                 UCharOffset = strbuffer + newsize;
3440                 uprv_memcpy(UCharOffset, buffer,
3441                                              UCOL_MAX_BUFFER * sizeof(UChar));
3442                 UCharOffset --;
3443             }
3444             if ((source->pos && (source->pos == source->string ||
3445                 ((source->flags & UCOL_ITER_INNORMBUF) &&
3446                 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3447                 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3448                 break;
3449             }
3450         }
3451         /* adds the initial base character to the string */
3452         *(UCharOffset) = schar;
3453         noChars++;
3454
3455         /* a new collIterate is used to simplify things, since using the current
3456         collIterate will mean that the forward and backwards iteration will
3457         share and change the same buffers. we don't want to get into that. */
3458         collIterate temp;
3459         //IInit_collIterate(coll, UCharOffset, -1, &temp);
3460         IInit_collIterate(coll, UCharOffset, noChars, &temp);
3461         temp.flags &= ~UCOL_ITER_NORM;
3462
3463         CE = ucol_IGetNextCE(coll, &temp, status);
3464         endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3465         while (CE != UCOL_NO_MORE_CES) {
3466             *(source->CEpos ++) = CE;
3467             if (source->CEpos == endCEBuffer) {
3468                 /* ran out of CE space, bail.
3469                 there's no guarantee of the right character position after
3470                 this bail*/
3471                 *status = U_BUFFER_OVERFLOW_ERROR;
3472                 source->CEpos = source->CEs;
3473                 freeHeapWritableBuffer(&temp);
3474                 if (strbuffer != buffer) {
3475                     uprv_free(strbuffer);
3476                 }
3477                 return (uint32_t)UCOL_NULLORDER;
3478             }
3479             CE = ucol_IGetNextCE(coll, &temp, status);
3480         }
3481         freeHeapWritableBuffer(&temp);
3482         if (strbuffer != buffer) {
3483             uprv_free(strbuffer);
3484         }
3485         source->toReturn = source->CEpos - 1;
3486         if (source->toReturn == source->CEs) {
3487             source->CEpos = source->CEs;
3488         }
3489         return *(source->toReturn);
3490     case LONG_PRIMARY_TAG:
3491       {
3492         *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3493         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3494         source->toReturn = source->CEpos - 1;
3495         return *(source->toReturn);
3496       }
3497     case EXPANSION_TAG: /* this tag always returns */
3498       /*
3499       This should handle expansion.
3500       NOTE: we can encounter both continuations and expansions in an expansion!
3501       I have to decide where continuations are going to be dealt with
3502       */
3503       /* find the offset to expansion table */
3504       CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3505       size     = getExpansionCount(CE);
3506       if (size != 0) {
3507         /*
3508         if there are less than 16 elements in expansion, we don't terminate
3509         */
3510         uint32_t count;
3511         for (count = 0; count < size; count++) {
3512           *(source->CEpos ++) = *CEOffset++;
3513         }
3514       }
3515       else {
3516         /* else, we do */
3517         while (*CEOffset != 0) {
3518           *(source->CEpos ++) = *CEOffset ++;
3519         }
3520       }
3521       source->toReturn = source->CEpos - 1;
3522       // in case of one element expansion, we
3523       // want to immediately return CEpos
3524       if(source->toReturn == source->CEs) {
3525         source->CEpos = source->CEs;
3526       }
3527       return *(source->toReturn);
3528      case DIGIT_TAG:
3529       {
3530       /*
3531          We do a check to see if we want to collate digits as numbers; if so we generate
3532          a custom collation key. Otherwise we pull out the value stored in the expansion table.
3533       */
3534       //uint32_t size;
3535       uint32_t i;    /* general counter */
3536
3537       if (source->coll->numericCollation == UCOL_ON){
3538         collIterateState state = {0,0,0,0,0,0,0,0};
3539         UChar32 char32 = 0;
3540
3541         uint32_t digIndx = 0;
3542         uint32_t endIndex = 0;
3543         uint32_t leadingZeroIndex = 0;
3544         uint32_t trailingZeroCount = 0;
3545
3546         uint32_t primWeight = 0;
3547
3548         int32_t digVal = 0;
3549         uint8_t collateVal = 0;
3550
3551         UBool nonZeroValReached = FALSE;
3552
3553         uint8_t *numTempBuf;
3554         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3555         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3556
3557         numTempBuf = stackNumTempBuf;
3558         /*
3559              We parse the source string until we hit a char that's NOT a digit.
3560             Use this u_charDigitValue. This might be slow because we have to
3561             handle surrogates...
3562         */
3563
3564         if (U16_IS_TRAIL (ch)){
3565             if (!collIter_bos(source)){
3566               UChar lead = getPrevNormalizedChar(source, status);
3567               if(U16_IS_LEAD(lead)) {
3568                 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3569                 goBackOne(source);
3570               } else {
3571                 char32 = ch;
3572               }
3573             } else {
3574                 char32 = ch;
3575             }
3576         } else {
3577             char32 = ch;
3578         }
3579         digVal = u_charDigitValue(char32);
3580
3581         for(;;){
3582         // Make sure we have enough space.
3583         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3584         {
3585             numTempBufSize *= 2;
3586             if (numTempBuf == stackNumTempBuf){
3587                 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3588                 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3589             }else
3590                 uprv_realloc(numTempBuf, numTempBufSize);
3591         }
3592
3593             // Skip over trailing zeroes, and keep a count of them.
3594             if (digVal != 0)
3595                     nonZeroValReached = TRUE;
3596             if (nonZeroValReached){
3597                 /*
3598                     We parse the digit string into base 100 numbers (this fits into a byte).
3599                     We only add to the buffer in twos, thus if we are parsing an odd character,
3600                     that serves as the 'tens' digit while the if we are parsing an even one, that
3601                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3602                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3603                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3604                     than all the other bytes.
3605
3606                     Since we're doing in this reverse we want to put the first digit encountered into the
3607                     ones place and the second digit encountered into the tens place.
3608                  */
3609
3610                 if ((digIndx + trailingZeroCount) % 2 == 1){
3611                     // High-order digit case (tens place)
3612                     collateVal += (uint8_t)(digVal * 10);
3613
3614                     // We cannot set leadingZeroIndex unless it has been set for the
3615                     // low-order digit. Therefore, all we can do for the high-order
3616                     // digit is turn it off, never on.
3617                     // The only time we will have a high digit without a low is for
3618                     // the very first non-zero digit, so no zero check is necessary.
3619                     if (collateVal != 0)
3620                         leadingZeroIndex = 0;
3621
3622                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3623                     collateVal = 0;
3624                 }
3625                 else{
3626                     // Low-order digit case (ones place)
3627                     collateVal = (uint8_t)digVal;
3628
3629                     // Check for leading zeroes.
3630                     if (collateVal == 0)
3631                     {
3632                         if (!leadingZeroIndex)
3633                             leadingZeroIndex = (digIndx/2) + 2;
3634                     }
3635                     else
3636                         leadingZeroIndex = 0;
3637
3638                     // No need to write to buffer; the case of a last odd digit
3639                     // is handled below.
3640                 }
3641                 ++digIndx;
3642             }
3643             else
3644                 ++trailingZeroCount;
3645
3646             if (!collIter_bos(source)){
3647                 ch = getPrevNormalizedChar(source, status);
3648                 //goBackOne(source);
3649                 if (U16_IS_TRAIL(ch)){
3650                     backupState(source, &state);
3651                     if (!collIter_bos(source))
3652                     {
3653                         goBackOne(source);
3654                         UChar lead = getPrevNormalizedChar(source, status);
3655                         if(U16_IS_LEAD(lead)) {
3656                           char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3657                         } else {
3658                           loadState(source, &state, FALSE);
3659                           char32 = ch;
3660                         }
3661                     }
3662                 }
3663                 else
3664                     char32 = ch;
3665
3666                 if ((digVal = u_charDigitValue(char32)) == -1){
3667                   if (char32 > 0xFFFF) {// For surrogates.
3668                     loadState(source, &state, FALSE);
3669                   }
3670                     // Don't need to "reverse" the goBackOne call,
3671                     // as this points to the next position to process..
3672                     //if (char32 > 0xFFFF) // For surrogates.
3673                         //getNextNormalizedChar(source);
3674                     break;
3675                 }
3676                 goBackOne(source);
3677             }else
3678                 break;
3679         }
3680
3681         if (nonZeroValReached == FALSE){
3682             digIndx = 2;
3683             trailingZeroCount = 0;
3684             numTempBuf[2] = 6;
3685         }
3686
3687         if ((digIndx + trailingZeroCount) % 2 != 0){
3688                 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3689             digIndx += 1;       // The implicit leading zero
3690             }
3691         if (trailingZeroCount % 2 != 0){
3692             // We had to consume one trailing zero for the low digit
3693             // of the least significant byte
3694             digIndx += 1;       // The trailing zero not in the exponent
3695             trailingZeroCount -= 1;
3696         }
3697
3698         endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3699
3700         // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3701         numTempBuf[2] -= 1;
3702
3703         /*
3704             We want to skip over the first two slots in the buffer. The first slot
3705             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3706             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3707             The exponent must be adjusted by the number of leading zeroes, and the number of
3708             trailing zeroes.
3709         */
3710         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3711         uint32_t exponent = (digIndx+trailingZeroCount)/2;
3712         if (leadingZeroIndex)
3713             exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3714         numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3715
3716         // Now transfer the collation key to our collIterate struct.
3717         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3718         //size = ((endIndex+1) & ~1)/2;
3719           *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3720                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3721                 UCOL_BYTE_COMMON; // Tertiary weight.
3722           i = endIndex - 1; // Reset the index into the buffer.
3723           while(i >= 2)
3724           {
3725             primWeight = numTempBuf[i--] << 8;
3726             if ( i >= 2)
3727                 primWeight |= numTempBuf[i--];
3728             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3729           }
3730           if (numTempBuf != stackNumTempBuf)
3731             uprv_free(numTempBuf);
3732
3733           source->toReturn = source->CEpos -1;
3734           return *(source->toReturn);
3735       }
3736       else {
3737           CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3738           CE = *(CEOffset++);
3739           break;
3740       }
3741       }
3742     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3743       {
3744         const uint32_t
3745           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3746         //const uint32_t LCount = 19;
3747         const uint32_t VCount = 21;
3748         const uint32_t TCount = 28;
3749         //const uint32_t NCount = VCount * TCount;   /* 588 */
3750         //const uint32_t SCount = LCount * NCount;   /* 11172 */
3751
3752         uint32_t L = ch - SBase;
3753         /*
3754         divide into pieces.
3755         we do it in this order since some compilers can do % and / in one
3756         operation
3757         */
3758         uint32_t T = L % TCount;
3759         L /= TCount;
3760         uint32_t V = L % VCount;
3761         L /= VCount;
3762
3763         /* offset them */
3764         L += LBase;
3765         V += VBase;
3766         T += TBase;
3767
3768         /*
3769         return the first CE, but first put the rest into the expansion buffer
3770         */
3771         if (!source->coll->image->jamoSpecial)
3772         {
3773           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3774           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3775           if (T != TBase)
3776             *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3777
3778           source->toReturn = source->CEpos - 1;
3779           return *(source->toReturn);
3780         } else {
3781           // Since Hanguls pass the FCD check, it is
3782           // guaranteed that we won't be in
3783           // the normalization buffer if something like this happens
3784           // Move Jamos into normalization buffer
3785           /*
3786           Move the Jamos into the
3787           normalization buffer
3788           */
3789           UChar *tempbuffer = source->writableBuffer +
3790                               (source->writableBufSize - 1);
3791           *(tempbuffer) = 0;
3792           if (T != TBase) {
3793             *(tempbuffer - 1) = (UChar)T;
3794             *(tempbuffer - 2) = (UChar)V;
3795             *(tempbuffer - 3) = (UChar)L;
3796             *(tempbuffer - 4) = 0;
3797           } else {
3798             *(tempbuffer - 1) = (UChar)V;
3799             *(tempbuffer - 2) = (UChar)L;
3800             *(tempbuffer - 3) = 0;
3801           }
3802
3803           /*
3804           Indicate where to continue in main input string after exhausting
3805           the writableBuffer
3806           */
3807           if (source->pos  == source->string) {
3808             source->fcdPosition = NULL;
3809           } else {
3810             source->fcdPosition       = source->pos-1;
3811           }
3812
3813           source->pos               = tempbuffer;
3814           source->origFlags         = source->flags;
3815           source->flags            |= UCOL_ITER_INNORMBUF;
3816           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3817
3818           return(UCOL_IGNORABLE);
3819         }
3820       }
3821     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3822       return 0; /* broken surrogate sequence */
3823     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3824     {
3825       UChar32 cp = 0;
3826       UChar  prevChar;
3827       UChar *prev;
3828       if (isAtStartPrevIterate(source)) {
3829           /* we are at the start of the string, wrong place to be at */
3830           return 0;
3831       }
3832       if (source->pos != source->writableBuffer) {
3833           prev     = source->pos - 1;
3834       } else {
3835           prev     = source->fcdPosition;
3836       }
3837       prevChar = *prev;
3838
3839       /* Handles Han and Supplementary characters here.*/
3840       if (U16_IS_LEAD(prevChar)) {
3841         cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3842         source->pos = prev;
3843       } else {
3844         return 0; /* completely ignorable */
3845       }
3846       return getPrevImplicit(cp, source);
3847     }
3848     // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3849     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3850       return getPrevImplicit(ch, source);
3851     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3852       return getPrevImplicit(ch, source);
3853       /* UCA is filled with these. Tailorings are NOT_FOUND */
3854     /* not yet implemented */
3855     case CHARSET_TAG:  /* this tag always returns */
3856       /* probably after 1.8 */
3857       return UCOL_NOT_FOUND;
3858     default:           /* this tag always returns */
3859       *status = U_INTERNAL_PROGRAM_ERROR;
3860       CE=0;
3861       break;
3862     }
3863     if (CE <= UCOL_NOT_FOUND) {
3864       break;
3865     }
3866   }
3867   return CE;
3868 }
3869
3870 /* This should really be a macro        */
3871 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3872 /* anyway */
3873 static
3874 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3875 #ifdef UCOL_DEBUG
3876   fprintf(stderr, ".");
3877 #endif
3878   uint8_t *newStart = NULL;
3879   uint32_t offset = *secondaries-secStart;
3880
3881   if(secStart==second) {
3882     newStart=(uint8_t*)uprv_malloc(newSize);
3883     if(newStart==NULL) {
3884       *status = U_MEMORY_ALLOCATION_ERROR;
3885       return NULL;
3886     }
3887     uprv_memcpy(newStart, secStart, *secondaries-secStart);
3888   } else {
3889     newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3890     if(newStart==NULL) {
3891       *status = U_MEMORY_ALLOCATION_ERROR;
3892       return NULL;
3893     }
3894   }
3895   *secondaries=newStart+offset;
3896   *secSize=newSize;
3897   return newStart;
3898 }
3899
3900
3901 /* This should really be a macro                                                                      */
3902 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3903 /* secondaries in French                                                                              */
3904 /*
3905 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3906   uint8_t temp;
3907   while(start<end) {
3908     temp = *start;
3909     *start++ = *end;
3910     *end-- = temp;
3911   }
3912 }
3913 */
3914
3915 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3916   TYPE tempA; \
3917 while((start)<(end)) { \
3918     tempA = *(start); \
3919     *(start)++ = *(end); \
3920     *(end)-- = tempA; \
3921 } \
3922 }
3923
3924 /****************************************************************************/
3925 /* Following are the sortkey generation functions                           */
3926 /*                                                                          */
3927 /****************************************************************************/
3928
3929 /**
3930  * Merge two sort keys.
3931  * This is useful, for example, to combine sort keys from first and last names
3932  * to sort such pairs.
3933  * Merged sort keys consider on each collation level the first part first entirely,
3934  * then the second one.
3935  * It is possible to merge multiple sort keys by consecutively merging
3936  * another one with the intermediate result.
3937  *
3938  * The length of the merge result is the sum of the lengths of the input sort keys
3939  * minus 1.
3940  *
3941  * @param src1 the first sort key
3942  * @param src1Length the length of the first sort key, including the zero byte at the end;
3943  *        can be -1 if the function is to find the length
3944  * @param src2 the second sort key
3945  * @param src2Length the length of the second sort key, including the zero byte at the end;
3946  *        can be -1 if the function is to find the length
3947  * @param dest the buffer where the merged sort key is written,
3948  *        can be NULL if destCapacity==0
3949  * @param destCapacity the number of bytes in the dest buffer
3950  * @return the length of the merged sort key, src1Length+src2Length-1;
3951  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3952  *         in which cases the contents of dest is undefined
3953  *
3954  * @draft
3955  */
3956 U_CAPI int32_t U_EXPORT2
3957 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3958                    const uint8_t *src2, int32_t src2Length,
3959                    uint8_t *dest, int32_t destCapacity) {
3960     int32_t destLength;
3961     uint8_t b;
3962
3963     /* check arguments */
3964     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
3965         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
3966         destCapacity<0 || (destCapacity>0 && dest==NULL)
3967     ) {
3968         /* error, attempt to write a zero byte and return 0 */
3969         if(dest!=NULL && destCapacity>0) {
3970             *dest=0;
3971         }
3972         return 0;
3973     }
3974
3975     /* check lengths and capacity */
3976     if(src1Length<0) {
3977         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
3978     }
3979     if(src2Length<0) {
3980         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
3981     }
3982
3983     destLength=src1Length+src2Length-1;
3984     if(destLength>destCapacity) {
3985         /* the merged sort key does not fit into the destination */
3986         return destLength;
3987     }
3988
3989     /* merge the sort keys with the same number of levels */
3990     while(*src1!=0 && *src2!=0) { /* while both have another level */
3991         /* copy level from src1 not including 00 or 01 */
3992         while((b=*src1)>=2) {
3993             ++src1;
3994             *dest++=b;
3995         }
3996
3997         /* add a 02 merge separator */
3998         *dest++=2;
3999
4000         /* copy level from src2 not including 00 or 01 */
4001         while((b=*src2)>=2) {
4002             ++src2;
4003             *dest++=b;
4004         }
4005
4006         /* if both sort keys have another level, then add a 01 level separator and continue */
4007         if(*src1==1 && *src2==1) {
4008             ++src1;
4009             ++src2;
4010             *dest++=1;
4011         }
4012     }
4013
4014     /*
4015      * here, at least one sort key is finished now, but the other one
4016      * might have some contents left from containing more levels;
4017      * that contents is just appended to the result
4018      */
4019     if(*src1!=0) {
4020         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4021         src2=src1;
4022     }
4023     /* append src2, "the other, unfinished sort key" */
4024     uprv_strcpy((char *)dest, (const char *)src2);
4025
4026     /* trust that neither sort key contained illegally embedded zero bytes */
4027     return destLength;
4028 }
4029
4030 /* sortkey API */
4031 U_CAPI int32_t U_EXPORT2
4032 ucol_getSortKey(const    UCollator    *coll,
4033         const    UChar        *source,
4034         int32_t        sourceLength,
4035         uint8_t        *result,
4036         int32_t        resultLength)
4037 {
4038   UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4039   if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4040       int32_t actualSrcLen = sourceLength;
4041       if (actualSrcLen==-1 && source!=NULL) {
4042           actualSrcLen = u_strlen(source);
4043       }
4044       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen);
4045   }
4046
4047   UErrorCode status = U_ZERO_ERROR;
4048   int32_t keySize   = 0;
4049
4050   if(source != NULL) {
4051       // source == NULL is actually an error situation, but we would need to
4052       // have an error code to return it. Until we introduce a new
4053       // API, it stays like this
4054
4055       /* this uses the function pointer that is set in updateinternalstate */
4056       /* currently, there are two funcs: */
4057       /*ucol_calcSortKey(...);*/
4058       /*ucol_calcSortKeySimpleTertiary(...);*/
4059
4060       keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4061       //((UCollator *)coll)->errorCode = status; /*semantically const */
4062   }
4063   UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4064   UTRACE_EXIT_STATUS(status);
4065   return keySize;
4066 }
4067
4068 /* this function is called by the C++ API for sortkey generation */
4069 U_CFUNC int32_t
4070 ucol_getSortKeyWithAllocation(const UCollator *coll,
4071                               const UChar *source, int32_t sourceLength,
4072                               uint8_t **pResult,
4073                               UErrorCode *pErrorCode) {
4074     *pResult = 0;
4075     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4076 }
4077
4078 #define UCOL_FSEC_BUF_SIZE 256
4079
4080 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4081 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
4082 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4083     UErrorCode status = U_ZERO_ERROR;
4084     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4085     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4086     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4087     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4088     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4089     UBool  doCase = (coll->caseLevel == UCOL_ON);
4090     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4091     //UBool  qShifted = shifted  && (compareQuad == 0);
4092     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4093     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4094     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4095     uint8_t *fSecs = fSecsBuff;
4096     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4097     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4098
4099     uint32_t variableTopValue = coll->variableTopValue;
4100     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4101     if(doHiragana) {
4102       UCOL_COMMON_BOT4++;
4103       /* allocate one more space for hiragana */
4104     }
4105     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4106
4107     uint32_t order = UCOL_NO_MORE_CES;
4108     uint8_t primary1 = 0;
4109     uint8_t primary2 = 0;
4110     uint8_t secondary = 0;
4111     uint8_t tertiary = 0;
4112     int32_t caseShift = 0;
4113     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4114
4115     uint8_t caseSwitch = coll->caseSwitch;
4116     uint8_t tertiaryMask = coll->tertiaryMask;
4117     uint8_t tertiaryCommon = coll->tertiaryCommon;
4118
4119     UBool wasShifted = FALSE;
4120     UBool notIsContinuation = FALSE;
4121     uint8_t leadPrimary = 0;
4122
4123
4124     for(;;) {
4125           order = ucol_IGetNextCE(coll, s, &status);
4126           if(order == UCOL_NO_MORE_CES) {
4127               break;
4128           }
4129
4130           if(order == 0) {
4131             continue;
4132           }
4133
4134           notIsContinuation = !isContinuation(order);
4135
4136
4137           if(notIsContinuation) {
4138             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4139           } else {
4140             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4141           }
4142           secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4143           primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4144           primary1 = (uint8_t)(order >> 8);
4145
4146
4147           if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4148             || (!notIsContinuation && wasShifted))
4149             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4150             /* and other ignorables should be removed if following a shifted code point */
4151             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4152                                 /* we should just completely ignore it */
4153               continue;
4154             }
4155             if(compareQuad == 0) {
4156               if(c4 > 0) {
4157                 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4158                 c4 = 0;
4159               }
4160               currentSize++;
4161               if(primary2 != 0) {
4162                 currentSize++;
4163               }
4164             }
4165             wasShifted = TRUE;
4166           } else {
4167             wasShifted = FALSE;
4168             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4169             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4170             /* calculate sortkey size */
4171             if(primary1 != UCOL_IGNORABLE) {
4172               if(notIsContinuation) {
4173                 if(leadPrimary == primary1) {
4174                   currentSize++;
4175                 } else {
4176                   if(leadPrimary != 0) {
4177                     currentSize++;
4178                   }
4179                   if(primary2 == UCOL_IGNORABLE) {
4180                   /* one byter, not compressed */
4181                       currentSize++;
4182                       leadPrimary = 0;
4183                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4184                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4185                       //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4186                       (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4187                   /* not compressible */
4188                       leadPrimary = 0;
4189                       currentSize+=2;
4190                   } else { /* compress */
4191                       leadPrimary = primary1;
4192                       currentSize+=2;
4193                   }
4194                 }
4195               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4196                 currentSize++;
4197                 if(primary2 != UCOL_IGNORABLE) {
4198                   currentSize++;
4199                 }
4200               }
4201             }
4202
4203             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4204               if(!isFrenchSec){
4205                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4206                   c2++;
4207                 } else {
4208                   if(c2 > 0) {
4209                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4210                       currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4211                     } else {
4212                       currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4213                     }
4214                     c2 = 0;
4215                   }
4216                   currentSize++;
4217                 }
4218               } else {
4219                 fSecs[fSecsLen++] = secondary;
4220                 if(fSecsLen == fSecsMaxLen) {
4221                   if(fSecs == fSecsBuff) {
4222                     fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4223                   } else {
4224                     fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4225                   }
4226                   if(fSecs == NULL) {
4227                     status = U_MEMORY_ALLOCATION_ERROR;
4228                     return -1;
4229                   }
4230                   fSecsMaxLen *= 2;
4231                 }
4232                 if(notIsContinuation) {
4233                   if (frenchStartPtr != NULL) {
4234                       /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4235                     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4236                     frenchStartPtr = NULL;
4237                   }
4238                 } else {
4239                   if (frenchStartPtr == NULL) {
4240                     frenchStartPtr = fSecs+fSecsLen-2;
4241                   }
4242                   frenchEndPtr = fSecs+fSecsLen-1;
4243                 }
4244               }
4245             }
4246
4247             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4248                 // do the case level if we need to do it. We don't want to calculate
4249                 // case level for primary ignorables if we have only primary strength and case level
4250                 // otherwise we would break well formedness of CEs
4251               if (caseShift  == 0) {
4252                 currentSize++;
4253                 caseShift = UCOL_CASE_SHIFT_START;
4254               }
4255               if((tertiary&0x3F) > 0 && notIsContinuation) {
4256                 caseShift--;
4257                 if((tertiary &0xC0) != 0) {
4258                   if (caseShift  == 0) {
4259                     currentSize++;
4260                     caseShift = UCOL_CASE_SHIFT_START;
4261                   }
4262                   caseShift--;
4263                 }
4264               }
4265             } else {
4266               if(notIsContinuation) {
4267                 tertiary ^= caseSwitch;
4268               }
4269             }
4270
4271             tertiary &= tertiaryMask;
4272             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4273               if (tertiary == tertiaryCommon && notIsContinuation) {
4274                 c3++;
4275               } else {
4276                 if(c3 > 0) {
4277                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4278                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4279                     currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4280                   } else {
4281                     currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4282                   }
4283                   c3 = 0;
4284                 }
4285                 currentSize++;
4286               }
4287             }
4288
4289             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4290               if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4291                 if(c4>0) { // Close this part
4292                   currentSize += (c4/UCOL_BOT_COUNT4)+1;
4293                   c4 = 0;
4294                 }
4295                 currentSize++; // Add the Hiragana
4296               } else { // This wasn't Hiragana, so we can continue adding stuff
4297                 c4++;
4298               }
4299             }
4300
4301           }
4302     }
4303
4304     if(!isFrenchSec){
4305       if(c2 > 0) {
4306         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4307       }
4308     } else {
4309       uint32_t i = 0;
4310       if(frenchStartPtr != NULL) {
4311         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4312       }
4313       for(i = 0; i<fSecsLen; i++) {
4314         secondary = *(fSecs+fSecsLen-i-1);
4315         /* This is compression code. */
4316         if (secondary == UCOL_COMMON2) {
4317           ++c2;
4318         } else {
4319           if(c2 > 0) {
4320             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4321               currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4322             } else {
4323               currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4324             }
4325             c2 = 0;
4326           }
4327           currentSize++;
4328         }
4329       }
4330       if(c2 > 0) {
4331         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4332       }
4333       if(fSecs != fSecsBuff) {
4334         uprv_free(fSecs);
4335       }
4336     }
4337
4338     if(c3 > 0) {
4339       currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4340     }
4341
4342     if(c4 > 0  && compareQuad == 0) {
4343       currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4344     }
4345
4346     if(compareIdent) {
4347       currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4348     }
4349     return currentSize;
4350
4351 }
4352
4353 static
4354 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4355   if (caseShift  == 0) {
4356     *(*cases)++ = UCOL_CASE_BYTE_START;
4357     caseShift = UCOL_CASE_SHIFT_START;
4358   }
4359 }
4360
4361 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4362 // know how many values we wanted to add, even if we didn't add them all
4363 static
4364 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4365   size++;
4366   if(primaries < limit) {
4367     *(primaries)++ = value;
4368   }
4369 }
4370
4371 // Packs the secondary buffer when processing French locale. Adds the terminator.
4372 static
4373 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4374   uint8_t secondary;
4375   int32_t count2 = 0;
4376   uint32_t i = 0, size = 0;
4377   // we use i here since the key size already accounts for terminators, so we'll discard the increment
4378   addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4379   /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4380   if(frenchStartPtr != NULL) {
4381     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4382   }
4383   for(i = 0; i<*secsize; i++) {
4384     secondary = *(secondaries-i-1);
4385     /* This is compression code. */
4386     if (secondary == UCOL_COMMON2) {
4387       ++count2;
4388     } else {
4389       if (count2 > 0) {
4390         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4391           while (count2 > UCOL_TOP_COUNT2) {
4392             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4393             count2 -= (uint32_t)UCOL_TOP_COUNT2;
4394           }
4395           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4396         } else {
4397           while (count2 > UCOL_BOT_COUNT2) {
4398             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4399             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4400           }
4401           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4402         }
4403         count2 = 0;
4404       }
4405       addWithIncrement(primaries, primEnd, size, secondary);
4406     }
4407   }
4408   if (count2 > 0) {
4409     while (count2 > UCOL_BOT_COUNT2) {
4410       addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4411       count2 -= (uint32_t)UCOL_BOT_COUNT2;
4412     }
4413     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4414   }
4415   *secsize = size;
4416   return primaries;
4417 }
4418
4419 /* This is the sortkey work horse function */
4420 U_CFUNC int32_t U_CALLCONV
4421 ucol_calcSortKey(const    UCollator    *coll,
4422         const    UChar        *source,
4423         int32_t        sourceLength,
4424         uint8_t        **result,
4425         uint32_t        resultLength,
4426         UBool allocateSKBuffer,
4427         UErrorCode *status)
4428 {
4429     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4430
4431     uint32_t i = 0; /* general purpose counter */
4432
4433     /* Stack allocated buffers for buffers we use */
4434     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4435
4436     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4437
4438     if(U_FAILURE(*status)) {
4439       return 0;
4440     }
4441
4442     if(primaries == NULL && allocateSKBuffer == TRUE) {
4443         primaries = *result = prim;
4444         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4445     }
4446
4447     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4448       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4449
4450     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4451
4452     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4453     UChar *normSource = normBuffer;
4454     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4455
4456     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4457
4458     UColAttributeValue strength = coll->strength;
4459
4460     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4461     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4462     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4463     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4464     UBool  doCase = (coll->caseLevel == UCOL_ON);
4465     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4466     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4467     //UBool  qShifted = shifted && (compareQuad == 0);
4468     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4469     /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4470
4471     uint32_t variableTopValue = coll->variableTopValue;
4472     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4473     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4474     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4475     uint8_t UCOL_HIRAGANA_QUAD = 0;
4476     if(doHiragana) {
4477       UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4478       /* allocate one more space for hiragana, value for hiragana */
4479     }
4480     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4481
4482     /* support for special features like caselevel and funky secondaries */
4483     uint8_t *frenchStartPtr = NULL;
4484     uint8_t *frenchEndPtr = NULL;
4485     uint32_t caseShift = 0;
4486
4487     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4488
4489     /* If we need to normalize, we'll do it all at once at the beginning! */
4490     UNormalizationMode normMode;
4491     if(compareIdent) {
4492         normMode = UNORM_NFD;
4493     } else if(coll->normalizationMode != UCOL_OFF) {
4494         normMode = UNORM_FCD;
4495     } else {
4496         normMode = UNORM_NONE;
4497     }
4498
4499     if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4500         len = unorm_internalNormalize(normSource, normSourceLen,
4501                                       source, len,
4502                                       normMode, FALSE,
4503                                       status);
4504         if(*status == U_BUFFER_OVERFLOW_ERROR) {
4505             normSourceLen = len;
4506             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4507             if(normSource == NULL) {
4508                 *status = U_MEMORY_ALLOCATION_ERROR;
4509                 return 0;
4510             }
4511             *status = U_ZERO_ERROR;
4512             len = unorm_internalNormalize(normSource, normSourceLen,
4513                                           source, len,
4514                                           normMode, FALSE,
4515                                           status);
4516         }
4517
4518         if(U_FAILURE(*status)) {
4519             return 0;
4520         }
4521         source = normSource;
4522     }
4523
4524     collIterate s;
4525     IInit_collIterate(coll, (UChar *)source, len, &s);
4526     if(source == normSource) {
4527         s.flags &= ~UCOL_ITER_NORM;
4528     }
4529
4530     if(resultLength == 0 || primaries == NULL) {
4531       int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4532       if(normSource != normBuffer) {
4533           uprv_free(normSource);
4534       }
4535       return keyLen;
4536     }
4537     uint8_t *primarySafeEnd = primaries + resultLength - 1;
4538     if(strength > UCOL_PRIMARY) {
4539         primarySafeEnd--;
4540     }
4541
4542     uint32_t minBufferSize = UCOL_MAX_BUFFER;
4543
4544     uint8_t *primStart = primaries;
4545     uint8_t *secStart = secondaries;
4546     uint8_t *terStart = tertiaries;
4547     uint8_t *caseStart = cases;
4548     uint8_t *quadStart = quads;
4549
4550     uint32_t order = 0;
4551
4552     uint8_t primary1 = 0;
4553     uint8_t primary2 = 0;
4554     uint8_t secondary = 0;
4555     uint8_t tertiary = 0;
4556     uint8_t caseSwitch = coll->caseSwitch;
4557     uint8_t tertiaryMask = coll->tertiaryMask;
4558     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4559     uint8_t tertiaryTop = coll->tertiaryTop;
4560     uint8_t tertiaryBottom = coll->tertiaryBottom;
4561     uint8_t tertiaryCommon = coll->tertiaryCommon;
4562     uint8_t caseBits = 0;
4563
4564     UBool finished = FALSE;
4565     UBool wasShifted = FALSE;
4566     UBool notIsContinuation = FALSE;
4567
4568     uint32_t prevBuffSize = 0;
4569
4570     uint32_t count2 = 0, count3 = 0, count4 = 0;
4571     uint8_t leadPrimary = 0;
4572
4573     for(;;) {
4574         for(i=prevBuffSize; i<minBufferSize; ++i) {
4575
4576             order = ucol_IGetNextCE(coll, &s, status);
4577             if(order == UCOL_NO_MORE_CES) {
4578                 finished = TRUE;
4579                 break;
4580             }
4581
4582             if(order == 0) {
4583               continue;
4584             }
4585
4586             notIsContinuation = !isContinuation(order);
4587
4588             if(notIsContinuation) {
4589               tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4590             } else {
4591               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4592             }
4593
4594             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4595             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4596             primary1 = (uint8_t)(order >> 8);
4597
4598             /*if(notIsContinuation && scriptOrder != NULL) {
4599               primary1 = scriptOrder[primary1];
4600             }*/
4601
4602             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4603               || (!notIsContinuation && wasShifted))
4604               || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4605               /* and other ignorables should be removed if following a shifted code point */
4606               if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4607                                   /* we should just completely ignore it */
4608                 continue;
4609               }
4610               if(compareQuad == 0) {
4611                 if(count4 > 0) {
4612                   while (count4 > UCOL_BOT_COUNT4) {
4613                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4614                     count4 -= UCOL_BOT_COUNT4;
4615                   }
4616                   *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4617                   count4 = 0;
4618                 }
4619                 /* We are dealing with a variable and we're treating them as shifted */
4620                 /* This is a shifted ignorable */
4621                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4622                   *quads++ = primary1;
4623                 }
4624                 if(primary2 != 0) {
4625                   *quads++ = primary2;
4626                 }
4627               }
4628               wasShifted = TRUE;
4629             } else {
4630               wasShifted = FALSE;
4631               /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4632               /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4633               /* regular and simple sortkey calc */
4634               if(primary1 != UCOL_IGNORABLE) {
4635                 if(notIsContinuation) {
4636                   if(leadPrimary == primary1) {
4637                     *primaries++ = primary2;
4638                   } else {
4639                     if(leadPrimary != 0) {
4640                       *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4641                     }
4642                     if(primary2 == UCOL_IGNORABLE) {
4643                     /* one byter, not compressed */
4644                         *primaries++ = primary1;
4645                         leadPrimary = 0;
4646                     } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4647                         //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4648                        (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
4649                     /* not compressible */
4650                         leadPrimary = 0;
4651                         *primaries++ = primary1;
4652                         *primaries++ = primary2;
4653                     } else { /* compress */
4654                         *primaries++ = leadPrimary = primary1;
4655                         *primaries++ = primary2;
4656                     }
4657                   }
4658                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4659                   *primaries++ = primary1;
4660                   if(primary2 != UCOL_IGNORABLE) {
4661                     *primaries++ = primary2; /* second part */
4662                   }
4663                 }
4664               }
4665
4666             if(secondary > compareSec) {
4667               if(!isFrenchSec) {
4668                 /* This is compression code. */
4669                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4670                   ++count2;
4671                 } else {
4672                   if (count2 > 0) {
4673                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4674                       while (count2 > UCOL_TOP_COUNT2) {
4675                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4676                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4677                       }
4678                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4679                     } else {
4680                       while (count2 > UCOL_BOT_COUNT2) {
4681                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4682                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4683                       }
4684                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4685                     }
4686                     count2 = 0;
4687                   }
4688                   *secondaries++ = secondary;
4689                 }
4690               } else {
4691                   *secondaries++ = secondary;
4692                   /* Do the special handling for French secondaries */
4693                   /* We need to get continuation elements and do intermediate restore */
4694                   /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4695                   if(notIsContinuation) {
4696                     if (frenchStartPtr != NULL) {
4697                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4698                       uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4699                       frenchStartPtr = NULL;
4700                     }
4701                   } else {
4702                     if (frenchStartPtr == NULL) {
4703                       frenchStartPtr = secondaries - 2;
4704                     }
4705                     frenchEndPtr = secondaries-1;
4706                   }
4707                 }
4708               }
4709
4710               if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4711                 // do the case level if we need to do it. We don't want to calculate
4712                 // case level for primary ignorables if we have only primary strength and case level
4713                 // otherwise we would break well formedness of CEs
4714                 doCaseShift(&cases, caseShift);
4715                 if(notIsContinuation) {
4716                   caseBits = (uint8_t)(tertiary & 0xC0);
4717
4718                   if(tertiary != 0) {
4719                     if(coll->caseFirst == UCOL_UPPER_FIRST) {
4720                       if((caseBits & 0xC0) == 0) {
4721                         *(cases-1) |= 1 << (--caseShift);
4722                       } else {
4723                         *(cases-1) |= 0 << (--caseShift);
4724                         /* second bit */
4725                         doCaseShift(&cases, caseShift);
4726                         *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4727                       }
4728                     } else {
4729                       if((caseBits & 0xC0) == 0) {
4730                         *(cases-1) |= 0 << (--caseShift);
4731                       } else {
4732                         *(cases-1) |= 1 << (--caseShift);
4733                         /* second bit */
4734                         doCaseShift(&cases, caseShift);
4735                         *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4736                       }
4737                     }
4738                   }
4739
4740                 }
4741               } else {
4742                 if(notIsContinuation) {
4743                   tertiary ^= caseSwitch;
4744                 }
4745               }
4746
4747               tertiary &= tertiaryMask;
4748               if(tertiary > compareTer) {
4749                 /* This is compression code. */
4750                 /* sequence size check is included in the if clause */
4751                 if (tertiary == tertiaryCommon && notIsContinuation) {
4752                   ++count3;
4753                 } else {
4754                   if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4755                     tertiary += tertiaryAddition;
4756                   } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4757                     tertiary -= tertiaryAddition;
4758                   }
4759                   if (count3 > 0) {
4760                     if ((tertiary > tertiaryCommon)) {
4761                       while (count3 > coll->tertiaryTopCount) {
4762                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4763                         count3 -= (uint32_t)coll->tertiaryTopCount;
4764                       }
4765                       *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4766                     } else {
4767                       while (count3 > coll->tertiaryBottomCount) {
4768                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4769                         count3 -= (uint32_t)coll->tertiaryBottomCount;
4770                       }
4771                       *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4772                     }
4773                     count3 = 0;
4774                   }
4775                   *tertiaries++ = tertiary;
4776                 }
4777               }
4778
4779               if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4780                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4781                   if(count4>0) { // Close this part
4782                     while (count4 > UCOL_BOT_COUNT4) {
4783                       *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4784                       count4 -= UCOL_BOT_COUNT4;
4785                     }
4786                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4787                     count4 = 0;
4788                   }
4789                   *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4790                 } else { // This wasn't Hiragana, so we can continue adding stuff
4791                   count4++;
4792                 }
4793               }
4794             }
4795
4796             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4797               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4798                 IInit_collIterate(coll, (UChar *)source, len, &s);
4799                 if(source == normSource) {
4800                     s.flags &= ~UCOL_ITER_NORM;
4801                 }
4802                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4803                 *status = U_BUFFER_OVERFLOW_ERROR;
4804                 finished = TRUE;
4805                 break;
4806               } else { /* It's much nicer if we can actually reallocate */
4807                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4808                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4809                 if(U_SUCCESS(*status)) {
4810                   *result = primStart;
4811                   primarySafeEnd = primStart + resultLength - 1;
4812                   if(strength > UCOL_PRIMARY) {
4813                       primarySafeEnd--;
4814                   }
4815                 } else {
4816                   IInit_collIterate(coll, (UChar *)source, len, &s);
4817                   if(source == normSource) {
4818                       s.flags &= ~UCOL_ITER_NORM;
4819                   }
4820                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4821                   finished = TRUE;
4822                   break;
4823                 }
4824               }
4825             }
4826         }
4827         if(finished) {
4828             break;
4829         } else {
4830           prevBuffSize = minBufferSize;
4831           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4832           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4833           caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4834           quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4835           minBufferSize *= 2;
4836           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4837             IInit_collIterate(coll, (UChar *)source, len, &s);
4838             if(source == normSource) {
4839                 s.flags &= ~UCOL_ITER_NORM;
4840             }
4841             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4842             break;
4843           }
4844         }
4845     }
4846
4847     /* Here, we are generally done with processing */
4848     /* bailing out would not be too productive */
4849
4850     if(U_SUCCESS(*status)) {
4851       sortKeySize += (primaries - primStart);
4852       /* we have done all the CE's, now let's put them together to form a key */
4853       if(compareSec == 0) {
4854         if (count2 > 0) {
4855           while (count2 > UCOL_BOT_COUNT2) {
4856             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4857             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4858           }
4859           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4860         }
4861         uint32_t secsize = secondaries-secStart;
4862         if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4863           sortKeySize += secsize;
4864           if(sortKeySize <= resultLength) {
4865             *(primaries++) = UCOL_LEVELTERMINATOR;
4866             uprv_memcpy(primaries, secStart, secsize);
4867             primaries += secsize;
4868           } else {
4869             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4870               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4871               if(U_SUCCESS(*status)) {
4872                 *result = primStart;
4873                 *(primaries++) = UCOL_LEVELTERMINATOR;
4874                 uprv_memcpy(primaries, secStart, secsize);
4875                 primaries += secsize;
4876               }
4877             } else {
4878               *status = U_BUFFER_OVERFLOW_ERROR;
4879             }
4880           }
4881         } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4882           uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4883           sortKeySize += secsize;
4884           if(sortKeySize <= resultLength) { // if we managed to pack fine
4885             primaries = newPrim; // update the primary pointer
4886           } else { // overflow, need to reallocate and redo
4887             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4888               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4889               if(U_SUCCESS(*status)) {
4890                 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4891               }
4892             } else {
4893               *status = U_BUFFER_OVERFLOW_ERROR;
4894             }
4895           }
4896         }
4897       }
4898
4899       if(doCase) {
4900         uint32_t casesize = cases - caseStart;
4901         sortKeySize += casesize;
4902         if(sortKeySize <= resultLength) {
4903           *(primaries++) = UCOL_LEVELTERMINATOR;
4904           uprv_memcpy(primaries, caseStart, casesize);
4905           primaries += casesize;
4906         } else {
4907           if(allocateSKBuffer == TRUE) {
4908             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4909             if(U_SUCCESS(*status)) {
4910               *result = primStart;
4911               *(primaries++) = UCOL_LEVELTERMINATOR;
4912               uprv_memcpy(primaries, caseStart, casesize);
4913             }
4914           } else {
4915             *status = U_BUFFER_OVERFLOW_ERROR;
4916           }
4917         }
4918       }
4919
4920       if(compareTer == 0) {
4921         if (count3 > 0) {
4922           if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4923             while (count3 >= coll->tertiaryTopCount) {
4924               *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4925               count3 -= (uint32_t)coll->tertiaryTopCount;
4926             }
4927             *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4928           } else {
4929             while (count3 > coll->tertiaryBottomCount) {
4930               *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4931               count3 -= (uint32_t)coll->tertiaryBottomCount;
4932             }
4933             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4934           }
4935         }
4936         uint32_t tersize = tertiaries - terStart;
4937         sortKeySize += tersize;
4938         if(sortKeySize <= resultLength) {
4939           *(primaries++) = UCOL_LEVELTERMINATOR;
4940           uprv_memcpy(primaries, terStart, tersize);
4941           primaries += tersize;
4942         } else {
4943           if(allocateSKBuffer == TRUE) {
4944             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4945             if(U_SUCCESS(*status)) {
4946               *result = primStart;
4947               *(primaries++) = UCOL_LEVELTERMINATOR;
4948               uprv_memcpy(primaries, terStart, tersize);
4949             }
4950           } else {
4951             *status = U_BUFFER_OVERFLOW_ERROR;
4952           }
4953         }
4954
4955         if(compareQuad == 0/*qShifted == TRUE*/) {
4956             if(count4 > 0) {
4957               while (count4 > UCOL_BOT_COUNT4) {
4958                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4959                 count4 -= UCOL_BOT_COUNT4;
4960               }
4961               *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4962             }
4963             uint32_t quadsize = quads - quadStart;
4964             sortKeySize += quadsize;
4965             if(sortKeySize <= resultLength) {
4966               *(primaries++) = UCOL_LEVELTERMINATOR;
4967               uprv_memcpy(primaries, quadStart, quadsize);
4968               primaries += quadsize;
4969             } else {
4970               if(allocateSKBuffer == TRUE) {
4971                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4972                 if(U_SUCCESS(*status)) {
4973                   *result = primStart;
4974                   *(primaries++) = UCOL_LEVELTERMINATOR;
4975                   uprv_memcpy(primaries, quadStart, quadsize);
4976                 }
4977               } else {
4978                 *status = U_BUFFER_OVERFLOW_ERROR;
4979               }
4980             }
4981         }
4982
4983         if(compareIdent) {
4984           sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4985           if(sortKeySize <= resultLength) {
4986             *(primaries++) = UCOL_LEVELTERMINATOR;
4987             primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
4988           } else {
4989             if(allocateSKBuffer == TRUE) {
4990               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
4991               if(U_SUCCESS(*status)) {
4992                 *result = primStart;
4993                 *(primaries++) = UCOL_LEVELTERMINATOR;
4994                 u_writeIdenticalLevelRun(s.string, len, primaries);
4995               }
4996             } else {
4997               *status = U_BUFFER_OVERFLOW_ERROR;
4998             }
4999           }
5000         }
5001       }
5002       *(primaries++) = '\0';
5003     }
5004
5005     if(terStart != tert) {
5006         uprv_free(terStart);
5007         uprv_free(secStart);
5008         uprv_free(caseStart);
5009         uprv_free(quadStart);
5010     }
5011
5012     if(normSource != normBuffer) {
5013         uprv_free(normSource);
5014     }
5015
5016     if(allocateSKBuffer == TRUE) {
5017       *result = (uint8_t*)uprv_malloc(sortKeySize);
5018       /* test for NULL */
5019       if (*result == NULL) {
5020         *status = U_MEMORY_ALLOCATION_ERROR;
5021         return sortKeySize;
5022       }
5023       uprv_memcpy(*result, primStart, sortKeySize);
5024       if(primStart != prim) {
5025         uprv_free(primStart);
5026       }
5027     }
5028
5029     return sortKeySize;
5030 }
5031
5032
5033 U_CFUNC int32_t U_CALLCONV
5034 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5035         const    UChar        *source,
5036         int32_t        sourceLength,
5037         uint8_t        **result,
5038         uint32_t        resultLength,
5039         UBool allocateSKBuffer,
5040         UErrorCode *status)
5041 {
5042     U_ALIGN_CODE(16);
5043
5044     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5045     uint32_t i = 0; /* general purpose counter */
5046
5047     /* Stack allocated buffers for buffers we use */
5048     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5049
5050     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5051
5052     if(U_FAILURE(*status)) {
5053       return 0;
5054     }
5055
5056     if(primaries == NULL && allocateSKBuffer == TRUE) {
5057         primaries = *result = prim;
5058         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5059     }
5060
5061     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5062
5063     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5064
5065     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5066     UChar *normSource = normBuffer;
5067     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5068
5069     int32_t len =  sourceLength;
5070
5071     /* If we need to normalize, we'll do it all at once at the beginning! */
5072     if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5073         len = unorm_internalNormalize(normSource, normSourceLen,
5074                                       source, len,
5075                                       UNORM_FCD, FALSE,
5076                                       status);
5077         if(*status == U_BUFFER_OVERFLOW_ERROR) {
5078             normSourceLen = len;
5079             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5080             if(normSource == NULL) {
5081                 *status = U_MEMORY_ALLOCATION_ERROR;
5082                 return 0;
5083             }
5084             *status = U_ZERO_ERROR;
5085             len = unorm_internalNormalize(normSource, normSourceLen,
5086                                           source, len,
5087                                           UNORM_FCD, FALSE,
5088                                           status);
5089         }
5090
5091         if(U_FAILURE(*status)) {
5092             return 0;
5093         }
5094         source = normSource;
5095     }
5096
5097     collIterate s;
5098     IInit_collIterate(coll, (UChar *)source, len, &s);
5099     if(source == normSource) {
5100         s.flags &= ~UCOL_ITER_NORM;
5101     }
5102
5103     if(resultLength == 0 || primaries == NULL) {
5104         int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5105         if(normSource != normBuffer) {
5106             uprv_free(normSource);
5107         }
5108         return t;
5109     }
5110
5111     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5112
5113     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5114
5115     uint8_t *primStart = primaries;
5116     uint8_t *secStart = secondaries;
5117     uint8_t *terStart = tertiaries;
5118
5119     uint32_t order = 0;
5120
5121     uint8_t primary1 = 0;
5122     uint8_t primary2 = 0;
5123     uint8_t secondary = 0;
5124     uint8_t tertiary = 0;
5125     uint8_t caseSwitch = coll->caseSwitch;
5126     uint8_t tertiaryMask = coll->tertiaryMask;
5127     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5128     uint8_t tertiaryTop = coll->tertiaryTop;
5129     uint8_t tertiaryBottom = coll->tertiaryBottom;
5130     uint8_t tertiaryCommon = coll->tertiaryCommon;
5131
5132     uint32_t prevBuffSize = 0;
5133
5134     UBool finished = FALSE;
5135     UBool notIsContinuation = FALSE;
5136
5137     uint32_t count2 = 0, count3 = 0;
5138     uint8_t leadPrimary = 0;
5139
5140     for(;;) {
5141         for(i=prevBuffSize; i<minBufferSize; ++i) {
5142
5143             order = ucol_IGetNextCE(coll, &s, status);
5144
5145             if(order == 0) {
5146               continue;
5147             }
5148
5149             if(order == UCOL_NO_MORE_CES) {
5150                 finished = TRUE;
5151                 break;
5152             }
5153
5154             notIsContinuation = !isContinuation(order);
5155
5156             if(notIsContinuation) {
5157               tertiary = (uint8_t)((order & tertiaryMask));
5158             } else {
5159               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5160             }
5161             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5162             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5163             primary1 = (uint8_t)(order >> 8);
5164
5165             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5166             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5167             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5168             /* regular and simple sortkey calc */
5169             if(primary1 != UCOL_IGNORABLE) {
5170               if(notIsContinuation) {
5171                 if(leadPrimary == primary1) {
5172                   *primaries++ = primary2;
5173                 } else {
5174                   if(leadPrimary != 0) {
5175                     *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5176                   }
5177                   if(primary2 == UCOL_IGNORABLE) {
5178                   /* one byter, not compressed */
5179                       *primaries++ = primary1;
5180                       leadPrimary = 0;
5181                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5182                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5183                       //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5184                       (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5185                   /* not compressible */
5186                       leadPrimary = 0;
5187                       *primaries++ = primary1;
5188                       *primaries++ = primary2;
5189                   } else { /* compress */
5190                       *primaries++ = leadPrimary = primary1;
5191                       *primaries++ = primary2;
5192                   }
5193                 }
5194               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5195                 *primaries++ = primary1;
5196                 if(primary2 != UCOL_IGNORABLE) {
5197                   *primaries++ = primary2; /* second part */
5198                 }
5199               }
5200             }
5201
5202             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5203               /* This is compression code. */
5204               if (secondary == UCOL_COMMON2 && notIsContinuation) {
5205                 ++count2;
5206               } else {
5207                 if (count2 > 0) {
5208                   if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5209                     while (count2 > UCOL_TOP_COUNT2) {
5210                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5211                       count2 -= (uint32_t)UCOL_TOP_COUNT2;
5212                     }
5213                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5214                   } else {
5215                     while (count2 > UCOL_BOT_COUNT2) {
5216                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5217                       count2 -= (uint32_t)UCOL_BOT_COUNT2;
5218                     }
5219                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5220                   }
5221                   count2 = 0;
5222                 }
5223                 *secondaries++ = secondary;
5224               }
5225             }
5226
5227             if(notIsContinuation) {
5228               tertiary ^= caseSwitch;
5229             }
5230
5231               if(tertiary > 0) {
5232               /* This is compression code. */
5233               /* sequence size check is included in the if clause */
5234               if (tertiary == tertiaryCommon && notIsContinuation) {
5235                 ++count3;
5236               } else {
5237                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5238                   tertiary += tertiaryAddition;
5239                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5240                   tertiary -= tertiaryAddition;
5241                 }
5242                 if (count3 > 0) {
5243                   if ((tertiary > tertiaryCommon)) {
5244                     while (count3 > coll->tertiaryTopCount) {
5245                       *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5246                       count3 -= (uint32_t)coll->tertiaryTopCount;
5247                     }
5248                     *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5249                   } else {
5250                     while (count3 > coll->tertiaryBottomCount) {
5251                       *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5252                       count3 -= (uint32_t)coll->tertiaryBottomCount;
5253                     }
5254                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5255                   }
5256                   count3 = 0;
5257                 }
5258                 *tertiaries++ = tertiary;
5259               }
5260             }
5261
5262             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5263               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5264                 IInit_collIterate(coll, (UChar *)source, len, &s);
5265                 if(source == normSource) {
5266                     s.flags &= ~UCOL_ITER_NORM;
5267                 }
5268                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5269                 *status = U_BUFFER_OVERFLOW_ERROR;
5270                 finished = TRUE;
5271                 break;
5272               } else { /* It's much nicer if we can actually reallocate */
5273                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5274                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5275                 if(U_SUCCESS(*status)) {
5276                   *result = primStart;
5277                   primarySafeEnd = primStart + resultLength - 2;
5278                 } else {
5279                   IInit_collIterate(coll, (UChar *)source, len, &s);
5280                   if(source == normSource) {
5281                       s.flags &= ~UCOL_ITER_NORM;
5282                   }
5283                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5284                   finished = TRUE;
5285                   break;
5286                 }
5287               }
5288             }
5289         }
5290         if(finished) {
5291             break;
5292         } else {
5293           prevBuffSize = minBufferSize;
5294           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5295           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5296           minBufferSize *= 2;
5297           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5298             IInit_collIterate(coll, (UChar *)source, len, &s);
5299             if(source == normSource) {
5300                 s.flags &= ~UCOL_ITER_NORM;
5301             }
5302             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5303             break;
5304           }
5305         }
5306     }
5307
5308     if(U_SUCCESS(*status)) {
5309       sortKeySize += (primaries - primStart);
5310       /* we have done all the CE's, now let's put them together to form a key */
5311       if (count2 > 0) {
5312         while (count2 > UCOL_BOT_COUNT2) {
5313           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5314           count2 -= (uint32_t)UCOL_BOT_COUNT2;
5315         }
5316         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5317       }
5318       uint32_t secsize = secondaries-secStart;
5319       sortKeySize += secsize;
5320       if(sortKeySize <= resultLength) {
5321         *(primaries++) = UCOL_LEVELTERMINATOR;
5322         uprv_memcpy(primaries, secStart, secsize);
5323         primaries += secsize;
5324       } else {
5325         if(allocateSKBuffer == TRUE) {
5326           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5327           if(U_SUCCESS(*status)) {
5328             *(primaries++) = UCOL_LEVELTERMINATOR;
5329             *result = primStart;
5330             uprv_memcpy(primaries, secStart, secsize);
5331           }
5332         } else {
5333           *status = U_BUFFER_OVERFLOW_ERROR;
5334         }
5335       }
5336
5337       if (count3 > 0) {
5338         if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5339           while (count3 >= coll->tertiaryTopCount) {
5340             *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5341             count3 -= (uint32_t)coll->tertiaryTopCount;
5342           }
5343           *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5344         } else {
5345           while (count3 > coll->tertiaryBottomCount) {
5346             *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5347             count3 -= (uint32_t)coll->tertiaryBottomCount;
5348           }
5349           *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5350         }
5351       }
5352       uint32_t tersize = tertiaries - terStart;
5353       sortKeySize += tersize;
5354       if(sortKeySize <= resultLength) {
5355         *(primaries++) = UCOL_LEVELTERMINATOR;
5356         uprv_memcpy(primaries, terStart, tersize);
5357         primaries += tersize;
5358       } else {
5359         if(allocateSKBuffer == TRUE) {
5360           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5361           if(U_SUCCESS(*status)) {
5362             *result = primStart;
5363             *(primaries++) = UCOL_LEVELTERMINATOR;
5364             uprv_memcpy(primaries, terStart, tersize);
5365           }
5366         } else {
5367           *status = U_MEMORY_ALLOCATION_ERROR;
5368         }
5369       }
5370
5371       *(primaries++) = '\0';
5372     }
5373
5374     if(terStart != tert) {
5375         uprv_free(terStart);
5376         uprv_free(secStart);
5377     }
5378
5379     if(normSource != normBuffer) {
5380         uprv_free(normSource);
5381     }
5382
5383     if(allocateSKBuffer == TRUE) {
5384       *result = (uint8_t*)uprv_malloc(sortKeySize);
5385       /* test for NULL */
5386       if (*result == NULL) {
5387         *status = U_MEMORY_ALLOCATION_ERROR;
5388         return sortKeySize;
5389       }
5390       uprv_memcpy(*result, primStart, sortKeySize);
5391       if(primStart != prim) {
5392         uprv_free(primStart);
5393       }
5394     }
5395
5396     return sortKeySize;
5397 }
5398
5399 static inline
5400 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5401   UBool notIsContinuation = !isContinuation(CE);
5402   uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5403   if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5404     || (!notIsContinuation && *wasShifted))
5405     || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5406     // The stuff below should probably be in the sortkey code... maybe not...
5407     if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5408                         /* we should just completely ignore it */
5409       *wasShifted = TRUE;
5410       //continue;
5411     }
5412     //*wasShifted = TRUE;
5413     return TRUE;
5414   } else {
5415     *wasShifted = FALSE;
5416     return FALSE;
5417   }
5418 }
5419 static inline
5420 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5421   if(level < maxLevel) {
5422     dest[i++] = UCOL_LEVELTERMINATOR;
5423   } else {
5424     dest[i++] = 0;
5425   }
5426 }
5427
5428 /** enumeration of level identifiers for partial sort key generation */
5429 enum {
5430   UCOL_PSK_PRIMARY = 0,
5431     UCOL_PSK_SECONDARY = 1,
5432     UCOL_PSK_CASE = 2,
5433     UCOL_PSK_TERTIARY = 3,
5434     UCOL_PSK_QUATERNARY = 4,
5435     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5436     UCOL_PSK_IDENTICAL = 6,
5437     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5438     UCOL_PSK_LIMIT
5439 };
5440
5441 /** collation state enum. *_SHIFT value is how much to shift right
5442  *  to get the state piece to the right. *_MASK value should be
5443  *  ANDed with the shifted state. This data is stored in state[1]
5444  *  field.
5445  */
5446 enum {
5447     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5448     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5449     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5450     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5451     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5452      *  This field is also used to denote that the French secondary level is finished
5453      */
5454     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5455     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5456     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5457     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5458     /** When we do French we need to reverse secondary values. However, continuations
5459      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5460      */
5461     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5462     UCOL_PSK_BOCSU_BYTES_MASK = 3,
5463     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5464     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5465 };
5466
5467 // macro calculating the number of expansion CEs available
5468 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5469
5470
5471 /** main sortkey part procedure. On the first call,
5472  *  you should pass in a collator, an iterator, empty state
5473  *  state[0] == state[1] == 0, a buffer to hold results
5474  *  number of bytes you need and an error code pointer.
5475  *  Make sure your buffer is big enough to hold the wanted
5476  *  number of sortkey bytes. I don't check.
5477  *  The only meaningful status you can get back is
5478  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5479  *  have been dealt a raw deal and that you probably won't
5480  *  be able to use partial sortkey generation for this
5481  *  particular combination of string and collator. This
5482  *  is highly unlikely, but you should still check the error code.
5483  *  Any other status means that you're not in a sane situation
5484  *  anymore. After the first call, preserve state values and
5485  *  use them on subsequent calls to obtain more bytes of a sortkey.
5486  *  Use until the number of bytes written is smaller than the requested
5487  *  number of bytes. Generated sortkey is not compatible with the
5488  *  one generated by ucol_getSortKey, as we don't do any compression.
5489  *  However, levels are still terminated by a 1 (one) and the sortkey
5490  *  is terminated by a 0 (zero). Identical level is the same as in the
5491  *  regular sortkey - internal bocu-1 implementation is used.
5492  *  For curious, although you cannot do much about this, here is
5493  *  the structure of state words.
5494  *  state[0] - iterator state. Depends on the iterator implementation,
5495  *             but allows the iterator to continue where it stopped in
5496  *             the last iteration.
5497  *  state[1] - collation processing state. Here is the distribution
5498  *             of the bits:
5499  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5500  *             quaternary, quin (we don't use this one), identical and
5501  *             null (producing only zeroes - first one to terminate the
5502  *             sortkey and subsequent to fill the buffer).
5503  *   3       - byte count. Number of bytes written on the primary level.
5504  *   4       - was shifted. Whether the previous iteration finished in the
5505  *             shifted state.
5506  *   5, 6    - French continuation bytes written. See the comment in the enum
5507  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5508  *             the identical level.
5509  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5510  *             since thes last successful update of the iterator state.
5511  */
5512 U_CAPI int32_t U_EXPORT2
5513 ucol_nextSortKeyPart(const UCollator *coll,
5514                      UCharIterator *iter,
5515                      uint32_t state[2],
5516                      uint8_t *dest, int32_t count,
5517                      UErrorCode *status) {
5518     /* error checking */
5519     if(status==NULL || U_FAILURE(*status)) {
5520         return 0;
5521     }
5522     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5523     if( coll==NULL || iter==NULL ||
5524         state==NULL ||
5525         count<0 || (count>0 && dest==NULL)
5526     ) {
5527         *status=U_ILLEGAL_ARGUMENT_ERROR;
5528     }
5529
5530     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5531                   coll, iter, state[0], state[1], dest, count);
5532
5533     if(count==0) {
5534         /* nothing to do */
5535         UTRACE_EXIT_VALUE(0);
5536         return 0;
5537     }
5538     /** Setting up situation according to the state we got from the previous iteration */
5539     // The state of the iterator from the previous invocation
5540     uint32_t iterState = state[0];
5541     // Has the last iteration ended in the shifted state
5542     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5543     // What is the current level of the sortkey?
5544     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5545     // Have we written only one byte from a two byte primary in the previous iteration?
5546     // Also on secondary level - have we finished with the French secondary?
5547     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5548     // number of bytes in the continuation buffer for French
5549     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5550     // Number of bytes already written from a bocsu sequence. Since
5551     // the longes bocsu sequence is 4 long, this can be up to 3.
5552     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5553     // Number of elements that need to be consumed in this iteration because
5554     // the iterator returned UITER_NO_STATE at the end of the last iteration,
5555     // so we had to save the last valid state.
5556     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5557
5558     /** values that depend on the collator attributes */
5559     // strength of the collator.
5560     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5561     // maximal level of the partial sortkey. Need to take whether case level is done
5562     int32_t maxLevel = 0;
5563     if(strength < UCOL_TERTIARY) {
5564       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5565         maxLevel = UCOL_PSK_CASE;
5566       } else {
5567         maxLevel = strength;
5568       }
5569     } else {
5570         if(strength == UCOL_TERTIARY) {
5571           maxLevel = UCOL_PSK_TERTIARY;
5572         } else if(strength == UCOL_QUATERNARY) {
5573           maxLevel = UCOL_PSK_QUATERNARY;
5574         } else { // identical
5575           maxLevel = UCOL_IDENTICAL;
5576         }
5577     }
5578     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5579     uint8_t UCOL_HIRAGANA_QUAD =
5580       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5581     // Boundary value that decides whether a CE is shifted or not
5582     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5583     // Are we doing French collation?
5584     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5585
5586     /** initializing the collation state */
5587     UBool notIsContinuation = FALSE;
5588     uint32_t CE = UCOL_NO_MORE_CES;
5589
5590     collIterate s;
5591     IInit_collIterate(coll, NULL, -1, &s);
5592     s.iterator = iter;
5593     s.flags |= UCOL_USE_ITERATOR;
5594     // This variable tells us whether we have produced some other levels in this iteration
5595     // before we moved to the identical level. In that case, we need to switch the
5596     // type of the iterator.
5597     UBool doingIdenticalFromStart = FALSE;
5598     // Normalizing iterator
5599     // The division for the array length may truncate the array size to
5600     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5601     // for all platforms anyway.
5602     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5603     UNormIterator *normIter = NULL;
5604     // If the normalization is turned on for the collator and we are below identical level
5605     // we will use a FCD normalizing iterator
5606     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5607       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5608       s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5609       s.flags &= ~UCOL_ITER_NORM;
5610       if(U_FAILURE(*status)) {
5611         UTRACE_EXIT_STATUS(*status);
5612         return 0;
5613       }
5614     } else if(level == UCOL_PSK_IDENTICAL) {
5615       // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5616       // will be updating the state - and this cannot be done on an ordinary iterator.
5617       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5618       s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5619       s.flags &= ~UCOL_ITER_NORM;
5620       if(U_FAILURE(*status)) {
5621         UTRACE_EXIT_STATUS(*status);
5622         return 0;
5623       }
5624       doingIdenticalFromStart = TRUE;
5625     }
5626
5627     // This is the tentative new state of the iterator. The problem
5628     // is that the iterator might return an undefined state, in
5629     // which case we should save the last valid state and increase
5630     // the iterator skip value.
5631     uint32_t newState = 0;
5632
5633     // First, we set the iterator to the last valid position
5634     // from the last iteration. This was saved in state[0].
5635     if(iterState == 0) {
5636       /* initial state */
5637       if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5638         s.iterator->move(s.iterator, 0, UITER_LIMIT);
5639       } else {
5640         s.iterator->move(s.iterator, 0, UITER_START);
5641       }
5642     } else {
5643         /* reset to previous state */
5644       s.iterator->setState(s.iterator, iterState, status);
5645       if(U_FAILURE(*status)) {
5646           UTRACE_EXIT_STATUS(*status);
5647           return 0;
5648       }
5649     }
5650
5651
5652
5653     // This variable tells us whether we can attempt to update the state
5654     // of iterator. Situations where we don't want to update iterator state
5655     // are the existence of expansion CEs that are not yet processed, and
5656     // finishing the case level without enough space in the buffer to insert
5657     // a level terminator.
5658     UBool canUpdateState = TRUE;
5659
5660     // Consume all the CEs that were consumed at the end of the previous
5661     // iteration without updating the iterator state. On identical level,
5662     // consume the code points.
5663     int32_t counter = cces;
5664     if(level < UCOL_PSK_IDENTICAL) {
5665       while(counter-->0) {
5666         // If we're doing French and we are on the secondary level,
5667         // we go backwards.
5668         if(level == UCOL_PSK_SECONDARY && doingFrench) {
5669           CE = ucol_IGetPrevCE(coll, &s, status);
5670         } else {
5671           CE = ucol_IGetNextCE(coll, &s, status);
5672         }
5673         if(CE==UCOL_NO_MORE_CES) {
5674           /* should not happen */
5675           *status=U_INTERNAL_PROGRAM_ERROR;
5676           UTRACE_EXIT_STATUS(*status);
5677           return 0;
5678         }
5679         if(uprv_numAvailableExpCEs(s)) {
5680           canUpdateState = FALSE;
5681         }
5682       }
5683     } else {
5684       while(counter-->0) {
5685         uiter_next32(s.iterator);
5686       }
5687     }
5688
5689     // French secondary needs to know whether the iterator state of zero came from previous level OR
5690     // from a new invocation...
5691     UBool wasDoingPrimary = FALSE;
5692     // destination buffer byte counter. When this guy
5693     // gets to count, we're done with the iteration
5694     int32_t i = 0;
5695     // used to count the zero bytes written after we
5696     // have finished with the sort key
5697     int32_t j = 0;
5698
5699
5700     // Hm.... I think we're ready to plunge in. Basic story is as following:
5701     // we have a fall through case based on level. This is used for initial
5702     // positioning on iteration start. Every level processor contains a
5703     // for(;;) which will be broken when we exhaust all the CEs. Other
5704     // way to exit is a goto saveState, which happens when we have filled
5705     // out our buffer.
5706     switch(level) {
5707     case UCOL_PSK_PRIMARY:
5708       wasDoingPrimary = TRUE;
5709       for(;;) {
5710           if(i==count) {
5711               goto saveState;
5712           }
5713           // We should save the state only if we
5714           // are sure that we are done with the
5715           // previous iterator state
5716           if(canUpdateState && byteCountOrFrenchDone == 0) {
5717             newState = s.iterator->getState(s.iterator);
5718             if(newState != UITER_NO_STATE) {
5719               iterState = newState;
5720               cces = 0;
5721             }
5722           }
5723           CE = ucol_IGetNextCE(coll, &s, status);
5724           cces++;
5725           if(CE==UCOL_NO_MORE_CES) {
5726               // Add the level separator
5727               terminatePSKLevel(level, maxLevel, i, dest);
5728               byteCountOrFrenchDone=0;
5729               // Restart the iteration an move to the
5730               // second level
5731               s.iterator->move(s.iterator, 0, UITER_START);
5732               cces = 0;
5733               level = UCOL_PSK_SECONDARY;
5734               break;
5735           }
5736           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5737             CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5738             if(CE != 0) {
5739               if(byteCountOrFrenchDone == 0) {
5740                 // get the second byte of primary
5741                 dest[i++]=(uint8_t)(CE >> 8);
5742               } else {
5743                 byteCountOrFrenchDone = 0;
5744               }
5745               if((CE &=0xff)!=0) {
5746                   if(i==count) {
5747                       /* overflow */
5748                       byteCountOrFrenchDone = 1;
5749                       cces--;
5750                       goto saveState;
5751                   }
5752                   dest[i++]=(uint8_t)CE;
5753               }
5754             }
5755           }
5756           if(uprv_numAvailableExpCEs(s)) {
5757             canUpdateState = FALSE;
5758           } else {
5759             canUpdateState = TRUE;
5760           }
5761       }
5762       /* fall through to next level */
5763     case UCOL_PSK_SECONDARY:
5764       if(strength >= UCOL_SECONDARY) {
5765         if(!doingFrench) {
5766           for(;;) {
5767             if(i == count) {
5768               goto saveState;
5769             }
5770             // We should save the state only if we
5771             // are sure that we are done with the
5772             // previous iterator state
5773             if(canUpdateState) {
5774               newState = s.iterator->getState(s.iterator);
5775               if(newState != UITER_NO_STATE) {
5776                 iterState = newState;
5777                 cces = 0;
5778               }
5779             }
5780             CE = ucol_IGetNextCE(coll, &s, status);
5781             cces++;
5782             if(CE==UCOL_NO_MORE_CES) {
5783                 // Add the level separator
5784                 terminatePSKLevel(level, maxLevel, i, dest);
5785                 byteCountOrFrenchDone = 0;
5786                 // Restart the iteration an move to the
5787                 // second level
5788                 s.iterator->move(s.iterator, 0, UITER_START);
5789                 cces = 0;
5790                 level = UCOL_PSK_CASE;
5791                 break;
5792             }
5793             if(!isShiftedCE(CE, LVT, &wasShifted)) {
5794               CE >>= 8; /* get secondary */
5795               if(CE != 0) {
5796                 dest[i++]=(uint8_t)CE;
5797               }
5798             }
5799             if(uprv_numAvailableExpCEs(s)) {
5800               canUpdateState = FALSE;
5801             } else {
5802               canUpdateState = TRUE;
5803             }
5804           }
5805         } else { // French secondary processing
5806           uint8_t frenchBuff[UCOL_MAX_BUFFER];
5807           int32_t frenchIndex = 0;
5808           // Here we are going backwards.
5809           // If the iterator is at the beggining, it should be
5810           // moved to end.
5811           if(wasDoingPrimary) {
5812             s.iterator->move(s.iterator, 0, UITER_LIMIT);
5813             cces = 0;
5814           }
5815           for(;;) {
5816             if(i == count) {
5817               goto saveState;
5818             }
5819             if(canUpdateState) {
5820               newState = s.iterator->getState(s.iterator);
5821               if(newState != UITER_NO_STATE) {
5822                 iterState = newState;
5823                 cces = 0;
5824               }
5825             }
5826             CE = ucol_IGetPrevCE(coll, &s, status);
5827             cces++;
5828             if(CE==UCOL_NO_MORE_CES) {
5829                 // Add the level separator
5830                 terminatePSKLevel(level, maxLevel, i, dest);
5831                 byteCountOrFrenchDone = 0;
5832                 // Restart the iteration an move to the next level
5833                 s.iterator->move(s.iterator, 0, UITER_START);
5834                 level = UCOL_PSK_CASE;
5835                 break;
5836             }
5837             if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5838               // reverse when we get a first non-continuation CE.
5839               CE >>= 8;
5840               frenchBuff[frenchIndex++] = (uint8_t)CE;
5841             } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5842               CE >>= 8; /* get secondary */
5843               if(!frenchIndex) {
5844                 if(CE != 0) {
5845                   dest[i++]=(uint8_t)CE;
5846                 }
5847               } else {
5848                 frenchBuff[frenchIndex++] = (uint8_t)CE;
5849                 frenchIndex -= usedFrench;
5850                 usedFrench = 0;
5851                 while(i < count && frenchIndex) {
5852                   dest[i++] = frenchBuff[--frenchIndex];
5853                   usedFrench++;
5854                 }
5855               }
5856             }
5857             if(uprv_numAvailableExpCEs(s)) {
5858               canUpdateState = FALSE;
5859             } else {
5860               canUpdateState = TRUE;
5861             }
5862           }
5863         }
5864       } else {
5865         level = UCOL_PSK_CASE;
5866       }
5867         /* fall through to next level */
5868     case UCOL_PSK_CASE:
5869       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5870         uint32_t caseShift = UCOL_CASE_SHIFT_START;
5871         uint8_t caseByte = UCOL_CASE_BYTE_START;
5872         uint8_t caseBits = 0;
5873
5874         for(;;) {
5875           if(i == count) {
5876             goto saveState;
5877           }
5878           // We should save the state only if we
5879           // are sure that we are done with the
5880           // previous iterator state
5881           if(canUpdateState) {
5882             newState = s.iterator->getState(s.iterator);
5883             if(newState != UITER_NO_STATE) {
5884               iterState = newState;
5885               cces = 0;
5886             }
5887           }
5888           CE = ucol_IGetNextCE(coll, &s, status);
5889           cces++;
5890           if(CE==UCOL_NO_MORE_CES) {
5891             // On the case level we might have an unfinished
5892             // case byte. Add one if it's started.
5893             if(caseShift != UCOL_CASE_SHIFT_START) {
5894               dest[i++] = caseByte;
5895             }
5896             cces = 0;
5897             // We have finished processing CEs on this level.
5898             // However, we don't know if we have enough space
5899             // to add a case level terminator.
5900             if(i < count) {
5901               // Add the level separator
5902               terminatePSKLevel(level, maxLevel, i, dest);
5903               // Restart the iteration and move to the
5904               // next level
5905               s.iterator->move(s.iterator, 0, UITER_START);
5906               level = UCOL_PSK_TERTIARY;
5907             } else {
5908               canUpdateState = FALSE;
5909             }
5910             break;
5911           }
5912
5913           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5914             if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5915                 // do the case level if we need to do it. We don't want to calculate
5916                 // case level for primary ignorables if we have only primary strength and case level
5917                 // otherwise we would break well formedness of CEs
5918               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5919               caseBits = (uint8_t)(CE & 0xC0);
5920               // this copies the case level logic from the
5921               // sort key generation code
5922               if(CE != 0) {
5923                 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5924                   if((caseBits & 0xC0) == 0) {
5925                     caseByte |= 1 << (--caseShift);
5926                   } else {
5927                     caseByte |= 0 << (--caseShift);
5928                     /* second bit */
5929                     if(caseShift == 0) {
5930                       dest[i++] = caseByte;
5931                       caseShift = UCOL_CASE_SHIFT_START;
5932                       caseByte = UCOL_CASE_BYTE_START;
5933                     }
5934                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
5935                   }
5936                 } else {
5937                   if((caseBits & 0xC0) == 0) {
5938                     caseByte |= 0 << (--caseShift);
5939                   } else {
5940                     caseByte |= 1 << (--caseShift);
5941                     /* second bit */
5942                     if(caseShift == 0) {
5943                       dest[i++] = caseByte;
5944                       caseShift = UCOL_CASE_SHIFT_START;
5945                       caseByte = UCOL_CASE_BYTE_START;
5946                     }
5947                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
5948                   }
5949                 }
5950               }
5951
5952             }
5953           }
5954           // Not sure this is correct for the case level - revisit
5955           if(uprv_numAvailableExpCEs(s)) {
5956             canUpdateState = FALSE;
5957           } else {
5958             canUpdateState = TRUE;
5959           }
5960         }
5961       } else {
5962         level = UCOL_PSK_TERTIARY;
5963       }
5964         /* fall through to next level */
5965     case UCOL_PSK_TERTIARY:
5966       if(strength >= UCOL_TERTIARY) {
5967         for(;;) {
5968           if(i == count) {
5969             goto saveState;
5970           }
5971           // We should save the state only if we
5972           // are sure that we are done with the
5973           // previous iterator state
5974           if(canUpdateState) {
5975             newState = s.iterator->getState(s.iterator);
5976             if(newState != UITER_NO_STATE) {
5977               iterState = newState;
5978               cces = 0;
5979             }
5980           }
5981           CE = ucol_IGetNextCE(coll, &s, status);
5982           cces++;
5983           if(CE==UCOL_NO_MORE_CES) {
5984               // Add the level separator
5985               terminatePSKLevel(level, maxLevel, i, dest);
5986               byteCountOrFrenchDone = 0;
5987               // Restart the iteration an move to the
5988               // second level
5989               s.iterator->move(s.iterator, 0, UITER_START);
5990               cces = 0;
5991               level = UCOL_PSK_QUATERNARY;
5992               break;
5993           }
5994           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5995             notIsContinuation = !isContinuation(CE);
5996
5997             if(notIsContinuation) {
5998               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5999               CE ^= coll->caseSwitch;
6000               CE &= coll->tertiaryMask;
6001             } else {
6002               CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6003             }
6004
6005             if(CE != 0) {
6006               dest[i++]=(uint8_t)CE;
6007             }
6008           }
6009           if(uprv_numAvailableExpCEs(s)) {
6010             canUpdateState = FALSE;
6011           } else {
6012             canUpdateState = TRUE;
6013           }
6014         }
6015       } else {
6016         // if we're not doing tertiary
6017         // skip to the end
6018         level = UCOL_PSK_NULL;
6019       }
6020         /* fall through to next level */
6021     case UCOL_PSK_QUATERNARY:
6022       if(strength >= UCOL_QUATERNARY) {
6023         for(;;) {
6024           if(i == count) {
6025             goto saveState;
6026           }
6027           // We should save the state only if we
6028           // are sure that we are done with the
6029           // previous iterator state
6030           if(canUpdateState) {
6031             newState = s.iterator->getState(s.iterator);
6032             if(newState != UITER_NO_STATE) {
6033               iterState = newState;
6034               cces = 0;
6035             }
6036           }
6037           CE = ucol_IGetNextCE(coll, &s, status);
6038           cces++;
6039           if(CE==UCOL_NO_MORE_CES) {
6040               // Add the level separator
6041               terminatePSKLevel(level, maxLevel, i, dest);
6042               //dest[i++] = UCOL_LEVELTERMINATOR;
6043               byteCountOrFrenchDone = 0;
6044               // Restart the iteration an move to the
6045               // second level
6046               s.iterator->move(s.iterator, 0, UITER_START);
6047               cces = 0;
6048               level = UCOL_PSK_QUIN;
6049               break;
6050           }
6051           if(isShiftedCE(CE, LVT, &wasShifted)) {
6052             CE >>= 16; /* get primary */
6053             if(CE != 0) {
6054               if(byteCountOrFrenchDone == 0) {
6055                 dest[i++]=(uint8_t)(CE >> 8);
6056               } else {
6057                 byteCountOrFrenchDone = 0;
6058               }
6059               if((CE &=0xff)!=0) {
6060                   if(i==count) {
6061                       /* overflow */
6062                       byteCountOrFrenchDone = 1;
6063                       goto saveState;
6064                   }
6065                   dest[i++]=(uint8_t)CE;
6066               }
6067             }
6068           } else {
6069             notIsContinuation = !isContinuation(CE);
6070             if(notIsContinuation) {
6071               if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6072                 dest[i++] = UCOL_HIRAGANA_QUAD;
6073               } else {
6074                 dest[i++] = 0xFF;
6075               }
6076             }
6077           }
6078           if(uprv_numAvailableExpCEs(s)) {
6079             canUpdateState = FALSE;
6080           } else {
6081             canUpdateState = TRUE;
6082           }
6083         }
6084       } else {
6085         // if we're not doing quaternary
6086         // skip to the end
6087         level = UCOL_PSK_NULL;
6088       }
6089         /* fall through to next level */
6090     case UCOL_PSK_QUIN:
6091       level = UCOL_PSK_IDENTICAL;
6092         /* fall through to next level */
6093     case UCOL_PSK_IDENTICAL:
6094       if(strength >= UCOL_IDENTICAL) {
6095         UChar32 first, second;
6096         int32_t bocsuBytesWritten = 0;
6097         // We always need to do identical on
6098         // the NFD form of the string.
6099         if(normIter == NULL) {
6100           // we arrived from the level below and
6101           // normalization was not turned on.
6102           // therefore, we need to make a fresh NFD iterator
6103           normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6104           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6105         } else if(!doingIdenticalFromStart) {
6106           // there is an iterator, but we did some other levels.
6107           // therefore, we have a FCD iterator - need to make
6108           // a NFD one.
6109           // normIter being at the beginning does not guarantee
6110           // that the underlying iterator is at the beginning
6111           iter->move(iter, 0, UITER_START);
6112           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6113         }
6114         // At this point we have a NFD iterator that is positioned
6115         // in the right place
6116         if(U_FAILURE(*status)) {
6117           UTRACE_EXIT_STATUS(*status);
6118           return 0;
6119         }
6120         first = uiter_previous32(s.iterator);
6121         // maybe we're at the start of the string
6122         if(first == U_SENTINEL) {
6123           first = 0;
6124         } else {
6125           uiter_next32(s.iterator);
6126         }
6127
6128         j = 0;
6129         for(;;) {
6130           if(i == count) {
6131             if(j+1 < bocsuBytesWritten) {
6132               bocsuBytesUsed = j+1;
6133             }
6134             goto saveState;
6135           }
6136
6137           // On identical level, we will always save
6138           // the state if we reach this point, since
6139           // we don't depend on getNextCE for content
6140           // all the content is in our buffer and we
6141           // already either stored the full buffer OR
6142           // otherwise we won't arrive here.
6143           newState = s.iterator->getState(s.iterator);
6144           if(newState != UITER_NO_STATE) {
6145             iterState = newState;
6146             cces = 0;
6147           }
6148
6149           uint8_t buff[4];
6150           second = uiter_next32(s.iterator);
6151           cces++;
6152
6153           // end condition for identical level
6154           if(second == U_SENTINEL) {
6155             terminatePSKLevel(level, maxLevel, i, dest);
6156             level = UCOL_PSK_NULL;
6157             break;
6158           }
6159           bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6160           first = second;
6161
6162           j = 0;
6163           if(bocsuBytesUsed != 0) {
6164             while(bocsuBytesUsed-->0) {
6165               j++;
6166             }
6167           }
6168
6169           while(i < count && j < bocsuBytesWritten) {
6170             dest[i++] = buff[j++];
6171           }
6172         }
6173
6174       } else {
6175         level = UCOL_PSK_NULL;
6176       }
6177         /* fall through to next level */
6178     case UCOL_PSK_NULL:
6179       j = i;
6180       while(j<count) {
6181           dest[j++]=0;
6182       }
6183       break;
6184     default:
6185       *status = U_INTERNAL_PROGRAM_ERROR;
6186       UTRACE_EXIT_STATUS(*status);
6187       return 0;
6188     }
6189
6190 saveState:
6191     // Now we need to return stuff. First we want to see whether we have
6192     // done everything for the current state of iterator.
6193     if(byteCountOrFrenchDone
6194     || canUpdateState == FALSE
6195     || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) {
6196       // Any of above mean that the previous transaction
6197       // wasn't finished and that we should store the
6198       // previous iterator state.
6199       state[0] = iterState;
6200     } else {
6201       // The transaction is complete. We will continue in the next iteration.
6202         state[0] = s.iterator->getState(s.iterator);
6203         cces = 0;
6204     }
6205     // Store the number of bocsu bytes written.
6206     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6207       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6208     }
6209     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6210
6211     // Next we put in the level of comparison
6212     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6213
6214     // If we are doing French, we need to store whether we have just finished the French level
6215     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6216       state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6217     } else {
6218       state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6219     }
6220
6221     // Was the latest CE shifted
6222     if(wasShifted) {
6223       state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6224     }
6225     // Check for cces overflow
6226     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6227       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6228     }
6229     // Store cces
6230     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6231
6232     // Check for French overflow
6233     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6234       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6235     }
6236     // Store number of bytes written in the French secondary continuation sequence
6237     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6238
6239
6240     // If we have used normalizing iterator, get rid of it
6241     if(normIter != NULL) {
6242       unorm_closeIter(normIter);
6243     }
6244
6245     // Return number of meaningful sortkey bytes.
6246     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6247                   dest,i, state[0], state[1]);
6248     UTRACE_EXIT_VALUE(i);
6249     return i;
6250 }
6251
6252 /**
6253  * Produce a bound for a given sortkey and a number of levels.
6254  */
6255 U_CAPI int32_t U_EXPORT2
6256 ucol_getBound(const uint8_t       *source,
6257         int32_t             sourceLength,
6258         UColBoundMode       boundType,
6259         uint32_t            noOfLevels,
6260         uint8_t             *result,
6261         int32_t             resultLength,
6262         UErrorCode          *status) {
6263   // consistency checks
6264   if(status == NULL || U_FAILURE(*status)) {
6265     return 0;
6266   }
6267   if(source == NULL) {
6268     *status = U_ILLEGAL_ARGUMENT_ERROR;
6269     return 0;
6270   }
6271
6272   int32_t sourceIndex = 0;
6273   // Scan the string until we skip enough of the key OR reach the end of the key
6274   do {
6275     sourceIndex++;
6276     if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6277       noOfLevels--;
6278     }
6279   } while (noOfLevels > 0
6280     && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6281
6282   if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6283     && noOfLevels > 0) {
6284     *status = U_SORT_KEY_TOO_SHORT_WARNING;
6285   }
6286
6287
6288   // READ ME: this code assumes that the values for boundType
6289   // enum will not changes. They are set so that the enum value
6290   // corresponds to the number of extra bytes each bound type
6291   // needs.
6292   if(result != NULL && resultLength >= sourceIndex+boundType) {
6293     uprv_memcpy(result, source, sourceIndex);
6294     switch(boundType) {
6295     // Lower bound just gets terminated. No extra bytes
6296     case UCOL_BOUND_LOWER: // = 0
6297       break;
6298     // Upper bound needs one extra byte
6299     case UCOL_BOUND_UPPER: // = 1
6300       result[sourceIndex++] = 2;
6301       break;
6302     // Upper long bound needs two extra bytes
6303     case UCOL_BOUND_UPPER_LONG: // = 2
6304       result[sourceIndex++] = 0xFF;
6305       result[sourceIndex++] = 0xFF;
6306       break;
6307     default:
6308       *status = U_ILLEGAL_ARGUMENT_ERROR;
6309       return 0;
6310     }
6311     result[sourceIndex++] = 0;
6312
6313     return sourceIndex;
6314   } else {
6315     return sourceIndex+boundType+1;
6316   }
6317 }
6318
6319 /****************************************************************************/
6320 /* Following are the functions that deal with the properties of a collator  */
6321 /* there are new APIs and some compatibility APIs                           */
6322 /****************************************************************************/
6323
6324 static inline void
6325 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6326                     int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6327   uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6328   UBool reverseSecondary = FALSE;
6329   if(!isContinuation(CE)) {
6330     tertiary = (uint8_t)((CE & coll->tertiaryMask));
6331     tertiary ^= coll->caseSwitch;
6332     reverseSecondary = TRUE;
6333   } else {
6334     tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6335     tertiary &= UCOL_REMOVE_CASE;
6336     reverseSecondary = FALSE;
6337   }
6338
6339   secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6340   primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6341   primary1 = (uint8_t)(CE >> 8);
6342
6343   if(primary1 != 0) {
6344     coll->latinOneCEs[ch] |= (primary1 << *primShift);
6345     *primShift -= 8;
6346   }
6347   if(primary2 != 0) {
6348     if(*primShift < 0) {
6349       coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6350       coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6351       coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6352       return;
6353     }
6354     coll->latinOneCEs[ch] |= (primary2 << *primShift);
6355     *primShift -= 8;
6356   }
6357   if(secondary != 0) {
6358     if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6359       coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6360       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6361     } else { // normal case
6362       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6363     }
6364     *secShift -= 8;
6365   }
6366   if(tertiary != 0) {
6367     coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6368     *terShift -= 8;
6369   }
6370 }
6371
6372 static inline UBool
6373 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6374     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6375     if(newTable == NULL) {
6376       *status = U_MEMORY_ALLOCATION_ERROR;
6377       coll->latinOneFailed = TRUE;
6378       return FALSE;
6379     }
6380     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6381     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6382     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6383     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6384     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6385     coll->latinOneTableLen = size;
6386     uprv_free(coll->latinOneCEs);
6387     coll->latinOneCEs = newTable;
6388     return TRUE;
6389 }
6390
6391 static UBool
6392 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6393   UBool result = TRUE;
6394   if(coll->latinOneCEs == NULL) {
6395     coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6396     if(coll->latinOneCEs == NULL) {
6397       *status = U_MEMORY_ALLOCATION_ERROR;
6398       return FALSE;
6399     }
6400     coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6401   }
6402   UChar ch = 0;
6403   UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6404   uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6405
6406   int32_t primShift = 24, secShift = 24, terShift = 24;
6407   uint32_t CE = 0;
6408   int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6409
6410   // TODO: make safe if you get more than you wanted...
6411   for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6412     primShift = 24; secShift = 24; terShift = 24;
6413     if(ch < 0x100) {
6414       CE = coll->latinOneMapping[ch];
6415     } else {
6416       CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6417       if(CE == UCOL_NOT_FOUND && coll->UCA) {
6418         CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6419       }
6420     }
6421     if(CE < UCOL_NOT_FOUND) {
6422       ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6423     } else {
6424       switch (getCETag(CE)) {
6425       case EXPANSION_TAG:
6426       case DIGIT_TAG:
6427         ucol_setText(it, &ch, 1, status);
6428         while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6429           if(primShift < 0 || secShift < 0 || terShift < 0) {
6430             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6431             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6432             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6433             break;
6434           }
6435           ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6436         }
6437         break;
6438       case CONTRACTION_TAG:
6439         // here is the trick
6440         // F2 is contraction. We do something very similar to contractions
6441         // but have two indices, one in the real contraction table and the
6442         // other to where we stuffed things. This hopes that we don't have
6443         // many contractions (this should work for latin-1 tables).
6444         {
6445           if((CE & 0x00FFF000) != 0) {
6446             *status = U_UNSUPPORTED_ERROR;
6447             goto cleanup_after_failure;
6448           }
6449
6450           const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6451
6452           CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6453
6454           coll->latinOneCEs[ch] = CE;
6455           coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6456           coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6457
6458           // We're going to jump into contraction table, pick the elements
6459           // and use them
6460           do {
6461               CE = *(coll->contractionCEs +
6462                   (UCharOffset - coll->contractionIndex));
6463               if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6464                 uint32_t size;
6465                 uint32_t i;    /* general counter */
6466                 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6467                 size = getExpansionCount(CE);
6468                 //CE = *CEOffset++;
6469                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6470                   for(i = 0; i<size; i++) {
6471                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6472                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6473                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6474                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6475                       break;
6476                     }
6477                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6478                   }
6479                 } else { /* else, we do */
6480                   while(*CEOffset != 0) {
6481                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6482                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6483                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6484                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6485                       break;
6486                     }
6487                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6488                   }
6489                 }
6490                 contractionOffset++;
6491               } else if(CE < UCOL_NOT_FOUND) {
6492                 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6493               } else {
6494                 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6495                 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6496                 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6497                 contractionOffset++;
6498               }
6499               UCharOffset++;
6500               primShift = 24; secShift = 24; terShift = 24;
6501               if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6502                 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6503                   goto cleanup_after_failure;
6504                 }
6505               }
6506           } while(*UCharOffset != 0xFFFF);
6507         }
6508         break;
6509       default:
6510         goto cleanup_after_failure;
6511       }
6512     }
6513   }
6514   // compact table
6515   if(contractionOffset < coll->latinOneTableLen) {
6516     if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6517       goto cleanup_after_failure;
6518     }
6519   }
6520   ucol_closeElements(it);
6521   return result;
6522
6523 cleanup_after_failure:
6524   // status should already be set before arriving here.
6525   coll->latinOneFailed = TRUE;
6526   ucol_closeElements(it);
6527   return FALSE;
6528 }
6529
6530 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6531   if(U_SUCCESS(*status)) {
6532     if(coll->caseFirst == UCOL_UPPER_FIRST) {
6533       coll->caseSwitch = UCOL_CASE_SWITCH;
6534     } else {
6535       coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6536     }
6537
6538     if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6539       coll->tertiaryMask = UCOL_REMOVE_CASE;
6540       coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6541       coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6542       coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6543       coll->tertiaryBottom = UCOL_COMMON_BOT3;
6544     } else {
6545       coll->tertiaryMask = UCOL_KEEP_CASE;
6546       coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6547       if(coll->caseFirst == UCOL_UPPER_FIRST) {
6548         coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6549         coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6550         coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6551       } else {
6552         coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6553         coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6554         coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6555       }
6556     }
6557
6558     /* Set the compression values */
6559     uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6560     coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6561     coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6562
6563     if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6564       && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6565       coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6566     } else {
6567       coll->sortKeyGen = ucol_calcSortKey;
6568     }
6569     if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6570       && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6571       if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6572         if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6573           //fprintf(stderr, "F");
6574           coll->latinOneUse = TRUE;
6575         } else {
6576           coll->latinOneUse = FALSE;
6577         }
6578         if(*status == U_UNSUPPORTED_ERROR) {
6579           *status = U_ZERO_ERROR;
6580         }
6581       } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6582         coll->latinOneUse = TRUE;
6583       }
6584     } else {
6585       coll->latinOneUse = FALSE;
6586     }
6587   }
6588 }
6589
6590 U_CAPI uint32_t  U_EXPORT2
6591 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6592   if(U_FAILURE(*status) || coll == NULL) {
6593     return 0;
6594   }
6595   if(len == -1) {
6596     len = u_strlen(varTop);
6597   }
6598   if(len == 0) {
6599     *status = U_ILLEGAL_ARGUMENT_ERROR;
6600     return 0;
6601   }
6602
6603   collIterate s;
6604   IInit_collIterate(coll, varTop, len, &s);
6605
6606   uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6607
6608   /* here we check if we have consumed all characters */
6609   /* you can put in either one character or a contraction */
6610   /* you shouldn't put more... */
6611   if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6612     *status = U_CE_NOT_FOUND_ERROR;
6613     return 0;
6614   }
6615
6616   uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6617
6618   if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6619     *status = U_PRIMARY_TOO_LONG_ERROR;
6620     return 0;
6621   }
6622   if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6623     coll->variableTopValueisDefault = FALSE;
6624     coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6625   }
6626
6627   return CE & UCOL_PRIMARYMASK;
6628 }
6629
6630 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6631   if(U_FAILURE(*status) || coll == NULL) {
6632     return 0;
6633   }
6634   return coll->variableTopValue<<16;
6635 }
6636
6637 U_CAPI void  U_EXPORT2
6638 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6639   if(U_FAILURE(*status) || coll == NULL) {
6640     return;
6641   }
6642
6643   if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6644       coll->variableTopValueisDefault = FALSE;
6645       coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6646   }
6647 }
6648 /* Attribute setter API */
6649 U_CAPI void  U_EXPORT2
6650 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6651     if(U_FAILURE(*status) || coll == NULL) {
6652       return;
6653     }
6654     UColAttributeValue oldFrench = coll->frenchCollation;
6655     UColAttributeValue oldCaseFirst = coll->caseFirst;
6656     switch(attr) {
6657     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6658       if(value == UCOL_ON) {
6659         coll->numericCollation = UCOL_ON;
6660         coll->numericCollationisDefault = FALSE;
6661       } else if (value == UCOL_OFF) {
6662         coll->numericCollation = UCOL_OFF;
6663         coll->numericCollationisDefault = FALSE;
6664       } else if (value == UCOL_DEFAULT) {
6665         coll->numericCollationisDefault = TRUE;
6666         coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6667       } else {
6668         *status = U_ILLEGAL_ARGUMENT_ERROR;
6669       }
6670       break;
6671     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6672       if(value == UCOL_ON) {
6673         coll->hiraganaQ = UCOL_ON;
6674         coll->hiraganaQisDefault = FALSE;
6675       } else if (value == UCOL_OFF) {
6676         coll->hiraganaQ = UCOL_OFF;
6677         coll->hiraganaQisDefault = FALSE;
6678       } else if (value == UCOL_DEFAULT) {
6679         coll->hiraganaQisDefault = TRUE;
6680         coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6681       } else {
6682         *status = U_ILLEGAL_ARGUMENT_ERROR;
6683       }
6684       break;
6685     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6686         if(value == UCOL_ON) {
6687             coll->frenchCollation = UCOL_ON;
6688             coll->frenchCollationisDefault = FALSE;
6689         } else if (value == UCOL_OFF) {
6690             coll->frenchCollation = UCOL_OFF;
6691             coll->frenchCollationisDefault = FALSE;
6692         } else if (value == UCOL_DEFAULT) {
6693             coll->frenchCollationisDefault = TRUE;
6694             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6695         } else {
6696             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6697         }
6698         break;
6699     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6700         if(value == UCOL_SHIFTED) {
6701             coll->alternateHandling = UCOL_SHIFTED;
6702             coll->alternateHandlingisDefault = FALSE;
6703         } else if (value == UCOL_NON_IGNORABLE) {
6704             coll->alternateHandling = UCOL_NON_IGNORABLE;
6705             coll->alternateHandlingisDefault = FALSE;
6706         } else if (value == UCOL_DEFAULT) {
6707             coll->alternateHandlingisDefault = TRUE;
6708             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6709         } else {
6710             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6711         }
6712         break;
6713     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6714         if(value == UCOL_LOWER_FIRST) {
6715             coll->caseFirst = UCOL_LOWER_FIRST;
6716             coll->caseFirstisDefault = FALSE;
6717         } else if (value == UCOL_UPPER_FIRST) {
6718             coll->caseFirst = UCOL_UPPER_FIRST;
6719             coll->caseFirstisDefault = FALSE;
6720         } else if (value == UCOL_OFF) {
6721           coll->caseFirst = UCOL_OFF;
6722           coll->caseFirstisDefault = FALSE;
6723         } else if (value == UCOL_DEFAULT) {
6724             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6725             coll->caseFirstisDefault = TRUE;
6726         } else {
6727             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6728         }
6729         break;
6730     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6731         if(value == UCOL_ON) {
6732             coll->caseLevel = UCOL_ON;
6733             coll->caseLevelisDefault = FALSE;
6734         } else if (value == UCOL_OFF) {
6735             coll->caseLevel = UCOL_OFF;
6736             coll->caseLevelisDefault = FALSE;
6737         } else if (value == UCOL_DEFAULT) {
6738             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6739             coll->caseLevelisDefault = TRUE;
6740         } else {
6741             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6742         }
6743         break;
6744     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6745         if(value == UCOL_ON) {
6746             coll->normalizationMode = UCOL_ON;
6747             coll->normalizationModeisDefault = FALSE;
6748         } else if (value == UCOL_OFF) {
6749             coll->normalizationMode = UCOL_OFF;
6750             coll->normalizationModeisDefault = FALSE;
6751         } else if (value == UCOL_DEFAULT) {
6752             coll->normalizationModeisDefault = TRUE;
6753             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6754         } else {
6755             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6756         }
6757         break;
6758     case UCOL_STRENGTH:         /* attribute for strength */
6759         if (value == UCOL_DEFAULT) {
6760             coll->strengthisDefault = TRUE;
6761             coll->strength = (UColAttributeValue)coll->options->strength;
6762         } else if (value <= UCOL_IDENTICAL) {
6763             coll->strengthisDefault = FALSE;
6764             coll->strength = value;
6765         } else {
6766             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6767         }
6768         break;
6769     case UCOL_ATTRIBUTE_COUNT:
6770     default:
6771         *status = U_ILLEGAL_ARGUMENT_ERROR;
6772         break;
6773     }
6774     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6775       coll->latinOneRegenTable = TRUE;
6776     } else {
6777       coll->latinOneRegenTable = FALSE;
6778     }
6779     ucol_updateInternalState(coll, status);
6780 }
6781
6782 U_CAPI UColAttributeValue  U_EXPORT2
6783 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6784     if(U_FAILURE(*status) || coll == NULL) {
6785       return UCOL_DEFAULT;
6786     }
6787     switch(attr) {
6788     case UCOL_NUMERIC_COLLATION:
6789       return coll->numericCollation;
6790     case UCOL_HIRAGANA_QUATERNARY_MODE:
6791       return coll->hiraganaQ;
6792     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6793         return coll->frenchCollation;
6794     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6795         return coll->alternateHandling;
6796     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6797         return coll->caseFirst;
6798     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6799         return coll->caseLevel;
6800     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6801         return coll->normalizationMode;
6802     case UCOL_STRENGTH:         /* attribute for strength */
6803         return coll->strength;
6804     case UCOL_ATTRIBUTE_COUNT:
6805     default:
6806         *status = U_ILLEGAL_ARGUMENT_ERROR;
6807         break;
6808     }
6809     return UCOL_DEFAULT;
6810 }
6811
6812 U_CAPI void U_EXPORT2
6813 ucol_setStrength(    UCollator                *coll,
6814             UCollationStrength        strength)
6815 {
6816   UErrorCode status = U_ZERO_ERROR;
6817   ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6818 }
6819
6820 U_CAPI UCollationStrength U_EXPORT2
6821 ucol_getStrength(const UCollator *coll)
6822 {
6823   UErrorCode status = U_ZERO_ERROR;
6824   return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6825 }
6826
6827 /****************************************************************************/
6828 /* Following are misc functions                                             */
6829 /* there are new APIs and some compatibility APIs                           */
6830 /****************************************************************************/
6831
6832 U_CAPI void U_EXPORT2
6833 ucol_getVersion(const UCollator* coll,
6834                 UVersionInfo versionInfo)
6835 {
6836     /* RunTime version  */
6837     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6838     /* Builder version*/
6839     uint8_t bdVersion = coll->image->version[0];
6840
6841     /* Charset Version. Need to get the version from cnv files
6842      * makeconv should populate cnv files with version and
6843      * an api has to be provided in ucnv.h to obtain this version
6844      */
6845     uint8_t csVersion = 0;
6846
6847     /* combine the version info */
6848     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6849
6850     /* Tailoring rules */
6851     versionInfo[0] = (uint8_t)(cmbVersion>>8);
6852     versionInfo[1] = (uint8_t)cmbVersion;
6853     versionInfo[2] = coll->image->version[1];
6854     if(coll->UCA) {
6855         versionInfo[3] = coll->UCA->image->UCAVersion[0];
6856     } else {
6857         versionInfo[3] = 0;
6858     }
6859 }
6860
6861
6862 /* This internal API checks whether a character is tailored or not */
6863 U_CAPI UBool  U_EXPORT2
6864 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6865   uint32_t CE = UCOL_NOT_FOUND;
6866   const UChar *ContractionStart = NULL;
6867   if(U_SUCCESS(*status) && coll != NULL) {
6868     if(coll == coll->UCA) {
6869       return FALSE;
6870     } else if(u < 0x100) { /* latin-1 */
6871       CE = coll->latinOneMapping[u];
6872       if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6873         return FALSE;
6874       }
6875     } else { /* regular */
6876       CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6877     }
6878
6879     if(isContraction(CE)) {
6880       ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6881       CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6882     }
6883
6884     if(CE == UCOL_NOT_FOUND) {
6885       return FALSE;
6886     } else {
6887       return TRUE;
6888     }
6889   } else {
6890     return FALSE;
6891   }
6892 }
6893
6894
6895 /****************************************************************************/
6896 /* Following are the string compare functions                               */
6897 /*                                                                          */
6898 /****************************************************************************/
6899
6900
6901 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6902 /*                     Used by strcoll if strength == identical and strings  */
6903 /*                     are otherwise equal.  Moved out-of-line because this  */
6904 /*                     is a rare case.                                       */
6905 /*                                                                           */
6906 /*                     Comparison must be done on NFD normalized strings.    */
6907 /*                     FCD is not good enough.                               */
6908 /*                                                                           */
6909 /*      TODO:  make an incremental NFD Comparison function, which could      */
6910 /*             be of general use                                             */
6911
6912 static
6913 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6914 {
6915
6916   // TODO: When we have an UChar iterator, we need to access the whole string. One
6917   // useful modification would be a UChar iterator extract API, since reset next next...
6918   // is not optimal.
6919   // TODO: Handle long strings. Do the same in compareUsingSortKeys.
6920
6921   // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6922   // of same type, but that doesn't really mean that it will stay that way.
6923
6924     // The division for the array length may truncate the array size to
6925     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6926     // for all platforms anyway.
6927     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6928     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6929     //UChar sStackBuf[256], tStackBuf[256];
6930     //int32_t sBufSize = 256, tBufSize = 256;
6931     int32_t            comparison;
6932     int32_t          sLen        = 0;
6933     UChar            *sBuf       = NULL;
6934     int32_t          tLen        = 0;
6935     UChar            *tBuf       = NULL;
6936     UBool freeSBuf = FALSE, freeTBuf = FALSE;
6937
6938     if (sColl->flags & UCOL_USE_ITERATOR) {
6939       UNormIterator *sNIt = NULL, *tNIt = NULL;
6940       sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
6941       tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
6942       sColl->iterator->move(sColl->iterator, 0, UITER_START);
6943       tColl->iterator->move(tColl->iterator, 0, UITER_START);
6944       UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
6945       UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
6946       comparison = u_strCompareIter(sIt, tIt, TRUE);
6947       unorm_closeIter(sNIt);
6948       unorm_closeIter(tNIt);
6949     } else {
6950       sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
6951       sBuf = sColl->string;
6952       tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
6953       tBuf = tColl->string;
6954
6955       if (normalize) {
6956           *status = U_ZERO_ERROR;
6957           if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
6958               sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6959                                      sBuf, sLen,
6960                                      FALSE, 0,
6961                                      status);
6962               if(*status == U_BUFFER_OVERFLOW_ERROR) {
6963                   if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
6964                                              &sColl->writableBuffer,
6965                                              (int32_t *)&sColl->writableBufSize, sLen,
6966                                              0)
6967                   ) {
6968                       *status = U_MEMORY_ALLOCATION_ERROR;
6969                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
6970                   }
6971                   *status = U_ZERO_ERROR;
6972                   sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
6973                                          sBuf, sLen,
6974                                          FALSE, 0,
6975                                          status);
6976               }
6977               if(freeSBuf) {
6978                 uprv_free(sBuf);
6979                 freeSBuf = FALSE;
6980               }
6981               sBuf = sColl->writableBuffer;
6982               if (sBuf != sColl->stackWritableBuffer) {
6983                   sColl->flags |= UCOL_ITER_ALLOCATED;
6984               }
6985           }
6986
6987           *status = U_ZERO_ERROR;
6988           if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
6989               tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
6990                                      tBuf, tLen,
6991                                      FALSE, 0,
6992                                      status);
6993               if(*status == U_BUFFER_OVERFLOW_ERROR) {
6994                   if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
6995                                              &tColl->writableBuffer,
6996                                              (int32_t *)&tColl->writableBufSize, tLen,
6997                                              0)
6998                   ) {
6999                       *status = U_MEMORY_ALLOCATION_ERROR;
7000                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7001                   }
7002                   *status = U_ZERO_ERROR;
7003                   tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7004                                          tBuf, tLen,
7005                                          FALSE, 0,
7006                                          status);
7007               }
7008               if(freeTBuf) {
7009                 uprv_free(tBuf);
7010                 freeTBuf = FALSE;
7011               }
7012               tBuf = tColl->writableBuffer;
7013               if (tBuf != tColl->stackWritableBuffer) {
7014                   tColl->flags |= UCOL_ITER_ALLOCATED;
7015               }
7016           }
7017       }
7018
7019       if (sLen == -1 && tLen == -1) {
7020           comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7021       } else {
7022           if (sLen == -1) {
7023               sLen = u_strlen(sBuf);
7024           }
7025           if (tLen == -1) {
7026               tLen = u_strlen(tBuf);
7027           }
7028           comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7029           if (comparison == 0) {
7030               comparison = sLen - tLen;
7031           }
7032       }
7033     }
7034
7035     if (comparison < 0) {
7036         return UCOL_LESS;
7037     } else if (comparison == 0) {
7038         return UCOL_EQUAL;
7039     } else /* comparison > 0 */ {
7040         return UCOL_GREATER;
7041     }
7042 }
7043
7044 /*  CEBuf - A struct and some inline functions to handle the saving    */
7045 /*          of CEs in a buffer within ucol_strcoll                     */
7046
7047 #define UCOL_CEBUF_SIZE 512
7048 typedef struct ucol_CEBuf {
7049     uint32_t    *buf;
7050     uint32_t    *endp;
7051     uint32_t    *pos;
7052     uint32_t     localArray[UCOL_CEBUF_SIZE];
7053 } ucol_CEBuf;
7054
7055
7056 static
7057 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7058     (b)->buf = (b)->pos = (b)->localArray;
7059     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7060 }
7061
7062 static
7063 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7064     uint32_t  oldSize;
7065     uint32_t  newSize;
7066     uint32_t  *newBuf;
7067
7068     ci->flags |= UCOL_ITER_ALLOCATED;
7069     oldSize = b->pos - b->buf;
7070     newSize = oldSize * 2;
7071     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7072     if(newBuf != NULL) {
7073       uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7074       if (b->buf != b->localArray) {
7075           uprv_free(b->buf);
7076       }
7077       b->buf = newBuf;
7078       b->endp = b->buf + newSize;
7079       b->pos  = b->buf + oldSize;
7080     }
7081 }
7082
7083 static
7084 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7085     if (b->pos == b->endp) {
7086         ucol_CEBuf_Expand(b, ci);
7087 }
7088     *(b)->pos++ = ce;
7089 }
7090
7091 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7092 /* It is used when compare gets in trouble and needs to bail out                     */
7093 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7094                                                   collIterate *tColl,
7095                                                   UErrorCode *status)
7096 {
7097     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7098     uint8_t *sourceKeyP = sourceKey;
7099     uint8_t *targetKeyP = targetKey;
7100     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7101     const UCollator *coll = sColl->coll;
7102     UChar *source = NULL;
7103     UChar *target = NULL;
7104     int32_t result = UCOL_EQUAL;
7105     UChar sStackBuf[256], tStackBuf[256];
7106     int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7107     int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7108
7109     // TODO: Handle long strings. Do the same in ucol_checkIdent.
7110     if(sColl->flags & UCOL_USE_ITERATOR) {
7111         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7112         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7113         source = sStackBuf;
7114         UChar *sBufp = source;
7115         target = tStackBuf;
7116         UChar *tBufp = target;
7117         while(sColl->iterator->hasNext(sColl->iterator)) {
7118             *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7119         }
7120         while(tColl->iterator->hasNext(tColl->iterator)) {
7121             *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7122         }
7123         sourceLength = sBufp - source;
7124         targetLength = tBufp - target;
7125     } else { // no iterators
7126         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7127         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7128         source = sColl->string;
7129         target = tColl->string;
7130     }
7131
7132
7133
7134     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7135     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7136         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7137         if(sourceKeyP == NULL) {
7138             *status = U_MEMORY_ALLOCATION_ERROR;
7139             goto cleanup_and_do_compare;
7140         }
7141         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7142     }
7143
7144     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7145     if(targetKeyLen > UCOL_MAX_BUFFER) {
7146         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7147         if(targetKeyP == NULL) {
7148             *status = U_MEMORY_ALLOCATION_ERROR;
7149             goto cleanup_and_do_compare;
7150         }
7151         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7152     }
7153
7154     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7155
7156 cleanup_and_do_compare:
7157     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7158         uprv_free(sourceKeyP);
7159     }
7160
7161     if(targetKeyP != NULL && targetKeyP != targetKey) {
7162         uprv_free(targetKeyP);
7163     }
7164
7165     if(result<0) {
7166         return UCOL_LESS;
7167     } else if(result>0) {
7168         return UCOL_GREATER;
7169     } else {
7170         return UCOL_EQUAL;
7171     }
7172 }
7173
7174
7175 static inline UCollationResult
7176 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7177 //              const UCollator    *coll,
7178 //              const UChar        *source,
7179 //              int32_t            sourceLength,
7180 //              const UChar        *target,
7181 //              int32_t            targetLength,
7182               UErrorCode *status)
7183 {
7184     U_ALIGN_CODE(16);
7185
7186     const UCollator *coll = sColl->coll;
7187
7188
7189     // setting up the collator parameters
7190     UColAttributeValue strength = coll->strength;
7191     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7192
7193     UBool checkSecTer = initialCheckSecTer;
7194     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7195     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7196     UBool checkIdent = (strength == UCOL_IDENTICAL);
7197     UBool checkCase = (coll->caseLevel == UCOL_ON);
7198     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7199     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7200     UBool qShifted = shifted && checkQuad;
7201     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7202
7203     if(doHiragana && shifted) {
7204       return (ucol_compareUsingSortKeys(sColl, tColl, status));
7205     }
7206     uint8_t caseSwitch = coll->caseSwitch;
7207     uint8_t tertiaryMask = coll->tertiaryMask;
7208
7209     // This is the lowest primary value that will not be ignored if shifted
7210     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7211
7212     UCollationResult result = UCOL_EQUAL;
7213     UCollationResult hirResult = UCOL_EQUAL;
7214
7215     // Preparing the CE buffers. They will be filled during the primary phase
7216     ucol_CEBuf   sCEs;
7217     ucol_CEBuf   tCEs;
7218     UCOL_INIT_CEBUF(&sCEs);
7219     UCOL_INIT_CEBUF(&tCEs);
7220
7221     uint32_t secS = 0, secT = 0;
7222     uint32_t sOrder=0, tOrder=0;
7223
7224     // Non shifted primary processing is quite simple
7225     if(!shifted) {
7226       for(;;) {
7227
7228         // We fetch CEs until we hit a non ignorable primary or end.
7229         do {
7230           // We get the next CE
7231           sOrder = ucol_IGetNextCE(coll, sColl, status);
7232           // Stuff it in the buffer
7233           UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7234           // And keep just the primary part.
7235           sOrder &= UCOL_PRIMARYMASK;
7236         } while(sOrder == 0);
7237
7238         // see the comments on the above block
7239         do {
7240           tOrder = ucol_IGetNextCE(coll, tColl, status);
7241           UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7242           tOrder &= UCOL_PRIMARYMASK;
7243         } while(tOrder == 0);
7244
7245         // if both primaries are the same
7246         if(sOrder == tOrder) {
7247             // and there are no more CEs, we advance to the next level
7248             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7249               break;
7250             }
7251             if(doHiragana && hirResult == UCOL_EQUAL) {
7252               if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7253                 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7254                   ? UCOL_LESS:UCOL_GREATER;
7255               }
7256             }
7257         } else {
7258             // if two primaries are different, we are done
7259             result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7260             goto commonReturn;
7261         }
7262       } // no primary difference... do the rest from the buffers
7263     } else { // shifted - do a slightly more complicated processing :)
7264       for(;;) {
7265         UBool sInShifted = FALSE;
7266         UBool tInShifted = FALSE;
7267         // This version of code can be refactored. However, it seems easier to understand this way.
7268         // Source loop. Sam as the target loop.
7269         for(;;) {
7270           sOrder = ucol_IGetNextCE(coll, sColl, status);
7271           if(sOrder == UCOL_NO_MORE_CES) {
7272             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7273             break;
7274           } else if(sOrder == 0
7275             || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7276             /* UCA amendment - ignore ignorables that follow shifted code points */
7277             continue;
7278           } else if(isContinuation(sOrder)) {
7279             if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7280               if(sInShifted) {
7281                 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7282                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7283                 continue;
7284               } else {
7285                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7286                 break;
7287               }
7288             } else { /* Just lower level values */
7289               if(sInShifted) {
7290                 continue;
7291               } else {
7292                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7293                 continue;
7294               }
7295             }
7296           } else { /* regular */
7297             if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7298               UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7299               break;
7300             } else {
7301               if((sOrder & UCOL_PRIMARYMASK) > 0) {
7302                 sInShifted = TRUE;
7303                 sOrder &= UCOL_PRIMARYMASK;
7304                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7305                 continue;
7306               } else {
7307                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7308                 sInShifted = FALSE;
7309                 continue;
7310               }
7311             }
7312           }
7313         }
7314         sOrder &= UCOL_PRIMARYMASK;
7315         sInShifted = FALSE;
7316
7317         for(;;) {
7318           tOrder = ucol_IGetNextCE(coll, tColl, status);
7319           if(tOrder == UCOL_NO_MORE_CES) {
7320             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7321             break;
7322           } else if(tOrder == 0
7323             || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7324             /* UCA amendment - ignore ignorables that follow shifted code points */
7325             continue;
7326           } else if(isContinuation(tOrder)) {
7327             if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7328               if(tInShifted) {
7329                 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7330                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7331                 continue;
7332               } else {
7333                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7334                 break;
7335               }
7336             } else { /* Just lower level values */
7337               if(tInShifted) {
7338                 continue;
7339               } else {
7340                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7341                 continue;
7342               }
7343             }
7344           } else { /* regular */
7345             if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7346               UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7347               break;
7348             } else {
7349               if((tOrder & UCOL_PRIMARYMASK) > 0) {
7350                 tInShifted = TRUE;
7351                 tOrder &= UCOL_PRIMARYMASK;
7352                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7353                 continue;
7354               } else {
7355                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7356                 tInShifted = FALSE;
7357                 continue;
7358               }
7359             }
7360           }
7361         }
7362         tOrder &= UCOL_PRIMARYMASK;
7363         tInShifted = FALSE;
7364
7365         if(sOrder == tOrder) {
7366           /*
7367             if(doHiragana && hirResult == UCOL_EQUAL) {
7368               if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7369                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7370                   ? UCOL_LESS:UCOL_GREATER;
7371               }
7372             }
7373           */
7374             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7375               break;
7376             } else {
7377               sOrder = 0; tOrder = 0;
7378               continue;
7379             }
7380         } else {
7381             result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7382             goto commonReturn;
7383         }
7384       } /* no primary difference... do the rest from the buffers */
7385     }
7386
7387     /* now, we're gonna reexamine collected CEs */
7388     uint32_t    *sCE;
7389     uint32_t    *tCE;
7390
7391     /* This is the secondary level of comparison */
7392     if(checkSecTer) {
7393       if(!isFrenchSec) { /* normal */
7394         sCE = sCEs.buf;
7395         tCE = tCEs.buf;
7396         for(;;) {
7397           while (secS == 0) {
7398             secS = *(sCE++) & UCOL_SECONDARYMASK;
7399           }
7400
7401           while(secT == 0) {
7402               secT = *(tCE++) & UCOL_SECONDARYMASK;
7403           }
7404
7405           if(secS == secT) {
7406             if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7407               break;
7408             } else {
7409               secS = 0; secT = 0;
7410               continue;
7411             }
7412           } else {
7413                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7414                goto commonReturn;
7415           }
7416         }
7417       } else { /* do the French */
7418         uint32_t *sCESave = NULL;
7419         uint32_t *tCESave = NULL;
7420         sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7421         tCE = tCEs.pos-2;
7422         for(;;) {
7423           while (secS == 0 && sCE >= sCEs.buf) {
7424             if(sCESave == 0) {
7425               secS = *(sCE--);
7426               if(isContinuation(secS)) {
7427                 while(isContinuation(secS = *(sCE--)));
7428                 /* after this, secS has the start of continuation, and sCEs points before that */
7429                 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7430                 sCE+=2;  /* need to point to the first continuation CP */
7431                 /* However, now you can just continue doing stuff */
7432               }
7433             } else {
7434               secS = *(sCE++);
7435               if(!isContinuation(secS)) { /* This means we have finished with this cont */
7436                 sCE = sCESave;            /* reset the pointer to before continuation */
7437                 sCESave = 0;
7438                 continue;
7439               }
7440             }
7441             secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7442           }
7443
7444           while(secT == 0 && tCE >= tCEs.buf) {
7445             if(tCESave == 0) {
7446               secT = *(tCE--);
7447               if(isContinuation(secT)) {
7448                 while(isContinuation(secT = *(tCE--)));
7449                 /* after this, secS has the start of continuation, and sCEs points before that */
7450                 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7451                 tCE+=2;  /* need to point to the first continuation CP */
7452                 /* However, now you can just continue doing stuff */
7453               }
7454             } else {
7455               secT = *(tCE++);
7456               if(!isContinuation(secT)) { /* This means we have finished with this cont */
7457                 tCE = tCESave;          /* reset the pointer to before continuation */
7458                 tCESave = 0;
7459                 continue;
7460               }
7461             }
7462             secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7463           }
7464
7465           if(secS == secT) {
7466             if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7467               break;
7468             } else {
7469               secS = 0; secT = 0;
7470               continue;
7471             }
7472           } else {
7473               result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7474               goto commonReturn;
7475           }
7476         }
7477       }
7478     }
7479
7480     /* doing the case bit */
7481     if(checkCase) {
7482       sCE = sCEs.buf;
7483       tCE = tCEs.buf;
7484       for(;;) {
7485         while((secS & UCOL_REMOVE_CASE) == 0) {
7486           if(!isContinuation(*sCE++)) {
7487             secS =*(sCE-1);
7488             if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7489             // primary ignorables should not be considered on the case level when the strength is primary
7490             // otherwise, the CEs stop being well-formed
7491               secS &= UCOL_TERT_CASE_MASK;
7492               secS ^= caseSwitch;
7493             } else {
7494               secS = 0;
7495             }
7496           } else {
7497             secS = 0;
7498           }
7499         }
7500
7501         while((secT & UCOL_REMOVE_CASE) == 0) {
7502           if(!isContinuation(*tCE++)) {
7503             secT = *(tCE-1);
7504             if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7505             // primary ignorables should not be considered on the case level when the strength is primary
7506             // otherwise, the CEs stop being well-formed
7507               secT &= UCOL_TERT_CASE_MASK;
7508               secT ^= caseSwitch;
7509             } else {
7510               secT = 0;
7511             }
7512           } else {
7513             secT = 0;
7514           }
7515         }
7516
7517         if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7518           result = UCOL_LESS;
7519           goto commonReturn;
7520         } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7521           result = UCOL_GREATER;
7522           goto commonReturn;
7523         }
7524
7525         if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7526           break;
7527         } else {
7528           secS = 0;
7529           secT = 0;
7530         }
7531       }
7532     }
7533
7534     /* Tertiary level */
7535     if(checkTertiary) {
7536       secS = 0;
7537       secT = 0;
7538       sCE = sCEs.buf;
7539       tCE = tCEs.buf;
7540       for(;;) {
7541         while((secS & UCOL_REMOVE_CASE) == 0) {
7542           secS = *(sCE++) & tertiaryMask;
7543           if(!isContinuation(secS)) {
7544             secS ^= caseSwitch;
7545           } else {
7546             secS &= UCOL_REMOVE_CASE;
7547           }
7548         }
7549
7550         while((secT & UCOL_REMOVE_CASE)  == 0) {
7551           secT = *(tCE++) & tertiaryMask;
7552           if(!isContinuation(secT)) {
7553             secT ^= caseSwitch;
7554           } else {
7555             secT &= UCOL_REMOVE_CASE;
7556           }
7557         }
7558
7559         if(secS == secT) {
7560           if((secS & UCOL_REMOVE_CASE) == 1) {
7561             break;
7562           } else {
7563             secS = 0; secT = 0;
7564             continue;
7565           }
7566         } else {
7567             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7568             goto commonReturn;
7569         }
7570       }
7571     }
7572
7573
7574     if(qShifted /*checkQuad*/) {
7575       UBool sInShifted = TRUE;
7576       UBool tInShifted = TRUE;
7577       secS = 0;
7578       secT = 0;
7579       sCE = sCEs.buf;
7580       tCE = tCEs.buf;
7581       for(;;) {
7582         while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7583           secS = *(sCE++);
7584           if(isContinuation(secS)) {
7585             if(!sInShifted) {
7586               continue;
7587             }
7588           } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7589             secS = UCOL_PRIMARYMASK;
7590             sInShifted = FALSE;
7591           } else {
7592             sInShifted = TRUE;
7593           }
7594         }
7595         secS &= UCOL_PRIMARYMASK;
7596
7597
7598         while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7599           secT = *(tCE++);
7600           if(isContinuation(secT)) {
7601             if(!tInShifted) {
7602               continue;
7603             }
7604           } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7605             secT = UCOL_PRIMARYMASK;
7606             tInShifted = FALSE;
7607           } else {
7608             tInShifted = TRUE;
7609           }
7610         }
7611         secT &= UCOL_PRIMARYMASK;
7612
7613         if(secS == secT) {
7614           if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7615             break;
7616           } else {
7617             secS = 0; secT = 0;
7618             continue;
7619           }
7620         } else {
7621             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7622             goto commonReturn;
7623         }
7624       }
7625     } else if(doHiragana && hirResult != UCOL_EQUAL) {
7626       // If we're fine on quaternaries, we might be different
7627       // on Hiragana. This, however, might fail us in shifted.
7628       result = hirResult;
7629       goto commonReturn;
7630     }
7631
7632     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7633     /*  as a tiebreaker if all else is equal.                                */
7634     /*  Getting here  should be quite rare - strings are not identical -     */
7635     /*     that is checked first, but compared == through all other checks.  */
7636     if(checkIdent)
7637     {
7638         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7639         result = ucol_checkIdent(sColl, tColl, TRUE, status);
7640     }
7641
7642 commonReturn:
7643     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7644         freeHeapWritableBuffer(sColl);
7645         freeHeapWritableBuffer(tColl);
7646
7647         if (sCEs.buf != sCEs.localArray ) {
7648             uprv_free(sCEs.buf);
7649         }
7650         if (tCEs.buf != tCEs.localArray ) {
7651             uprv_free(tCEs.buf);
7652         }
7653     }
7654
7655     return result;
7656 }
7657
7658
7659 static inline uint32_t
7660 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7661                           uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7662   const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7663   int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7664   int32_t offset = 1;
7665   UChar schar = 0, tchar = 0;
7666
7667   for(;;) {
7668     if(len == -1) {
7669       if(s[*index] == 0) { // end of string
7670         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7671       } else {
7672         schar = s[*index];
7673       }
7674     } else {
7675       if(*index == len) {
7676         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7677       } else {
7678         schar = s[*index];
7679       }
7680     }
7681
7682     while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7683       offset++;
7684     }
7685
7686     if (schar == tchar) {
7687       (*index)++;
7688       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7689     }
7690     else
7691     {
7692       if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7693         return UCOL_BAIL_OUT_CE;
7694       }
7695       // skip completely ignorables
7696       uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7697       if(isZeroCE == 0) { // we have to ignore completely ignorables
7698         (*index)++;
7699         continue;
7700       }
7701
7702       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7703     }
7704   }
7705 }
7706
7707
7708 /**
7709  * This is a fast strcoll, geared towards text in Latin-1.
7710  * It supports contractions of size two, French secondaries
7711  * and case switching. You can use it with strengths primary
7712  * to tertiary. It does not support shifted and case level.
7713  * It relies on the table build by setupLatin1Table. If it
7714  * doesn't understand something, it will go to the regular
7715  * strcoll.
7716  */
7717 static inline UCollationResult
7718 ucol_strcollUseLatin1( const UCollator    *coll,
7719               const UChar        *source,
7720               int32_t            sLen,
7721               const UChar        *target,
7722               int32_t            tLen,
7723               UErrorCode *status)
7724 {
7725     U_ALIGN_CODE(16);
7726     int32_t strength = coll->strength;
7727
7728     int32_t sIndex = 0, tIndex = 0;
7729     UChar sChar = 0, tChar = 0;
7730     uint32_t sOrder=0, tOrder=0;
7731
7732     UBool endOfSource = FALSE;
7733
7734     uint32_t *elements = coll->latinOneCEs;
7735
7736     UBool haveContractions = FALSE; // if we have contractions in our string
7737                                     // we cannot do French secondary
7738
7739     // Do the primary level
7740     for(;;) {
7741       while(sOrder==0) { // this loop skips primary ignorables
7742         // sOrder=getNextlatinOneCE(source);
7743         if(sLen==-1) {   // handling zero terminated strings
7744           sChar=source[sIndex++];
7745           if(sChar==0) {
7746             endOfSource = TRUE;
7747             break;
7748           }
7749         } else {        // handling strings with known length
7750           if(sIndex==sLen) {
7751             endOfSource = TRUE;
7752             break;
7753           }
7754           sChar=source[sIndex++];
7755         }
7756         if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7757           //fprintf(stderr, "R");
7758           goto returnRegular;
7759           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7760         }
7761         sOrder = elements[sChar];
7762         if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7763           // specials can basically be either contractions or bail-out signs. If we get anything
7764           // else, we'll bail out anywasy
7765           if(getCETag(sOrder) == CONTRACTION_TAG) {
7766             sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7767             haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7768             // However, if there are contractions in the table, but we always use just one char,
7769             // we might be able to do French. This should be checked out.
7770           }
7771           if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7772             //fprintf(stderr, "S");
7773             goto returnRegular;
7774             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7775           }
7776         }
7777       }
7778
7779       while(tOrder==0) {  // this loop skips primary ignorables
7780         // tOrder=getNextlatinOneCE(target);
7781         if(tLen==-1) {    // handling zero terminated strings
7782           tChar=target[tIndex++];
7783           if(tChar==0) {
7784             if(endOfSource) { // this is different than source loop,
7785               // as we already know that source loop is done here,
7786               // so we can either finish the primary loop if both
7787               // strings are done or anounce the result if only
7788               // target is done. Same below.
7789               goto endOfPrimLoop;
7790             } else {
7791               return UCOL_GREATER;
7792             }
7793           }
7794         } else {          // handling strings with known length
7795           if(tIndex==tLen) {
7796             if(endOfSource) {
7797               goto endOfPrimLoop;
7798             } else {
7799               return UCOL_GREATER;
7800             }
7801           }
7802           tChar=target[tIndex++];
7803         }
7804         if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7805           //fprintf(stderr, "R");
7806           goto returnRegular;
7807           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7808         }
7809         tOrder = elements[tChar];
7810         if(tOrder >= UCOL_NOT_FOUND) {
7811           // Handling specials, see the comments for source
7812           if(getCETag(tOrder) == CONTRACTION_TAG) {
7813             tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7814             haveContractions = TRUE;
7815           }
7816           if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7817             //fprintf(stderr, "S");
7818             goto returnRegular;
7819             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7820           }
7821         }
7822       }
7823       if(endOfSource) { // source is finished, but target is not, say the result.
7824           return UCOL_LESS;
7825       }
7826
7827       if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7828         sOrder = 0; tOrder = 0;
7829         continue;
7830       } else {
7831         // compare current top bytes
7832         if(((sOrder^tOrder)&0xFF000000)!=0) {
7833           // top bytes differ, return difference
7834           if(sOrder < tOrder) {
7835             return UCOL_LESS;
7836           } else if(sOrder > tOrder) {
7837             return UCOL_GREATER;
7838           }
7839           // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7840           // since we must return enum value
7841         }
7842
7843         // top bytes match, continue with following bytes
7844         sOrder<<=8;
7845         tOrder<<=8;
7846       }
7847     }
7848
7849 endOfPrimLoop:
7850     // after primary loop, we definitely know the sizes of strings,
7851     // so we set it and use simpler loop for secondaries and tertiaries
7852     sLen = sIndex; tLen = tIndex;
7853     if(strength >= UCOL_SECONDARY) {
7854       // adjust the table beggining
7855       elements += coll->latinOneTableLen;
7856       endOfSource = FALSE;
7857
7858       if(coll->frenchCollation == UCOL_OFF) { // non French
7859         // This loop is a simplified copy of primary loop
7860         // at this point we know that whole strings are latin-1, so we don't
7861         // check for that. We also know that we only have contractions as
7862         // specials.
7863         sIndex = 0; tIndex = 0;
7864         for(;;) {
7865           while(sOrder==0) {
7866             if(sIndex==sLen) {
7867               endOfSource = TRUE;
7868               break;
7869             }
7870             sChar=source[sIndex++];
7871             sOrder = elements[sChar];
7872             if(sOrder > UCOL_NOT_FOUND) {
7873               sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7874             }
7875           }
7876
7877           while(tOrder==0) {
7878             if(tIndex==tLen) {
7879               if(endOfSource) {
7880                 goto endOfSecLoop;
7881               } else {
7882                 return UCOL_GREATER;
7883               }
7884             }
7885             tChar=target[tIndex++];
7886             tOrder = elements[tChar];
7887             if(tOrder > UCOL_NOT_FOUND) {
7888               tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7889             }
7890           }
7891           if(endOfSource) {
7892               return UCOL_LESS;
7893           }
7894
7895           if(sOrder == tOrder) {
7896             sOrder = 0; tOrder = 0;
7897             continue;
7898           } else {
7899             // see primary loop for comments on this
7900             if(((sOrder^tOrder)&0xFF000000)!=0) {
7901               if(sOrder < tOrder) {
7902                 return UCOL_LESS;
7903               } else if(sOrder > tOrder) {
7904                 return UCOL_GREATER;
7905               }
7906             }
7907             sOrder<<=8;
7908             tOrder<<=8;
7909           }
7910         }
7911       } else { // French
7912         if(haveContractions) { // if we have contractions, we have to bail out
7913           // since we don't really know how to handle them here
7914           goto returnRegular;
7915           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7916         }
7917         // For French, we go backwards
7918         sIndex = sLen; tIndex = tLen;
7919         for(;;) {
7920           while(sOrder==0) {
7921             if(sIndex==0) {
7922               endOfSource = TRUE;
7923               break;
7924             }
7925             sChar=source[--sIndex];
7926             sOrder = elements[sChar];
7927             // don't even look for contractions
7928           }
7929
7930           while(tOrder==0) {
7931             if(tIndex==0) {
7932               if(endOfSource) {
7933                 goto endOfSecLoop;
7934               } else {
7935                 return UCOL_GREATER;
7936               }
7937             }
7938             tChar=target[--tIndex];
7939             tOrder = elements[tChar];
7940             // don't even look for contractions
7941           }
7942           if(endOfSource) {
7943               return UCOL_LESS;
7944           }
7945
7946           if(sOrder == tOrder) {
7947             sOrder = 0; tOrder = 0;
7948             continue;
7949           } else {
7950             // see the primary loop for comments
7951             if(((sOrder^tOrder)&0xFF000000)!=0) {
7952               if(sOrder < tOrder) {
7953                 return UCOL_LESS;
7954               } else if(sOrder > tOrder) {
7955                 return UCOL_GREATER;
7956               }
7957             }
7958             sOrder<<=8;
7959             tOrder<<=8;
7960           }
7961         }
7962       }
7963     }
7964
7965 endOfSecLoop:
7966     if(strength >= UCOL_TERTIARY) {
7967       // tertiary loop is the same as secondary (except no French)
7968       elements += coll->latinOneTableLen;
7969       sIndex = 0; tIndex = 0;
7970       endOfSource = FALSE;
7971       for(;;) {
7972         while(sOrder==0) {
7973           if(sIndex==sLen) {
7974             endOfSource = TRUE;
7975             break;
7976           }
7977           sChar=source[sIndex++];
7978           sOrder = elements[sChar];
7979           if(sOrder > UCOL_NOT_FOUND) {
7980             sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
7981           }
7982         }
7983         while(tOrder==0) {
7984           if(tIndex==tLen) {
7985             if(endOfSource) {
7986               return UCOL_EQUAL; // if both strings are at the end, they are equal
7987             } else {
7988               return UCOL_GREATER;
7989             }
7990           }
7991           tChar=target[tIndex++];
7992           tOrder = elements[tChar];
7993           if(tOrder > UCOL_NOT_FOUND) {
7994             tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
7995           }
7996         }
7997         if(endOfSource) {
7998             return UCOL_LESS;
7999         }
8000         if(sOrder == tOrder) {
8001           sOrder = 0; tOrder = 0;
8002           continue;
8003         } else {
8004           if(((sOrder^tOrder)&0xff000000)!=0) {
8005             if(sOrder < tOrder) {
8006               return UCOL_LESS;
8007             } else if(sOrder > tOrder) {
8008               return UCOL_GREATER;
8009             }
8010           }
8011           sOrder<<=8;
8012           tOrder<<=8;
8013         }
8014       }
8015     }
8016     return UCOL_EQUAL;
8017
8018 returnRegular:
8019     // Preparing the context objects for iterating over strings
8020     collIterate sColl, tColl;
8021
8022     IInit_collIterate(coll, source, sLen, &sColl);
8023     IInit_collIterate(coll, target, tLen, &tColl);
8024     return ucol_strcollRegular(&sColl, &tColl, status);
8025 }
8026
8027
8028 U_CAPI UCollationResult U_EXPORT2
8029 ucol_strcollIter( const UCollator    *coll,
8030                  UCharIterator *sIter,
8031                  UCharIterator *tIter,
8032                  UErrorCode         *status) {
8033   if(!status || U_FAILURE(*status)) {
8034     return UCOL_EQUAL;
8035   }
8036
8037   UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8038   UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8039
8040   if (sIter == tIter) {
8041     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8042     return UCOL_EQUAL;
8043   }
8044   if(sIter == NULL || tIter == NULL || coll == NULL) {
8045     *status = U_ILLEGAL_ARGUMENT_ERROR;
8046     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8047     return UCOL_EQUAL;
8048   }
8049
8050   UCollationResult result = UCOL_EQUAL;
8051
8052   // Preparing the context objects for iterating over strings
8053   collIterate sColl, tColl;
8054   // The division for the array length may truncate the array size to
8055   // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8056   // for all platforms anyway.
8057   UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8058   UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8059   UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8060
8061   IInit_collIterate(coll, NULL, -1, &sColl);
8062   sColl.iterator = sIter;
8063   sColl.flags |= UCOL_USE_ITERATOR;
8064   IInit_collIterate(coll, NULL, -1, &tColl);
8065   tColl.flags |= UCOL_USE_ITERATOR;
8066   tColl.iterator = tIter;
8067
8068   if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8069     sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8070     sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8071     sColl.flags &= ~UCOL_ITER_NORM;
8072
8073     tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8074     tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8075     tColl.flags &= ~UCOL_ITER_NORM;
8076   }
8077
8078   UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8079
8080   while((sChar = sColl.iterator->next(sColl.iterator)) ==
8081     (tChar = tColl.iterator->next(tColl.iterator))) {
8082     if(sChar == U_SENTINEL) {
8083       result = UCOL_EQUAL;
8084       goto end_compare;
8085     }
8086   }
8087
8088   if(sChar == U_SENTINEL) {
8089     tChar = tColl.iterator->previous(tColl.iterator);
8090   }
8091
8092   if(tChar == U_SENTINEL) {
8093     sChar = sColl.iterator->previous(sColl.iterator);
8094   }
8095
8096   sChar = sColl.iterator->previous(sColl.iterator);
8097   tChar = tColl.iterator->previous(tColl.iterator);
8098
8099   if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8100   {
8101       // We are stopped in the middle of a contraction.
8102       // Scan backwards through the == part of the string looking for the start of the contraction.
8103       //   It doesn't matter which string we scan, since they are the same in this region.
8104       do
8105       {
8106         sChar = sColl.iterator->previous(sColl.iterator);
8107         tChar = tColl.iterator->previous(tColl.iterator);
8108       }
8109       while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8110   }
8111
8112
8113   if(U_SUCCESS(*status)) {
8114     result = ucol_strcollRegular(&sColl, &tColl, status);
8115   }
8116
8117 end_compare:
8118   if(sNormIter || tNormIter) {
8119     unorm_closeIter(sNormIter);
8120     unorm_closeIter(tNormIter);
8121   }
8122
8123   UTRACE_EXIT_VALUE_STATUS(result, *status)
8124   return result;
8125 }
8126
8127
8128
8129 /*                                                                      */
8130 /* ucol_strcoll     Main public API string comparison function          */
8131 /*                                                                      */
8132 U_CAPI UCollationResult U_EXPORT2
8133 ucol_strcoll( const UCollator    *coll,
8134               const UChar        *source,
8135               int32_t            sourceLength,
8136               const UChar        *target,
8137               int32_t            targetLength) {
8138     U_ALIGN_CODE(16);
8139
8140     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8141     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8142       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8143       UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8144       UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8145     }
8146
8147     UErrorCode status = U_ZERO_ERROR;
8148     if(source == NULL || target == NULL) {
8149       // do not crash, but return. Should have
8150       // status argument to return error.
8151       UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
8152       return UCOL_EQUAL;
8153     }
8154       collIterate sColl, tColl;
8155
8156     /* Scan the strings.  Find:                                                             */
8157     /*    The length of any leading portion that is equal                                   */
8158     /*    Whether they are exactly equal.  (in which case we just return)                   */
8159     const UChar    *pSrc    = source;
8160     const UChar    *pTarg   = target;
8161     int32_t        equalLength;
8162
8163     if (sourceLength == -1 && targetLength == -1) {
8164         // Both strings are null terminated.
8165         //    Check for them being the same string, and scan through
8166         //    any leading equal portion.
8167         if (source==target) {
8168             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8169             return UCOL_EQUAL;
8170         }
8171
8172         for (;;) {
8173             if ( *pSrc != *pTarg || *pSrc == 0) {
8174                 break;
8175             }
8176             pSrc++;
8177             pTarg++;
8178         }
8179         if (*pSrc == 0 && *pTarg == 0) {
8180             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8181             return UCOL_EQUAL;
8182         }
8183         equalLength = pSrc - source;
8184     }
8185     else
8186     {
8187         // One or both strings has an explicit length.
8188         /* check if source and target are same strings */
8189
8190         if (source==target  && sourceLength==targetLength) {
8191             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8192             return UCOL_EQUAL;
8193         }
8194         const UChar    *pSrcEnd = source + sourceLength;
8195         const UChar    *pTargEnd = target + targetLength;
8196
8197
8198         // Scan while the strings are bitwise ==, or until one is exhausted.
8199             for (;;) {
8200                 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8201                     break;
8202                 }
8203                 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8204                     break;
8205                 }
8206                 if (*pSrc != *pTarg) {
8207                     break;
8208                 }
8209                 pSrc++;
8210                 pTarg++;
8211             }
8212             equalLength = pSrc - source;
8213
8214             // If we made it all the way through both strings, we are done.  They are ==
8215             if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8216                 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))  {  /* and also at end of dest string                  */
8217                 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8218                 return UCOL_EQUAL;
8219             }
8220     }
8221     if (equalLength > 0) {
8222         /* There is an identical portion at the beginning of the two strings.        */
8223         /*   If the identical portion ends within a contraction or a comibining      */
8224         /*   character sequence, back up to the start of that sequence.              */
8225         pSrc  = source + equalLength;        /* point to the first differing chars   */
8226         pTarg = target + equalLength;
8227         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8228             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8229         {
8230             // We are stopped in the middle of a contraction.
8231             // Scan backwards through the == part of the string looking for the start of the contraction.
8232             //   It doesn't matter which string we scan, since they are the same in this region.
8233             do
8234             {
8235                 equalLength--;
8236                 pSrc--;
8237             }
8238             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8239         }
8240
8241         source += equalLength;
8242         target += equalLength;
8243         if (sourceLength > 0) {
8244             sourceLength -= equalLength;
8245         }
8246         if (targetLength > 0) {
8247             targetLength -= equalLength;
8248         }
8249     }
8250
8251     UCollationResult  returnVal;
8252     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8253       // Preparing the context objects for iterating over strings
8254       IInit_collIterate(coll, source, sourceLength, &sColl);
8255       IInit_collIterate(coll, target, targetLength, &tColl);
8256       returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8257     } else {
8258       returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8259     }
8260     UTRACE_EXIT_VALUE(returnVal);
8261     return returnVal;
8262 }
8263
8264 /* convenience function for comparing strings */
8265 U_CAPI UBool U_EXPORT2
8266 ucol_greater(    const    UCollator        *coll,
8267         const    UChar            *source,
8268         int32_t            sourceLength,
8269         const    UChar            *target,
8270         int32_t            targetLength)
8271 {
8272   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8273       == UCOL_GREATER);
8274 }
8275
8276 /* convenience function for comparing strings */
8277 U_CAPI UBool U_EXPORT2
8278 ucol_greaterOrEqual(    const    UCollator    *coll,
8279             const    UChar        *source,
8280             int32_t        sourceLength,
8281             const    UChar        *target,
8282             int32_t        targetLength)
8283 {
8284   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8285       != UCOL_LESS);
8286 }
8287
8288 /* convenience function for comparing strings */
8289 U_CAPI UBool U_EXPORT2
8290 ucol_equal(        const    UCollator        *coll,
8291             const    UChar            *source,
8292             int32_t            sourceLength,
8293             const    UChar            *target,
8294             int32_t            targetLength)
8295 {
8296   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8297       == UCOL_EQUAL);
8298 }
8299
8300 U_CAPI void U_EXPORT2
8301 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8302   if(coll && coll->UCA) {
8303     uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8304   }
8305 }
8306
8307 U_CAPI int32_t U_EXPORT2
8308 ucol_cloneBinary(const UCollator *coll,
8309                  uint8_t *buffer, int32_t capacity,
8310                  UErrorCode *status)
8311 {
8312     int32_t length = 0;
8313     if(U_FAILURE(*status)) {
8314         return length;
8315     }
8316     if(capacity < 0) {
8317       *status = U_ILLEGAL_ARGUMENT_ERROR;
8318       return length;
8319     }
8320     if(coll->hasRealData == TRUE) {
8321         length = coll->image->size;
8322         if(length <= capacity) {
8323             uprv_memcpy(buffer, coll->image, length);
8324         } else {
8325             *status = U_BUFFER_OVERFLOW_ERROR;
8326         }
8327     } else {
8328         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
8329         if(length <= capacity) {
8330             /* build the UCATableHeader with minimal entries */
8331             /* do not copy the header from the UCA file because its values are wrong! */
8332             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
8333
8334             /* reset everything */
8335             uprv_memset(buffer, 0, length);
8336
8337             /* set the tailoring-specific values */
8338             UCATableHeader *myData = (UCATableHeader *)buffer;
8339             myData->size = length;
8340
8341             /* offset for the options, the only part of the data that is present after the header */
8342             myData->options = sizeof(UCATableHeader);
8343
8344             /* need to always set the expansion value for an upper bound of the options */
8345             myData->expansion = myData->options + sizeof(UColOptionSet);
8346
8347             myData->magic = UCOL_HEADER_MAGIC;
8348             myData->isBigEndian = U_IS_BIG_ENDIAN;
8349             myData->charSetFamily = U_CHARSET_FAMILY;
8350
8351             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
8352             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
8353
8354             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
8355             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
8356             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
8357             myData->jamoSpecial = coll->image->jamoSpecial;
8358
8359             /* copy the collator options */
8360             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
8361         } else {
8362             *status = U_BUFFER_OVERFLOW_ERROR;
8363         }
8364     }
8365     return length;
8366 }
8367
8368 U_CAPI void U_EXPORT2
8369 ucol_forgetUCA(void)
8370 {
8371   _staticUCA = NULL;
8372   UCA_DATA_MEM = NULL;
8373 }
8374
8375 #endif /* #if !UCONFIG_NO_COLLATION */
8376