icuSources/i18n/ucol.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 1996-2009, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  ucol.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 * Modification history
  12 * Date        Name      Comments
  13 * 1996-1999   various members of ICU team maintained C API for collation framework
  14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
  15 * 03/01/2001  synwee    Added maxexpansion functionality.
  16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_COLLATION
  22
  23 #include "unicode/coleitr.h"
  24 #include "unicode/unorm.h"
  25 #include "unicode/udata.h"
  26 #include "unicode/ustring.h"
  27
  28 #include "ucol_imp.h"
  29 #include "bocsu.h"
  30
  31 #include "unormimp.h"
  32 #include "unorm_it.h"
  33 #include "umutex.h"
  34 #include "cmemory.h"
  35 #include "ucln_in.h"
  36 #include "cstring.h"
  37 #include "utracimp.h"
  38 #include "putilimp.h"
  39 #include "uassert.h"
  40
  41 #ifdef UCOL_DEBUG
  42 #include <stdio.h>
  43 #endif
  44
  45 U_NAMESPACE_USE
  46
  47 /* added by synwee for trie manipulation*/
  48 #define STAGE_1_SHIFT_            10
  49 #define STAGE_2_SHIFT_            4
  50 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
  51 #define STAGE_3_MASK_             0xF
  52 #define LAST_BYTE_MASK_           0xFF
  53 #define SECOND_LAST_BYTE_SHIFT_   8
  54
  55 #define ZERO_CC_LIMIT_            0xC0
  56
  57 // this is static pointer to the normalizer fcdTrieIndex
  58 // it is always the same between calls to u_cleanup
  59 // and therefore writing to it is not synchronized.
  60 // It is cleaned in ucol_cleanup
  61 static const uint16_t *fcdTrieIndex=NULL;
  62
  63 // These are values from UCA required for
  64 // implicit generation and supressing sort key compression
  65 // they should regularly be in the UCA, but if one
  66 // is running without UCA, it could be a problem
  67 static const int32_t maxRegularPrimary  = 0xA0;
  68 static const int32_t minImplicitPrimary = 0xE0;
  69 static const int32_t maxImplicitPrimary = 0xE4;
  70
  71 U_CDECL_BEGIN
  72 static UBool U_CALLCONV
  73 ucol_cleanup(void)
  74 {
  75     fcdTrieIndex = NULL;
  76     return TRUE;
  77 }
  78
  79 static int32_t U_CALLCONV
  80 _getFoldingOffset(uint32_t data) {
  81     return (int32_t)(data&0xFFFFFF);
  82 }
  83
  84 U_CDECL_END
  85
  86 static
  87 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
  88                               int32_t sourceLen, collIterate *s)
  89 {
  90     (s)->string = (s)->pos = (UChar *)(sourceString);
  91     (s)->origFlags = 0;
  92     (s)->flags = 0;
  93     if (sourceLen >= 0) {
  94         s->flags |= UCOL_ITER_HASLEN;
  95         (s)->endp = (UChar *)sourceString+sourceLen;
  96     }
  97     else {
  98         /* change to enable easier checking for end of string for fcdpositon */
  99         (s)->endp = NULL;
 100     }
 101     (s)->extendCEs = NULL;
 102     (s)->extendCEsSize = 0;
 103     (s)->CEpos = (s)->toReturn = (s)->CEs;
 104     (s)->offsetBuffer = NULL;
 105     (s)->offsetBufferSize = 0;
 106     (s)->offsetReturn = (s)->offsetStore = NULL;
 107     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
 108     (s)->writableBuffer = (s)->stackWritableBuffer;
 109     (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
 110     (s)->coll = (collator);
 111     (s)->fcdPosition = 0;
 112     if(collator->normalizationMode == UCOL_ON) {
 113         (s)->flags |= UCOL_ITER_NORM;
 114     }
 115     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
 116         (s)->flags |= UCOL_HIRAGANA_Q;
 117     }
 118     (s)->iterator = NULL;
 119     //(s)->iteratorIndex = 0;
 120 }
 121
 122 U_CAPI void  U_EXPORT2
 123 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
 124                              int32_t sourceLen, collIterate *s){
 125     /* Out-of-line version for use from other files. */
 126     IInit_collIterate(collator, sourceString, sourceLen, s);
 127 }
 128
 129
 130 /**
 131 * Backup the state of the collIterate struct data
 132 * @param data collIterate to backup
 133 * @param backup storage
 134 */
 135 static
 136 inline void backupState(const collIterate *data, collIterateState *backup)
 137 {
 138     backup->fcdPosition = data->fcdPosition;
 139     backup->flags       = data->flags;
 140     backup->origFlags   = data->origFlags;
 141     backup->pos         = data->pos;
 142     backup->bufferaddress = data->writableBuffer;
 143     backup->buffersize    = data->writableBufSize;
 144     backup->iteratorMove = 0;
 145     backup->iteratorIndex = 0;
 146     if(data->iterator != NULL) {
 147         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
 148         backup->iteratorIndex = data->iterator->getState(data->iterator);
 149         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
 150         if(backup->iteratorIndex == UITER_NO_STATE) {
 151             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
 152                 backup->iteratorMove++;
 153                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
 154             }
 155             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 156         }
 157     }
 158 }
 159
 160 /**
 161 * Loads the state into the collIterate struct data
 162 * @param data collIterate to backup
 163 * @param backup storage
 164 * @param forwards boolean to indicate if forwards iteration is used,
 165 *        false indicates backwards iteration
 166 */
 167 static
 168 inline void loadState(collIterate *data, const collIterateState *backup,
 169                       UBool        forwards)
 170 {
 171     UErrorCode status = U_ZERO_ERROR;
 172     data->flags       = backup->flags;
 173     data->origFlags   = backup->origFlags;
 174     if(data->iterator != NULL) {
 175         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
 176         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
 177         if(backup->iteratorMove != 0) {
 178             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 179         }
 180     }
 181     data->pos         = backup->pos;
 182
 183     if ((data->flags & UCOL_ITER_INNORMBUF) &&
 184         data->writableBuffer != backup->bufferaddress) {
 185         /*
 186         this is when a new buffer has been reallocated and we'll have to
 187         calculate the new position.
 188         note the new buffer has to contain the contents of the old buffer.
 189         */
 190         if (forwards) {
 191             data->pos = data->writableBuffer +
 192                                          (data->pos - backup->bufferaddress);
 193         }
 194         else {
 195             /* backwards direction */
 196             uint32_t temp = backup->buffersize -
 197                                   (data->pos - backup->bufferaddress);
 198             data->pos = data->writableBuffer + (data->writableBufSize - temp);
 199         }
 200     }
 201     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
 202         /*
 203         this is alittle tricky.
 204         if we are initially not in the normalization buffer, even if we
 205         normalize in the later stage, the data in the buffer will be
 206         ignored, since we skip back up to the data string.
 207         however if we are already in the normalization buffer, any
 208         further normalization will pull data into the normalization
 209         buffer and modify the fcdPosition.
 210         since we are keeping the data in the buffer for use, the
 211         fcdPosition can not be reverted back.
 212         arrgghh....
 213         */
 214         data->fcdPosition = backup->fcdPosition;
 215     }
 216 }
 217
 218
 219 /*
 220 * collIter_eos()
 221 *     Checks for a collIterate being positioned at the end of
 222 *     its source string.
 223 *
 224 */
 225 static
 226 inline UBool collIter_eos(collIterate *s) {
 227     if(s->flags & UCOL_USE_ITERATOR) {
 228       return !(s->iterator->hasNext(s->iterator));
 229     }
 230     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
 231         // Null terminated string, but not at null, so not at end.
 232         //   Whether in main or normalization buffer doesn't matter.
 233         return FALSE;
 234     }
 235
 236     // String with length.  Can't be in normalization buffer, which is always
 237     //  null termintated.
 238     if (s->flags & UCOL_ITER_HASLEN) {
 239         return (s->pos == s->endp);
 240     }
 241
 242     // We are at a null termination, could be either normalization buffer or main string.
 243     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
 244         // At null at end of main string.
 245         return TRUE;
 246     }
 247
 248     // At null at end of normalization buffer.  Need to check whether there there are
 249     //   any characters left in the main buffer.
 250     if(s->origFlags & UCOL_USE_ITERATOR) {
 251       return !(s->iterator->hasNext(s->iterator));
 252     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
 253         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
 254         return (*s->fcdPosition == 0);
 255     }
 256     else {
 257         // Main string with an end pointer.
 258         return s->fcdPosition == s->endp;
 259     }
 260 }
 261
 262 /*
 263 * collIter_bos()
 264 *     Checks for a collIterate being positioned at the start of
 265 *     its source string.
 266 *
 267 */
 268 static
 269 inline UBool collIter_bos(collIterate *source) {
 270   // if we're going backwards, we need to know whether there is more in the
 271   // iterator, even if we are in the side buffer
 272   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 273     return !source->iterator->hasPrevious(source->iterator);
 274   }
 275   if (source->pos <= source->string ||
 276       ((source->flags & UCOL_ITER_INNORMBUF) &&
 277       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
 278     return TRUE;
 279   }
 280   return FALSE;
 281 }
 282
 283 /*static
 284 inline UBool collIter_SimpleBos(collIterate *source) {
 285   // if we're going backwards, we need to know whether there is more in the
 286   // iterator, even if we are in the side buffer
 287   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 288     return !source->iterator->hasPrevious(source->iterator);
 289   }
 290   if (source->pos == source->string) {
 291     return TRUE;
 292   }
 293   return FALSE;
 294 }*/
 295     //return (data->pos == data->string) ||
 296
 297
 298 /**
 299 * Checks and free writable buffer if it is not the original stack buffer
 300 * in collIterate. This function does not reassign the writable buffer.
 301 * @param data collIterate struct to determine and free the writable buffer
 302 */
 303 static
 304 inline void freeHeapWritableBuffer(collIterate *data)
 305 {
 306     if (data->writableBuffer != data->stackWritableBuffer) {
 307         uprv_free(data->writableBuffer);
 308     }
 309 }
 310
 311
 312 /****************************************************************************/
 313 /* Following are the open/close functions                                   */
 314 /*                                                                          */
 315 /****************************************************************************/
 316
 317 static UCollator*
 318 ucol_initFromBinary(const uint8_t *bin, int32_t length,
 319                 const UCollator *base,
 320                 UCollator *fillIn,
 321                 UErrorCode *status)
 322 {
 323     UCollator *result = fillIn;
 324     if(U_FAILURE(*status)) {
 325         return NULL;
 326     }
 327     /*
 328     if(base == NULL) {
 329         // we don't support null base yet
 330         *status = U_ILLEGAL_ARGUMENT_ERROR;
 331         return NULL;
 332     }
 333     */
 334     // We need these and we could be running without UCA
 335     uprv_uca_initImplicitConstants(status);
 336     UCATableHeader *colData = (UCATableHeader *)bin;
 337     // do we want version check here? We're trying to figure out whether collators are compatible
 338     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
 339         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
 340         colData->version[0] != UCOL_BUILDER_VERSION)
 341     {
 342         *status = U_COLLATOR_VERSION_MISMATCH;
 343         return NULL;
 344     }
 345     else {
 346         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
 347             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
 348             if(U_FAILURE(*status)){
 349                 return NULL;
 350             }
 351             result->hasRealData = TRUE;
 352         }
 353         else {
 354             if(base) {
 355                 result = ucol_initCollator(base->image, result, base, status);
 356                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
 357                 if(U_FAILURE(*status)){
 358                     return NULL;
 359                 }
 360                 result->hasRealData = FALSE;
 361             }
 362             else {
 363                 *status = U_USELESS_COLLATOR_ERROR;
 364                 return NULL;
 365             }
 366         }
 367         result->freeImageOnClose = FALSE;
 368     }
 369     result->actualLocale = NULL;
 370     result->validLocale = NULL;
 371     result->requestedLocale = NULL;
 372     result->rules = NULL;
 373     result->rulesLength = 0;
 374     result->freeRulesOnClose = FALSE;
 375     result->ucaRules = NULL;
 376     return result;
 377 }
 378
 379 U_CAPI UCollator* U_EXPORT2
 380 ucol_openBinary(const uint8_t *bin, int32_t length,
 381                 const UCollator *base,
 382                 UErrorCode *status)
 383 {
 384     return ucol_initFromBinary(bin, length, base, NULL, status);
 385 }
 386
 387 U_CAPI int32_t U_EXPORT2
 388 ucol_cloneBinary(const UCollator *coll,
 389                  uint8_t *buffer, int32_t capacity,
 390                  UErrorCode *status)
 391 {
 392     int32_t length = 0;
 393     if(U_FAILURE(*status)) {
 394         return length;
 395     }
 396     if(capacity < 0) {
 397         *status = U_ILLEGAL_ARGUMENT_ERROR;
 398         return length;
 399     }
 400     if(coll->hasRealData == TRUE) {
 401         length = coll->image->size;
 402         if(length <= capacity) {
 403             uprv_memcpy(buffer, coll->image, length);
 404         } else {
 405             *status = U_BUFFER_OVERFLOW_ERROR;
 406         }
 407     } else {
 408         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 409         if(length <= capacity) {
 410             /* build the UCATableHeader with minimal entries */
 411             /* do not copy the header from the UCA file because its values are wrong! */
 412             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 413
 414             /* reset everything */
 415             uprv_memset(buffer, 0, length);
 416
 417             /* set the tailoring-specific values */
 418             UCATableHeader *myData = (UCATableHeader *)buffer;
 419             myData->size = length;
 420
 421             /* offset for the options, the only part of the data that is present after the header */
 422             myData->options = sizeof(UCATableHeader);
 423
 424             /* need to always set the expansion value for an upper bound of the options */
 425             myData->expansion = myData->options + sizeof(UColOptionSet);
 426
 427             myData->magic = UCOL_HEADER_MAGIC;
 428             myData->isBigEndian = U_IS_BIG_ENDIAN;
 429             myData->charSetFamily = U_CHARSET_FAMILY;
 430
 431             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 432             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 433
 434             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 435             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 436             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 437             myData->jamoSpecial = coll->image->jamoSpecial;
 438
 439             /* copy the collator options */
 440             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 441         } else {
 442             *status = U_BUFFER_OVERFLOW_ERROR;
 443         }
 444     }
 445     return length;
 446 }
 447
 448 U_CAPI UCollator* U_EXPORT2
 449 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
 450 {
 451     UCollator * localCollator;
 452     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
 453     char *stackBufferChars = (char *)stackBuffer;
 454     int32_t imageSize = 0;
 455     int32_t rulesSize = 0;
 456     int32_t rulesPadding = 0;
 457     uint8_t *image;
 458     UChar *rules;
 459     UBool colAllocated = FALSE;
 460     UBool imageAllocated = FALSE;
 461
 462     if (status == NULL || U_FAILURE(*status)){
 463         return 0;
 464     }
 465     if ((stackBuffer && !pBufferSize) || !coll){
 466        *status = U_ILLEGAL_ARGUMENT_ERROR;
 467         return 0;
 468     }
 469     if (coll->rules && coll->freeRulesOnClose) {
 470         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
 471         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
 472         bufferSizeNeeded += rulesSize + rulesPadding;
 473     }
 474
 475     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
 476         *pBufferSize =  bufferSizeNeeded;
 477         return 0;
 478     }
 479
 480     /* Pointers on 64-bit platforms need to be aligned
 481      * on a 64-bit boundry in memory.
 482      */
 483     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
 484         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
 485         if (*pBufferSize > offsetUp) {
 486             *pBufferSize -= offsetUp;
 487             stackBufferChars += offsetUp;
 488         }
 489         else {
 490             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
 491             *pBufferSize = 1;
 492         }
 493     }
 494     stackBuffer = (void *)stackBufferChars;
 495
 496     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
 497         /* allocate one here...*/
 498         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
 499         // Null pointer check.
 500         if (stackBufferChars == NULL) {
 501             *status = U_MEMORY_ALLOCATION_ERROR;
 502             return NULL;
 503         }
 504         colAllocated = TRUE;
 505         if (U_SUCCESS(*status)) {
 506             *status = U_SAFECLONE_ALLOCATED_WARNING;
 507         }
 508     }
 509     localCollator = (UCollator *)stackBufferChars;
 510     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
 511     {
 512         UErrorCode tempStatus = U_ZERO_ERROR;
 513         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
 514     }
 515     if (coll->freeImageOnClose) {
 516         image = (uint8_t *)uprv_malloc(imageSize);
 517         // Null pointer check
 518         if (image == NULL) {
 519             *status = U_MEMORY_ALLOCATION_ERROR;
 520             return NULL;
 521         }
 522         ucol_cloneBinary(coll, image, imageSize, status);
 523         imageAllocated = TRUE;
 524     }
 525     else {
 526         image = (uint8_t *)coll->image;
 527     }
 528     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
 529     if (U_FAILURE(*status)) {
 530         return NULL;
 531     }
 532
 533     if (coll->rules) {
 534         if (coll->freeRulesOnClose) {
 535             localCollator->rules = u_strcpy(rules, coll->rules);
 536             //bufferEnd += rulesSize;
 537         }
 538         else {
 539             localCollator->rules = coll->rules;
 540         }
 541         localCollator->freeRulesOnClose = FALSE;
 542         localCollator->rulesLength = coll->rulesLength;
 543     }
 544
 545     int32_t i;
 546     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
 547         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
 548     }
 549     // zero copies of pointers
 550     localCollator->actualLocale = NULL;
 551     localCollator->validLocale = NULL;
 552     localCollator->requestedLocale = NULL;
 553     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
 554     localCollator->freeOnClose = colAllocated;
 555     localCollator->freeImageOnClose = imageAllocated;
 556     return localCollator;
 557 }
 558
 559 U_CAPI void U_EXPORT2
 560 ucol_close(UCollator *coll)
 561 {
 562     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
 563     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
 564     if(coll != NULL) {
 565         // these are always owned by each UCollator struct,
 566         // so we always free them
 567         if(coll->validLocale != NULL) {
 568             uprv_free(coll->validLocale);
 569         }
 570         if(coll->actualLocale != NULL) {
 571             uprv_free(coll->actualLocale);
 572         }
 573         if(coll->requestedLocale != NULL) {
 574             uprv_free(coll->requestedLocale);
 575         }
 576         if(coll->latinOneCEs != NULL) {
 577             uprv_free(coll->latinOneCEs);
 578         }
 579         if(coll->options != NULL && coll->freeOptionsOnClose) {
 580             uprv_free(coll->options);
 581         }
 582         if(coll->rules != NULL && coll->freeRulesOnClose) {
 583             uprv_free((UChar *)coll->rules);
 584         }
 585         if(coll->image != NULL && coll->freeImageOnClose) {
 586             uprv_free((UCATableHeader *)coll->image);
 587         }
 588
 589         /* Here, it would be advisable to close: */
 590         /* - UData for UCA (unless we stuff it in the root resb */
 591         /* Again, do we need additional housekeeping... HMMM! */
 592         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
 593         if(coll->freeOnClose){
 594             /* for safeClone, if freeOnClose is FALSE,
 595             don't free the other instance data */
 596             uprv_free(coll);
 597         }
 598     }
 599     UTRACE_EXIT();
 600 }
 601
 602 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
 603 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
 604 U_CFUNC uint8_t* U_EXPORT2
 605 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
 606 {
 607     uint8_t *result = NULL;
 608     if(U_FAILURE(*status)) {
 609         return NULL;
 610     }
 611     if(coll->hasRealData == TRUE) {
 612         *length = coll->image->size;
 613         result = (uint8_t *)uprv_malloc(*length);
 614         /* test for NULL */
 615         if (result == NULL) {
 616             *status = U_MEMORY_ALLOCATION_ERROR;
 617             return NULL;
 618         }
 619         uprv_memcpy(result, coll->image, *length);
 620     } else {
 621         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 622         result = (uint8_t *)uprv_malloc(*length);
 623         /* test for NULL */
 624         if (result == NULL) {
 625             *status = U_MEMORY_ALLOCATION_ERROR;
 626             return NULL;
 627         }
 628
 629         /* build the UCATableHeader with minimal entries */
 630         /* do not copy the header from the UCA file because its values are wrong! */
 631         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 632
 633         /* reset everything */
 634         uprv_memset(result, 0, *length);
 635
 636         /* set the tailoring-specific values */
 637         UCATableHeader *myData = (UCATableHeader *)result;
 638         myData->size = *length;
 639
 640         /* offset for the options, the only part of the data that is present after the header */
 641         myData->options = sizeof(UCATableHeader);
 642
 643         /* need to always set the expansion value for an upper bound of the options */
 644         myData->expansion = myData->options + sizeof(UColOptionSet);
 645
 646         myData->magic = UCOL_HEADER_MAGIC;
 647         myData->isBigEndian = U_IS_BIG_ENDIAN;
 648         myData->charSetFamily = U_CHARSET_FAMILY;
 649
 650         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 651         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 652
 653         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 654         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 655         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 656         myData->jamoSpecial = coll->image->jamoSpecial;
 657
 658         /* copy the collator options */
 659         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 660     }
 661     return result;
 662 }
 663
 664 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 665     if(U_FAILURE(*status)) {
 666         return;
 667     }
 668     result->caseFirst = (UColAttributeValue)opts->caseFirst;
 669     result->caseLevel = (UColAttributeValue)opts->caseLevel;
 670     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
 671     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
 672     result->strength = (UColAttributeValue)opts->strength;
 673     result->variableTopValue = opts->variableTopValue;
 674     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
 675     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
 676     result->numericCollation = (UColAttributeValue)opts->numericCollation;
 677
 678     result->caseFirstisDefault = TRUE;
 679     result->caseLevelisDefault = TRUE;
 680     result->frenchCollationisDefault = TRUE;
 681     result->normalizationModeisDefault = TRUE;
 682     result->strengthisDefault = TRUE;
 683     result->variableTopValueisDefault = TRUE;
 684     result->hiraganaQisDefault = TRUE;
 685     result->numericCollationisDefault = TRUE;
 686
 687     ucol_updateInternalState(result, status);
 688
 689     result->options = opts;
 690 }
 691
 692
 693 /**
 694 * Approximate determination if a character is at a contraction end.
 695 * Guaranteed to be TRUE if a character is at the end of a contraction,
 696 * otherwise it is not deterministic.
 697 * @param c character to be determined
 698 * @param coll collator
 699 */
 700 static
 701 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 702     if (c < coll->minContrEndCP) {
 703         return FALSE;
 704     }
 705
 706     int32_t  hash = c;
 707     uint8_t  htbyte;
 708     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 709         if (U16_IS_TRAIL(c)) {
 710             return TRUE;
 711         }
 712         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 713     }
 714     htbyte = coll->contrEndCP[hash>>3];
 715     return (((htbyte >> (hash & 7)) & 1) == 1);
 716 }
 717
 718
 719
 720 /*
 721 *   i_getCombiningClass()
 722 *        A fast, at least partly inline version of u_getCombiningClass()
 723 *        This is a candidate for further optimization.  Used heavily
 724 *        in contraction processing.
 725 */
 726 static
 727 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
 728     uint8_t sCC = 0;
 729     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
 730         sCC = u_getCombiningClass(c);
 731     }
 732     return sCC;
 733 }
 734
 735 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
 736     UChar c;
 737     UCollator *result = fillIn;
 738     if(U_FAILURE(*status) || image == NULL) {
 739         return NULL;
 740     }
 741
 742     if(result == NULL) {
 743         result = (UCollator *)uprv_malloc(sizeof(UCollator));
 744         if(result == NULL) {
 745             *status = U_MEMORY_ALLOCATION_ERROR;
 746             return result;
 747         }
 748         result->freeOnClose = TRUE;
 749     } else {
 750         result->freeOnClose = FALSE;
 751     }
 752
 753     // init FCD data
 754     if (fcdTrieIndex == NULL) {
 755         // The result is constant, until the library is reloaded.
 756         fcdTrieIndex = unorm_getFCDTrie(status);
 757         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
 758     }
 759
 760     result->image = image;
 761     result->mapping.getFoldingOffset = _getFoldingOffset;
 762     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
 763     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
 764     if(U_FAILURE(*status)) {
 765         if(result->freeOnClose == TRUE) {
 766             uprv_free(result);
 767             result = NULL;
 768         }
 769         return result;
 770     }
 771
 772     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
 773     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
 774     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
 775     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
 776     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
 777
 778     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
 779     result->freeOptionsOnClose = FALSE;
 780
 781     /* set attributes */
 782     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
 783     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
 784     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
 785     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
 786     result->strength = (UColAttributeValue)result->options->strength;
 787     result->variableTopValue = result->options->variableTopValue;
 788     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
 789     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
 790     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
 791
 792     result->caseFirstisDefault = TRUE;
 793     result->caseLevelisDefault = TRUE;
 794     result->frenchCollationisDefault = TRUE;
 795     result->normalizationModeisDefault = TRUE;
 796     result->strengthisDefault = TRUE;
 797     result->variableTopValueisDefault = TRUE;
 798     result->alternateHandlingisDefault = TRUE;
 799     result->hiraganaQisDefault = TRUE;
 800     result->numericCollationisDefault = TRUE;
 801
 802     /*result->scriptOrder = NULL;*/
 803
 804     result->rules = NULL;
 805     result->rulesLength = 0;
 806     result->freeRulesOnClose = FALSE;
 807
 808     /* get the version info from UCATableHeader and populate the Collator struct*/
 809     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
 810     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
 811     result->dataVersion[2] = 0;
 812     result->dataVersion[3] = 0;
 813
 814     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
 815     result->minUnsafeCP = 0;
 816     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
 817         if (ucol_unsafeCP(c, result)) break;
 818     }
 819     result->minUnsafeCP = c;
 820
 821     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
 822     result->minContrEndCP = 0;
 823     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
 824         if (ucol_contractionEndCP(c, result)) break;
 825     }
 826     result->minContrEndCP = c;
 827
 828     /* max expansion tables */
 829     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
 830                                          result->image->endExpansionCE);
 831     result->lastEndExpansionCE = result->endExpansionCE +
 832                                  result->image->endExpansionCECount - 1;
 833     result->expansionCESize = (uint8_t*)result->image +
 834                                                result->image->expansionCESize;
 835
 836
 837     //result->errorCode = *status;
 838
 839     result->latinOneCEs = NULL;
 840
 841     result->latinOneRegenTable = FALSE;
 842     result->latinOneFailed = FALSE;
 843     result->UCA = UCA;
 844
 845     ucol_updateInternalState(result, status);
 846
 847     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
 848     result->ucaRules = NULL;
 849     result->actualLocale = NULL;
 850     result->validLocale = NULL;
 851     result->requestedLocale = NULL;
 852     result->hasRealData = FALSE; // real data lives in .dat file...
 853     result->freeImageOnClose = FALSE;
 854
 855     return result;
 856 }
 857
 858 /* new Mark's code */
 859
 860 /**
 861  * For generation of Implicit CEs
 862  * @author Davis
 863  *
 864  * Cleaned up so that changes can be made more easily.
 865  * Old values:
 866 # First Implicit: E26A792D
 867 # Last Implicit: E3DC70C0
 868 # First CJK: E0030300
 869 # Last CJK: E0A9DD00
 870 # First CJK_A: E0A9DF00
 871 # Last CJK_A: E0DE3100
 872  */
 873 /* Following is a port of Mark's code for new treatment of implicits.
 874  * It is positioned here, since ucol_initUCA need to initialize the
 875  * variables below according to the data in the fractional UCA.
 876  */
 877
 878 /**
 879  * Function used to:
 880  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
 881  * b) bump any non-CJK characters by 10FFFF.
 882  * The relevant blocks are:
 883  * A:    4E00..9FFF; CJK Unified Ideographs
 884  *       F900..FAFF; CJK Compatibility Ideographs
 885  * B:    3400..4DBF; CJK Unified Ideographs Extension A
 886  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
 887  * As long as
 888  *   no new B characters are allocated between 4E00 and FAFF, and
 889  *   no new A characters are outside of this range,
 890  * (very high probability) this simple code will work.
 891  * The reordered blocks are:
 892  * Block1 is CJK
 893  * Block2 is CJK_COMPAT_USED
 894  * Block3 is CJK_A
 895  * (all contiguous)
 896  * Any other CJK gets its normal code point
 897  * Any non-CJK gets +10FFFF
 898  * When we reorder Block1, we make sure that it is at the very start,
 899  * so that it will use a 3-byte form.
 900  * Warning: the we only pick up the compatibility characters that are
 901  * NOT decomposed, so that block is smaller!
 902  */
 903
 904 // CONSTANTS
 905 static const UChar32
 906     NON_CJK_OFFSET = 0x110000,
 907     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
 908
 909 /**
 910  * Precomputed by constructor
 911  */
 912 static int32_t
 913     final3Multiplier = 0,
 914     final4Multiplier = 0,
 915     final3Count = 0,
 916     final4Count = 0,
 917     medialCount = 0,
 918     min3Primary = 0,
 919     min4Primary = 0,
 920     max4Primary = 0,
 921     minTrail = 0,
 922     maxTrail = 0,
 923     max3Trail = 0,
 924     max4Trail = 0,
 925     min4Boundary = 0;
 926
 927 static const UChar32
 928     CJK_BASE = 0x4E00,
 929     CJK_LIMIT = 0x9FFF+1,
 930     CJK_COMPAT_USED_BASE = 0xFA0E,
 931     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
 932     CJK_A_BASE = 0x3400,
 933     CJK_A_LIMIT = 0x4DBF+1,
 934     CJK_B_BASE = 0x20000,
 935     CJK_B_LIMIT = 0x2A6DF+1;
 936
 937 static UChar32 swapCJK(UChar32 i) {
 938
 939     if (i >= CJK_BASE) {
 940         if (i < CJK_LIMIT)              return i - CJK_BASE;
 941
 942         if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
 943
 944         if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
 945                                                 + (CJK_LIMIT - CJK_BASE);
 946         if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
 947
 948         if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
 949
 950         return i + NON_CJK_OFFSET;  // non-CJK
 951     }
 952     if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
 953
 954     if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
 955                                                 + (CJK_LIMIT - CJK_BASE)
 956                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
 957     return i + NON_CJK_OFFSET; // non-CJK
 958 }
 959
 960 U_CAPI UChar32 U_EXPORT2
 961 uprv_uca_getRawFromCodePoint(UChar32 i) {
 962     return swapCJK(i)+1;
 963 }
 964
 965 U_CAPI UChar32 U_EXPORT2
 966 uprv_uca_getCodePointFromRaw(UChar32 i) {
 967     i--;
 968     UChar32 result = 0;
 969     if(i >= NON_CJK_OFFSET) {
 970         result = i - NON_CJK_OFFSET;
 971     } else if(i >= CJK_B_BASE) {
 972         result = i;
 973     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
 974         if(i < CJK_LIMIT - CJK_BASE) {
 975             result = i + CJK_BASE;
 976         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
 977             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
 978         } else {
 979             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
 980         }
 981     } else {
 982         result = -1;
 983     }
 984     return result;
 985 }
 986
 987 // GET IMPLICIT PRIMARY WEIGHTS
 988 // Return value is left justified primary key
 989 U_CAPI uint32_t U_EXPORT2
 990 uprv_uca_getImplicitFromRaw(UChar32 cp) {
 991     /*
 992     if (cp < 0 || cp > UCOL_MAX_INPUT) {
 993         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
 994     }
 995     */
 996     int32_t last0 = cp - min4Boundary;
 997     if (last0 < 0) {
 998         int32_t last1 = cp / final3Count;
 999         last0 = cp % final3Count;
1000
1001         int32_t last2 = last1 / medialCount;
1002         last1 %= medialCount;
1003
1004         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1005         last1 = minTrail + last1; // offset
1006         last2 = min3Primary + last2; // offset
1007         /*
1008         if (last2 >= min4Primary) {
1009             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1010         }
1011         */
1012         return (last2 << 24) + (last1 << 16) + (last0 << 8);
1013     } else {
1014         int32_t last1 = last0 / final4Count;
1015         last0 %= final4Count;
1016
1017         int32_t last2 = last1 / medialCount;
1018         last1 %= medialCount;
1019
1020         int32_t last3 = last2 / medialCount;
1021         last2 %= medialCount;
1022
1023         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1024         last1 = minTrail + last1; // offset
1025         last2 = minTrail + last2; // offset
1026         last3 = min4Primary + last3; // offset
1027         /*
1028         if (last3 > max4Primary) {
1029             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1030         }
1031         */
1032         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1033     }
1034 }
1035
1036 static uint32_t U_EXPORT2
1037 uprv_uca_getImplicitPrimary(UChar32 cp) {
1038     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1039
1040     cp = swapCJK(cp);
1041     cp++;
1042     // we now have a range of numbers from 0 to 21FFFF.
1043
1044     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1045
1046     return uprv_uca_getImplicitFromRaw(cp);
1047 }
1048
1049 /**
1050  * Converts implicit CE into raw integer ("code point")
1051  * @param implicit
1052  * @return -1 if illegal format
1053  */
1054 U_CAPI UChar32 U_EXPORT2
1055 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1056     UChar32 result;
1057     UChar32 b3 = implicit & 0xFF;
1058     UChar32 b2 = (implicit >> 8) & 0xFF;
1059     UChar32 b1 = (implicit >> 16) & 0xFF;
1060     UChar32 b0 = (implicit >> 24) & 0xFF;
1061
1062     // simple parameter checks
1063     if (b0 < min3Primary || b0 > max4Primary
1064         || b1 < minTrail || b1 > maxTrail)
1065         return -1;
1066     // normal offsets
1067     b1 -= minTrail;
1068
1069     // take care of the final values, and compose
1070     if (b0 < min4Primary) {
1071         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1072             return -1;
1073         b2 -= minTrail;
1074         UChar32 remainder = b2 % final3Multiplier;
1075         if (remainder != 0)
1076             return -1;
1077         b0 -= min3Primary;
1078         b2 /= final3Multiplier;
1079         result = ((b0 * medialCount) + b1) * final3Count + b2;
1080     } else {
1081         if (b2 < minTrail || b2 > maxTrail
1082             || b3 < minTrail || b3 > max4Trail)
1083             return -1;
1084         b2 -= minTrail;
1085         b3 -= minTrail;
1086         UChar32 remainder = b3 % final4Multiplier;
1087         if (remainder != 0)
1088             return -1;
1089         b3 /= final4Multiplier;
1090         b0 -= min4Primary;
1091         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1092     }
1093     // final check
1094     if (result < 0 || result > UCOL_MAX_INPUT)
1095         return -1;
1096     return result;
1097 }
1098
1099
1100 static inline int32_t divideAndRoundUp(int a, int b) {
1101     return 1 + (a-1)/b;
1102 }
1103
1104 /* this function is either called from initUCA or from genUCA before
1105  * doing canonical closure for the UCA.
1106  */
1107
1108 /**
1109  * Set up to generate implicits.
1110  * @param minPrimary
1111  * @param maxPrimary
1112  * @param minTrail final byte
1113  * @param maxTrail final byte
1114  * @param gap3 the gap we leave for tailoring for 3-byte forms
1115  * @param gap4 the gap we leave for tailoring for 4-byte forms
1116  */
1117 static void initImplicitConstants(int minPrimary, int maxPrimary,
1118                                     int minTrailIn, int maxTrailIn,
1119                                     int gap3, int primaries3count,
1120                                     UErrorCode *status) {
1121     // some simple parameter checks
1122     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1123         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1124         || (primaries3count < 1))
1125     {
1126         *status = U_ILLEGAL_ARGUMENT_ERROR;
1127         return;
1128     };
1129
1130     minTrail = minTrailIn;
1131     maxTrail = maxTrailIn;
1132
1133     min3Primary = minPrimary;
1134     max4Primary = maxPrimary;
1135     // compute constants for use later.
1136     // number of values we can use in trailing bytes
1137     // leave room for empty values between AND above, e.g. if gap = 2
1138     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1139     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1140     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1141     final3Multiplier = gap3 + 1;
1142     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1143     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1144
1145     // medials can use full range
1146     medialCount = (maxTrail - minTrail + 1);
1147     // find out how many values fit in each form
1148     int32_t threeByteCount = medialCount * final3Count;
1149     // now determine where the 3/4 boundary is.
1150     // we use 3 bytes below the boundary, and 4 above
1151     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1152     int32_t primaries4count = primariesAvailable - primaries3count;
1153
1154
1155     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1156     min4Primary = minPrimary + primaries3count;
1157     min4Boundary = min3ByteCoverage;
1158     // Now expand out the multiplier for the 4 bytes, and redo.
1159
1160     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1161     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1162     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1163     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1164     if (gap4 < 1) {
1165         *status = U_ILLEGAL_ARGUMENT_ERROR;
1166         return;
1167     }
1168     final4Multiplier = gap4 + 1;
1169     final4Count = neededPerFinalByte;
1170     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1171 }
1172
1173     /**
1174      * Supply parameters for generating implicit CEs
1175      */
1176 U_CAPI void U_EXPORT2
1177 uprv_uca_initImplicitConstants(UErrorCode *status) {
1178     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1179     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1180     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1181 }
1182
1183
1184 /*    collIterNormalize     Incremental Normalization happens here.                       */
1185 /*                          pick up the range of chars identifed by FCD,                  */
1186 /*                          normalize it into the collIterate's writable buffer,          */
1187 /*                          switch the collIterate's state to use the writable buffer.    */
1188 /*                                                                                        */
1189 static
1190 void collIterNormalize(collIterate *collationSource)
1191 {
1192     UErrorCode  status = U_ZERO_ERROR;
1193
1194     int32_t    normLen;
1195     UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1196     UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1197
1198     normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1199                               srcP, (int32_t)(endP - srcP),
1200                               FALSE, 0,
1201                               &status);
1202     if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1203         // reallocate and terminate
1204         if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1205                                    &collationSource->writableBuffer,
1206                                    (int32_t *)&collationSource->writableBufSize, normLen + 1,
1207                                    0)
1208         ) {
1209 #ifdef UCOL_DEBUG
1210             fprintf(stderr, "collIterNormalize(), out of memory\n");
1211 #endif
1212             return;
1213         }
1214         status = U_ZERO_ERROR;
1215         normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1216                                   srcP, (int32_t)(endP - srcP),
1217                                   FALSE, 0,
1218                                   &status);
1219     }
1220     if (U_FAILURE(status)) {
1221 #ifdef UCOL_DEBUG
1222         fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1223 #endif
1224         return;
1225     }
1226
1227     if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1228         collationSource->flags |= UCOL_ITER_ALLOCATED;
1229     }
1230     collationSource->pos        = collationSource->writableBuffer;
1231     collationSource->origFlags  = collationSource->flags;
1232     collationSource->flags     |= UCOL_ITER_INNORMBUF;
1233     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1234 }
1235
1236
1237 // This function takes the iterator and extracts normalized stuff up to the next boundary
1238 // It is similar in the end results to the collIterNormalize, but for the cases when we
1239 // use an iterator
1240 /*static
1241 inline void normalizeIterator(collIterate *collationSource) {
1242   UErrorCode status = U_ZERO_ERROR;
1243   UBool wasNormalized = FALSE;
1244   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1245   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1246   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1247     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1248   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1249     // reallocate and terminate
1250     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1251                                &collationSource->writableBuffer,
1252                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1253                                0)
1254     ) {
1255     #ifdef UCOL_DEBUG
1256         fprintf(stderr, "normalizeIterator(), out of memory\n");
1257     #endif
1258         return;
1259     }
1260     status = U_ZERO_ERROR;
1261     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1262     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1263     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1264     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1265   }
1266   // Terminate the buffer - we already checked that it is big enough
1267   collationSource->writableBuffer[normLen] = 0;
1268   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1269       collationSource->flags |= UCOL_ITER_ALLOCATED;
1270   }
1271   collationSource->pos        = collationSource->writableBuffer;
1272   collationSource->origFlags  = collationSource->flags;
1273   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1274   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1275 }*/
1276
1277
1278 /* Incremental FCD check and normalize                                                    */
1279 /*   Called from getNextCE when normalization state is suspect.                           */
1280 /*   When entering, the state is known to be this:                                        */
1281 /*      o   We are working in the main buffer of the collIterate, not the side            */
1282 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1283 /*          so we won't get here.                                                         */
1284 /*      o   The leading combining class from the current character is 0 or                */
1285 /*          the trailing combining class of the previous char was zero.                   */
1286 /*          True because the previous call to this function will have always exited       */
1287 /*          that way, and we get called for every char where cc might be non-zero.        */
1288 static
1289 inline UBool collIterFCD(collIterate *collationSource) {
1290     UChar       c, c2;
1291     const UChar *srcP, *endP;
1292     uint8_t     leadingCC;
1293     uint8_t     prevTrailingCC = 0;
1294     uint16_t    fcd;
1295     UBool       needNormalize = FALSE;
1296
1297     srcP = collationSource->pos-1;
1298
1299     if (collationSource->flags & UCOL_ITER_HASLEN) {
1300         endP = collationSource->endp;
1301     } else {
1302         endP = NULL;
1303     }
1304
1305     // Get the trailing combining class of the current character.  If it's zero,
1306     //   we are OK.
1307     c = *srcP++;
1308     /* trie access */
1309     fcd = unorm_getFCD16(fcdTrieIndex, c);
1310     if (fcd != 0) {
1311         if (U16_IS_LEAD(c)) {
1312             if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1313                 ++srcP;
1314                 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1315             } else {
1316                 fcd = 0;
1317             }
1318         }
1319
1320         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1321
1322         if (prevTrailingCC != 0) {
1323             // The current char has a non-zero trailing CC.  Scan forward until we find
1324             //   a char with a leading cc of zero.
1325             while (endP == NULL || srcP != endP)
1326             {
1327                 const UChar *savedSrcP = srcP;
1328
1329                 c = *srcP++;
1330                 /* trie access */
1331                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1332                 if (fcd != 0 && U16_IS_LEAD(c)) {
1333                     if ((endP == NULL || srcP != endP) && U16_IS_TRAIL(c2=*srcP)) {
1334                         ++srcP;
1335                         fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1336                     } else {
1337                         fcd = 0;
1338                     }
1339                 }
1340                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1341                 if (leadingCC == 0) {
1342                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1343                                            //   back up over it.  (Could be surrogate pair!)
1344                     break;
1345                 }
1346
1347                 if (leadingCC < prevTrailingCC) {
1348                     needNormalize = TRUE;
1349                 }
1350
1351                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1352             }
1353         }
1354     }
1355
1356     collationSource->fcdPosition = (UChar *)srcP;
1357
1358     return needNormalize;
1359 }
1360
1361 /****************************************************************************/
1362 /* Following are the CE retrieval functions                                 */
1363 /*                                                                          */
1364 /****************************************************************************/
1365
1366 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1367 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1368
1369 /* there should be a macro version of this function in the header file */
1370 /* This is the first function that tries to fetch a collation element  */
1371 /* If it's not succesfull or it encounters a more difficult situation  */
1372 /* some more sofisticated and slower functions are invoked             */
1373 static
1374 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1375     uint32_t order = 0;
1376     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1377         order = *(collationSource->toReturn++);                         /* if so, return them */
1378         if(collationSource->CEpos == collationSource->toReturn) {
1379             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1380         }
1381         return order;
1382     }
1383
1384     UChar ch = 0;
1385     collationSource->offsetReturn = NULL;
1386
1387     for (;;)                           /* Loop handles case when incremental normalize switches   */
1388     {                                  /*   to or from the side buffer / original string, and we  */
1389         /*   need to start again to get the next character.        */
1390
1391         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1392         {
1393             // The source string is null terminated and we're not working from the side buffer,
1394             //   and we're not normalizing.  This is the fast path.
1395             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1396             ch = *collationSource->pos++;
1397             if (ch != 0) {
1398                 break;
1399             }
1400             else {
1401                 return UCOL_NO_MORE_CES;
1402             }
1403         }
1404
1405         if (collationSource->flags & UCOL_ITER_HASLEN) {
1406             // Normal path for strings when length is specified.
1407             //   (We can't be in side buffer because it is always null terminated.)
1408             if (collationSource->pos >= collationSource->endp) {
1409                 // Ran off of the end of the main source string.  We're done.
1410                 return UCOL_NO_MORE_CES;
1411             }
1412             ch = *collationSource->pos++;
1413         }
1414         else if(collationSource->flags & UCOL_USE_ITERATOR) {
1415             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1416             if(iterCh == U_SENTINEL) {
1417                 return UCOL_NO_MORE_CES;
1418             }
1419             ch = (UChar)iterCh;
1420         }
1421         else
1422         {
1423             // Null terminated string.
1424             ch = *collationSource->pos++;
1425             if (ch == 0) {
1426                 // Ran off end of buffer.
1427                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1428                     // Ran off end of main string. backing up one character.
1429                     collationSource->pos--;
1430                     return UCOL_NO_MORE_CES;
1431                 }
1432                 else
1433                 {
1434                     // Hit null in the normalize side buffer.
1435                     // Usually this means the end of the normalized data,
1436                     // except for one odd case: a null followed by combining chars,
1437                     //   which is the case if we are at the start of the buffer.
1438                     if (collationSource->pos == collationSource->writableBuffer+1) {
1439                         break;
1440                     }
1441
1442                     //  Null marked end of side buffer.
1443                     //   Revert to the main string and
1444                     //   loop back to top to try again to get a character.
1445                     collationSource->pos   = collationSource->fcdPosition;
1446                     collationSource->flags = collationSource->origFlags;
1447                     continue;
1448                 }
1449             }
1450         }
1451
1452         if(collationSource->flags&UCOL_HIRAGANA_Q) {
1453             /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1454              * based on whether the previous codepoint was Hiragana or Katakana.
1455              */
1456             if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1457                     ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1458                 collationSource->flags |= UCOL_WAS_HIRAGANA;
1459             } else {
1460                 collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1461             }
1462         }
1463
1464         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1465         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1466         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1467             break;
1468         }
1469
1470         if (collationSource->fcdPosition >= collationSource->pos) {
1471             // An earlier FCD check has already covered the current character.
1472             // We can go ahead and process this char.
1473             break;
1474         }
1475
1476         if (ch < ZERO_CC_LIMIT_ ) {
1477             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1478             break;
1479         }
1480
1481         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1482             // We need to peek at the next character in order to tell if we are FCD
1483             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1484                 // We are at the last char of source string.
1485                 //  It is always OK for FCD check.
1486                 break;
1487             }
1488
1489             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1490             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1491                 break;
1492             }
1493         }
1494
1495
1496         // Need a more complete FCD check and possible normalization.
1497         if (collIterFCD(collationSource)) {
1498             collIterNormalize(collationSource);
1499         }
1500         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1501             //  No normalization was needed.  Go ahead and process the char we already had.
1502             break;
1503         }
1504
1505         // Some normalization happened.  Next loop iteration will pick up a char
1506         //   from the normalization buffer.
1507
1508     }   // end for (;;)
1509
1510
1511     if (ch <= 0xFF) {
1512         /*  For latin-1 characters we never need to fall back to the UCA table        */
1513         /*    because all of the UCA data is replicated in the latinOneMapping array  */
1514         order = coll->latinOneMapping[ch];
1515         if (order > UCOL_NOT_FOUND) {
1516             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1517         }
1518     }
1519     else
1520     {
1521         order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1522         if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1523             order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1524         }
1525         if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1526             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1527             order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1528
1529             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1530                 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1531             }
1532         }
1533     }
1534     if(order == UCOL_NOT_FOUND) {
1535         order = getImplicit(ch, collationSource);
1536     }
1537     return order; /* return the CE */
1538 }
1539
1540 /* ucol_getNextCE, out-of-line version for use from other files.   */
1541 U_CAPI uint32_t  U_EXPORT2
1542 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1543     return ucol_IGetNextCE(coll, collationSource, status);
1544 }
1545
1546
1547 /**
1548 * Incremental previous normalization happens here. Pick up the range of chars
1549 * identifed by FCD, normalize it into the collIterate's writable buffer,
1550 * switch the collIterate's state to use the writable buffer.
1551 * @param data collation iterator data
1552 */
1553 static
1554 void collPrevIterNormalize(collIterate *data)
1555 {
1556     UErrorCode status  = U_ZERO_ERROR;
1557     UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1558     UChar      *pStart;
1559     uint32_t    normLen;
1560     UChar      *pStartNorm;
1561
1562     /* Start normalize */
1563     if (data->fcdPosition == NULL) {
1564         pStart = data->string;
1565     }
1566     else {
1567         pStart = data->fcdPosition + 1;
1568     }
1569
1570     normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1571                               data->writableBuffer, 0, &status);
1572
1573     if (data->writableBufSize <= normLen) {
1574             freeHeapWritableBuffer(data);
1575             data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1576                                                         sizeof(UChar));
1577             if(data->writableBuffer == NULL) { // something is wrong here, return
1578                 data->writableBufSize = 0;     // Reset writableBufSize
1579                 return;
1580             }
1581             data->flags |= UCOL_ITER_ALLOCATED;
1582             /* to handle the zero termination */
1583             data->writableBufSize = normLen + 1;
1584     }
1585             status = U_ZERO_ERROR;
1586     /*
1587     this puts the null termination infront of the normalized string instead
1588     of the end
1589     */
1590     pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1591     *(pStartNorm - 1) = 0;
1592     unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1593                     normLen, &status);
1594
1595     if (data->offsetBuffer == NULL) {
1596         int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
1597
1598         data->offsetBufferSize = len;
1599         data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
1600         data->offsetStore = data->offsetBuffer;
1601     } else if(data->offsetBufferSize < (int32_t) normLen) {
1602         int32_t storeIX = data->offsetStore - data->offsetBuffer;
1603         int32_t *tob    = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
1604
1605         if (tob != NULL) {
1606             data->offsetBuffer = tob;
1607             data->offsetStore = &data->offsetBuffer[storeIX];
1608             data->offsetBufferSize = normLen + 1;
1609         }
1610     }
1611
1612     /*
1613      * The usual case at this point is that we've got a base
1614      * character followed by marks that were normalized. If
1615      * fcdPosition is NULL, that means that we backed up to
1616      * the beginning of the string and there's no base character.
1617      *
1618      * Forward processing will usually normalize when it sees
1619      * the first mark, so that mark will get it's natural offset
1620      * and the rest will get the offset of the character following
1621      * the marks. The base character will also get its natural offset.
1622      *
1623      * We write the offset of the base character, if there is one,
1624      * followed by the offset of the first mark and then the offsets
1625      * of the rest of the marks.
1626      */
1627     int32_t firstMarkOffset = 0;
1628     int32_t trailOffset     = data->pos - data->string + 1;
1629     int32_t trailCount      = normLen - 1;
1630
1631     if (data->fcdPosition != NULL) {
1632         int32_t baseOffset = data->fcdPosition - data->string;
1633         UChar   baseChar   = *data->fcdPosition;
1634
1635         firstMarkOffset = baseOffset + 1;
1636
1637         /*
1638          * If the base character is the start of a contraction, forward processing
1639          * will normalize the marks while checking for the contraction, which means
1640          * that the offset of the first mark will the same as the other marks.
1641          *
1642          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1643          */
1644         if (baseChar >= 0x100) {
1645             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1646
1647             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1648                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1649             }
1650
1651             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1652                 firstMarkOffset = trailOffset;
1653             }
1654         }
1655
1656         *(data->offsetStore++) = baseOffset;
1657     }
1658
1659     *(data->offsetStore++) = firstMarkOffset;
1660
1661     for (int32_t i = 0; i < trailCount; i += 1) {
1662         *(data->offsetStore++) = trailOffset;
1663     }
1664
1665     data->offsetRepeatValue = trailOffset;
1666
1667     data->offsetReturn = data->offsetStore - 1;
1668     if (data->offsetReturn == data->offsetBuffer) {
1669         data->offsetStore = data->offsetBuffer;
1670     }
1671
1672     data->pos        = data->writableBuffer + data->writableBufSize;
1673     data->origFlags  = data->flags;
1674     data->flags     |= UCOL_ITER_INNORMBUF;
1675     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1676 }
1677
1678
1679 /**
1680 * Incremental FCD check for previous iteration and normalize. Called from
1681 * getPrevCE when normalization state is suspect.
1682 * When entering, the state is known to be this:
1683 * o  We are working in the main buffer of the collIterate, not the side
1684 *    writable buffer. When in the side buffer, normalization mode is always
1685 *    off, so we won't get here.
1686 * o  The leading combining class from the current character is 0 or the
1687 *    trailing combining class of the previous char was zero.
1688 *    True because the previous call to this function will have always exited
1689 *    that way, and we get called for every char where cc might be non-zero.
1690 * @param data collation iterate struct
1691 * @return normalization status, TRUE for normalization to be done, FALSE
1692 *         otherwise
1693 */
1694 static
1695 inline UBool collPrevIterFCD(collIterate *data)
1696 {
1697     const UChar *src, *start;
1698     UChar       c, c2;
1699     uint8_t     leadingCC;
1700     uint8_t     trailingCC = 0;
1701     uint16_t    fcd;
1702     UBool       result = FALSE;
1703
1704     start = data->string;
1705     src = data->pos + 1;
1706
1707     /* Get the trailing combining class of the current character. */
1708     c = *--src;
1709     if (!U16_IS_SURROGATE(c)) {
1710         fcd = unorm_getFCD16(fcdTrieIndex, c);
1711     } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1712         --src;
1713         fcd = unorm_getFCD16(fcdTrieIndex, c2);
1714         if (fcd != 0) {
1715             fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1716         }
1717     } else /* unpaired surrogate */ {
1718         fcd = 0;
1719     }
1720
1721     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1722
1723     if (leadingCC != 0) {
1724         /*
1725         The current char has a non-zero leading combining class.
1726         Scan backward until we find a char with a trailing cc of zero.
1727         */
1728         for (;;)
1729         {
1730             if (start == src) {
1731                 data->fcdPosition = NULL;
1732                 return result;
1733             }
1734
1735             c = *--src;
1736             if (!U16_IS_SURROGATE(c)) {
1737                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1738             } else if (U16_IS_TRAIL(c) && start < src && U16_IS_LEAD(c2 = *(src - 1))) {
1739                 --src;
1740                 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1741                 if (fcd != 0) {
1742                     fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1743                 }
1744             } else /* unpaired surrogate */ {
1745                 fcd = 0;
1746             }
1747
1748             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1749
1750             if (trailingCC == 0) {
1751                 break;
1752             }
1753
1754             if (leadingCC < trailingCC) {
1755                 result = TRUE;
1756             }
1757
1758             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1759         }
1760     }
1761
1762     data->fcdPosition = (UChar *)src;
1763
1764     return result;
1765 }
1766
1767 /** gets a character from the string at a given offset
1768  *  Handles both normal and iterative cases.
1769  *  No error checking - caller beware!
1770  */
1771 inline static
1772 UChar peekCharacter(collIterate *source, int32_t offset) {
1773     if(source->pos != NULL) {
1774         return *(source->pos + offset);
1775     } else if(source->iterator != NULL) {
1776         if(offset != 0) {
1777             source->iterator->move(source->iterator, offset, UITER_CURRENT);
1778             UChar toReturn = (UChar)source->iterator->next(source->iterator);
1779             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1780             return toReturn;
1781         } else {
1782             return (UChar)source->iterator->current(source->iterator);
1783         }
1784     } else {
1785         return (UChar)U_SENTINEL;
1786     }
1787 }
1788
1789 /**
1790 * Determines if we are at the start of the data string in the backwards
1791 * collation iterator
1792 * @param data collation iterator
1793 * @return TRUE if we are at the start
1794 */
1795 static
1796 inline UBool isAtStartPrevIterate(collIterate *data) {
1797     if(data->pos == NULL && data->iterator != NULL) {
1798         return !data->iterator->hasPrevious(data->iterator);
1799     }
1800     //return (collIter_bos(data)) ||
1801     return (data->pos == data->string) ||
1802               ((data->flags & UCOL_ITER_INNORMBUF) &&
1803               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1804 }
1805
1806 static
1807 inline void goBackOne(collIterate *data) {
1808 # if 0
1809     // somehow, it looks like we need to keep iterator synced up
1810     // at all times, as above.
1811     if(data->pos) {
1812         data->pos--;
1813     }
1814     if(data->iterator) {
1815         data->iterator->previous(data->iterator);
1816     }
1817 #endif
1818     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1819         data->iterator->previous(data->iterator);
1820     }
1821     if(data->pos) {
1822         data->pos --;
1823     }
1824 }
1825
1826 /**
1827 * Inline function that gets a simple CE.
1828 * So what it does is that it will first check the expansion buffer. If the
1829 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1830 * is different from the string pointer, we return the collation element at the
1831 * return pointer and decrement it.
1832 * For more complicated CEs it resorts to getComplicatedCE.
1833 * @param coll collator data
1834 * @param data collation iterator struct
1835 * @param status error status
1836 */
1837 static
1838 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1839                                UErrorCode *status)
1840 {
1841     uint32_t result = (uint32_t)UCOL_NULLORDER;
1842
1843     if (data->offsetReturn != NULL) {
1844         if (data->offsetRepeatCount > 0) {
1845                 data->offsetRepeatCount -= 1;
1846         } else {
1847             if (data->offsetReturn == data->offsetBuffer) {
1848                 data->offsetReturn = NULL;
1849                 data->offsetStore  = data->offsetBuffer;
1850             } else {
1851                 data->offsetReturn -= 1;
1852             }
1853         }
1854     }
1855
1856     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
1857             (!data->extendCEs && data->toReturn > data->CEs))
1858     {
1859         data->toReturn -= 1;
1860         result = *(data->toReturn);
1861         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
1862             data->CEpos = data->toReturn;
1863         }
1864     }
1865     else {
1866         UChar ch = 0;
1867
1868         /*
1869         Loop handles case when incremental normalize switches to or from the
1870         side buffer / original string, and we need to start again to get the
1871         next character.
1872         */
1873         for (;;) {
1874             if (data->flags & UCOL_ITER_HASLEN) {
1875                 /*
1876                 Normal path for strings when length is specified.
1877                 Not in side buffer because it is always null terminated.
1878                 */
1879                 if (data->pos <= data->string) {
1880                     /* End of the main source string */
1881                     return UCOL_NO_MORE_CES;
1882                 }
1883                 data->pos --;
1884                 ch = *data->pos;
1885             }
1886             // we are using an iterator to go back. Pray for us!
1887             else if (data->flags & UCOL_USE_ITERATOR) {
1888               UChar32 iterCh = data->iterator->previous(data->iterator);
1889               if(iterCh == U_SENTINEL) {
1890                 return UCOL_NO_MORE_CES;
1891               } else {
1892                 ch = (UChar)iterCh;
1893               }
1894             }
1895             else {
1896                 data->pos --;
1897                 ch = *data->pos;
1898                 /* we are in the side buffer. */
1899                 if (ch == 0) {
1900                     /*
1901                     At the start of the normalize side buffer.
1902                     Go back to string.
1903                     Because pointer points to the last accessed character,
1904                     hence we have to increment it by one here.
1905                     */
1906                     data->flags = data->origFlags;
1907                     data->offsetRepeatValue = 0;
1908
1909                      if (data->fcdPosition == NULL) {
1910                         data->pos = data->string;
1911                         return UCOL_NO_MORE_CES;
1912                     }
1913                     else {
1914                         data->pos   = data->fcdPosition + 1;
1915                     }
1916
1917                    continue;
1918                 }
1919             }
1920
1921             if(data->flags&UCOL_HIRAGANA_Q) {
1922               if(ch>=0x3040 && ch<=0x309f) {
1923                 data->flags |= UCOL_WAS_HIRAGANA;
1924               } else {
1925                 data->flags &= ~UCOL_WAS_HIRAGANA;
1926               }
1927             }
1928
1929             /*
1930             * got a character to determine if there's fcd and/or normalization
1931             * stuff to do.
1932             * if the current character is not fcd.
1933             * if current character is at the start of the string
1934             * Trailing combining class == 0.
1935             * Note if pos is in the writablebuffer, norm is always 0
1936             */
1937             if (ch < ZERO_CC_LIMIT_ ||
1938               // this should propel us out of the loop in the iterator case
1939                 (data->flags & UCOL_ITER_NORM) == 0 ||
1940                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1941                 || data->string == data->pos) {
1942                 break;
1943             }
1944
1945             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1946                 /* if next character is FCD */
1947                 if (data->pos == data->string) {
1948                     /* First char of string is always OK for FCD check */
1949                     break;
1950                 }
1951
1952                 /* Not first char of string, do the FCD fast test */
1953                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1954                     break;
1955                 }
1956             }
1957
1958             /* Need a more complete FCD check and possible normalization. */
1959             if (collPrevIterFCD(data)) {
1960                 collPrevIterNormalize(data);
1961             }
1962
1963             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1964                 /*  No normalization. Go ahead and process the char. */
1965                 break;
1966             }
1967
1968             /*
1969             Some normalization happened.
1970             Next loop picks up a char from the normalization buffer.
1971             */
1972         }
1973
1974         /* attempt to handle contractions, after removal of the backwards
1975         contraction
1976         */
1977         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1978             result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1979         } else {
1980             if (ch <= 0xFF) {
1981                 result = coll->latinOneMapping[ch];
1982             }
1983             else {
1984                 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1985             }
1986             if (result > UCOL_NOT_FOUND) {
1987                 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1988             }
1989             if (result == UCOL_NOT_FOUND) { // Not found in master list
1990                 if (!isAtStartPrevIterate(data) &&
1991                     ucol_contractionEndCP(ch, data->coll))
1992                 {
1993                     result = UCOL_CONTRACTION;
1994                 } else {
1995                     if(coll->UCA) {
1996                         result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1997                     }
1998                 }
1999
2000                 if (result > UCOL_NOT_FOUND) {
2001                     if(coll->UCA) {
2002                         result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2003                     }
2004                 }
2005             }
2006         }
2007
2008         if(result == UCOL_NOT_FOUND) {
2009             result = getPrevImplicit(ch, data);
2010         }
2011     }
2012
2013     return result;
2014 }
2015
2016
2017 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
2018 U_CFUNC uint32_t  U_EXPORT2
2019 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2020                         UErrorCode *status) {
2021     return ucol_IGetPrevCE(coll, data, status);
2022 }
2023
2024
2025 /* this should be connected to special Jamo handling */
2026 U_CFUNC uint32_t  U_EXPORT2
2027 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2028     collIterate colIt;
2029     uint32_t order;
2030     IInit_collIterate(coll, &u, 1, &colIt);
2031     order = ucol_IGetNextCE(coll, &colIt, status);
2032     /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2033     return order;
2034 }
2035
2036 /**
2037 * Inserts the argument character into the end of the buffer pushing back the
2038 * null terminator.
2039 * @param data collIterate struct data
2040 * @param pNull pointer to the null termination
2041 * @param ch character to be appended
2042 * @return the position of the new addition
2043 */
2044 static
2045 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
2046 {
2047     uint32_t  size    = data->writableBufSize;
2048     UChar    *newbuffer;
2049     static const uint32_t  INCSIZE = 5;
2050
2051     if ((data->writableBuffer + size) > (pNull + 1)) {
2052         *pNull = ch;
2053         *(pNull + 1) = 0;
2054         return pNull;
2055     }
2056
2057     /*
2058     buffer will always be null terminated at the end.
2059     giving extra space since it is likely that more characters will be added.
2060     */
2061     size += INCSIZE;
2062     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2063     if(newbuffer != NULL) { // something wrong, but no status
2064         uprv_memcpy(newbuffer, data->writableBuffer,
2065             data->writableBufSize * sizeof(UChar));
2066
2067         freeHeapWritableBuffer(data);
2068         data->writableBufSize = size;
2069         data->writableBuffer  = newbuffer;
2070
2071         newbuffer        = newbuffer + data->writableBufSize;
2072         *newbuffer       = ch;
2073         *(newbuffer + 1) = 0;
2074     }
2075     return newbuffer;
2076 }
2077
2078 /**
2079 * Inserts the argument string into the end of the buffer pushing back the
2080 * null terminator.
2081 * @param data collIterate struct data
2082 * @param pNull pointer to the null termination
2083 * @param string to be appended
2084 * @param length of the string to be appended
2085 * @return the position of the new addition
2086 */
2087 static
2088 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2089                                int32_t length)
2090 {
2091     uint32_t  size = pNull - data->writableBuffer;
2092     UChar    *newbuffer;
2093
2094     if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2095         uprv_memcpy(pNull, str, length * sizeof(UChar));
2096         *(pNull + length) = 0;
2097         return pNull;
2098     }
2099
2100     /*
2101     buffer will always be null terminated at the end.
2102     giving extra space since it is likely that more characters will be added.
2103     */
2104     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2105     if(newbuffer != NULL) {
2106       uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2107       uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2108
2109       freeHeapWritableBuffer(data);
2110       data->writableBufSize = size + length + 1;
2111       data->writableBuffer  = newbuffer;
2112     }
2113
2114     return newbuffer;
2115 }
2116
2117 /**
2118 * Special normalization function for contraction in the forwards iterator.
2119 * This normalization sequence will place the current character at source->pos
2120 * and its following normalized sequence into the buffer.
2121 * The fcd position, pos will be changed.
2122 * pos will now point to positions in the buffer.
2123 * Flags will be changed accordingly.
2124 * @param data collation iterator data
2125 */
2126 static
2127 inline void normalizeNextContraction(collIterate *data)
2128 {
2129     UChar      *buffer     = data->writableBuffer;
2130     uint32_t    buffersize = data->writableBufSize;
2131     uint32_t    strsize;
2132     UErrorCode  status     = U_ZERO_ERROR;
2133     /* because the pointer points to the next character */
2134     UChar      *pStart     = data->pos - 1;
2135     UChar      *pEnd;
2136     uint32_t    normLen;
2137     UChar      *pStartNorm;
2138
2139     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2140         *data->writableBuffer = *(pStart - 1);
2141         strsize               = 1;
2142     }
2143     else {
2144         strsize = u_strlen(data->writableBuffer);
2145     }
2146
2147     pEnd = data->fcdPosition;
2148
2149     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2150                               &status);
2151
2152     if (buffersize <= normLen + strsize) {
2153         uint32_t  size = strsize + normLen + 1;
2154         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2155         if(temp != NULL) {
2156           uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2157           freeHeapWritableBuffer(data);
2158           data->writableBuffer = temp;
2159           data->writableBufSize = size;
2160           data->flags |= UCOL_ITER_ALLOCATED;
2161         } else {
2162             return; // Avoid writing past bound of buffer->writableBuffer.
2163         }
2164     }
2165
2166     status            = U_ZERO_ERROR;
2167     pStartNorm        = buffer + strsize;
2168     /* null-termination will be added here */
2169     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2170                     normLen + 1, &status);
2171
2172     data->pos        = data->writableBuffer + strsize;
2173     data->origFlags  = data->flags;
2174     data->flags     |= UCOL_ITER_INNORMBUF;
2175     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2176 }
2177
2178 /**
2179 * Contraction character management function that returns the next character
2180 * for the forwards iterator.
2181 * Does nothing if the next character is in buffer and not the first character
2182 * in it.
2183 * Else it checks next character in data string to see if it is normalizable.
2184 * If it is not, the character is simply copied into the buffer, else
2185 * the whole normalized substring is copied into the buffer, including the
2186 * current character.
2187 * @param data collation element iterator data
2188 * @return next character
2189 */
2190 static
2191 inline UChar getNextNormalizedChar(collIterate *data)
2192 {
2193     UChar  nextch;
2194     UChar  ch;
2195     // Here we need to add the iterator code. One problem is the way
2196     // end of string is handled. If we just return next char, it could
2197     // be the sentinel. Most of the cases already check for this, but we
2198     // need to be sure.
2199     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2200          /* if no normalization and not in buffer. */
2201       if(data->flags & UCOL_USE_ITERATOR) {
2202          return (UChar)data->iterator->next(data->iterator);
2203       } else {
2204          return *(data->pos ++);
2205       }
2206     }
2207
2208     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2209       //normalizeIterator(data);
2210     //}
2211
2212     UChar  *pEndWritableBuffer = NULL;
2213     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2214     if ((innormbuf && *data->pos != 0) ||
2215         (data->fcdPosition != NULL && !innormbuf &&
2216         data->pos < data->fcdPosition)) {
2217         /*
2218         if next character is in normalized buffer, no further normalization
2219         is required
2220         */
2221         return *(data->pos ++);
2222     }
2223
2224     if (data->flags & UCOL_ITER_HASLEN) {
2225         /* in data string */
2226         if (data->pos + 1 == data->endp) {
2227             return *(data->pos ++);
2228         }
2229     }
2230     else {
2231         if (innormbuf) {
2232           // inside the normalization buffer, but at the end
2233           // (since we encountered zero). This means, in the
2234           // case we're using char iterator, that we need to
2235           // do another round of normalization.
2236           //if(data->origFlags & UCOL_USE_ITERATOR) {
2237             // we need to restore original flags,
2238             // otherwise, we'll lose them
2239             //data->flags = data->origFlags;
2240             //normalizeIterator(data);
2241             //return *(data->pos++);
2242           //} else {
2243             /*
2244             in writable buffer, at this point fcdPosition can not be
2245             pointing to the end of the data string. see contracting tag.
2246             */
2247           if(data->fcdPosition) {
2248             if (*(data->fcdPosition + 1) == 0 ||
2249                 data->fcdPosition + 1 == data->endp) {
2250                 /* at the end of the string, dump it into the normalizer */
2251                 data->pos = insertBufferEnd(data, data->pos,
2252                                             *(data->fcdPosition)) + 1;
2253                 // Check if data->pos received a null pointer
2254                 if (data->pos == NULL) {
2255                     return (UChar)-1; // Return to indicate error.
2256                 }
2257                 return *(data->fcdPosition ++);
2258             }
2259             pEndWritableBuffer = data->pos;
2260             data->pos = data->fcdPosition;
2261           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2262             // if we are here, we're using a normalizing iterator.
2263             // we should just continue further.
2264             data->flags = data->origFlags;
2265             data->pos = NULL;
2266             return (UChar)data->iterator->next(data->iterator);
2267           }
2268           //}
2269         }
2270         else {
2271             if (*(data->pos + 1) == 0) {
2272                 return *(data->pos ++);
2273             }
2274         }
2275     }
2276
2277     ch = *data->pos ++;
2278     nextch = *data->pos;
2279
2280     /*
2281     * if the current character is not fcd.
2282     * Trailing combining class == 0.
2283     */
2284     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2285         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2286          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2287             /*
2288             Need a more complete FCD check and possible normalization.
2289             normalize substring will be appended to buffer
2290             */
2291         if (collIterFCD(data)) {
2292             normalizeNextContraction(data);
2293             return *(data->pos ++);
2294         }
2295         else if (innormbuf) {
2296             /* fcdposition shifted even when there's no normalization, if we
2297             don't input the rest into this, we'll get the wrong position when
2298             we reach the end of the writableBuffer */
2299             int32_t length = data->fcdPosition - data->pos + 1;
2300             data->pos = insertBufferEnd(data, pEndWritableBuffer,
2301                                         data->pos - 1, length);
2302             // Check if data->pos received a null pointer
2303             if (data->pos == NULL) {
2304                 return (UChar)-1; // Return to indicate error.
2305             }
2306             return *(data->pos ++);
2307         }
2308     }
2309
2310     if (innormbuf) {
2311         /*
2312         no normalization is to be done hence only one character will be
2313         appended to the buffer.
2314         */
2315         data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2316         // Check if data->pos received a null pointer
2317         if (data->pos == NULL) {
2318             return (UChar)-1; // Return to indicate error.
2319         }
2320     }
2321
2322     /* points back to the pos in string */
2323     return ch;
2324 }
2325
2326
2327
2328 /**
2329 * Function to copy the buffer into writableBuffer and sets the fcd position to
2330 * the correct position
2331 * @param source data string source
2332 * @param buffer character buffer
2333 * @param tempdb current position in buffer that has been used up
2334 */
2335 static
2336 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2337                                      UChar *tempdb)
2338 {
2339     /* okay confusing part here. to ensure that the skipped characters are
2340     considered later, we need to place it in the appropriate position in the
2341     normalization buffer and reassign the pos pointer. simple case if pos
2342     reside in string, simply copy to normalization buffer and
2343     fcdposition = pos, pos = start of normalization buffer. if pos in
2344     normalization buffer, we'll insert the copy infront of pos and point pos
2345     to the start of the normalization buffer. why am i doing these copies?
2346     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2347     not require any changes, which be really painful. */
2348     uint32_t length = u_strlen(buffer);;
2349     if (source->flags & UCOL_ITER_INNORMBUF) {
2350         u_strcpy(tempdb, source->pos);
2351     }
2352     else {
2353         source->fcdPosition  = source->pos;
2354         source->origFlags    = source->flags;
2355         source->flags       |= UCOL_ITER_INNORMBUF;
2356         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2357     }
2358
2359     if (length >= source->writableBufSize) {
2360         freeHeapWritableBuffer(source);
2361         source->writableBuffer =
2362                      (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2363         if(source->writableBuffer == NULL) {
2364             source->writableBufSize = 0; // Reset size
2365             return;
2366         }
2367         source->writableBufSize = length;
2368     }
2369
2370     u_strcpy(source->writableBuffer, buffer);
2371     source->pos = source->writableBuffer;
2372 }
2373
2374 /**
2375 * Function to get the discontiguos collation element within the source.
2376 * Note this function will set the position to the appropriate places.
2377 * @param coll current collator used
2378 * @param source data string source
2379 * @param constart index to the start character in the contraction table
2380 * @return discontiguos collation element offset
2381 */
2382 static
2383 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2384                                 const UChar *constart)
2385 {
2386     /* source->pos currently points to the second combining character after
2387        the start character */
2388           UChar   *temppos      = source->pos;
2389           UChar    buffer[4*UCOL_MAX_BUFFER];
2390           UChar   *tempdb       = buffer;
2391     const UChar   *tempconstart = constart;
2392           uint8_t  tempflags    = source->flags;
2393           UBool    multicontraction = FALSE;
2394           UChar   *tempbufferpos = 0;
2395           collIterateState discState;
2396
2397           backupState(source, &discState);
2398
2399     //*tempdb = *(source->pos - 1);
2400     *tempdb = peekCharacter(source, -1);
2401     tempdb++;
2402     for (;;) {
2403         UChar    *UCharOffset;
2404         UChar     schar,
2405                   tchar;
2406         uint32_t  result;
2407
2408         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2409             || (peekCharacter(source, 0) == 0  &&
2410             //|| (*source->pos == 0  &&
2411                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2412                  source->fcdPosition == NULL ||
2413                  source->fcdPosition == source->endp ||
2414                  *(source->fcdPosition) == 0 ||
2415                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2416                  /* end of string in null terminated string or stopped by a
2417                  null character, note fcd does not always point to a base
2418                  character after the discontiguos change */
2419                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2420                  //u_getCombiningClass(*(source->pos)) == 0) {
2421             //constart = (UChar *)coll->image + getContractOffset(CE);
2422             if (multicontraction) {
2423                 *tempbufferpos = 0;
2424                 source->pos    = temppos - 1;
2425                 setDiscontiguosAttribute(source, buffer, tempdb);
2426                 return *(coll->contractionCEs +
2427                                     (tempconstart - coll->contractionIndex));
2428             }
2429             constart = tempconstart;
2430             break;
2431         }
2432
2433         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2434         schar = getNextNormalizedChar(source);
2435
2436         while (schar > (tchar = *UCharOffset)) {
2437             UCharOffset++;
2438         }
2439
2440         if (schar != tchar) {
2441             /* not the correct codepoint. we stuff the current codepoint into
2442             the discontiguos buffer and try the next character */
2443             *tempdb = schar;
2444             tempdb ++;
2445             continue;
2446         }
2447         else {
2448             if (u_getCombiningClass(schar) ==
2449                 u_getCombiningClass(peekCharacter(source, -2))) {
2450                 //u_getCombiningClass(*(source->pos - 2))) {
2451                 *tempdb = schar;
2452                 tempdb ++;
2453                 continue;
2454             }
2455             result = *(coll->contractionCEs +
2456                                       (UCharOffset - coll->contractionIndex));
2457         }
2458         *tempdb = 0;
2459
2460         if (result == UCOL_NOT_FOUND) {
2461           break;
2462         } else if (isContraction(result)) {
2463             /* this is a multi-contraction*/
2464             tempconstart = (UChar *)coll->image + getContractOffset(result);
2465             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2466                 != UCOL_NOT_FOUND) {
2467                 multicontraction = TRUE;
2468                 temppos       = source->pos + 1;
2469                 tempbufferpos = buffer + u_strlen(buffer);
2470             }
2471         } else {
2472             setDiscontiguosAttribute(source, buffer, tempdb);
2473             return result;
2474         }
2475     }
2476
2477     /* no problems simply reverting just like that,
2478     if we are in string before getting into this function, points back to
2479     string hence no problem.
2480     if we are in normalization buffer before getting into this function,
2481     since we'll never use another normalization within this function, we
2482     know that fcdposition points to a base character. the normalization buffer
2483     never change, hence this revert works. */
2484     loadState(source, &discState, TRUE);
2485     goBackOne(source);
2486
2487     //source->pos   = temppos - 1;
2488     source->flags = tempflags;
2489     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2490 }
2491
2492 static
2493 inline UBool isNonChar(UChar32 cp) {
2494     return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
2495 }
2496
2497 /* now uses Mark's getImplicitPrimary code */
2498 static
2499 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2500     if(isNonChar(cp)) {
2501         return 0;
2502     }
2503     uint32_t r = uprv_uca_getImplicitPrimary(cp);
2504     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2505     collationSource->offsetRepeatCount += 1;
2506     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2507 }
2508
2509 /**
2510 * Inserts the argument character into the front of the buffer replacing the
2511 * front null terminator.
2512 * @param data collation element iterator data
2513 * @param pNull pointer to the null terminator
2514 * @param ch character to be appended
2515 * @return positon of added character
2516 */
2517 static
2518 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2519 {
2520     uint32_t  size    = data->writableBufSize;
2521     UChar    *end;
2522     UChar    *newbuffer;
2523     static const uint32_t  INCSIZE = 5;
2524
2525     if (pNull > data->writableBuffer + 1) {
2526         *pNull       = ch;
2527         *(pNull - 1) = 0;
2528         return pNull;
2529     }
2530
2531     /*
2532     buffer will always be null terminated infront.
2533     giving extra space since it is likely that more characters will be added.
2534     */
2535     size += INCSIZE;
2536     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2537     if(newbuffer == NULL) {
2538         return NULL;
2539     }
2540     end = newbuffer + INCSIZE;
2541     uprv_memcpy(end, data->writableBuffer,
2542                 data->writableBufSize * sizeof(UChar));
2543     *end       = ch;
2544     *(end - 1) = 0;
2545
2546     freeHeapWritableBuffer(data);
2547
2548     data->writableBufSize = size;
2549     data->writableBuffer  = newbuffer;
2550     return end;
2551 }
2552
2553 /**
2554 * Special normalization function for contraction in the previous iterator.
2555 * This normalization sequence will place the current character at source->pos
2556 * and its following normalized sequence into the buffer.
2557 * The fcd position, pos will be changed.
2558 * pos will now point to positions in the buffer.
2559 * Flags will be changed accordingly.
2560 * @param data collation iterator data
2561 */
2562 static
2563 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2564 {
2565     uint32_t    nulltermsize;
2566     UErrorCode  localstatus = U_ZERO_ERROR;
2567     UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2568     UChar      *pStart;
2569     uint32_t    normLen;
2570     UChar      *pStartNorm;
2571
2572     if (data->flags & UCOL_ITER_HASLEN) {
2573         /*
2574         normalization buffer not used yet, we'll pull down the next
2575         character into the end of the buffer
2576         */
2577         *(data->writableBuffer + (data->writableBufSize - 1)) = *(data->pos + 1);
2578         nulltermsize                  = data->writableBufSize - 1;
2579     }
2580     else {
2581         nulltermsize = data->writableBufSize;
2582         UChar *temp = data->writableBuffer + (nulltermsize - 1);
2583         while (*(temp --) != 0) {
2584             nulltermsize --;
2585         }
2586     }
2587
2588     /* Start normalize */
2589     if (data->fcdPosition == NULL) {
2590         pStart = data->string;
2591     }
2592     else {
2593         pStart = data->fcdPosition + 1;
2594     }
2595
2596     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, data->writableBuffer, 0,
2597                               &localstatus);
2598
2599     if (nulltermsize <= normLen) {
2600         uint32_t  size = data->writableBufSize - nulltermsize + normLen + 1;
2601         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2602         if (temp == NULL) {
2603             *status = U_MEMORY_ALLOCATION_ERROR;
2604             return;
2605         }
2606         nulltermsize   = normLen + 1;
2607         uprv_memcpy(temp + normLen, data->writableBuffer,
2608                     sizeof(UChar) * (data->writableBufSize - nulltermsize));
2609         freeHeapWritableBuffer(data);
2610         data->writableBuffer = temp;
2611         data->writableBufSize = size;
2612     }
2613
2614     /*
2615     this puts the null termination infront of the normalized string instead
2616     of the end
2617     */
2618     pStartNorm   = data->writableBuffer + (nulltermsize - normLen);
2619     *(pStartNorm - 1) = 0;
2620     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2621                     status);
2622
2623     data->pos        = data->writableBuffer + nulltermsize;
2624     data->origFlags  = data->flags;
2625     data->flags     |= UCOL_ITER_INNORMBUF;
2626     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2627 }
2628
2629 /**
2630 * Contraction character management function that returns the previous character
2631 * for the backwards iterator.
2632 * Does nothing if the previous character is in buffer and not the first
2633 * character in it.
2634 * Else it checks previous character in data string to see if it is
2635 * normalizable.
2636 * If it is not, the character is simply copied into the buffer, else
2637 * the whole normalized substring is copied into the buffer, including the
2638 * current character.
2639 * @param data collation element iterator data
2640 * @return previous character
2641 */
2642 static
2643 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2644 {
2645     UChar  prevch;
2646     UChar  ch;
2647     UChar *start;
2648     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2649     UChar *pNull = NULL;
2650     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2651         (innormbuf && *(data->pos - 1) != 0)) {
2652         /*
2653         if no normalization.
2654         if previous character is in normalized buffer, no further normalization
2655         is required
2656         */
2657       if(data->flags & UCOL_USE_ITERATOR) {
2658         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2659         return (UChar)data->iterator->next(data->iterator);
2660       } else {
2661         return *(data->pos - 1);
2662       }
2663     }
2664
2665     start = data->pos;
2666     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2667         /* in data string */
2668         if ((start - 1) == data->string) {
2669             return *(start - 1);
2670         }
2671         start --;
2672         ch     = *start;
2673         prevch = *(start - 1);
2674     }
2675     else {
2676         /*
2677         in writable buffer, at this point fcdPosition can not be NULL.
2678         see contracting tag.
2679         */
2680         if (data->fcdPosition == data->string) {
2681             /* at the start of the string, just dump it into the normalizer */
2682             insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2683             data->fcdPosition = NULL;
2684             return *(data->pos - 1);
2685         }
2686         pNull  = data->pos - 1;
2687         start  = data->fcdPosition;
2688         ch     = *start;
2689         prevch = *(start - 1);
2690     }
2691     /*
2692     * if the current character is not fcd.
2693     * Trailing combining class == 0.
2694     */
2695     if (data->fcdPosition > start &&
2696        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2697     {
2698         /*
2699         Need a more complete FCD check and possible normalization.
2700         normalize substring will be appended to buffer
2701         */
2702         UChar *backuppos = data->pos;
2703         data->pos = start;
2704         if (collPrevIterFCD(data)) {
2705             normalizePrevContraction(data, status);
2706             return *(data->pos - 1);
2707         }
2708         data->pos = backuppos;
2709         data->fcdPosition ++;
2710     }
2711
2712     if (innormbuf) {
2713     /*
2714     no normalization is to be done hence only one character will be
2715     appended to the buffer.
2716     */
2717         insertBufferFront(data, pNull, ch);
2718         data->fcdPosition --;
2719     }
2720
2721     return ch;
2722 }
2723
2724 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2725 /* It is called by getNextCE */
2726
2727 /* The following should be even */
2728 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2729
2730 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2731     collIterateState entryState;
2732     backupState(source, &entryState);
2733     UChar32 cp = ch;
2734
2735     for (;;) {
2736         // This loop will repeat only in the case of contractions, and only when a contraction
2737         //   is found and the first CE resulting from that contraction is itself a special
2738         //   (an expansion, for example.)  All other special CE types are fully handled the
2739         //   first time through, and the loop exits.
2740
2741         const uint32_t *CEOffset = NULL;
2742         switch(getCETag(CE)) {
2743         case NOT_FOUND_TAG:
2744             /* This one is not found, and we'll let somebody else bother about it... no more games */
2745             return CE;
2746         case SPEC_PROC_TAG:
2747             {
2748                 // Special processing is getting a CE that is preceded by a certain prefix
2749                 // Currently this is only needed for optimizing Japanese length and iteration marks.
2750                 // When we encouter a special processing tag, we go backwards and try to see if
2751                 // we have a match.
2752                 // Contraction tables are used - so the whole process is not unlike contraction.
2753                 // prefix data is stored backwards in the table.
2754                 const UChar *UCharOffset;
2755                 UChar schar, tchar;
2756                 collIterateState prefixState;
2757                 backupState(source, &prefixState);
2758                 loadState(source, &entryState, TRUE);
2759                 goBackOne(source); // We want to look at the point where we entered - actually one
2760                 // before that...
2761
2762                 for(;;) {
2763                     // This loop will run once per source string character, for as long as we
2764                     //  are matching a potential contraction sequence
2765
2766                     // First we position ourselves at the begining of contraction sequence
2767                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2768                     if (collIter_bos(source)) {
2769                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2770                         break;
2771                     }
2772                     schar = getPrevNormalizedChar(source, status);
2773                     goBackOne(source);
2774
2775                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2776                         UCharOffset++;
2777                     }
2778
2779                     if (schar == tchar) {
2780                         // Found the source string char in the table.
2781                         //  Pick up the corresponding CE from the table.
2782                         CE = *(coll->contractionCEs +
2783                             (UCharOffset - coll->contractionIndex));
2784                     }
2785                     else
2786                     {
2787                         // Source string char was not in the table.
2788                         //   We have not found the prefix.
2789                         CE = *(coll->contractionCEs +
2790                             (ContractionStart - coll->contractionIndex));
2791                     }
2792
2793                     if(!isPrefix(CE)) {
2794                         // The source string char was in the contraction table, and the corresponding
2795                         //   CE is not a prefix CE.  We found the prefix, break
2796                         //   out of loop, this CE will end up being returned.  This is the normal
2797                         //   way out of prefix handling when the source actually contained
2798                         //   the prefix.
2799                         break;
2800                     }
2801                 }
2802                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2803                     loadState(source, &prefixState, TRUE);
2804                     if(source->origFlags & UCOL_USE_ITERATOR) {
2805                         source->flags = source->origFlags;
2806                     }
2807                 } else { // prefix search was a failure, we have to backup all the way to the start
2808                     loadState(source, &entryState, TRUE);
2809                 }
2810                 break;
2811             }
2812         case CONTRACTION_TAG:
2813             {
2814                 /* This should handle contractions */
2815                 collIterateState state;
2816                 backupState(source, &state);
2817                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2818                 const UChar *UCharOffset;
2819                 UChar schar, tchar;
2820
2821                 for (;;) {
2822                     /* This loop will run once per source string character, for as long as we     */
2823                     /*  are matching a potential contraction sequence                  */
2824
2825                     /* First we position ourselves at the begining of contraction sequence */
2826                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2827
2828                     if (collIter_eos(source)) {
2829                         // Ran off the end of the source string.
2830                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2831                         // So we'll pick whatever we have at the point...
2832                         if (CE == UCOL_NOT_FOUND) {
2833                             // back up the source over all the chars we scanned going into this contraction.
2834                             CE = firstCE;
2835                             loadState(source, &state, TRUE);
2836                             if(source->origFlags & UCOL_USE_ITERATOR) {
2837                                 source->flags = source->origFlags;
2838                             }
2839                         }
2840                         break;
2841                     }
2842
2843                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2844                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2845
2846                     schar = getNextNormalizedChar(source);
2847                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2848                         UCharOffset++;
2849                     }
2850
2851                     if (schar == tchar) {
2852                         // Found the source string char in the contraction table.
2853                         //  Pick up the corresponding CE from the table.
2854                         CE = *(coll->contractionCEs +
2855                             (UCharOffset - coll->contractionIndex));
2856                     }
2857                     else
2858                     {
2859                         // Source string char was not in contraction table.
2860                         //   Unless we have a discontiguous contraction, we have finished
2861                         //   with this contraction.
2862                         // in order to do the proper detection, we
2863                         // need to see if we're dealing with a supplementary
2864                         /* We test whether the next two char are surrogate pairs.
2865                         * This test is done if the iterator is not NULL.
2866                         * If there is no surrogate pair, the iterator
2867                         * goes back one if needed. */
2868                         UChar32 miss = schar;
2869                         if (source->iterator) {
2870                             UChar32 surrNextChar; /* the next char in the iteration to test */
2871                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2872                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2873                                 prevPos = source->iterator->index;
2874                                 surrNextChar = getNextNormalizedChar(source);
2875                                 if (U16_IS_TRAIL(surrNextChar)) {
2876                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2877                                 } else if (prevPos < source->iterator->index){
2878                                     goBackOne(source);
2879                                 }
2880                             }
2881                         } else if (U16_IS_LEAD(schar)) {
2882                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2883                         }
2884
2885                         uint8_t sCC;
2886                         if (miss < 0x300 ||
2887                             maxCC == 0 ||
2888                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2889                             sCC>maxCC ||
2890                             (allSame != 0 && sCC == maxCC) ||
2891                             collIter_eos(source))
2892                         {
2893                             //  Contraction can not be discontiguous.
2894                             goBackOne(source);  // back up the source string by one,
2895                             //  because  the character we just looked at was
2896                             //  not part of the contraction.   */
2897                             if(U_IS_SUPPLEMENTARY(miss)) {
2898                                 goBackOne(source);
2899                             }
2900                             CE = *(coll->contractionCEs +
2901                                 (ContractionStart - coll->contractionIndex));
2902                         } else {
2903                             //
2904                             // Contraction is possibly discontiguous.
2905                             //   Scan more of source string looking for a match
2906                             //
2907                             UChar tempchar;
2908                             /* find the next character if schar is not a base character
2909                             and we are not yet at the end of the string */
2910                             tempchar = getNextNormalizedChar(source);
2911                             // probably need another supplementary thingie here
2912                             goBackOne(source);
2913                             if (i_getCombiningClass(tempchar, coll) == 0) {
2914                                 goBackOne(source);
2915                                 if(U_IS_SUPPLEMENTARY(miss)) {
2916                                     goBackOne(source);
2917                                 }
2918                                 /* Spit out the last char of the string, wasn't tasty enough */
2919                                 CE = *(coll->contractionCEs +
2920                                     (ContractionStart - coll->contractionIndex));
2921                             } else {
2922                                 CE = getDiscontiguous(coll, source, ContractionStart);
2923                             }
2924                         }
2925                     } // else after if(schar == tchar)
2926
2927                     if(CE == UCOL_NOT_FOUND) {
2928                         /* The Source string did not match the contraction that we were checking.  */
2929                         /*  Back up the source position to undo the effects of having partially    */
2930                         /*   scanned through what ultimately proved to not be a contraction.       */
2931                         loadState(source, &state, TRUE);
2932                         CE = firstCE;
2933                         break;
2934                     }
2935
2936                     if(!isContraction(CE)) {
2937                         // The source string char was in the contraction table, and the corresponding
2938                         //   CE is not a contraction CE.  We completed the contraction, break
2939                         //   out of loop, this CE will end up being returned.  This is the normal
2940                         //   way out of contraction handling when the source actually contained
2941                         //   the contraction.
2942                         break;
2943                     }
2944
2945
2946                     // The source string char was in the contraction table, and the corresponding
2947                     //   CE is IS  a contraction CE.  We will continue looping to check the source
2948                     //   string for the remaining chars in the contraction.
2949                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2950                     if(tempCE != UCOL_NOT_FOUND) {
2951                         // We have scanned a a section of source string for which there is a
2952                         //  CE from the contraction table.  Remember the CE and scan position, so
2953                         //  that we can return to this point if further scanning fails to
2954                         //  match a longer contraction sequence.
2955                         firstCE = tempCE;
2956
2957                         goBackOne(source);
2958                         backupState(source, &state);
2959                         getNextNormalizedChar(source);
2960
2961                         // Another way to do this is:
2962                         //collIterateState tempState;
2963                         //backupState(source, &tempState);
2964                         //goBackOne(source);
2965                         //backupState(source, &state);
2966                         //loadState(source, &tempState, TRUE);
2967
2968                         // The problem is that for incomplete contractions we have to remember the previous
2969                         // position. Before, the only thing I needed to do was state.pos--;
2970                         // After iterator introduction and especially after introduction of normalizing
2971                         // iterators, it became much more difficult to decrease the saved state.
2972                         // I'm not yet sure which of the two methods above is faster.
2973                     }
2974                 } // for(;;)
2975                 break;
2976             } // case CONTRACTION_TAG:
2977         case LONG_PRIMARY_TAG:
2978             {
2979                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2980                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2981                 source->offsetRepeatCount += 1;
2982                 return CE;
2983             }
2984         case EXPANSION_TAG:
2985             {
2986                 /* This should handle expansion. */
2987                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
2988                 /* I have to decide where continuations are going to be dealt with */
2989                 uint32_t size;
2990                 uint32_t i;    /* general counter */
2991
2992                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2993                 size = getExpansionCount(CE);
2994                 CE = *CEOffset++;
2995               //source->offsetRepeatCount = -1;
2996
2997                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2998                     for(i = 1; i<size; i++) {
2999                         *(source->CEpos++) = *CEOffset++;
3000                         source->offsetRepeatCount += 1;
3001                     }
3002                 } else { /* else, we do */
3003                     while(*CEOffset != 0) {
3004                         *(source->CEpos++) = *CEOffset++;
3005                         source->offsetRepeatCount += 1;
3006                     }
3007                 }
3008
3009                 return CE;
3010             }
3011         case DIGIT_TAG:
3012             {
3013                 /*
3014                 We do a check to see if we want to collate digits as numbers; if so we generate
3015                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3016                 */
3017                 //uint32_t size;
3018                 uint32_t i;    /* general counter */
3019
3020                 if (source->coll->numericCollation == UCOL_ON){
3021                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3022                     UChar32 char32 = 0;
3023                     int32_t digVal = 0;
3024
3025                     uint32_t digIndx = 0;
3026                     uint32_t endIndex = 0;
3027                     uint32_t trailingZeroIndex = 0;
3028
3029                     uint8_t collateVal = 0;
3030
3031                     UBool nonZeroValReached = FALSE;
3032
3033                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3034                     /*
3035                          We parse the source string until we hit a char that's NOT a digit.
3036                         Use this u_charDigitValue. This might be slow because we have to
3037                         handle surrogates...
3038                     */
3039             /*
3040                     if (U16_IS_LEAD(ch)){
3041                       if (!collIter_eos(source)) {
3042                         backupState(source, &digitState);
3043                         UChar trail = getNextNormalizedChar(source);
3044                         if(U16_IS_TRAIL(trail)) {
3045                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3046                         } else {
3047                           loadState(source, &digitState, TRUE);
3048                           char32 = ch;
3049                         }
3050                       } else {
3051                         char32 = ch;
3052                       }
3053                     } else {
3054                       char32 = ch;
3055                     }
3056                     digVal = u_charDigitValue(char32);
3057             */
3058                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
3059                     // already processed possible supplementaries that trigered the digit tag -
3060                     // all supplementaries are marked in the UCA.
3061                     /*
3062                         We  pad a zero in front of the first element anyways. This takes
3063                         care of the (probably) most common case where people are sorting things followed
3064                         by a single digit
3065                     */
3066                     digIndx++;
3067                     for(;;){
3068                         // Make sure we have enough space. No longer needed;
3069                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3070                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3071                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3072
3073                         // Skipping over leading zeroes.
3074                         if (digVal != 0) {
3075                             nonZeroValReached = TRUE;
3076                         }
3077                         if (nonZeroValReached) {
3078                             /*
3079                             We parse the digit string into base 100 numbers (this fits into a byte).
3080                             We only add to the buffer in twos, thus if we are parsing an odd character,
3081                             that serves as the 'tens' digit while the if we are parsing an even one, that
3082                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3083                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3084                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3085                             than all the other bytes.
3086                             */
3087
3088                             if (digIndx % 2 == 1){
3089                                 collateVal += (uint8_t)digVal;
3090
3091                                 // We don't enter the low-order-digit case unless we've already seen
3092                                 // the high order, or for the first digit, which is always non-zero.
3093                                 if (collateVal != 0)
3094                                     trailingZeroIndex = 0;
3095
3096                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3097                                 collateVal = 0;
3098                             }
3099                             else{
3100                                 // We drop the collation value into the buffer so if we need to do
3101                                 // a "front patch" we don't have to check to see if we're hitting the
3102                                 // last element.
3103                                 collateVal = (uint8_t)(digVal * 10);
3104
3105                                 // Check for trailing zeroes.
3106                                 if (collateVal == 0)
3107                                 {
3108                                     if (!trailingZeroIndex)
3109                                         trailingZeroIndex = (digIndx/2) + 2;
3110                                 }
3111                                 else
3112                                     trailingZeroIndex = 0;
3113
3114                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3115                             }
3116                             digIndx++;
3117                         }
3118
3119                         // Get next character.
3120                         if (!collIter_eos(source)){
3121                             ch = getNextNormalizedChar(source);
3122                             if (U16_IS_LEAD(ch)){
3123                                 if (!collIter_eos(source)) {
3124                                     backupState(source, &digitState);
3125                                     UChar trail = getNextNormalizedChar(source);
3126                                     if(U16_IS_TRAIL(trail)) {
3127                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3128                                     } else {
3129                                         loadState(source, &digitState, TRUE);
3130                                         char32 = ch;
3131                                     }
3132                                 }
3133                             } else {
3134                                 char32 = ch;
3135                             }
3136
3137                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3138                                 // Resetting position to point to the next unprocessed char. We
3139                                 // overshot it when doing our test/set for numbers.
3140                                 if (char32 > 0xFFFF) { // For surrogates.
3141                                     loadState(source, &digitState, TRUE);
3142                                     //goBackOne(source);
3143                                 }
3144                                 goBackOne(source);
3145                                 break;
3146                             }
3147                         } else {
3148                             break;
3149                         }
3150                     }
3151
3152                     if (nonZeroValReached == FALSE){
3153                         digIndx = 2;
3154                         numTempBuf[2] = 6;
3155                     }
3156
3157                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3158                     if (digIndx % 2 != 0){
3159                         /*
3160                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3161                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3162                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3163                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3164                         */
3165
3166                         for(i = 2; i < endIndex; i++){
3167                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3168                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3169                         }
3170                         --digIndx;
3171                     }
3172
3173                     // Subtract one off of the last byte.
3174                     numTempBuf[endIndex-1] -= 1;
3175
3176                     /*
3177                     We want to skip over the first two slots in the buffer. The first slot
3178                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3179                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3180                     */
3181                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3182                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3183
3184                     // Now transfer the collation key to our collIterate struct.
3185                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3186                     //size = ((endIndex+1) & ~1)/2;
3187                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3188                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3189                         UCOL_BYTE_COMMON; // Tertiary weight.
3190                     i = 2; // Reset the index into the buffer.
3191                     while(i < endIndex)
3192                     {
3193                         uint32_t primWeight = numTempBuf[i++] << 8;
3194                         if ( i < endIndex)
3195                             primWeight |= numTempBuf[i++];
3196                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3197                     }
3198
3199                 } else {
3200                     // no numeric mode, we'll just switch to whatever we stashed and continue
3201                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3202                     CE = *CEOffset++;
3203                     break;
3204                 }
3205                 return CE;
3206             }
3207             /* various implicits optimization */
3208         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3209             /* UCA is filled with these. Tailorings are NOT_FOUND */
3210             return getImplicit(cp, source);
3211         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3212             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3213             return getImplicit(cp, source);
3214         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3215             {
3216                 static const uint32_t
3217                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3218                 //const uint32_t LCount = 19;
3219                 static const uint32_t VCount = 21;
3220                 static const uint32_t TCount = 28;
3221                 //const uint32_t NCount = VCount * TCount;   // 588
3222                 //const uint32_t SCount = LCount * NCount;   // 11172
3223                 uint32_t L = ch - SBase;
3224
3225                 // divide into pieces
3226
3227                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3228                 L /= TCount;
3229                 uint32_t V = L % VCount;
3230                 L /= VCount;
3231
3232                 // offset them
3233
3234                 L += LBase;
3235                 V += VBase;
3236                 T += TBase;
3237
3238                 // return the first CE, but first put the rest into the expansion buffer
3239                 if (!source->coll->image->jamoSpecial) { // FAST PATH
3240
3241                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3242                     if (T != TBase) {
3243                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3244                     }
3245
3246                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3247
3248                 } else { // Jamo is Special
3249                     // Since Hanguls pass the FCD check, it is
3250                     // guaranteed that we won't be in
3251                     // the normalization buffer if something like this happens
3252                     // However, if we are using a uchar iterator and normalization
3253                     // is ON, the Hangul that lead us here is going to be in that
3254                     // normalization buffer. Here we want to restore the uchar
3255                     // iterator state and pull out of the normalization buffer
3256                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3257                         source->flags = source->origFlags; // restore the iterator
3258                         source->pos = NULL;
3259                     }
3260                     // Move Jamos into normalization buffer
3261                     source->writableBuffer[0] = (UChar)L;
3262                     source->writableBuffer[1] = (UChar)V;
3263                     if (T != TBase) {
3264                         source->writableBuffer[2] = (UChar)T;
3265                         source->writableBuffer[3] = 0;
3266                     } else {
3267                         source->writableBuffer[2] = 0;
3268                     }
3269
3270                     source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3271                     //   after exhausting the writableBuffer
3272                     source->pos   = source->writableBuffer;
3273                     source->origFlags   = source->flags;
3274                     source->flags       |= UCOL_ITER_INNORMBUF;
3275                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3276
3277                     return(UCOL_IGNORABLE);
3278                 }
3279             }
3280         case SURROGATE_TAG:
3281             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3282             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3283             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3284             /* we return 0 (completely ignorable - per UCA specification */
3285             {
3286                 UChar trail;
3287                 collIterateState state;
3288                 backupState(source, &state);
3289                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3290                     // we chould have stepped one char forward and it might have turned that it
3291                     // was not a trail surrogate. In that case, we have to backup.
3292                     loadState(source, &state, TRUE);
3293                     return 0;
3294                 } else {
3295                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3296                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3297                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3298                         // We need to backup
3299                         loadState(source, &state, TRUE);
3300                         return CE;
3301                     }
3302                     // calculate the supplementary code point value, if surrogate was not tailored
3303                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3304                 }
3305             }
3306             break;
3307         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3308             UChar nextChar;
3309             if( source->flags & UCOL_USE_ITERATOR) {
3310                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3311                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3312                     source->iterator->next(source->iterator);
3313                     return getImplicit(cp, source);
3314                 } else {
3315                     return 0;
3316                 }
3317             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3318                 U_IS_TRAIL((nextChar=*source->pos))) {
3319                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3320                     source->pos++;
3321                     return getImplicit(cp, source);
3322             } else {
3323                 return 0; /* completely ignorable */
3324             }
3325         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3326             return 0; /* broken surrogate sequence */
3327         case CHARSET_TAG:
3328             /* not yet implemented */
3329             /* probably after 1.8 */
3330             return UCOL_NOT_FOUND;
3331         default:
3332             *status = U_INTERNAL_PROGRAM_ERROR;
3333             CE=0;
3334             break;
3335     }
3336     if (CE <= UCOL_NOT_FOUND) break;
3337   }
3338   return CE;
3339 }
3340
3341
3342 /* now uses Mark's getImplicitPrimary code */
3343 static
3344 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3345     if(isNonChar(cp)) {
3346         return 0;
3347     }
3348
3349     uint32_t r = uprv_uca_getImplicitPrimary(cp);
3350
3351     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3352     collationSource->toReturn = collationSource->CEpos;
3353
3354     if (collationSource->offsetBuffer == NULL) {
3355         collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3356         collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3357         collationSource->offsetStore = collationSource->offsetBuffer;
3358     }
3359
3360     // **** doesn't work if using iterator ****
3361     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3362       collationSource->offsetRepeatCount = 1;
3363     } else {
3364       int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3365
3366       *(collationSource->offsetStore++) = firstOffset;
3367       *(collationSource->offsetStore++) = firstOffset + 1;
3368
3369         collationSource->offsetReturn = collationSource->offsetStore - 1;
3370         *(collationSource->offsetBuffer) = firstOffset;
3371         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3372             collationSource->offsetStore = collationSource->offsetBuffer;
3373         }
3374     }
3375
3376     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3377 }
3378
3379 /**
3380  * This function handles the special CEs like contractions, expansions,
3381  * surrogates, Thai.
3382  * It is called by both getPrevCE
3383  */
3384 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3385                           collIterate *source,
3386                           UErrorCode *status)
3387 {
3388     const uint32_t *CEOffset    = NULL;
3389           UChar    *UCharOffset = NULL;
3390           UChar    schar;
3391     const UChar    *constart    = NULL;
3392           uint32_t size;
3393           UChar    buffer[UCOL_MAX_BUFFER];
3394           uint32_t *endCEBuffer;
3395           UChar   *strbuffer;
3396           int32_t noChars = 0;
3397           int32_t CECount = 0;
3398
3399     for(;;)
3400     {
3401         /* the only ces that loops are thai and contractions */
3402         switch (getCETag(CE))
3403         {
3404         case NOT_FOUND_TAG:  /* this tag always returns */
3405             return CE;
3406
3407         case SPEC_PROC_TAG:
3408             {
3409                 // Special processing is getting a CE that is preceded by a certain prefix
3410                 // Currently this is only needed for optimizing Japanese length and iteration marks.
3411                 // When we encouter a special processing tag, we go backwards and try to see if
3412                 // we have a match.
3413                 // Contraction tables are used - so the whole process is not unlike contraction.
3414                 // prefix data is stored backwards in the table.
3415                 const UChar *UCharOffset;
3416                 UChar schar, tchar;
3417                 collIterateState prefixState;
3418                 backupState(source, &prefixState);
3419                 for(;;) {
3420                     // This loop will run once per source string character, for as long as we
3421                     //  are matching a potential contraction sequence
3422
3423                     // First we position ourselves at the begining of contraction sequence
3424                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3425
3426                     if (collIter_bos(source)) {
3427                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3428                         break;
3429                     }
3430                     schar = getPrevNormalizedChar(source, status);
3431                     goBackOne(source);
3432
3433                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3434                         UCharOffset++;
3435                     }
3436
3437                     if (schar == tchar) {
3438                         // Found the source string char in the table.
3439                         //  Pick up the corresponding CE from the table.
3440                         CE = *(coll->contractionCEs +
3441                             (UCharOffset - coll->contractionIndex));
3442                     }
3443                     else
3444                     {
3445                         // if there is a completely ignorable code point in the middle of
3446                         // a prefix, we need to act as if it's not there
3447                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3448                         // lone surrogates cannot be set to zero as it would break other processing
3449                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3450                         // it's easy for BMP code points
3451                         if(isZeroCE == 0) {
3452                             continue;
3453                         } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
3454                             // for supplementary code points, we have to check the next one
3455                             // situations where we are going to ignore
3456                             // 1. beginning of the string: schar is a lone surrogate
3457                             // 2. schar is a lone surrogate
3458                             // 3. schar is a trail surrogate in a valid surrogate sequence
3459                             //    that is explicitly set to zero.
3460                             if (!collIter_bos(source)) {
3461                                 UChar lead;
3462                                 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3463                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3464                                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3465                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3466                                         if(finalCE == 0) {
3467                                             // this is a real, assigned completely ignorable code point
3468                                             goBackOne(source);
3469                                             continue;
3470                                         }
3471                                     }
3472                                 } else {
3473                                     // lone surrogate, completely ignorable
3474                                     continue;
3475                                 }
3476                             } else {
3477                                 // lone surrogate at the beggining, completely ignorable
3478                                 continue;
3479                             }
3480                         }
3481                         // Source string char was not in the table.
3482                         //   We have not found the prefix.
3483                         CE = *(coll->contractionCEs +
3484                             (ContractionStart - coll->contractionIndex));
3485                     }
3486
3487                     if(!isPrefix(CE)) {
3488                         // The source string char was in the contraction table, and the corresponding
3489                         //   CE is not a prefix CE.  We found the prefix, break
3490                         //   out of loop, this CE will end up being returned.  This is the normal
3491                         //   way out of prefix handling when the source actually contained
3492                         //   the prefix.
3493                         break;
3494                     }
3495                 }
3496                 loadState(source, &prefixState, TRUE);
3497                 break;
3498             }
3499
3500         case CONTRACTION_TAG:
3501             /* to ensure that the backwards and forwards iteration matches, we
3502             take the current region of most possible match and pass it through
3503             the forward iteration. this will ensure that the obstinate problem of
3504             overlapping contractions will not occur.
3505             */
3506             schar = peekCharacter(source, 0);
3507             constart = (UChar *)coll->image + getContractOffset(CE);
3508             if (isAtStartPrevIterate(source)
3509                 /* commented away contraction end checks after adding the checks
3510                 in getPrevCE  */) {
3511                     /* start of string or this is not the end of any contraction */
3512                     CE = *(coll->contractionCEs +
3513                         (constart - coll->contractionIndex));
3514                     break;
3515             }
3516             strbuffer = buffer;
3517             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3518             *(UCharOffset --) = 0;
3519             noChars = 0;
3520             // have to swap thai characters
3521             while (ucol_unsafeCP(schar, coll)) {
3522                 *(UCharOffset) = schar;
3523                 noChars++;
3524                 UCharOffset --;
3525                 schar = getPrevNormalizedChar(source, status);
3526                 goBackOne(source);
3527                 // TODO: when we exhaust the contraction buffer,
3528                 // it needs to get reallocated. The problem is
3529                 // that the size depends on the string which is
3530                 // not iterated over. However, since we're travelling
3531                 // backwards, we already had to set the iterator at
3532                 // the end - so we might as well know where we are?
3533                 if (UCharOffset + 1 == buffer) {
3534                     /* we have exhausted the buffer */
3535                     int32_t newsize = 0;
3536                     if(source->pos) { // actually dealing with a position
3537                         newsize = source->pos - source->string + 1;
3538                     } else { // iterator
3539                         newsize = 4 * UCOL_MAX_BUFFER;
3540                     }
3541                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3542                         (newsize + UCOL_MAX_BUFFER));
3543                     /* test for NULL */
3544                     if (strbuffer == NULL) {
3545                         *status = U_MEMORY_ALLOCATION_ERROR;
3546                         return UCOL_NO_MORE_CES;
3547                     }
3548                     UCharOffset = strbuffer + newsize;
3549                     uprv_memcpy(UCharOffset, buffer,
3550                         UCOL_MAX_BUFFER * sizeof(UChar));
3551                     UCharOffset --;
3552                 }
3553                 if ((source->pos && (source->pos == source->string ||
3554                     ((source->flags & UCOL_ITER_INNORMBUF) &&
3555                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3556                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3557                         break;
3558                 }
3559             }
3560             /* adds the initial base character to the string */
3561             *(UCharOffset) = schar;
3562             noChars++;
3563
3564             int32_t offsetBias;
3565
3566 #if 0
3567             if (source->offsetReturn != NULL) {
3568                 source->offsetStore = source->offsetReturn - noChars;
3569             }
3570
3571             // **** doesn't work if using iterator ****
3572             if (source->flags & UCOL_ITER_INNORMBUF) {
3573                 if (source->fcdPosition == NULL) {
3574                     offsetBias = 0;
3575                 } else {
3576                     offsetBias = (int32_t)(source->fcdPosition - source->string);
3577                 }
3578             } else {
3579                 offsetBias = (int32_t)(source->pos - source->string);
3580             }
3581
3582 #else
3583             // **** doesn't work if using iterator ****
3584             if (source->flags & UCOL_ITER_INNORMBUF) {
3585 #if 1
3586                 offsetBias = -1;
3587 #else
3588               if (source->fcdPosition == NULL) {
3589                   offsetBias = 0;
3590               } else {
3591                   offsetBias = (int32_t)(source->fcdPosition - source->string);
3592               }
3593 #endif
3594             } else {
3595                 offsetBias = (int32_t)(source->pos - source->string);
3596             }
3597 #endif
3598
3599             /* a new collIterate is used to simplify things, since using the current
3600             collIterate will mean that the forward and backwards iteration will
3601             share and change the same buffers. we don't want to get into that. */
3602             collIterate temp;
3603             int32_t rawOffset;
3604
3605             //IInit_collIterate(coll, UCharOffset, -1, &temp);
3606             IInit_collIterate(coll, UCharOffset, noChars, &temp);
3607             temp.flags &= ~UCOL_ITER_NORM;
3608
3609             rawOffset = temp.pos - temp.string; // should always be zero?
3610             CE = ucol_IGetNextCE(coll, &temp, status);
3611
3612             if (source->extendCEs) {
3613                 endCEBuffer = source->extendCEs + source->extendCEsSize;
3614                 CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t);
3615             } else {
3616                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3617                 CECount = (source->CEpos - source->CEs)/sizeof(uint32_t);
3618             }
3619
3620             if (source->offsetBuffer == NULL) {
3621                 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3622                 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3623                 source->offsetStore = source->offsetBuffer;
3624             }
3625
3626             while (CE != UCOL_NO_MORE_CES) {
3627                 *(source->CEpos ++) = CE;
3628
3629                 if (offsetBias >= 0) {
3630                     *(source->offsetStore ++) = rawOffset + offsetBias;
3631                 }
3632
3633                 CECount++;
3634                 if (source->CEpos == endCEBuffer) {
3635                     /* ran out of CE space, reallocate to new buffer.
3636                     If reallocation fails, reset pointers and bail out,
3637                     there's no guarantee of the right character position after
3638                     this bail*/
3639                     if (source->extendCEs == NULL) {
3640                         source->extendCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t) *
3641                             (source->extendCEsSize =UCOL_EXPAND_CE_BUFFER_SIZE + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3642                         if (source->extendCEs == NULL) {
3643                             // Handle error later.
3644                             CECount = -1;
3645                         } else {
3646                             source->extendCEs = (uint32_t *)uprv_memcpy(source->extendCEs, source->CEs, UCOL_EXPAND_CE_BUFFER_SIZE * sizeof(uint32_t));
3647                         }
3648                     } else {
3649                         uint32_t *tempBufCE = (uint32_t *)uprv_realloc(source->extendCEs,
3650                             sizeof(uint32_t) * (source->extendCEsSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3651                         if (tempBufCE == NULL) {
3652                             // Handle error later.
3653                             CECount = -1;
3654                         }
3655                         else {
3656                             source->extendCEs = tempBufCE;
3657                         }
3658                     }
3659
3660                     if (CECount == -1) {
3661                         *status = U_MEMORY_ALLOCATION_ERROR;
3662                         source->extendCEsSize = 0;
3663                         source->CEpos = source->CEs;
3664                         freeHeapWritableBuffer(&temp);
3665
3666                         if (strbuffer != buffer) {
3667                             uprv_free(strbuffer);
3668                         }
3669
3670                         return (uint32_t)UCOL_NULLORDER;
3671                     }
3672
3673                     source->CEpos = source->extendCEs + CECount;
3674                     endCEBuffer = source->extendCEs + source->extendCEsSize;
3675                 }
3676
3677                 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
3678                     int32_t  storeIX = source->offsetStore - source->offsetBuffer;
3679                     int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
3680                         sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
3681
3682                     if (tob != NULL) {
3683                         source->offsetBuffer = tob;
3684                         source->offsetStore = &source->offsetBuffer[storeIX];
3685                         source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
3686                     } else {
3687                         // memory error...
3688                         *status = U_MEMORY_ALLOCATION_ERROR;
3689                         source->CEpos = source->CEs;
3690                         freeHeapWritableBuffer(&temp);
3691
3692                         if (strbuffer != buffer) {
3693                             uprv_free(strbuffer);
3694                         }
3695
3696                         return (uint32_t) UCOL_NULLORDER;
3697                     }
3698                 }
3699
3700                 rawOffset = temp.pos - temp.string;
3701                 CE = ucol_IGetNextCE(coll, &temp, status);
3702             }
3703
3704             if (source->offsetRepeatValue != 0) {
3705                 if (CECount > noChars) {
3706                     source->offsetRepeatCount += temp.offsetRepeatCount;
3707                 } else {
3708                     // **** does this really skip the right offsets? ****
3709                     source->offsetReturn -= (noChars - CECount);
3710                 }
3711             }
3712
3713             freeHeapWritableBuffer(&temp);
3714
3715             if (strbuffer != buffer) {
3716                 uprv_free(strbuffer);
3717             }
3718
3719             if (offsetBias >= 0) {
3720                 source->offsetReturn = source->offsetStore - 1;
3721                 if (source->offsetReturn == source->offsetBuffer) {
3722                     source->offsetStore = source->offsetBuffer;
3723                 }
3724             }
3725
3726             source->toReturn = source->CEpos - 1;
3727             if (source->toReturn == source->CEs) {
3728                 source->CEpos = source->CEs;
3729             }
3730
3731             return *(source->toReturn);
3732
3733         case LONG_PRIMARY_TAG:
3734             {
3735                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3736                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3737                 source->toReturn = source->CEpos - 1;
3738
3739                 if (source->offsetBuffer == NULL) {
3740                     source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3741                     source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3742                     source->offsetStore = source->offsetBuffer;
3743                 }
3744
3745                 if (source->flags & UCOL_ITER_INNORMBUF) {
3746                     source->offsetRepeatCount = 1;
3747                 } else {
3748                   int32_t firstOffset = (int32_t)(source->pos - source->string);
3749
3750                   *(source->offsetStore++) = firstOffset;
3751                   *(source->offsetStore++) = firstOffset + 1;
3752
3753                     source->offsetReturn = source->offsetStore - 1;
3754                     *(source->offsetBuffer) = firstOffset;
3755                     if (source->offsetReturn == source->offsetBuffer) {
3756                         source->offsetStore = source->offsetBuffer;
3757                     }
3758                 }
3759
3760
3761                 return *(source->toReturn);
3762             }
3763
3764         case EXPANSION_TAG: /* this tag always returns */
3765             {
3766             /*
3767             This should handle expansion.
3768             NOTE: we can encounter both continuations and expansions in an expansion!
3769             I have to decide where continuations are going to be dealt with
3770             */
3771             int32_t firstOffset = (int32_t)(source->pos - source->string);
3772
3773             // **** doesn't work if using iterator ****
3774             if (source->offsetReturn != NULL) {
3775                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3776                     source->offsetStore = source->offsetBuffer;
3777                 }else {
3778                   firstOffset = -1;
3779                 }
3780             }
3781
3782             if (source->offsetBuffer == NULL) {
3783                 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
3784                 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
3785                 source->offsetStore = source->offsetBuffer;
3786             }
3787
3788             /* find the offset to expansion table */
3789             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3790             size     = getExpansionCount(CE);
3791             if (size != 0) {
3792                 /*
3793                 if there are less than 16 elements in expansion, we don't terminate
3794                 */
3795                 uint32_t count;
3796
3797                 for (count = 0; count < size; count++) {
3798                     *(source->CEpos ++) = *CEOffset++;
3799
3800                     if (firstOffset >= 0) {
3801                         *(source->offsetStore ++) = firstOffset + 1;
3802                     }
3803                 }
3804             } else {
3805                 /* else, we do */
3806                 while (*CEOffset != 0) {
3807                     *(source->CEpos ++) = *CEOffset ++;
3808
3809                     if (firstOffset >= 0) {
3810                         *(source->offsetStore ++) = firstOffset + 1;
3811                     }
3812                 }
3813             }
3814
3815             if (firstOffset >= 0) {
3816                 source->offsetReturn = source->offsetStore - 1;
3817                 *(source->offsetBuffer) = firstOffset;
3818                 if (source->offsetReturn == source->offsetBuffer) {
3819                     source->offsetStore = source->offsetBuffer;
3820                 }
3821             } else {
3822                 source->offsetRepeatCount += size - 1;
3823             }
3824
3825             source->toReturn = source->CEpos - 1;
3826             // in case of one element expansion, we
3827             // want to immediately return CEpos
3828             if(source->toReturn == source->CEs) {
3829                 source->CEpos = source->CEs;
3830             }
3831
3832             return *(source->toReturn);
3833             }
3834
3835         case DIGIT_TAG:
3836             {
3837                 /*
3838                 We do a check to see if we want to collate digits as numbers; if so we generate
3839                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3840                 */
3841                 //uint32_t size;
3842                 uint32_t i;    /* general counter */
3843
3844                 if (source->coll->numericCollation == UCOL_ON){
3845                     uint32_t digIndx = 0;
3846                     uint32_t endIndex = 0;
3847                     uint32_t leadingZeroIndex = 0;
3848                     uint32_t trailingZeroCount = 0;
3849
3850                     uint8_t collateVal = 0;
3851
3852                     UBool nonZeroValReached = FALSE;
3853
3854                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3855                     /*
3856                     We parse the source string until we hit a char that's NOT a digit.
3857                     Use this u_charDigitValue. This might be slow because we have to
3858                     handle surrogates...
3859                     */
3860                     /*
3861                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3862                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3863                     element we process when going backward. To determine how long that chunk might be, we may need to make
3864                     two passes through the loop that collects digits - one to see how long the string is (and how much is
3865                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3866                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3867                     element chunk after resetting the state to the initialState at the right side of the digit string.
3868                     */
3869                     uint32_t ceLimit = 0;
3870                     UChar initial_ch = ch;
3871                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3872                     backupState(source, &initialState);
3873
3874                     for(;;) {
3875                         collIterateState state = {0,0,0,0,0,0,0,0,0};
3876                         UChar32 char32 = 0;
3877                         int32_t digVal = 0;
3878
3879                         if (U16_IS_TRAIL (ch)) {
3880                             if (!collIter_bos(source)){
3881                                 UChar lead = getPrevNormalizedChar(source, status);
3882                                 if(U16_IS_LEAD(lead)) {
3883                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3884                                     goBackOne(source);
3885                                 } else {
3886                                     char32 = ch;
3887                                 }
3888                             } else {
3889                                 char32 = ch;
3890                             }
3891                         } else {
3892                             char32 = ch;
3893                         }
3894                         digVal = u_charDigitValue(char32);
3895
3896                         for(;;) {
3897                             // Make sure we have enough space. No longer needed;
3898                             // at this point the largest value of digIndx when we need to save data in numTempBuf
3899                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3900                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3901
3902                             // Skip over trailing zeroes, and keep a count of them.
3903                             if (digVal != 0)
3904                                 nonZeroValReached = TRUE;
3905
3906                             if (nonZeroValReached) {
3907                                 /*
3908                                 We parse the digit string into base 100 numbers (this fits into a byte).
3909                                 We only add to the buffer in twos, thus if we are parsing an odd character,
3910                                 that serves as the 'tens' digit while the if we are parsing an even one, that
3911                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3912                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3913                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3914                                 than all the other bytes.
3915
3916                                 Since we're doing in this reverse we want to put the first digit encountered into the
3917                                 ones place and the second digit encountered into the tens place.
3918                                 */
3919
3920                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
3921                                     // High-order digit case (tens place)
3922                                     collateVal += (uint8_t)(digVal * 10);
3923
3924                                     // We cannot set leadingZeroIndex unless it has been set for the
3925                                     // low-order digit. Therefore, all we can do for the high-order
3926                                     // digit is turn it off, never on.
3927                                     // The only time we will have a high digit without a low is for
3928                                     // the very first non-zero digit, so no zero check is necessary.
3929                                     if (collateVal != 0)
3930                                         leadingZeroIndex = 0;
3931
3932                                     // The first pass through, digIndx may exceed the limit, but in that case
3933                                     // we no longer care about numTempBuf contents since they will be discarded
3934                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3935                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3936                                     }
3937                                     collateVal = 0;
3938                                 } else {
3939                                     // Low-order digit case (ones place)
3940                                     collateVal = (uint8_t)digVal;
3941
3942                                     // Check for leading zeroes.
3943                                     if (collateVal == 0) {
3944                                         if (!leadingZeroIndex)
3945                                             leadingZeroIndex = (digIndx/2) + 2;
3946                                     } else
3947                                         leadingZeroIndex = 0;
3948
3949                                     // No need to write to buffer; the case of a last odd digit
3950                                     // is handled below.
3951                                 }
3952                                 ++digIndx;
3953                             } else
3954                                 ++trailingZeroCount;
3955
3956                             if (!collIter_bos(source)) {
3957                                 ch = getPrevNormalizedChar(source, status);
3958                                 //goBackOne(source);
3959                                 if (U16_IS_TRAIL(ch)) {
3960                                     backupState(source, &state);
3961                                     if (!collIter_bos(source)) {
3962                                         goBackOne(source);
3963                                         UChar lead = getPrevNormalizedChar(source, status);
3964
3965                                         if(U16_IS_LEAD(lead)) {
3966                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3967                                         } else {
3968                                             loadState(source, &state, FALSE);
3969                                             char32 = ch;
3970                                         }
3971                                     }
3972                                 } else
3973                                     char32 = ch;
3974
3975                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3976                                     if (char32 > 0xFFFF) {// For surrogates.
3977                                         loadState(source, &state, FALSE);
3978                                     }
3979                                     // Don't need to "reverse" the goBackOne call,
3980                                     // as this points to the next position to process..
3981                                     //if (char32 > 0xFFFF) // For surrogates.
3982                                     //getNextNormalizedChar(source);
3983                                     break;
3984                                 }
3985
3986                                 goBackOne(source);
3987                             }else
3988                                 break;
3989                         }
3990
3991                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3992                             // our collation element is not too big, go ahead and finish with it
3993                             break;
3994                         }
3995                         // our digit string is too long for a collation element;
3996                         // set the limit for it, reset the state and begin again
3997                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3998                         if ( ceLimit == 0 ) {
3999                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
4000                         }
4001                         ch = initial_ch;
4002                         loadState(source, &initialState, FALSE);
4003                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
4004                         collateVal = 0;
4005                         nonZeroValReached = FALSE;
4006                     }
4007
4008                     if (! nonZeroValReached) {
4009                         digIndx = 2;
4010                         trailingZeroCount = 0;
4011                         numTempBuf[2] = 6;
4012                     }
4013
4014                     if ((digIndx + trailingZeroCount) % 2 != 0) {
4015                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
4016                         digIndx += 1;       // The implicit leading zero
4017                     }
4018                     if (trailingZeroCount % 2 != 0) {
4019                         // We had to consume one trailing zero for the low digit
4020                         // of the least significant byte
4021                         digIndx += 1;       // The trailing zero not in the exponent
4022                         trailingZeroCount -= 1;
4023                     }
4024
4025                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
4026
4027                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4028                     numTempBuf[2] -= 1;
4029
4030                     /*
4031                     We want to skip over the first two slots in the buffer. The first slot
4032                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4033                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4034                     The exponent must be adjusted by the number of leading zeroes, and the number of
4035                     trailing zeroes.
4036                     */
4037                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
4038                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
4039                     if (leadingZeroIndex)
4040                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
4041                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
4042
4043                     // Now transfer the collation key to our collIterate struct.
4044                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4045                     //size = ((endIndex+1) & ~1)/2;
4046                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
4047                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
4048                         UCOL_BYTE_COMMON; // Tertiary weight.
4049                     i = endIndex - 1; // Reset the index into the buffer.
4050                     while(i >= 2) {
4051                         uint32_t primWeight = numTempBuf[i--] << 8;
4052                         if ( i >= 2)
4053                             primWeight |= numTempBuf[i--];
4054                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
4055                     }
4056
4057                     source->toReturn = source->CEpos -1;
4058                     return *(source->toReturn);
4059                 } else {
4060                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4061                     CE = *(CEOffset++);
4062                     break;
4063                 }
4064             }
4065
4066         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
4067             {
4068                 static const uint32_t
4069                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4070                 //const uint32_t LCount = 19;
4071                 static const uint32_t VCount = 21;
4072                 static const uint32_t TCount = 28;
4073                 //const uint32_t NCount = VCount * TCount;   /* 588 */
4074                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
4075
4076                 uint32_t L = ch - SBase;
4077                 /*
4078                 divide into pieces.
4079                 we do it in this order since some compilers can do % and / in one
4080                 operation
4081                 */
4082                 uint32_t T = L % TCount;
4083                 L /= TCount;
4084                 uint32_t V = L % VCount;
4085                 L /= VCount;
4086
4087                 /* offset them */
4088                 L += LBase;
4089                 V += VBase;
4090                 T += TBase;
4091
4092                 if (source->offsetBuffer == NULL) {
4093                     source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
4094                     source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
4095                     source->offsetStore = source->offsetBuffer;
4096                 }
4097
4098               int32_t firstOffset = (int32_t)(source->pos - source->string);
4099
4100               *(source->offsetStore++) = firstOffset;
4101
4102                 /*
4103                  * return the first CE, but first put the rest into the expansion buffer
4104                  */
4105                 if (!source->coll->image->jamoSpecial) {
4106                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4107                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4108                     *(source->offsetStore++) = firstOffset + 1;
4109
4110                     if (T != TBase) {
4111                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4112                         *(source->offsetStore++) = firstOffset + 1;
4113                     }
4114
4115                     source->toReturn = source->CEpos - 1;
4116
4117                     source->offsetReturn = source->offsetStore - 1;
4118                     if (source->offsetReturn == source->offsetBuffer) {
4119                         source->offsetStore = source->offsetBuffer;
4120                     }
4121
4122                     return *(source->toReturn);
4123                 } else {
4124                     // Since Hanguls pass the FCD check, it is
4125                     // guaranteed that we won't be in
4126                     // the normalization buffer if something like this happens
4127                     // Move Jamos into normalization buffer
4128                     /*
4129                     Move the Jamos into the
4130                     normalization buffer
4131                     */
4132                     UChar *tempbuffer = source->writableBuffer +
4133                         (source->writableBufSize - 1);
4134                     *(tempbuffer) = 0;
4135                     if (T != TBase) {
4136                         *(tempbuffer - 1) = (UChar)T;
4137                         *(tempbuffer - 2) = (UChar)V;
4138                         *(tempbuffer - 3) = (UChar)L;
4139                         *(tempbuffer - 4) = 0;
4140                     } else {
4141                         *(tempbuffer - 1) = (UChar)V;
4142                         *(tempbuffer - 2) = (UChar)L;
4143                         *(tempbuffer - 3) = 0;
4144                     }
4145
4146                     /*
4147                     Indicate where to continue in main input string after exhausting
4148                     the writableBuffer
4149                     */
4150                     if (source->pos  == source->string) {
4151                         source->fcdPosition = NULL;
4152                     } else {
4153                         source->fcdPosition       = source->pos-1;
4154                     }
4155
4156                     source->pos               = tempbuffer;
4157                     source->origFlags         = source->flags;
4158                     source->flags            |= UCOL_ITER_INNORMBUF;
4159                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4160
4161                     return(UCOL_IGNORABLE);
4162                 }
4163             }
4164
4165         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4166 #if 0
4167             if (source->offsetBuffer == NULL) {
4168                 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
4169                 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
4170                 source->offsetStore = source->offsetBuffer;
4171             }
4172
4173             // **** doesn't work if using iterator ****
4174             if (source->flags & UCOL_ITER_INNORMBUF) {
4175               source->offsetRepeatCount = 1;
4176             } else {
4177               int32_t firstOffset = (int32_t)(source->pos - source->string);
4178
4179               *(source->offsetStore++) = firstOffset;
4180               *(source->offsetStore++) = firstOffset + 1;
4181
4182                 source->offsetReturn = source->offsetStore - 1;
4183                 if (source->offsetReturn == source->offsetBuffer) {
4184                     source->offsetStore = source->offsetBuffer;
4185                 }
4186             }
4187 #endif
4188
4189             return getPrevImplicit(ch, source);
4190
4191             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4192         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4193             return getPrevImplicit(ch, source);
4194
4195         case SURROGATE_TAG:  /* This is a surrogate pair */
4196             /* essentialy an engaged lead surrogate. */
4197             /* if you have encountered it here, it means that a */
4198             /* broken sequence was encountered and this is an error */
4199             return 0;
4200
4201         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4202             return 0; /* broken surrogate sequence */
4203
4204         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4205             {
4206                 UChar32 cp = 0;
4207                 UChar  prevChar;
4208                 UChar *prev;
4209                 if (isAtStartPrevIterate(source)) {
4210                     /* we are at the start of the string, wrong place to be at */
4211                     return 0;
4212                 }
4213                 if (source->pos != source->writableBuffer) {
4214                     prev     = source->pos - 1;
4215                 } else {
4216                     prev     = source->fcdPosition;
4217                 }
4218                 prevChar = *prev;
4219
4220                 /* Handles Han and Supplementary characters here.*/
4221                 if (U16_IS_LEAD(prevChar)) {
4222                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4223                     source->pos = prev;
4224                 } else {
4225                     return 0; /* completely ignorable */
4226                 }
4227
4228                 return getPrevImplicit(cp, source);
4229             }
4230
4231             /* UCA is filled with these. Tailorings are NOT_FOUND */
4232             /* not yet implemented */
4233         case CHARSET_TAG:  /* this tag always returns */
4234             /* probably after 1.8 */
4235             return UCOL_NOT_FOUND;
4236
4237         default:           /* this tag always returns */
4238             *status = U_INTERNAL_PROGRAM_ERROR;
4239             CE=0;
4240             break;
4241         }
4242
4243         if (CE <= UCOL_NOT_FOUND) {
4244             break;
4245         }
4246     }
4247
4248     return CE;
4249 }
4250
4251 /* This should really be a macro        */
4252 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4253 /* anyway */
4254 static
4255 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4256 #ifdef UCOL_DEBUG
4257     fprintf(stderr, ".");
4258 #endif
4259     uint8_t *newStart = NULL;
4260     uint32_t offset = *secondaries-secStart;
4261
4262     if(secStart==second) {
4263         newStart=(uint8_t*)uprv_malloc(newSize);
4264         if(newStart==NULL) {
4265             *status = U_MEMORY_ALLOCATION_ERROR;
4266             return NULL;
4267         }
4268         uprv_memcpy(newStart, secStart, *secondaries-secStart);
4269     } else {
4270         newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4271         if(newStart==NULL) {
4272             *status = U_MEMORY_ALLOCATION_ERROR;
4273             /* Since we're reallocating, return original reference so we don't loose it. */
4274             return secStart;
4275         }
4276     }
4277     *secondaries=newStart+offset;
4278     *secSize=newSize;
4279     return newStart;
4280 }
4281
4282
4283 /* This should really be a macro                                                                      */
4284 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4285 /* secondaries in French                                                                              */
4286 /*
4287 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4288   uint8_t temp;
4289   while(start<end) {
4290     temp = *start;
4291     *start++ = *end;
4292     *end-- = temp;
4293   }
4294 }
4295 */
4296
4297 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4298   TYPE tempA; \
4299 while((start)<(end)) { \
4300     tempA = *(start); \
4301     *(start)++ = *(end); \
4302     *(end)-- = tempA; \
4303 } \
4304 }
4305
4306 /****************************************************************************/
4307 /* Following are the sortkey generation functions                           */
4308 /*                                                                          */
4309 /****************************************************************************/
4310
4311 /**
4312  * Merge two sort keys.
4313  * This is useful, for example, to combine sort keys from first and last names
4314  * to sort such pairs.
4315  * Merged sort keys consider on each collation level the first part first entirely,
4316  * then the second one.
4317  * It is possible to merge multiple sort keys by consecutively merging
4318  * another one with the intermediate result.
4319  *
4320  * The length of the merge result is the sum of the lengths of the input sort keys
4321  * minus 1.
4322  *
4323  * @param src1 the first sort key
4324  * @param src1Length the length of the first sort key, including the zero byte at the end;
4325  *        can be -1 if the function is to find the length
4326  * @param src2 the second sort key
4327  * @param src2Length the length of the second sort key, including the zero byte at the end;
4328  *        can be -1 if the function is to find the length
4329  * @param dest the buffer where the merged sort key is written,
4330  *        can be NULL if destCapacity==0
4331  * @param destCapacity the number of bytes in the dest buffer
4332  * @return the length of the merged sort key, src1Length+src2Length-1;
4333  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4334  *         in which cases the contents of dest is undefined
4335  *
4336  * @draft
4337  */
4338 U_CAPI int32_t U_EXPORT2
4339 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4340                    const uint8_t *src2, int32_t src2Length,
4341                    uint8_t *dest, int32_t destCapacity) {
4342     int32_t destLength;
4343     uint8_t b;
4344
4345     /* check arguments */
4346     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4347         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4348         destCapacity<0 || (destCapacity>0 && dest==NULL)
4349     ) {
4350         /* error, attempt to write a zero byte and return 0 */
4351         if(dest!=NULL && destCapacity>0) {
4352             *dest=0;
4353         }
4354         return 0;
4355     }
4356
4357     /* check lengths and capacity */
4358     if(src1Length<0) {
4359         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4360     }
4361     if(src2Length<0) {
4362         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4363     }
4364
4365     destLength=src1Length+src2Length-1;
4366     if(destLength>destCapacity) {
4367         /* the merged sort key does not fit into the destination */
4368         return destLength;
4369     }
4370
4371     /* merge the sort keys with the same number of levels */
4372     while(*src1!=0 && *src2!=0) { /* while both have another level */
4373         /* copy level from src1 not including 00 or 01 */
4374         while((b=*src1)>=2) {
4375             ++src1;
4376             *dest++=b;
4377         }
4378
4379         /* add a 02 merge separator */
4380         *dest++=2;
4381
4382         /* copy level from src2 not including 00 or 01 */
4383         while((b=*src2)>=2) {
4384             ++src2;
4385             *dest++=b;
4386         }
4387
4388         /* if both sort keys have another level, then add a 01 level separator and continue */
4389         if(*src1==1 && *src2==1) {
4390             ++src1;
4391             ++src2;
4392             *dest++=1;
4393         }
4394     }
4395
4396     /*
4397      * here, at least one sort key is finished now, but the other one
4398      * might have some contents left from containing more levels;
4399      * that contents is just appended to the result
4400      */
4401     if(*src1!=0) {
4402         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4403         src2=src1;
4404     }
4405     /* append src2, "the other, unfinished sort key" */
4406     uprv_strcpy((char *)dest, (const char *)src2);
4407
4408     /* trust that neither sort key contained illegally embedded zero bytes */
4409     return destLength;
4410 }
4411
4412 /* sortkey API */
4413 U_CAPI int32_t U_EXPORT2
4414 ucol_getSortKey(const    UCollator    *coll,
4415         const    UChar        *source,
4416         int32_t        sourceLength,
4417         uint8_t        *result,
4418         int32_t        resultLength)
4419 {
4420     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4421     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4422         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4423             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4424     }
4425
4426     UErrorCode status = U_ZERO_ERROR;
4427     int32_t keySize   = 0;
4428
4429     if(source != NULL) {
4430         // source == NULL is actually an error situation, but we would need to
4431         // have an error code to return it. Until we introduce a new
4432         // API, it stays like this
4433
4434         /* this uses the function pointer that is set in updateinternalstate */
4435         /* currently, there are two funcs: */
4436         /*ucol_calcSortKey(...);*/
4437         /*ucol_calcSortKeySimpleTertiary(...);*/
4438
4439         keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4440         //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) {
4441             // That's not good. Something unusual happened.
4442             // We don't know how much we initialized before we failed.
4443             // NULL terminate for safety.
4444             // We have no way say that we have generated a partial sort key.
4445             //result[0] = 0;
4446             //keySize = 0;
4447         //}
4448     }
4449     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4450     UTRACE_EXIT_STATUS(status);
4451     return keySize;
4452 }
4453
4454 /* this function is called by the C++ API for sortkey generation */
4455 U_CFUNC int32_t
4456 ucol_getSortKeyWithAllocation(const UCollator *coll,
4457                               const UChar *source, int32_t sourceLength,
4458                               uint8_t **pResult,
4459                               UErrorCode *pErrorCode) {
4460     *pResult = 0;
4461     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4462 }
4463
4464 #define UCOL_FSEC_BUF_SIZE 256
4465
4466 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4467 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
4468 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4469     UErrorCode status = U_ZERO_ERROR;
4470     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4471     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4472     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4473     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4474     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4475     UBool  doCase = (coll->caseLevel == UCOL_ON);
4476     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4477     //UBool  qShifted = shifted  && (compareQuad == 0);
4478     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4479     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4480     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4481     uint8_t *fSecs = fSecsBuff;
4482     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4483     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4484
4485     uint32_t variableTopValue = coll->variableTopValue;
4486     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4487     if(doHiragana) {
4488         UCOL_COMMON_BOT4++;
4489         /* allocate one more space for hiragana */
4490     }
4491     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4492
4493     uint32_t order = UCOL_NO_MORE_CES;
4494     uint8_t primary1 = 0;
4495     uint8_t primary2 = 0;
4496     uint8_t secondary = 0;
4497     uint8_t tertiary = 0;
4498     int32_t caseShift = 0;
4499     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4500
4501     uint8_t caseSwitch = coll->caseSwitch;
4502     uint8_t tertiaryMask = coll->tertiaryMask;
4503     uint8_t tertiaryCommon = coll->tertiaryCommon;
4504
4505     UBool wasShifted = FALSE;
4506     UBool notIsContinuation = FALSE;
4507     uint8_t leadPrimary = 0;
4508
4509
4510     for(;;) {
4511         order = ucol_IGetNextCE(coll, s, &status);
4512         if(order == UCOL_NO_MORE_CES) {
4513             break;
4514         }
4515
4516         if(order == 0) {
4517             continue;
4518         }
4519
4520         notIsContinuation = !isContinuation(order);
4521
4522
4523         if(notIsContinuation) {
4524             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4525         } else {
4526             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4527         }
4528         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4529         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4530         primary1 = (uint8_t)(order >> 8);
4531
4532
4533         if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4534             || (!notIsContinuation && wasShifted))
4535             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4536                 /* and other ignorables should be removed if following a shifted code point */
4537                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4538                     /* we should just completely ignore it */
4539                     continue;
4540                 }
4541                 if(compareQuad == 0) {
4542                     if(c4 > 0) {
4543                         currentSize += (c2/UCOL_BOT_COUNT4)+1;
4544                         c4 = 0;
4545                     }
4546                     currentSize++;
4547                     if(primary2 != 0) {
4548                         currentSize++;
4549                     }
4550                 }
4551                 wasShifted = TRUE;
4552         } else {
4553             wasShifted = FALSE;
4554             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4555             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4556             /* calculate sortkey size */
4557             if(primary1 != UCOL_IGNORABLE) {
4558                 if(notIsContinuation) {
4559                     if(leadPrimary == primary1) {
4560                         currentSize++;
4561                     } else {
4562                         if(leadPrimary != 0) {
4563                             currentSize++;
4564                         }
4565                         if(primary2 == UCOL_IGNORABLE) {
4566                             /* one byter, not compressed */
4567                             currentSize++;
4568                             leadPrimary = 0;
4569                         }
4570                         else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4571                             //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4572                             //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4573                             (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
4574                         {
4575                             /* not compressible */
4576                             leadPrimary = 0;
4577                             currentSize+=2;
4578                         }
4579                         else { /* compress */
4580                             leadPrimary = primary1;
4581                             currentSize+=2;
4582                         }
4583                     }
4584                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4585                     currentSize++;
4586                     if(primary2 != UCOL_IGNORABLE) {
4587                         currentSize++;
4588                     }
4589                 }
4590             }
4591
4592             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4593                 if(!isFrenchSec){
4594                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
4595                         c2++;
4596                     } else {
4597                         if(c2 > 0) {
4598                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4599                                 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4600                             } else {
4601                                 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4602                             }
4603                             c2 = 0;
4604                         }
4605                         currentSize++;
4606                     }
4607                 } else {
4608                     fSecs[fSecsLen++] = secondary;
4609                     if(fSecsLen == fSecsMaxLen) {
4610                         uint8_t *fSecsTemp;
4611                         if(fSecs == fSecsBuff) {
4612                             fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen);
4613                         } else {
4614                             fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4615                         }
4616                         if(fSecsTemp == NULL) {
4617                             status = U_MEMORY_ALLOCATION_ERROR;
4618                             return 0;
4619                         }
4620                         fSecs = fSecsTemp;
4621                         fSecsMaxLen *= 2;
4622                     }
4623                     if(notIsContinuation) {
4624                         if (frenchStartPtr != NULL) {
4625                             /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4626                             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4627                             frenchStartPtr = NULL;
4628                         }
4629                     } else {
4630                         if (frenchStartPtr == NULL) {
4631                             frenchStartPtr = fSecs+fSecsLen-2;
4632                         }
4633                         frenchEndPtr = fSecs+fSecsLen-1;
4634                     }
4635                 }
4636             }
4637
4638             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4639                 // do the case level if we need to do it. We don't want to calculate
4640                 // case level for primary ignorables if we have only primary strength and case level
4641                 // otherwise we would break well formedness of CEs
4642                 if (caseShift  == 0) {
4643                     currentSize++;
4644                     caseShift = UCOL_CASE_SHIFT_START;
4645                 }
4646                 if((tertiary&0x3F) > 0 && notIsContinuation) {
4647                     caseShift--;
4648                     if((tertiary &0xC0) != 0) {
4649                         if (caseShift  == 0) {
4650                             currentSize++;
4651                             caseShift = UCOL_CASE_SHIFT_START;
4652                         }
4653                         caseShift--;
4654                     }
4655                 }
4656             } else {
4657                 if(notIsContinuation) {
4658                     tertiary ^= caseSwitch;
4659                 }
4660             }
4661
4662             tertiary &= tertiaryMask;
4663             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4664                 if (tertiary == tertiaryCommon && notIsContinuation) {
4665                     c3++;
4666                 } else {
4667                     if(c3 > 0) {
4668                         if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4669                             || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4670                                 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4671                         } else {
4672                             currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4673                         }
4674                         c3 = 0;
4675                     }
4676                     currentSize++;
4677                 }
4678             }
4679
4680             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4681                 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4682                     if(c4>0) { // Close this part
4683                         currentSize += (c4/UCOL_BOT_COUNT4)+1;
4684                         c4 = 0;
4685                     }
4686                     currentSize++; // Add the Hiragana
4687                 } else { // This wasn't Hiragana, so we can continue adding stuff
4688                     c4++;
4689                 }
4690             }
4691         }
4692     }
4693
4694     if(!isFrenchSec){
4695         if(c2 > 0) {
4696             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4697         }
4698     } else {
4699         uint32_t i = 0;
4700         if(frenchStartPtr != NULL) {
4701             uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4702         }
4703         for(i = 0; i<fSecsLen; i++) {
4704             secondary = *(fSecs+fSecsLen-i-1);
4705             /* This is compression code. */
4706             if (secondary == UCOL_COMMON2) {
4707                 ++c2;
4708             } else {
4709                 if(c2 > 0) {
4710                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4711                         currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4712                     } else {
4713                         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4714                     }
4715                     c2 = 0;
4716                 }
4717                 currentSize++;
4718             }
4719         }
4720         if(c2 > 0) {
4721             currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4722         }
4723         if(fSecs != fSecsBuff) {
4724             uprv_free(fSecs);
4725         }
4726     }
4727
4728     if(c3 > 0) {
4729         currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4730     }
4731
4732     if(c4 > 0  && compareQuad == 0) {
4733         currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4734     }
4735
4736     if(compareIdent) {
4737         currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4738     }
4739     return currentSize;
4740 }
4741
4742 static
4743 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4744     if (caseShift  == 0) {
4745         *(*cases)++ = UCOL_CASE_BYTE_START;
4746         caseShift = UCOL_CASE_SHIFT_START;
4747     }
4748 }
4749
4750 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4751 // know how many values we wanted to add, even if we didn't add them all
4752 static
4753 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4754     size++;
4755     if(primaries < limit) {
4756         *(primaries)++ = value;
4757     }
4758 }
4759
4760 // Packs the secondary buffer when processing French locale. Adds the terminator.
4761 static
4762 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4763     uint8_t secondary;
4764     int32_t count2 = 0;
4765     uint32_t i = 0, size = 0;
4766     // we use i here since the key size already accounts for terminators, so we'll discard the increment
4767     addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4768     /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4769     if(frenchStartPtr != NULL) {
4770         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4771     }
4772     for(i = 0; i<*secsize; i++) {
4773         secondary = *(secondaries-i-1);
4774         /* This is compression code. */
4775         if (secondary == UCOL_COMMON2) {
4776             ++count2;
4777         } else {
4778             if (count2 > 0) {
4779                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4780                     while (count2 > UCOL_TOP_COUNT2) {
4781                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4782                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4783                     }
4784                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4785                 } else {
4786                     while (count2 > UCOL_BOT_COUNT2) {
4787                         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4788                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4789                     }
4790                     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4791                 }
4792                 count2 = 0;
4793             }
4794             addWithIncrement(primaries, primEnd, size, secondary);
4795         }
4796     }
4797     if (count2 > 0) {
4798         while (count2 > UCOL_BOT_COUNT2) {
4799             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4800             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4801         }
4802         addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4803     }
4804     *secsize = size;
4805     return primaries;
4806 }
4807
4808 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4809
4810 /* This is the sortkey work horse function */
4811 U_CFUNC int32_t U_CALLCONV
4812 ucol_calcSortKey(const    UCollator    *coll,
4813         const    UChar        *source,
4814         int32_t        sourceLength,
4815         uint8_t        **result,
4816         uint32_t        resultLength,
4817         UBool allocateSKBuffer,
4818         UErrorCode *status)
4819 {
4820     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4821
4822     uint32_t i = 0; /* general purpose counter */
4823
4824     /* Stack allocated buffers for buffers we use */
4825     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4826
4827     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4828
4829     if(U_FAILURE(*status)) {
4830         return 0;
4831     }
4832
4833     if(primaries == NULL && allocateSKBuffer == TRUE) {
4834         primaries = *result = prim;
4835         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4836     }
4837
4838     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4839       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4840
4841     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4842
4843     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4844     UChar *normSource = normBuffer;
4845     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4846
4847     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4848
4849     UColAttributeValue strength = coll->strength;
4850
4851     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4852     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4853     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4854     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4855     UBool  doCase = (coll->caseLevel == UCOL_ON);
4856     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4857     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4858     //UBool  qShifted = shifted && (compareQuad == 0);
4859     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4860     /*const uint8_t *scriptOrder = coll->scriptOrder;*/
4861
4862     uint32_t variableTopValue = coll->variableTopValue;
4863     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4864     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4865     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4866     uint8_t UCOL_HIRAGANA_QUAD = 0;
4867     if(doHiragana) {
4868         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4869         /* allocate one more space for hiragana, value for hiragana */
4870     }
4871     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4872
4873     /* support for special features like caselevel and funky secondaries */
4874     uint8_t *frenchStartPtr = NULL;
4875     uint8_t *frenchEndPtr = NULL;
4876     uint32_t caseShift = 0;
4877
4878     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4879
4880     /* If we need to normalize, we'll do it all at once at the beginning! */
4881     UNormalizationMode normMode;
4882     if(compareIdent) {
4883         normMode = UNORM_NFD;
4884     } else if(coll->normalizationMode != UCOL_OFF) {
4885         normMode = UNORM_FCD;
4886     } else {
4887         normMode = UNORM_NONE;
4888     }
4889
4890     if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4891         len = unorm_internalNormalize(normSource, normSourceLen,
4892                                       source, len,
4893                                       normMode, FALSE,
4894                                       status);
4895         if(*status == U_BUFFER_OVERFLOW_ERROR) {
4896             normSourceLen = len;
4897             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4898             if(normSource == NULL) {
4899                 *status = U_MEMORY_ALLOCATION_ERROR;
4900                 return 0;
4901             }
4902             *status = U_ZERO_ERROR;
4903             len = unorm_internalNormalize(normSource, normSourceLen,
4904                                           source, len,
4905                                           normMode, FALSE,
4906                                           status);
4907         }
4908
4909         if(U_FAILURE(*status)) {
4910             return 0;
4911         }
4912         source = normSource;
4913     }
4914
4915     collIterate s;
4916     IInit_collIterate(coll, (UChar *)source, len, &s);
4917     if(source == normSource) {
4918         s.flags &= ~UCOL_ITER_NORM;
4919     }
4920
4921     if(resultLength == 0 || primaries == NULL) {
4922         int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4923         if(normSource != normBuffer) {
4924             uprv_free(normSource);
4925         }
4926         return keyLen;
4927     }
4928     uint8_t *primarySafeEnd = primaries + resultLength - 1;
4929     if(strength > UCOL_PRIMARY) {
4930         primarySafeEnd--;
4931     }
4932
4933     uint32_t minBufferSize = UCOL_MAX_BUFFER;
4934
4935     uint8_t *primStart = primaries;
4936     uint8_t *secStart = secondaries;
4937     uint8_t *terStart = tertiaries;
4938     uint8_t *caseStart = cases;
4939     uint8_t *quadStart = quads;
4940
4941     uint32_t order = 0;
4942
4943     uint8_t primary1 = 0;
4944     uint8_t primary2 = 0;
4945     uint8_t secondary = 0;
4946     uint8_t tertiary = 0;
4947     uint8_t caseSwitch = coll->caseSwitch;
4948     uint8_t tertiaryMask = coll->tertiaryMask;
4949     int8_t tertiaryAddition = coll->tertiaryAddition;
4950     uint8_t tertiaryTop = coll->tertiaryTop;
4951     uint8_t tertiaryBottom = coll->tertiaryBottom;
4952     uint8_t tertiaryCommon = coll->tertiaryCommon;
4953     uint8_t caseBits = 0;
4954
4955     UBool finished = FALSE;
4956     UBool wasShifted = FALSE;
4957     UBool notIsContinuation = FALSE;
4958
4959     uint32_t prevBuffSize = 0;
4960
4961     uint32_t count2 = 0, count3 = 0, count4 = 0;
4962     uint8_t leadPrimary = 0;
4963
4964     for(;;) {
4965         for(i=prevBuffSize; i<minBufferSize; ++i) {
4966
4967             order = ucol_IGetNextCE(coll, &s, status);
4968             if(order == UCOL_NO_MORE_CES) {
4969                 finished = TRUE;
4970                 break;
4971             }
4972
4973             if(order == 0) {
4974                 continue;
4975             }
4976
4977             notIsContinuation = !isContinuation(order);
4978
4979             if(notIsContinuation) {
4980                 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4981             } else {
4982                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4983             }
4984
4985             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4986             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4987             primary1 = (uint8_t)(order >> 8);
4988
4989             /*if(notIsContinuation && scriptOrder != NULL) {
4990             primary1 = scriptOrder[primary1];
4991             }*/
4992
4993             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4994                 || (!notIsContinuation && wasShifted))
4995                 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4996             {
4997                 /* and other ignorables should be removed if following a shifted code point */
4998                 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4999                     /* we should just completely ignore it */
5000                     continue;
5001                 }
5002                 if(compareQuad == 0) {
5003                     if(count4 > 0) {
5004                         while (count4 > UCOL_BOT_COUNT4) {
5005                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5006                             count4 -= UCOL_BOT_COUNT4;
5007                         }
5008                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5009                         count4 = 0;
5010                     }
5011                     /* We are dealing with a variable and we're treating them as shifted */
5012                     /* This is a shifted ignorable */
5013                     if(primary1 != 0) { /* we need to check this since we could be in continuation */
5014                         *quads++ = primary1;
5015                     }
5016                     if(primary2 != 0) {
5017                         *quads++ = primary2;
5018                     }
5019                 }
5020                 wasShifted = TRUE;
5021             } else {
5022                 wasShifted = FALSE;
5023                 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5024                 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5025                 /* regular and simple sortkey calc */
5026                 if(primary1 != UCOL_IGNORABLE) {
5027                     if(notIsContinuation) {
5028                         if(leadPrimary == primary1) {
5029                             *primaries++ = primary2;
5030                         } else {
5031                             if(leadPrimary != 0) {
5032                                 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5033                             }
5034                             if(primary2 == UCOL_IGNORABLE) {
5035                                 /* one byter, not compressed */
5036                                 *primaries++ = primary1;
5037                                 leadPrimary = 0;
5038                             } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5039                                 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5040                                 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5041                                     /* not compressible */
5042                                     leadPrimary = 0;
5043                                     *primaries++ = primary1;
5044                                     if(primaries <= primarySafeEnd) {
5045                                         *primaries++ = primary2;
5046                                     }
5047                             } else { /* compress */
5048                                 *primaries++ = leadPrimary = primary1;
5049                                 if(primaries <= primarySafeEnd) {
5050                                     *primaries++ = primary2;
5051                                 }
5052                             }
5053                         }
5054                     } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5055                         *primaries++ = primary1;
5056                         if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) {
5057                                 *primaries++ = primary2; /* second part */
5058                         }
5059                     }
5060                 }
5061
5062                 if(secondary > compareSec) {
5063                     if(!isFrenchSec) {
5064                         /* This is compression code. */
5065                         if (secondary == UCOL_COMMON2 && notIsContinuation) {
5066                             ++count2;
5067                         } else {
5068                             if (count2 > 0) {
5069                                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5070                                     while (count2 > UCOL_TOP_COUNT2) {
5071                                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5072                                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
5073                                     }
5074                                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5075                                 } else {
5076                                     while (count2 > UCOL_BOT_COUNT2) {
5077                                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5078                                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
5079                                     }
5080                                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5081                                 }
5082                                 count2 = 0;
5083                             }
5084                             *secondaries++ = secondary;
5085                         }
5086                     } else {
5087                         *secondaries++ = secondary;
5088                         /* Do the special handling for French secondaries */
5089                         /* We need to get continuation elements and do intermediate restore */
5090                         /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5091                         if(notIsContinuation) {
5092                             if (frenchStartPtr != NULL) {
5093                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5094                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5095                                 frenchStartPtr = NULL;
5096                             }
5097                         } else {
5098                             if (frenchStartPtr == NULL) {
5099                                 frenchStartPtr = secondaries - 2;
5100                             }
5101                             frenchEndPtr = secondaries-1;
5102                         }
5103                     }
5104                 }
5105
5106                 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
5107                     // do the case level if we need to do it. We don't want to calculate
5108                     // case level for primary ignorables if we have only primary strength and case level
5109                     // otherwise we would break well formedness of CEs
5110                     doCaseShift(&cases, caseShift);
5111                     if(notIsContinuation) {
5112                         caseBits = (uint8_t)(tertiary & 0xC0);
5113
5114                         if(tertiary != 0) {
5115                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
5116                                 if((caseBits & 0xC0) == 0) {
5117                                     *(cases-1) |= 1 << (--caseShift);
5118                                 } else {
5119                                     *(cases-1) |= 0 << (--caseShift);
5120                                     /* second bit */
5121                                     doCaseShift(&cases, caseShift);
5122                                     *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
5123                                 }
5124                             } else {
5125                                 if((caseBits & 0xC0) == 0) {
5126                                     *(cases-1) |= 0 << (--caseShift);
5127                                 } else {
5128                                     *(cases-1) |= 1 << (--caseShift);
5129                                     /* second bit */
5130                                     doCaseShift(&cases, caseShift);
5131                                     *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
5132                                 }
5133                             }
5134                         }
5135
5136                     }
5137                 } else {
5138                     if(notIsContinuation) {
5139                         tertiary ^= caseSwitch;
5140                     }
5141                 }
5142
5143                 tertiary &= tertiaryMask;
5144                 if(tertiary > compareTer) {
5145                     /* This is compression code. */
5146                     /* sequence size check is included in the if clause */
5147                     if (tertiary == tertiaryCommon && notIsContinuation) {
5148                         ++count3;
5149                     } else {
5150                         if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5151                             tertiary += tertiaryAddition;
5152                         } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5153                             tertiary -= tertiaryAddition;
5154                         }
5155                         if (count3 > 0) {
5156                             if ((tertiary > tertiaryCommon)) {
5157                                 while (count3 > coll->tertiaryTopCount) {
5158                                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5159                                     count3 -= (uint32_t)coll->tertiaryTopCount;
5160                                 }
5161                                 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5162                             } else {
5163                                 while (count3 > coll->tertiaryBottomCount) {
5164                                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5165                                     count3 -= (uint32_t)coll->tertiaryBottomCount;
5166                                 }
5167                                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5168                             }
5169                             count3 = 0;
5170                         }
5171                         *tertiaries++ = tertiary;
5172                     }
5173                 }
5174
5175                 if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
5176                     if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5177                         if(count4>0) { // Close this part
5178                             while (count4 > UCOL_BOT_COUNT4) {
5179                                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5180                                 count4 -= UCOL_BOT_COUNT4;
5181                             }
5182                             *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5183                             count4 = 0;
5184                         }
5185                         *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5186                     } else { // This wasn't Hiragana, so we can continue adding stuff
5187                         count4++;
5188                     }
5189                 }
5190             }
5191
5192             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5193                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5194                     IInit_collIterate(coll, (UChar *)source, len, &s);
5195                     if(source == normSource) {
5196                         s.flags &= ~UCOL_ITER_NORM;
5197                     }
5198                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5199                     *status = U_BUFFER_OVERFLOW_ERROR;
5200                     finished = TRUE;
5201                     break;
5202                 } else { /* It's much nicer if we can actually reallocate */
5203                     int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
5204                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5205                     if(U_SUCCESS(*status)) {
5206                         *result = primStart;
5207                         primarySafeEnd = primStart + resultLength - 1;
5208                         if(strength > UCOL_PRIMARY) {
5209                             primarySafeEnd--;
5210                         }
5211                     } else {
5212                         /* We ran out of memory!? We can't recover. */
5213                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5214                         finished = TRUE;
5215                         break;
5216                     }
5217                 }
5218             }
5219         }
5220         if(finished) {
5221             break;
5222         } else {
5223             prevBuffSize = minBufferSize;
5224
5225             uint32_t frenchStartOffset = 0, frenchEndOffset = 0;
5226             if (frenchStartPtr != NULL) {
5227                 frenchStartOffset = frenchStartPtr - secStart;
5228                 frenchEndOffset = frenchEndPtr - secStart;
5229             }
5230             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5231             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5232             caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5233             quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5234             if(U_FAILURE(*status)) {
5235                 /* We ran out of memory!? We can't recover. */
5236                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5237                 break;
5238             }
5239             if (frenchStartPtr != NULL) {
5240                 frenchStartPtr = secStart + frenchStartOffset;
5241                 frenchEndPtr = secStart + frenchEndOffset;
5242             }
5243             minBufferSize *= 2;
5244         }
5245     }
5246
5247     /* Here, we are generally done with processing */
5248     /* bailing out would not be too productive */
5249
5250     if(U_SUCCESS(*status)) {
5251         sortKeySize += (primaries - primStart);
5252         /* we have done all the CE's, now let's put them together to form a key */
5253         if(compareSec == 0) {
5254             if (count2 > 0) {
5255                 while (count2 > UCOL_BOT_COUNT2) {
5256                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5257                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
5258                 }
5259                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5260             }
5261             uint32_t secsize = secondaries-secStart;
5262             if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5263                 sortKeySize += secsize;
5264                 if(sortKeySize <= resultLength) {
5265                     *(primaries++) = UCOL_LEVELTERMINATOR;
5266                     uprv_memcpy(primaries, secStart, secsize);
5267                     primaries += secsize;
5268                 } else {
5269                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5270                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5271                         if(U_SUCCESS(*status)) {
5272                             *result = primStart;
5273                             *(primaries++) = UCOL_LEVELTERMINATOR;
5274                             uprv_memcpy(primaries, secStart, secsize);
5275                             primaries += secsize;
5276                         }
5277                         else {
5278                             /* We ran out of memory!? We can't recover. */
5279                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5280                             goto cleanup;
5281                         }
5282                     } else {
5283                         *status = U_BUFFER_OVERFLOW_ERROR;
5284                     }
5285                 }
5286             } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5287                 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5288                 sortKeySize += secsize;
5289                 if(sortKeySize <= resultLength) { // if we managed to pack fine
5290                     primaries = newPrim; // update the primary pointer
5291                 } else { // overflow, need to reallocate and redo
5292                     if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5293                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5294                         if(U_SUCCESS(*status)) {
5295                             primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5296                         }
5297                         else {
5298                             /* We ran out of memory!? We can't recover. */
5299                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5300                             goto cleanup;
5301                         }
5302                     } else {
5303                         *status = U_BUFFER_OVERFLOW_ERROR;
5304                     }
5305                 }
5306             }
5307         }
5308
5309         if(doCase) {
5310             uint32_t casesize = cases - caseStart;
5311             sortKeySize += casesize;
5312             if(sortKeySize <= resultLength) {
5313                 *(primaries++) = UCOL_LEVELTERMINATOR;
5314                 uprv_memcpy(primaries, caseStart, casesize);
5315                 primaries += casesize;
5316             } else {
5317                 if(allocateSKBuffer == TRUE) {
5318                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5319                     if(U_SUCCESS(*status)) {
5320                         *result = primStart;
5321                         *(primaries++) = UCOL_LEVELTERMINATOR;
5322                         uprv_memcpy(primaries, caseStart, casesize);
5323                     }
5324                     else {
5325                         /* We ran out of memory!? We can't recover. */
5326                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5327                         goto cleanup;
5328                     }
5329                 } else {
5330                     *status = U_BUFFER_OVERFLOW_ERROR;
5331                 }
5332             }
5333         }
5334
5335         if(compareTer == 0) {
5336             if (count3 > 0) {
5337                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5338                     while (count3 >= coll->tertiaryTopCount) {
5339                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5340                         count3 -= (uint32_t)coll->tertiaryTopCount;
5341                     }
5342                     *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5343                 } else {
5344                     while (count3 > coll->tertiaryBottomCount) {
5345                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5346                         count3 -= (uint32_t)coll->tertiaryBottomCount;
5347                     }
5348                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5349                 }
5350             }
5351             uint32_t tersize = tertiaries - terStart;
5352             sortKeySize += tersize;
5353             if(sortKeySize <= resultLength) {
5354                 *(primaries++) = UCOL_LEVELTERMINATOR;
5355                 uprv_memcpy(primaries, terStart, tersize);
5356                 primaries += tersize;
5357             } else {
5358                 if(allocateSKBuffer == TRUE) {
5359                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5360                     if(U_SUCCESS(*status)) {
5361                         *result = primStart;
5362                         *(primaries++) = UCOL_LEVELTERMINATOR;
5363                         uprv_memcpy(primaries, terStart, tersize);
5364                     }
5365                     else {
5366                         /* We ran out of memory!? We can't recover. */
5367                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5368                         goto cleanup;
5369                     }
5370                 } else {
5371                     *status = U_BUFFER_OVERFLOW_ERROR;
5372                 }
5373             }
5374
5375             if(compareQuad == 0/*qShifted == TRUE*/) {
5376                 if(count4 > 0) {
5377                     while (count4 > UCOL_BOT_COUNT4) {
5378                         *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5379                         count4 -= UCOL_BOT_COUNT4;
5380                     }
5381                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5382                 }
5383                 uint32_t quadsize = quads - quadStart;
5384                 sortKeySize += quadsize;
5385                 if(sortKeySize <= resultLength) {
5386                     *(primaries++) = UCOL_LEVELTERMINATOR;
5387                     uprv_memcpy(primaries, quadStart, quadsize);
5388                     primaries += quadsize;
5389                 } else {
5390                     if(allocateSKBuffer == TRUE) {
5391                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5392                         if(U_SUCCESS(*status)) {
5393                             *result = primStart;
5394                             *(primaries++) = UCOL_LEVELTERMINATOR;
5395                             uprv_memcpy(primaries, quadStart, quadsize);
5396                         }
5397                         else {
5398                             /* We ran out of memory!? We can't recover. */
5399                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5400                             goto cleanup;
5401                         }
5402                     } else {
5403                         *status = U_BUFFER_OVERFLOW_ERROR;
5404                     }
5405                 }
5406             }
5407
5408             if(compareIdent) {
5409                 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5410                 if(sortKeySize <= resultLength) {
5411                     *(primaries++) = UCOL_LEVELTERMINATOR;
5412                     primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5413                 } else {
5414                     if(allocateSKBuffer == TRUE) {
5415                         primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5416                         if(U_SUCCESS(*status)) {
5417                             *result = primStart;
5418                             *(primaries++) = UCOL_LEVELTERMINATOR;
5419                             u_writeIdenticalLevelRun(s.string, len, primaries);
5420                         }
5421                         else {
5422                             /* We ran out of memory!? We can't recover. */
5423                             sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5424                             goto cleanup;
5425                         }
5426                     } else {
5427                         *status = U_BUFFER_OVERFLOW_ERROR;
5428                     }
5429                 }
5430             }
5431         }
5432         *(primaries++) = '\0';
5433     }
5434
5435     if(allocateSKBuffer == TRUE) {
5436         *result = (uint8_t*)uprv_malloc(sortKeySize);
5437         /* test for NULL */
5438         if (*result == NULL) {
5439             *status = U_MEMORY_ALLOCATION_ERROR;
5440             goto cleanup;
5441         }
5442         uprv_memcpy(*result, primStart, sortKeySize);
5443         if(primStart != prim) {
5444             uprv_free(primStart);
5445         }
5446     }
5447
5448 cleanup:
5449     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5450         /* NULL terminate for safety */
5451         **result = 0;
5452     }
5453     if(terStart != tert) {
5454         uprv_free(terStart);
5455         uprv_free(secStart);
5456         uprv_free(caseStart);
5457         uprv_free(quadStart);
5458     }
5459
5460     /* To avoid memory leak, free the offset buffer if necessary. */
5461     freeOffsetBuffer(&s);
5462
5463     if(normSource != normBuffer) {
5464         uprv_free(normSource);
5465     }
5466
5467     return sortKeySize;
5468 }
5469
5470
5471 U_CFUNC int32_t U_CALLCONV
5472 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5473         const    UChar        *source,
5474         int32_t        sourceLength,
5475         uint8_t        **result,
5476         uint32_t        resultLength,
5477         UBool allocateSKBuffer,
5478         UErrorCode *status)
5479 {
5480     U_ALIGN_CODE(16);
5481
5482     //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5483     uint32_t i = 0; /* general purpose counter */
5484
5485     /* Stack allocated buffers for buffers we use */
5486     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5487
5488     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5489
5490     if(U_FAILURE(*status)) {
5491         return 0;
5492     }
5493
5494     if(primaries == NULL && allocateSKBuffer == TRUE) {
5495         primaries = *result = prim;
5496         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5497     }
5498
5499     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5500
5501     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5502
5503     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5504     UChar *normSource = normBuffer;
5505     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5506
5507     int32_t len =  sourceLength;
5508
5509     /* If we need to normalize, we'll do it all at once at the beginning! */
5510     if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5511         len = unorm_internalNormalize(normSource, normSourceLen,
5512                                       source, len,
5513                                       UNORM_FCD, FALSE,
5514                                       status);
5515         if(*status == U_BUFFER_OVERFLOW_ERROR) {
5516             normSourceLen = len;
5517             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5518             if(normSource == NULL) {
5519                 *status = U_MEMORY_ALLOCATION_ERROR;
5520                 return 0;
5521             }
5522             *status = U_ZERO_ERROR;
5523             len = unorm_internalNormalize(normSource, normSourceLen,
5524                                           source, len,
5525                                           UNORM_FCD, FALSE,
5526                                           status);
5527             if(U_FAILURE(*status)) {
5528                 /* Should never happen. */
5529                 uprv_free(normSource);
5530                 normSource = normBuffer;
5531             }
5532         }
5533
5534         if(U_FAILURE(*status)) {
5535             return 0;
5536         }
5537         source = normSource;
5538     }
5539
5540     collIterate s;
5541     IInit_collIterate(coll, (UChar *)source, len, &s);
5542     if(source == normSource) {
5543         s.flags &= ~UCOL_ITER_NORM;
5544     }
5545
5546     if(resultLength == 0 || primaries == NULL) {
5547         int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5548         if(normSource != normBuffer) {
5549             uprv_free(normSource);
5550         }
5551         return t;
5552     }
5553
5554     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5555
5556     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5557
5558     uint8_t *primStart = primaries;
5559     uint8_t *secStart = secondaries;
5560     uint8_t *terStart = tertiaries;
5561
5562     uint32_t order = 0;
5563
5564     uint8_t primary1 = 0;
5565     uint8_t primary2 = 0;
5566     uint8_t secondary = 0;
5567     uint8_t tertiary = 0;
5568     uint8_t caseSwitch = coll->caseSwitch;
5569     uint8_t tertiaryMask = coll->tertiaryMask;
5570     int8_t tertiaryAddition = coll->tertiaryAddition;
5571     uint8_t tertiaryTop = coll->tertiaryTop;
5572     uint8_t tertiaryBottom = coll->tertiaryBottom;
5573     uint8_t tertiaryCommon = coll->tertiaryCommon;
5574
5575     uint32_t prevBuffSize = 0;
5576
5577     UBool finished = FALSE;
5578     UBool notIsContinuation = FALSE;
5579
5580     uint32_t count2 = 0, count3 = 0;
5581     uint8_t leadPrimary = 0;
5582
5583     for(;;) {
5584         for(i=prevBuffSize; i<minBufferSize; ++i) {
5585
5586             order = ucol_IGetNextCE(coll, &s, status);
5587
5588             if(order == 0) {
5589                 continue;
5590             }
5591
5592             if(order == UCOL_NO_MORE_CES) {
5593                 finished = TRUE;
5594                 break;
5595             }
5596
5597             notIsContinuation = !isContinuation(order);
5598
5599             if(notIsContinuation) {
5600                 tertiary = (uint8_t)((order & tertiaryMask));
5601             } else {
5602                 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5603             }
5604             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5605             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5606             primary1 = (uint8_t)(order >> 8);
5607
5608             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5609             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5610             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5611             /* regular and simple sortkey calc */
5612             if(primary1 != UCOL_IGNORABLE) {
5613                 if(notIsContinuation) {
5614                     if(leadPrimary == primary1) {
5615                         *primaries++ = primary2;
5616                     } else {
5617                         if(leadPrimary != 0) {
5618                             *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5619                         }
5620                         if(primary2 == UCOL_IGNORABLE) {
5621                             /* one byter, not compressed */
5622                             *primaries++ = primary1;
5623                             leadPrimary = 0;
5624                         } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5625                             //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5626                             //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5627                             (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
5628                                 /* not compressible */
5629                                 leadPrimary = 0;
5630                                 *primaries++ = primary1;
5631                                 *primaries++ = primary2;
5632                         } else { /* compress */
5633                             *primaries++ = leadPrimary = primary1;
5634                             *primaries++ = primary2;
5635                         }
5636                     }
5637                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5638                     *primaries++ = primary1;
5639                     if(primary2 != UCOL_IGNORABLE) {
5640                         *primaries++ = primary2; /* second part */
5641                     }
5642                 }
5643             }
5644
5645             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5646                 /* This is compression code. */
5647                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5648                     ++count2;
5649                 } else {
5650                     if (count2 > 0) {
5651                         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5652                             while (count2 > UCOL_TOP_COUNT2) {
5653                                 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5654                                 count2 -= (uint32_t)UCOL_TOP_COUNT2;
5655                             }
5656                             *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5657                         } else {
5658                             while (count2 > UCOL_BOT_COUNT2) {
5659                                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5660                                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5661                             }
5662                             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5663                         }
5664                         count2 = 0;
5665                     }
5666                     *secondaries++ = secondary;
5667                 }
5668             }
5669
5670             if(notIsContinuation) {
5671                 tertiary ^= caseSwitch;
5672             }
5673
5674             if(tertiary > 0) {
5675                 /* This is compression code. */
5676                 /* sequence size check is included in the if clause */
5677                 if (tertiary == tertiaryCommon && notIsContinuation) {
5678                     ++count3;
5679                 } else {
5680                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5681                         tertiary += tertiaryAddition;
5682                     } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5683                         tertiary -= tertiaryAddition;
5684                     }
5685                     if (count3 > 0) {
5686                         if ((tertiary > tertiaryCommon)) {
5687                             while (count3 > coll->tertiaryTopCount) {
5688                                 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5689                                 count3 -= (uint32_t)coll->tertiaryTopCount;
5690                             }
5691                             *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5692                         } else {
5693                             while (count3 > coll->tertiaryBottomCount) {
5694                                 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5695                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
5696                             }
5697                             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5698                         }
5699                         count3 = 0;
5700                     }
5701                     *tertiaries++ = tertiary;
5702                 }
5703             }
5704
5705             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5706                 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5707                     IInit_collIterate(coll, (UChar *)source, len, &s);
5708                     if(source == normSource) {
5709                         s.flags &= ~UCOL_ITER_NORM;
5710                     }
5711                     sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5712                     *status = U_BUFFER_OVERFLOW_ERROR;
5713                     finished = TRUE;
5714                     break;
5715                 } else { /* It's much nicer if we can actually reallocate */
5716                     int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5717                     primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5718                     if(U_SUCCESS(*status)) {
5719                         *result = primStart;
5720                         primarySafeEnd = primStart + resultLength - 2;
5721                     } else {
5722                         /* We ran out of memory!? We can't recover. */
5723                         sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5724                         finished = TRUE;
5725                         break;
5726                     }
5727                 }
5728             }
5729         }
5730         if(finished) {
5731             break;
5732         } else {
5733             prevBuffSize = minBufferSize;
5734             secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5735             terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5736             minBufferSize *= 2;
5737             if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5738                 /* We ran out of memory!? We can't recover. */
5739                 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5740                 break;
5741             }
5742         }
5743     }
5744
5745     if(U_SUCCESS(*status)) {
5746         sortKeySize += (primaries - primStart);
5747         /* we have done all the CE's, now let's put them together to form a key */
5748         if (count2 > 0) {
5749             while (count2 > UCOL_BOT_COUNT2) {
5750                 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5751                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5752             }
5753             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5754         }
5755         uint32_t secsize = secondaries-secStart;
5756         sortKeySize += secsize;
5757         if(sortKeySize <= resultLength) {
5758             *(primaries++) = UCOL_LEVELTERMINATOR;
5759             uprv_memcpy(primaries, secStart, secsize);
5760             primaries += secsize;
5761         } else {
5762             if(allocateSKBuffer == TRUE) {
5763                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5764                 if(U_SUCCESS(*status)) {
5765                     *(primaries++) = UCOL_LEVELTERMINATOR;
5766                     *result = primStart;
5767                     uprv_memcpy(primaries, secStart, secsize);
5768                 }
5769                 else {
5770                     /* We ran out of memory!? We can't recover. */
5771                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5772                     goto cleanup;
5773                 }
5774             } else {
5775                 *status = U_BUFFER_OVERFLOW_ERROR;
5776             }
5777         }
5778
5779         if (count3 > 0) {
5780             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5781                 while (count3 >= coll->tertiaryTopCount) {
5782                     *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5783                     count3 -= (uint32_t)coll->tertiaryTopCount;
5784                 }
5785                 *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5786             } else {
5787                 while (count3 > coll->tertiaryBottomCount) {
5788                     *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5789                     count3 -= (uint32_t)coll->tertiaryBottomCount;
5790                 }
5791                 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5792             }
5793         }
5794         uint32_t tersize = tertiaries - terStart;
5795         sortKeySize += tersize;
5796         if(sortKeySize <= resultLength) {
5797             *(primaries++) = UCOL_LEVELTERMINATOR;
5798             uprv_memcpy(primaries, terStart, tersize);
5799             primaries += tersize;
5800         } else {
5801             if(allocateSKBuffer == TRUE) {
5802                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5803                 if(U_SUCCESS(*status)) {
5804                     *result = primStart;
5805                     *(primaries++) = UCOL_LEVELTERMINATOR;
5806                     uprv_memcpy(primaries, terStart, tersize);
5807                 }
5808                 else {
5809                     /* We ran out of memory!? We can't recover. */
5810                     sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY;
5811                     goto cleanup;
5812                 }
5813             } else {
5814                 *status = U_MEMORY_ALLOCATION_ERROR;
5815             }
5816         }
5817
5818         *(primaries++) = '\0';
5819     }
5820
5821     if(allocateSKBuffer == TRUE) {
5822         *result = (uint8_t*)uprv_malloc(sortKeySize);
5823         /* test for NULL */
5824         if (*result == NULL) {
5825             *status = U_MEMORY_ALLOCATION_ERROR;
5826             goto cleanup;
5827         }
5828         uprv_memcpy(*result, primStart, sortKeySize);
5829         if(primStart != prim) {
5830             uprv_free(primStart);
5831         }
5832     }
5833
5834 cleanup:
5835     if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
5836         /* NULL terminate for safety */
5837         **result = 0;
5838     }
5839     if(terStart != tert) {
5840         uprv_free(terStart);
5841         uprv_free(secStart);
5842     }
5843
5844     /* To avoid memory leak, free the offset buffer if necessary. */
5845     freeOffsetBuffer(&s);
5846
5847     if(normSource != normBuffer) {
5848         uprv_free(normSource);
5849     }
5850
5851     return sortKeySize;
5852 }
5853
5854 static inline
5855 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5856     UBool notIsContinuation = !isContinuation(CE);
5857     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5858     if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5859         || (!notIsContinuation && *wasShifted))
5860         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5861     {
5862         // The stuff below should probably be in the sortkey code... maybe not...
5863         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5864             /* we should just completely ignore it */
5865             *wasShifted = TRUE;
5866             //continue;
5867         }
5868         //*wasShifted = TRUE;
5869         return TRUE;
5870     } else {
5871         *wasShifted = FALSE;
5872         return FALSE;
5873     }
5874 }
5875 static inline
5876 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5877     if(level < maxLevel) {
5878         dest[i++] = UCOL_LEVELTERMINATOR;
5879     } else {
5880         dest[i++] = 0;
5881     }
5882 }
5883
5884 /** enumeration of level identifiers for partial sort key generation */
5885 enum {
5886   UCOL_PSK_PRIMARY = 0,
5887     UCOL_PSK_SECONDARY = 1,
5888     UCOL_PSK_CASE = 2,
5889     UCOL_PSK_TERTIARY = 3,
5890     UCOL_PSK_QUATERNARY = 4,
5891     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5892     UCOL_PSK_IDENTICAL = 6,
5893     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5894     UCOL_PSK_LIMIT
5895 };
5896
5897 /** collation state enum. *_SHIFT value is how much to shift right
5898  *  to get the state piece to the right. *_MASK value should be
5899  *  ANDed with the shifted state. This data is stored in state[1]
5900  *  field.
5901  */
5902 enum {
5903     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5904     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5905     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5906     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5907     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5908      *  This field is also used to denote that the French secondary level is finished
5909      */
5910     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5911     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5912     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5913     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5914     /** When we do French we need to reverse secondary values. However, continuations
5915      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5916      */
5917     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5918     UCOL_PSK_BOCSU_BYTES_MASK = 3,
5919     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5920     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5921 };
5922
5923 // macro calculating the number of expansion CEs available
5924 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5925
5926
5927 /** main sortkey part procedure. On the first call,
5928  *  you should pass in a collator, an iterator, empty state
5929  *  state[0] == state[1] == 0, a buffer to hold results
5930  *  number of bytes you need and an error code pointer.
5931  *  Make sure your buffer is big enough to hold the wanted
5932  *  number of sortkey bytes. I don't check.
5933  *  The only meaningful status you can get back is
5934  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5935  *  have been dealt a raw deal and that you probably won't
5936  *  be able to use partial sortkey generation for this
5937  *  particular combination of string and collator. This
5938  *  is highly unlikely, but you should still check the error code.
5939  *  Any other status means that you're not in a sane situation
5940  *  anymore. After the first call, preserve state values and
5941  *  use them on subsequent calls to obtain more bytes of a sortkey.
5942  *  Use until the number of bytes written is smaller than the requested
5943  *  number of bytes. Generated sortkey is not compatible with the
5944  *  one generated by ucol_getSortKey, as we don't do any compression.
5945  *  However, levels are still terminated by a 1 (one) and the sortkey
5946  *  is terminated by a 0 (zero). Identical level is the same as in the
5947  *  regular sortkey - internal bocu-1 implementation is used.
5948  *  For curious, although you cannot do much about this, here is
5949  *  the structure of state words.
5950  *  state[0] - iterator state. Depends on the iterator implementation,
5951  *             but allows the iterator to continue where it stopped in
5952  *             the last iteration.
5953  *  state[1] - collation processing state. Here is the distribution
5954  *             of the bits:
5955  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956  *             quaternary, quin (we don't use this one), identical and
5957  *             null (producing only zeroes - first one to terminate the
5958  *             sortkey and subsequent to fill the buffer).
5959  *   3       - byte count. Number of bytes written on the primary level.
5960  *   4       - was shifted. Whether the previous iteration finished in the
5961  *             shifted state.
5962  *   5, 6    - French continuation bytes written. See the comment in the enum
5963  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5964  *             the identical level.
5965  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5966  *             since thes last successful update of the iterator state.
5967  */
5968 U_CAPI int32_t U_EXPORT2
5969 ucol_nextSortKeyPart(const UCollator *coll,
5970                      UCharIterator *iter,
5971                      uint32_t state[2],
5972                      uint8_t *dest, int32_t count,
5973                      UErrorCode *status)
5974 {
5975     /* error checking */
5976     if(status==NULL || U_FAILURE(*status)) {
5977         return 0;
5978     }
5979     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5980     if( coll==NULL || iter==NULL ||
5981         state==NULL ||
5982         count<0 || (count>0 && dest==NULL)
5983     ) {
5984         *status=U_ILLEGAL_ARGUMENT_ERROR;
5985         UTRACE_EXIT_STATUS(status);
5986         return 0;
5987     }
5988
5989     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5990                   coll, iter, state[0], state[1], dest, count);
5991
5992     if(count==0) {
5993         /* nothing to do */
5994         UTRACE_EXIT_VALUE(0);
5995         return 0;
5996     }
5997     /** Setting up situation according to the state we got from the previous iteration */
5998     // The state of the iterator from the previous invocation
5999     uint32_t iterState = state[0];
6000     // Has the last iteration ended in the shifted state
6001     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
6002     // What is the current level of the sortkey?
6003     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
6004     // Have we written only one byte from a two byte primary in the previous iteration?
6005     // Also on secondary level - have we finished with the French secondary?
6006     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
6007     // number of bytes in the continuation buffer for French
6008     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
6009     // Number of bytes already written from a bocsu sequence. Since
6010     // the longes bocsu sequence is 4 long, this can be up to 3.
6011     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
6012     // Number of elements that need to be consumed in this iteration because
6013     // the iterator returned UITER_NO_STATE at the end of the last iteration,
6014     // so we had to save the last valid state.
6015     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
6016
6017     /** values that depend on the collator attributes */
6018     // strength of the collator.
6019     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
6020     // maximal level of the partial sortkey. Need to take whether case level is done
6021     int32_t maxLevel = 0;
6022     if(strength < UCOL_TERTIARY) {
6023         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6024             maxLevel = UCOL_PSK_CASE;
6025         } else {
6026             maxLevel = strength;
6027         }
6028     } else {
6029         if(strength == UCOL_TERTIARY) {
6030             maxLevel = UCOL_PSK_TERTIARY;
6031         } else if(strength == UCOL_QUATERNARY) {
6032             maxLevel = UCOL_PSK_QUATERNARY;
6033         } else { // identical
6034             maxLevel = UCOL_IDENTICAL;
6035         }
6036     }
6037     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
6038     uint8_t UCOL_HIRAGANA_QUAD =
6039       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
6040     // Boundary value that decides whether a CE is shifted or not
6041     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
6042     // Are we doing French collation?
6043     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
6044
6045     /** initializing the collation state */
6046     UBool notIsContinuation = FALSE;
6047     uint32_t CE = UCOL_NO_MORE_CES;
6048
6049     collIterate s;
6050     IInit_collIterate(coll, NULL, -1, &s);
6051     s.iterator = iter;
6052     s.flags |= UCOL_USE_ITERATOR;
6053     // This variable tells us whether we have produced some other levels in this iteration
6054     // before we moved to the identical level. In that case, we need to switch the
6055     // type of the iterator.
6056     UBool doingIdenticalFromStart = FALSE;
6057     // Normalizing iterator
6058     // The division for the array length may truncate the array size to
6059     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6060     // for all platforms anyway.
6061     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6062     UNormIterator *normIter = NULL;
6063     // If the normalization is turned on for the collator and we are below identical level
6064     // we will use a FCD normalizing iterator
6065     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
6066         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6067         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
6068         s.flags &= ~UCOL_ITER_NORM;
6069         if(U_FAILURE(*status)) {
6070             UTRACE_EXIT_STATUS(*status);
6071             return 0;
6072         }
6073     } else if(level == UCOL_PSK_IDENTICAL) {
6074         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6075         // will be updating the state - and this cannot be done on an ordinary iterator.
6076         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6077         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6078         s.flags &= ~UCOL_ITER_NORM;
6079         if(U_FAILURE(*status)) {
6080             UTRACE_EXIT_STATUS(*status);
6081             return 0;
6082         }
6083         doingIdenticalFromStart = TRUE;
6084     }
6085
6086     // This is the tentative new state of the iterator. The problem
6087     // is that the iterator might return an undefined state, in
6088     // which case we should save the last valid state and increase
6089     // the iterator skip value.
6090     uint32_t newState = 0;
6091
6092     // First, we set the iterator to the last valid position
6093     // from the last iteration. This was saved in state[0].
6094     if(iterState == 0) {
6095         /* initial state */
6096         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
6097             s.iterator->move(s.iterator, 0, UITER_LIMIT);
6098         } else {
6099             s.iterator->move(s.iterator, 0, UITER_START);
6100         }
6101     } else {
6102         /* reset to previous state */
6103         s.iterator->setState(s.iterator, iterState, status);
6104         if(U_FAILURE(*status)) {
6105             UTRACE_EXIT_STATUS(*status);
6106             return 0;
6107         }
6108     }
6109
6110
6111
6112     // This variable tells us whether we can attempt to update the state
6113     // of iterator. Situations where we don't want to update iterator state
6114     // are the existence of expansion CEs that are not yet processed, and
6115     // finishing the case level without enough space in the buffer to insert
6116     // a level terminator.
6117     UBool canUpdateState = TRUE;
6118
6119     // Consume all the CEs that were consumed at the end of the previous
6120     // iteration without updating the iterator state. On identical level,
6121     // consume the code points.
6122     int32_t counter = cces;
6123     if(level < UCOL_PSK_IDENTICAL) {
6124         while(counter-->0) {
6125             // If we're doing French and we are on the secondary level,
6126             // we go backwards.
6127             if(level == UCOL_PSK_SECONDARY && doingFrench) {
6128                 CE = ucol_IGetPrevCE(coll, &s, status);
6129             } else {
6130                 CE = ucol_IGetNextCE(coll, &s, status);
6131             }
6132             if(CE==UCOL_NO_MORE_CES) {
6133                 /* should not happen */
6134                 *status=U_INTERNAL_PROGRAM_ERROR;
6135                 UTRACE_EXIT_STATUS(*status);
6136                 return 0;
6137             }
6138             if(uprv_numAvailableExpCEs(s)) {
6139                 canUpdateState = FALSE;
6140             }
6141         }
6142     } else {
6143         while(counter-->0) {
6144             uiter_next32(s.iterator);
6145         }
6146     }
6147
6148     // French secondary needs to know whether the iterator state of zero came from previous level OR
6149     // from a new invocation...
6150     UBool wasDoingPrimary = FALSE;
6151     // destination buffer byte counter. When this guy
6152     // gets to count, we're done with the iteration
6153     int32_t i = 0;
6154     // used to count the zero bytes written after we
6155     // have finished with the sort key
6156     int32_t j = 0;
6157
6158
6159     // Hm.... I think we're ready to plunge in. Basic story is as following:
6160     // we have a fall through case based on level. This is used for initial
6161     // positioning on iteration start. Every level processor contains a
6162     // for(;;) which will be broken when we exhaust all the CEs. Other
6163     // way to exit is a goto saveState, which happens when we have filled
6164     // out our buffer.
6165     switch(level) {
6166     case UCOL_PSK_PRIMARY:
6167         wasDoingPrimary = TRUE;
6168         for(;;) {
6169             if(i==count) {
6170                 goto saveState;
6171             }
6172             // We should save the state only if we
6173             // are sure that we are done with the
6174             // previous iterator state
6175             if(canUpdateState && byteCountOrFrenchDone == 0) {
6176                 newState = s.iterator->getState(s.iterator);
6177                 if(newState != UITER_NO_STATE) {
6178                     iterState = newState;
6179                     cces = 0;
6180                 }
6181             }
6182             CE = ucol_IGetNextCE(coll, &s, status);
6183             cces++;
6184             if(CE==UCOL_NO_MORE_CES) {
6185                 // Add the level separator
6186                 terminatePSKLevel(level, maxLevel, i, dest);
6187                 byteCountOrFrenchDone=0;
6188                 // Restart the iteration an move to the
6189                 // second level
6190                 s.iterator->move(s.iterator, 0, UITER_START);
6191                 cces = 0;
6192                 level = UCOL_PSK_SECONDARY;
6193                 break;
6194             }
6195             if(!isShiftedCE(CE, LVT, &wasShifted)) {
6196                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6197                 if(CE != 0) {
6198                     if(byteCountOrFrenchDone == 0) {
6199                         // get the second byte of primary
6200                         dest[i++]=(uint8_t)(CE >> 8);
6201                     } else {
6202                         byteCountOrFrenchDone = 0;
6203                     }
6204                     if((CE &=0xff)!=0) {
6205                         if(i==count) {
6206                             /* overflow */
6207                             byteCountOrFrenchDone = 1;
6208                             cces--;
6209                             goto saveState;
6210                         }
6211                         dest[i++]=(uint8_t)CE;
6212                     }
6213                 }
6214             }
6215             if(uprv_numAvailableExpCEs(s)) {
6216                 canUpdateState = FALSE;
6217             } else {
6218                 canUpdateState = TRUE;
6219             }
6220         }
6221         /* fall through to next level */
6222     case UCOL_PSK_SECONDARY:
6223         if(strength >= UCOL_SECONDARY) {
6224             if(!doingFrench) {
6225                 for(;;) {
6226                     if(i == count) {
6227                         goto saveState;
6228                     }
6229                     // We should save the state only if we
6230                     // are sure that we are done with the
6231                     // previous iterator state
6232                     if(canUpdateState) {
6233                         newState = s.iterator->getState(s.iterator);
6234                         if(newState != UITER_NO_STATE) {
6235                             iterState = newState;
6236                             cces = 0;
6237                         }
6238                     }
6239                     CE = ucol_IGetNextCE(coll, &s, status);
6240                     cces++;
6241                     if(CE==UCOL_NO_MORE_CES) {
6242                         // Add the level separator
6243                         terminatePSKLevel(level, maxLevel, i, dest);
6244                         byteCountOrFrenchDone = 0;
6245                         // Restart the iteration an move to the
6246                         // second level
6247                         s.iterator->move(s.iterator, 0, UITER_START);
6248                         cces = 0;
6249                         level = UCOL_PSK_CASE;
6250                         break;
6251                     }
6252                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
6253                         CE >>= 8; /* get secondary */
6254                         if(CE != 0) {
6255                             dest[i++]=(uint8_t)CE;
6256                         }
6257                     }
6258                     if(uprv_numAvailableExpCEs(s)) {
6259                         canUpdateState = FALSE;
6260                     } else {
6261                         canUpdateState = TRUE;
6262                     }
6263                 }
6264             } else { // French secondary processing
6265                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
6266                 int32_t frenchIndex = 0;
6267                 // Here we are going backwards.
6268                 // If the iterator is at the beggining, it should be
6269                 // moved to end.
6270                 if(wasDoingPrimary) {
6271                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
6272                     cces = 0;
6273                 }
6274                 for(;;) {
6275                     if(i == count) {
6276                         goto saveState;
6277                     }
6278                     if(canUpdateState) {
6279                         newState = s.iterator->getState(s.iterator);
6280                         if(newState != UITER_NO_STATE) {
6281                             iterState = newState;
6282                             cces = 0;
6283                         }
6284                     }
6285                     CE = ucol_IGetPrevCE(coll, &s, status);
6286                     cces++;
6287                     if(CE==UCOL_NO_MORE_CES) {
6288                         // Add the level separator
6289                         terminatePSKLevel(level, maxLevel, i, dest);
6290                         byteCountOrFrenchDone = 0;
6291                         // Restart the iteration an move to the next level
6292                         s.iterator->move(s.iterator, 0, UITER_START);
6293                         level = UCOL_PSK_CASE;
6294                         break;
6295                     }
6296                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6297                         // reverse when we get a first non-continuation CE.
6298                         CE >>= 8;
6299                         frenchBuff[frenchIndex++] = (uint8_t)CE;
6300                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6301                         CE >>= 8; /* get secondary */
6302                         if(!frenchIndex) {
6303                             if(CE != 0) {
6304                                 dest[i++]=(uint8_t)CE;
6305                             }
6306                         } else {
6307                             frenchBuff[frenchIndex++] = (uint8_t)CE;
6308                             frenchIndex -= usedFrench;
6309                             usedFrench = 0;
6310                             while(i < count && frenchIndex) {
6311                                 dest[i++] = frenchBuff[--frenchIndex];
6312                                 usedFrench++;
6313                             }
6314                         }
6315                     }
6316                     if(uprv_numAvailableExpCEs(s)) {
6317                         canUpdateState = FALSE;
6318                     } else {
6319                         canUpdateState = TRUE;
6320                     }
6321                 }
6322             }
6323         } else {
6324             level = UCOL_PSK_CASE;
6325         }
6326         /* fall through to next level */
6327     case UCOL_PSK_CASE:
6328         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6329             uint32_t caseShift = UCOL_CASE_SHIFT_START;
6330             uint8_t caseByte = UCOL_CASE_BYTE_START;
6331             uint8_t caseBits = 0;
6332
6333             for(;;) {
6334                 if(i == count) {
6335                     goto saveState;
6336                 }
6337                 // We should save the state only if we
6338                 // are sure that we are done with the
6339                 // previous iterator state
6340                 if(canUpdateState) {
6341                     newState = s.iterator->getState(s.iterator);
6342                     if(newState != UITER_NO_STATE) {
6343                         iterState = newState;
6344                         cces = 0;
6345                     }
6346                 }
6347                 CE = ucol_IGetNextCE(coll, &s, status);
6348                 cces++;
6349                 if(CE==UCOL_NO_MORE_CES) {
6350                     // On the case level we might have an unfinished
6351                     // case byte. Add one if it's started.
6352                     if(caseShift != UCOL_CASE_SHIFT_START) {
6353                         dest[i++] = caseByte;
6354                     }
6355                     cces = 0;
6356                     // We have finished processing CEs on this level.
6357                     // However, we don't know if we have enough space
6358                     // to add a case level terminator.
6359                     if(i < count) {
6360                         // Add the level separator
6361                         terminatePSKLevel(level, maxLevel, i, dest);
6362                         // Restart the iteration and move to the
6363                         // next level
6364                         s.iterator->move(s.iterator, 0, UITER_START);
6365                         level = UCOL_PSK_TERTIARY;
6366                     } else {
6367                         canUpdateState = FALSE;
6368                     }
6369                     break;
6370                 }
6371
6372                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6373                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
6374                         // do the case level if we need to do it. We don't want to calculate
6375                         // case level for primary ignorables if we have only primary strength and case level
6376                         // otherwise we would break well formedness of CEs
6377                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6378                         caseBits = (uint8_t)(CE & 0xC0);
6379                         // this copies the case level logic from the
6380                         // sort key generation code
6381                         if(CE != 0) {
6382                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
6383                                 if((caseBits & 0xC0) == 0) {
6384                                     caseByte |= 1 << (--caseShift);
6385                                 } else {
6386                                     caseByte |= 0 << (--caseShift);
6387                                     /* second bit */
6388                                     if(caseShift == 0) {
6389                                         dest[i++] = caseByte;
6390                                         caseShift = UCOL_CASE_SHIFT_START;
6391                                         caseByte = UCOL_CASE_BYTE_START;
6392                                     }
6393                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
6394                                 }
6395                             } else {
6396                                 if((caseBits & 0xC0) == 0) {
6397                                     caseByte |= 0 << (--caseShift);
6398                                 } else {
6399                                     caseByte |= 1 << (--caseShift);
6400                                     /* second bit */
6401                                     if(caseShift == 0) {
6402                                         dest[i++] = caseByte;
6403                                         caseShift = UCOL_CASE_SHIFT_START;
6404                                         caseByte = UCOL_CASE_BYTE_START;
6405                                     }
6406                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
6407                                 }
6408                             }
6409                         }
6410
6411                     }
6412                 }
6413                 // Not sure this is correct for the case level - revisit
6414                 if(uprv_numAvailableExpCEs(s)) {
6415                     canUpdateState = FALSE;
6416                 } else {
6417                     canUpdateState = TRUE;
6418                 }
6419             }
6420         } else {
6421             level = UCOL_PSK_TERTIARY;
6422         }
6423         /* fall through to next level */
6424     case UCOL_PSK_TERTIARY:
6425         if(strength >= UCOL_TERTIARY) {
6426             for(;;) {
6427                 if(i == count) {
6428                     goto saveState;
6429                 }
6430                 // We should save the state only if we
6431                 // are sure that we are done with the
6432                 // previous iterator state
6433                 if(canUpdateState) {
6434                     newState = s.iterator->getState(s.iterator);
6435                     if(newState != UITER_NO_STATE) {
6436                         iterState = newState;
6437                         cces = 0;
6438                     }
6439                 }
6440                 CE = ucol_IGetNextCE(coll, &s, status);
6441                 cces++;
6442                 if(CE==UCOL_NO_MORE_CES) {
6443                     // Add the level separator
6444                     terminatePSKLevel(level, maxLevel, i, dest);
6445                     byteCountOrFrenchDone = 0;
6446                     // Restart the iteration an move to the
6447                     // second level
6448                     s.iterator->move(s.iterator, 0, UITER_START);
6449                     cces = 0;
6450                     level = UCOL_PSK_QUATERNARY;
6451                     break;
6452                 }
6453                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
6454                     notIsContinuation = !isContinuation(CE);
6455
6456                     if(notIsContinuation) {
6457                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6458                         CE ^= coll->caseSwitch;
6459                         CE &= coll->tertiaryMask;
6460                     } else {
6461                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6462                     }
6463
6464                     if(CE != 0) {
6465                         dest[i++]=(uint8_t)CE;
6466                     }
6467                 }
6468                 if(uprv_numAvailableExpCEs(s)) {
6469                     canUpdateState = FALSE;
6470                 } else {
6471                     canUpdateState = TRUE;
6472                 }
6473             }
6474         } else {
6475             // if we're not doing tertiary
6476             // skip to the end
6477             level = UCOL_PSK_NULL;
6478         }
6479         /* fall through to next level */
6480     case UCOL_PSK_QUATERNARY:
6481         if(strength >= UCOL_QUATERNARY) {
6482             for(;;) {
6483                 if(i == count) {
6484                     goto saveState;
6485                 }
6486                 // We should save the state only if we
6487                 // are sure that we are done with the
6488                 // previous iterator state
6489                 if(canUpdateState) {
6490                     newState = s.iterator->getState(s.iterator);
6491                     if(newState != UITER_NO_STATE) {
6492                         iterState = newState;
6493                         cces = 0;
6494                     }
6495                 }
6496                 CE = ucol_IGetNextCE(coll, &s, status);
6497                 cces++;
6498                 if(CE==UCOL_NO_MORE_CES) {
6499                     // Add the level separator
6500                     terminatePSKLevel(level, maxLevel, i, dest);
6501                     //dest[i++] = UCOL_LEVELTERMINATOR;
6502                     byteCountOrFrenchDone = 0;
6503                     // Restart the iteration an move to the
6504                     // second level
6505                     s.iterator->move(s.iterator, 0, UITER_START);
6506                     cces = 0;
6507                     level = UCOL_PSK_QUIN;
6508                     break;
6509                 }
6510                 if(CE==0)
6511                     continue;
6512                 if(isShiftedCE(CE, LVT, &wasShifted)) {
6513                     CE >>= 16; /* get primary */
6514                     if(CE != 0) {
6515                         if(byteCountOrFrenchDone == 0) {
6516                             dest[i++]=(uint8_t)(CE >> 8);
6517                         } else {
6518                             byteCountOrFrenchDone = 0;
6519                         }
6520                         if((CE &=0xff)!=0) {
6521                             if(i==count) {
6522                                 /* overflow */
6523                                 byteCountOrFrenchDone = 1;
6524                                 goto saveState;
6525                             }
6526                             dest[i++]=(uint8_t)CE;
6527                         }
6528                     }
6529                 } else {
6530                     notIsContinuation = !isContinuation(CE);
6531                     if(notIsContinuation) {
6532                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6533                             dest[i++] = UCOL_HIRAGANA_QUAD;
6534                         } else {
6535                             dest[i++] = 0xFF;
6536                         }
6537                     }
6538                 }
6539                 if(uprv_numAvailableExpCEs(s)) {
6540                     canUpdateState = FALSE;
6541                 } else {
6542                     canUpdateState = TRUE;
6543                 }
6544             }
6545         } else {
6546             // if we're not doing quaternary
6547             // skip to the end
6548             level = UCOL_PSK_NULL;
6549         }
6550         /* fall through to next level */
6551     case UCOL_PSK_QUIN:
6552         level = UCOL_PSK_IDENTICAL;
6553         /* fall through to next level */
6554     case UCOL_PSK_IDENTICAL:
6555         if(strength >= UCOL_IDENTICAL) {
6556             UChar32 first, second;
6557             int32_t bocsuBytesWritten = 0;
6558             // We always need to do identical on
6559             // the NFD form of the string.
6560             if(normIter == NULL) {
6561                 // we arrived from the level below and
6562                 // normalization was not turned on.
6563                 // therefore, we need to make a fresh NFD iterator
6564                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6565                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6566             } else if(!doingIdenticalFromStart) {
6567                 // there is an iterator, but we did some other levels.
6568                 // therefore, we have a FCD iterator - need to make
6569                 // a NFD one.
6570                 // normIter being at the beginning does not guarantee
6571                 // that the underlying iterator is at the beginning
6572                 iter->move(iter, 0, UITER_START);
6573                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6574             }
6575             // At this point we have a NFD iterator that is positioned
6576             // in the right place
6577             if(U_FAILURE(*status)) {
6578                 UTRACE_EXIT_STATUS(*status);
6579                 return 0;
6580             }
6581             first = uiter_previous32(s.iterator);
6582             // maybe we're at the start of the string
6583             if(first == U_SENTINEL) {
6584                 first = 0;
6585             } else {
6586                 uiter_next32(s.iterator);
6587             }
6588
6589             j = 0;
6590             for(;;) {
6591                 if(i == count) {
6592                     if(j+1 < bocsuBytesWritten) {
6593                         bocsuBytesUsed = j+1;
6594                     }
6595                     goto saveState;
6596                 }
6597
6598                 // On identical level, we will always save
6599                 // the state if we reach this point, since
6600                 // we don't depend on getNextCE for content
6601                 // all the content is in our buffer and we
6602                 // already either stored the full buffer OR
6603                 // otherwise we won't arrive here.
6604                 newState = s.iterator->getState(s.iterator);
6605                 if(newState != UITER_NO_STATE) {
6606                     iterState = newState;
6607                     cces = 0;
6608                 }
6609
6610                 uint8_t buff[4];
6611                 second = uiter_next32(s.iterator);
6612                 cces++;
6613
6614                 // end condition for identical level
6615                 if(second == U_SENTINEL) {
6616                     terminatePSKLevel(level, maxLevel, i, dest);
6617                     level = UCOL_PSK_NULL;
6618                     break;
6619                 }
6620                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6621                 first = second;
6622
6623                 j = 0;
6624                 if(bocsuBytesUsed != 0) {
6625                     while(bocsuBytesUsed-->0) {
6626                         j++;
6627                     }
6628                 }
6629
6630                 while(i < count && j < bocsuBytesWritten) {
6631                     dest[i++] = buff[j++];
6632                 }
6633             }
6634
6635         } else {
6636             level = UCOL_PSK_NULL;
6637         }
6638         /* fall through to next level */
6639     case UCOL_PSK_NULL:
6640         j = i;
6641         while(j<count) {
6642             dest[j++]=0;
6643         }
6644         break;
6645     default:
6646         *status = U_INTERNAL_PROGRAM_ERROR;
6647         UTRACE_EXIT_STATUS(*status);
6648         return 0;
6649     }
6650
6651 saveState:
6652     // Now we need to return stuff. First we want to see whether we have
6653     // done everything for the current state of iterator.
6654     if(byteCountOrFrenchDone
6655         || canUpdateState == FALSE
6656         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6657     {
6658         // Any of above mean that the previous transaction
6659         // wasn't finished and that we should store the
6660         // previous iterator state.
6661         state[0] = iterState;
6662     } else {
6663         // The transaction is complete. We will continue in the next iteration.
6664         state[0] = s.iterator->getState(s.iterator);
6665         cces = 0;
6666     }
6667     // Store the number of bocsu bytes written.
6668     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6669         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6670     }
6671     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6672
6673     // Next we put in the level of comparison
6674     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6675
6676     // If we are doing French, we need to store whether we have just finished the French level
6677     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6678         state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6679     } else {
6680         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6681     }
6682
6683     // Was the latest CE shifted
6684     if(wasShifted) {
6685         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6686     }
6687     // Check for cces overflow
6688     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6689         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6690     }
6691     // Store cces
6692     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6693
6694     // Check for French overflow
6695     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6696         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6697     }
6698     // Store number of bytes written in the French secondary continuation sequence
6699     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6700
6701
6702     // If we have used normalizing iterator, get rid of it
6703     if(normIter != NULL) {
6704         unorm_closeIter(normIter);
6705     }
6706
6707     /* To avoid memory leak, free the offset buffer if necessary. */
6708     freeOffsetBuffer(&s);
6709
6710     // Return number of meaningful sortkey bytes.
6711     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6712                   dest,i, state[0], state[1]);
6713     UTRACE_EXIT_VALUE(i);
6714     return i;
6715 }
6716
6717 /**
6718  * Produce a bound for a given sortkey and a number of levels.
6719  */
6720 U_CAPI int32_t U_EXPORT2
6721 ucol_getBound(const uint8_t       *source,
6722         int32_t             sourceLength,
6723         UColBoundMode       boundType,
6724         uint32_t            noOfLevels,
6725         uint8_t             *result,
6726         int32_t             resultLength,
6727         UErrorCode          *status)
6728 {
6729     // consistency checks
6730     if(status == NULL || U_FAILURE(*status)) {
6731         return 0;
6732     }
6733     if(source == NULL) {
6734         *status = U_ILLEGAL_ARGUMENT_ERROR;
6735         return 0;
6736     }
6737
6738     int32_t sourceIndex = 0;
6739     // Scan the string until we skip enough of the key OR reach the end of the key
6740     do {
6741         sourceIndex++;
6742         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6743             noOfLevels--;
6744         }
6745     } while (noOfLevels > 0
6746         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6747
6748     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6749         && noOfLevels > 0) {
6750             *status = U_SORT_KEY_TOO_SHORT_WARNING;
6751     }
6752
6753
6754     // READ ME: this code assumes that the values for boundType
6755     // enum will not changes. They are set so that the enum value
6756     // corresponds to the number of extra bytes each bound type
6757     // needs.
6758     if(result != NULL && resultLength >= sourceIndex+boundType) {
6759         uprv_memcpy(result, source, sourceIndex);
6760         switch(boundType) {
6761             // Lower bound just gets terminated. No extra bytes
6762         case UCOL_BOUND_LOWER: // = 0
6763             break;
6764             // Upper bound needs one extra byte
6765         case UCOL_BOUND_UPPER: // = 1
6766             result[sourceIndex++] = 2;
6767             break;
6768             // Upper long bound needs two extra bytes
6769         case UCOL_BOUND_UPPER_LONG: // = 2
6770             result[sourceIndex++] = 0xFF;
6771             result[sourceIndex++] = 0xFF;
6772             break;
6773         default:
6774             *status = U_ILLEGAL_ARGUMENT_ERROR;
6775             return 0;
6776         }
6777         result[sourceIndex++] = 0;
6778
6779         return sourceIndex;
6780     } else {
6781         return sourceIndex+boundType+1;
6782     }
6783 }
6784
6785 /****************************************************************************/
6786 /* Following are the functions that deal with the properties of a collator  */
6787 /* there are new APIs and some compatibility APIs                           */
6788 /****************************************************************************/
6789
6790 static inline void
6791 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6792                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
6793 {
6794     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6795     UBool reverseSecondary = FALSE;
6796     if(!isContinuation(CE)) {
6797         tertiary = (uint8_t)((CE & coll->tertiaryMask));
6798         tertiary ^= coll->caseSwitch;
6799         reverseSecondary = TRUE;
6800     } else {
6801         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6802         tertiary &= UCOL_REMOVE_CASE;
6803         reverseSecondary = FALSE;
6804     }
6805
6806     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6807     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6808     primary1 = (uint8_t)(CE >> 8);
6809
6810     if(primary1 != 0) {
6811         coll->latinOneCEs[ch] |= (primary1 << *primShift);
6812         *primShift -= 8;
6813     }
6814     if(primary2 != 0) {
6815         if(*primShift < 0) {
6816             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6817             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6818             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6819             return;
6820         }
6821         coll->latinOneCEs[ch] |= (primary2 << *primShift);
6822         *primShift -= 8;
6823     }
6824     if(secondary != 0) {
6825         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6826             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6827             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6828         } else { // normal case
6829             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6830         }
6831         *secShift -= 8;
6832     }
6833     if(tertiary != 0) {
6834         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6835         *terShift -= 8;
6836     }
6837 }
6838
6839 static inline UBool
6840 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6841     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6842     if(newTable == NULL) {
6843       *status = U_MEMORY_ALLOCATION_ERROR;
6844       coll->latinOneFailed = TRUE;
6845       return FALSE;
6846     }
6847     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6848     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6849     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6850     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6851     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6852     coll->latinOneTableLen = size;
6853     uprv_free(coll->latinOneCEs);
6854     coll->latinOneCEs = newTable;
6855     return TRUE;
6856 }
6857
6858 static UBool
6859 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6860     UBool result = TRUE;
6861     if(coll->latinOneCEs == NULL) {
6862         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6863         if(coll->latinOneCEs == NULL) {
6864             *status = U_MEMORY_ALLOCATION_ERROR;
6865             return FALSE;
6866         }
6867         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6868     }
6869     UChar ch = 0;
6870     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6871     // Check for null pointer
6872     if (U_FAILURE(*status)) {
6873         return FALSE;
6874     }
6875     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6876
6877     int32_t primShift = 24, secShift = 24, terShift = 24;
6878     uint32_t CE = 0;
6879     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6880
6881     // TODO: make safe if you get more than you wanted...
6882     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6883         primShift = 24; secShift = 24; terShift = 24;
6884         if(ch < 0x100) {
6885             CE = coll->latinOneMapping[ch];
6886         } else {
6887             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6888             if(CE == UCOL_NOT_FOUND && coll->UCA) {
6889                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6890             }
6891         }
6892         if(CE < UCOL_NOT_FOUND) {
6893             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6894         } else {
6895             switch (getCETag(CE)) {
6896             case EXPANSION_TAG:
6897             case DIGIT_TAG:
6898                 ucol_setText(it, &ch, 1, status);
6899                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6900                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6901                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6902                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6903                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6904                         break;
6905                     }
6906                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6907                 }
6908                 break;
6909             case CONTRACTION_TAG:
6910                 // here is the trick
6911                 // F2 is contraction. We do something very similar to contractions
6912                 // but have two indices, one in the real contraction table and the
6913                 // other to where we stuffed things. This hopes that we don't have
6914                 // many contractions (this should work for latin-1 tables).
6915                 {
6916                     if((CE & 0x00FFF000) != 0) {
6917                         *status = U_UNSUPPORTED_ERROR;
6918                         goto cleanup_after_failure;
6919                     }
6920
6921                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6922
6923                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6924
6925                     coll->latinOneCEs[ch] = CE;
6926                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6927                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6928
6929                     // We're going to jump into contraction table, pick the elements
6930                     // and use them
6931                     do {
6932                         CE = *(coll->contractionCEs +
6933                             (UCharOffset - coll->contractionIndex));
6934                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6935                             uint32_t size;
6936                             uint32_t i;    /* general counter */
6937                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6938                             size = getExpansionCount(CE);
6939                             //CE = *CEOffset++;
6940                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6941                                 for(i = 0; i<size; i++) {
6942                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6943                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6944                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6945                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6946                                         break;
6947                                     }
6948                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6949                                 }
6950                             } else { /* else, we do */
6951                                 while(*CEOffset != 0) {
6952                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6953                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6954                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6955                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6956                                         break;
6957                                     }
6958                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6959                                 }
6960                             }
6961                             contractionOffset++;
6962                         } else if(CE < UCOL_NOT_FOUND) {
6963                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6964                         } else {
6965                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6966                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6967                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6968                             contractionOffset++;
6969                         }
6970                         UCharOffset++;
6971                         primShift = 24; secShift = 24; terShift = 24;
6972                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6973                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6974                                 goto cleanup_after_failure;
6975                             }
6976                         }
6977                     } while(*UCharOffset != 0xFFFF);
6978                 }
6979                 break;;
6980             case SPEC_PROC_TAG:
6981                 {
6982                     // 0xB7 is a precontext character defined in UCA5.1, a special
6983                     // handle is implemeted in order to save LatinOne table for
6984                     // most locales.
6985                     if (ch==0xb7) {
6986                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6987                     }
6988                     else {
6989                         goto cleanup_after_failure;
6990                     }
6991                 }
6992                 break;
6993             default:
6994                 goto cleanup_after_failure;
6995             }
6996         }
6997     }
6998     // compact table
6999     if(contractionOffset < coll->latinOneTableLen) {
7000         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
7001             goto cleanup_after_failure;
7002         }
7003     }
7004     ucol_closeElements(it);
7005     return result;
7006
7007 cleanup_after_failure:
7008     // status should already be set before arriving here.
7009     coll->latinOneFailed = TRUE;
7010     ucol_closeElements(it);
7011     return FALSE;
7012 }
7013
7014 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
7015     if(U_SUCCESS(*status)) {
7016         if(coll->caseFirst == UCOL_UPPER_FIRST) {
7017             coll->caseSwitch = UCOL_CASE_SWITCH;
7018         } else {
7019             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
7020         }
7021
7022         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
7023             coll->tertiaryMask = UCOL_REMOVE_CASE;
7024             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7025             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
7026             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
7027             coll->tertiaryBottom = UCOL_COMMON_BOT3;
7028         } else {
7029             coll->tertiaryMask = UCOL_KEEP_CASE;
7030             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
7031             if(coll->caseFirst == UCOL_UPPER_FIRST) {
7032                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
7033                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
7034                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
7035             } else {
7036                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7037                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
7038                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
7039             }
7040         }
7041
7042         /* Set the compression values */
7043         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
7044         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
7045         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
7046
7047         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
7048             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
7049         {
7050             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
7051         } else {
7052             coll->sortKeyGen = ucol_calcSortKey;
7053         }
7054         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
7055             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
7056         {
7057             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
7058                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
7059                     //fprintf(stderr, "F");
7060                     coll->latinOneUse = TRUE;
7061                 } else {
7062                     coll->latinOneUse = FALSE;
7063                 }
7064                 if(*status == U_UNSUPPORTED_ERROR) {
7065                     *status = U_ZERO_ERROR;
7066                 }
7067             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7068                 coll->latinOneUse = TRUE;
7069             }
7070         } else {
7071             coll->latinOneUse = FALSE;
7072         }
7073     }
7074 }
7075
7076 U_CAPI uint32_t  U_EXPORT2
7077 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
7078     if(U_FAILURE(*status) || coll == NULL) {
7079         return 0;
7080     }
7081     if(len == -1) {
7082         len = u_strlen(varTop);
7083     }
7084     if(len == 0) {
7085         *status = U_ILLEGAL_ARGUMENT_ERROR;
7086         return 0;
7087     }
7088
7089     collIterate s;
7090     IInit_collIterate(coll, varTop, len, &s);
7091
7092     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
7093
7094     /* here we check if we have consumed all characters */
7095     /* you can put in either one character or a contraction */
7096     /* you shouldn't put more... */
7097     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
7098         *status = U_CE_NOT_FOUND_ERROR;
7099         return 0;
7100     }
7101
7102     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
7103
7104     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
7105         *status = U_PRIMARY_TOO_LONG_ERROR;
7106         return 0;
7107     }
7108     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
7109         coll->variableTopValueisDefault = FALSE;
7110         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
7111     }
7112
7113     /* To avoid memory leak, free the offset buffer if necessary. */
7114     freeOffsetBuffer(&s);
7115
7116     return CE & UCOL_PRIMARYMASK;
7117 }
7118
7119 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
7120     if(U_FAILURE(*status) || coll == NULL) {
7121         return 0;
7122     }
7123     return coll->variableTopValue<<16;
7124 }
7125
7126 U_CAPI void  U_EXPORT2
7127 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
7128     if(U_FAILURE(*status) || coll == NULL) {
7129         return;
7130     }
7131
7132     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
7133         coll->variableTopValueisDefault = FALSE;
7134         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
7135     }
7136 }
7137 /* Attribute setter API */
7138 U_CAPI void  U_EXPORT2
7139 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
7140     if(U_FAILURE(*status) || coll == NULL) {
7141       return;
7142     }
7143     UColAttributeValue oldFrench = coll->frenchCollation;
7144     UColAttributeValue oldCaseFirst = coll->caseFirst;
7145     switch(attr) {
7146     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
7147         if(value == UCOL_ON) {
7148             coll->numericCollation = UCOL_ON;
7149             coll->numericCollationisDefault = FALSE;
7150         } else if (value == UCOL_OFF) {
7151             coll->numericCollation = UCOL_OFF;
7152             coll->numericCollationisDefault = FALSE;
7153         } else if (value == UCOL_DEFAULT) {
7154             coll->numericCollationisDefault = TRUE;
7155             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
7156         } else {
7157             *status = U_ILLEGAL_ARGUMENT_ERROR;
7158         }
7159         break;
7160     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
7161         if(value == UCOL_ON) {
7162             coll->hiraganaQ = UCOL_ON;
7163             coll->hiraganaQisDefault = FALSE;
7164         } else if (value == UCOL_OFF) {
7165             coll->hiraganaQ = UCOL_OFF;
7166             coll->hiraganaQisDefault = FALSE;
7167         } else if (value == UCOL_DEFAULT) {
7168             coll->hiraganaQisDefault = TRUE;
7169             coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
7170         } else {
7171             *status = U_ILLEGAL_ARGUMENT_ERROR;
7172         }
7173         break;
7174     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7175         if(value == UCOL_ON) {
7176             coll->frenchCollation = UCOL_ON;
7177             coll->frenchCollationisDefault = FALSE;
7178         } else if (value == UCOL_OFF) {
7179             coll->frenchCollation = UCOL_OFF;
7180             coll->frenchCollationisDefault = FALSE;
7181         } else if (value == UCOL_DEFAULT) {
7182             coll->frenchCollationisDefault = TRUE;
7183             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7184         } else {
7185             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7186         }
7187         break;
7188     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7189         if(value == UCOL_SHIFTED) {
7190             coll->alternateHandling = UCOL_SHIFTED;
7191             coll->alternateHandlingisDefault = FALSE;
7192         } else if (value == UCOL_NON_IGNORABLE) {
7193             coll->alternateHandling = UCOL_NON_IGNORABLE;
7194             coll->alternateHandlingisDefault = FALSE;
7195         } else if (value == UCOL_DEFAULT) {
7196             coll->alternateHandlingisDefault = TRUE;
7197             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7198         } else {
7199             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7200         }
7201         break;
7202     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7203         if(value == UCOL_LOWER_FIRST) {
7204             coll->caseFirst = UCOL_LOWER_FIRST;
7205             coll->caseFirstisDefault = FALSE;
7206         } else if (value == UCOL_UPPER_FIRST) {
7207             coll->caseFirst = UCOL_UPPER_FIRST;
7208             coll->caseFirstisDefault = FALSE;
7209         } else if (value == UCOL_OFF) {
7210             coll->caseFirst = UCOL_OFF;
7211             coll->caseFirstisDefault = FALSE;
7212         } else if (value == UCOL_DEFAULT) {
7213             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7214             coll->caseFirstisDefault = TRUE;
7215         } else {
7216             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7217         }
7218         break;
7219     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7220         if(value == UCOL_ON) {
7221             coll->caseLevel = UCOL_ON;
7222             coll->caseLevelisDefault = FALSE;
7223         } else if (value == UCOL_OFF) {
7224             coll->caseLevel = UCOL_OFF;
7225             coll->caseLevelisDefault = FALSE;
7226         } else if (value == UCOL_DEFAULT) {
7227             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7228             coll->caseLevelisDefault = TRUE;
7229         } else {
7230             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7231         }
7232         break;
7233     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7234         if(value == UCOL_ON) {
7235             coll->normalizationMode = UCOL_ON;
7236             coll->normalizationModeisDefault = FALSE;
7237         } else if (value == UCOL_OFF) {
7238             coll->normalizationMode = UCOL_OFF;
7239             coll->normalizationModeisDefault = FALSE;
7240         } else if (value == UCOL_DEFAULT) {
7241             coll->normalizationModeisDefault = TRUE;
7242             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7243         } else {
7244             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7245         }
7246         break;
7247     case UCOL_STRENGTH:         /* attribute for strength */
7248         if (value == UCOL_DEFAULT) {
7249             coll->strengthisDefault = TRUE;
7250             coll->strength = (UColAttributeValue)coll->options->strength;
7251         } else if (value <= UCOL_IDENTICAL) {
7252             coll->strengthisDefault = FALSE;
7253             coll->strength = value;
7254         } else {
7255             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7256         }
7257         break;
7258     case UCOL_ATTRIBUTE_COUNT:
7259     default:
7260         *status = U_ILLEGAL_ARGUMENT_ERROR;
7261         break;
7262     }
7263     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7264         coll->latinOneRegenTable = TRUE;
7265     } else {
7266         coll->latinOneRegenTable = FALSE;
7267     }
7268     ucol_updateInternalState(coll, status);
7269 }
7270
7271 U_CAPI UColAttributeValue  U_EXPORT2
7272 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7273     if(U_FAILURE(*status) || coll == NULL) {
7274       return UCOL_DEFAULT;
7275     }
7276     switch(attr) {
7277     case UCOL_NUMERIC_COLLATION:
7278       return coll->numericCollation;
7279     case UCOL_HIRAGANA_QUATERNARY_MODE:
7280       return coll->hiraganaQ;
7281     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7282         return coll->frenchCollation;
7283     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7284         return coll->alternateHandling;
7285     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7286         return coll->caseFirst;
7287     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7288         return coll->caseLevel;
7289     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7290         return coll->normalizationMode;
7291     case UCOL_STRENGTH:         /* attribute for strength */
7292         return coll->strength;
7293     case UCOL_ATTRIBUTE_COUNT:
7294     default:
7295         *status = U_ILLEGAL_ARGUMENT_ERROR;
7296         break;
7297     }
7298     return UCOL_DEFAULT;
7299 }
7300
7301 U_CAPI void U_EXPORT2
7302 ucol_setStrength(    UCollator                *coll,
7303             UCollationStrength        strength)
7304 {
7305     UErrorCode status = U_ZERO_ERROR;
7306     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7307 }
7308
7309 U_CAPI UCollationStrength U_EXPORT2
7310 ucol_getStrength(const UCollator *coll)
7311 {
7312     UErrorCode status = U_ZERO_ERROR;
7313     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7314 }
7315
7316 /****************************************************************************/
7317 /* Following are misc functions                                             */
7318 /* there are new APIs and some compatibility APIs                           */
7319 /****************************************************************************/
7320
7321 U_CAPI void U_EXPORT2
7322 ucol_getVersion(const UCollator* coll,
7323                 UVersionInfo versionInfo)
7324 {
7325     /* RunTime version  */
7326     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7327     /* Builder version*/
7328     uint8_t bdVersion = coll->image->version[0];
7329
7330     /* Charset Version. Need to get the version from cnv files
7331      * makeconv should populate cnv files with version and
7332      * an api has to be provided in ucnv.h to obtain this version
7333      */
7334     uint8_t csVersion = 0;
7335
7336     /* combine the version info */
7337     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7338
7339     /* Tailoring rules */
7340     versionInfo[0] = (uint8_t)(cmbVersion>>8);
7341     versionInfo[1] = (uint8_t)cmbVersion;
7342     versionInfo[2] = coll->image->version[1];
7343     if(coll->UCA) {
7344         versionInfo[3] = coll->UCA->image->UCAVersion[0];
7345     } else {
7346         versionInfo[3] = 0;
7347     }
7348 }
7349
7350
7351 /* This internal API checks whether a character is tailored or not */
7352 U_CAPI UBool  U_EXPORT2
7353 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7354     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
7355         return FALSE;
7356     }
7357
7358     uint32_t CE = UCOL_NOT_FOUND;
7359     const UChar *ContractionStart = NULL;
7360     if(u < 0x100) { /* latin-1 */
7361         CE = coll->latinOneMapping[u];
7362         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7363             return FALSE;
7364         }
7365     } else { /* regular */
7366         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
7367     }
7368
7369     if(isContraction(CE)) {
7370         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7371         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7372     }
7373
7374     return (UBool)(CE != UCOL_NOT_FOUND);
7375 }
7376
7377
7378 /****************************************************************************/
7379 /* Following are the string compare functions                               */
7380 /*                                                                          */
7381 /****************************************************************************/
7382
7383
7384 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
7385 /*                     Used by strcoll if strength == identical and strings  */
7386 /*                     are otherwise equal.  Moved out-of-line because this  */
7387 /*                     is a rare case.                                       */
7388 /*                                                                           */
7389 /*                     Comparison must be done on NFD normalized strings.    */
7390 /*                     FCD is not good enough.                               */
7391 /*                                                                           */
7392 /*      TODO:  make an incremental NFD Comparison function, which could      */
7393 /*             be of general use                                             */
7394
7395 static
7396 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7397 {
7398
7399   // TODO: When we have an UChar iterator, we need to access the whole string. One
7400   // useful modification would be a UChar iterator extract API, since reset next next...
7401   // is not optimal.
7402   // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7403
7404   // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7405   // of same type, but that doesn't really mean that it will stay that way.
7406
7407     // The division for the array length may truncate the array size to
7408     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7409     // for all platforms anyway.
7410     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7411     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7412     //UChar sStackBuf[256], tStackBuf[256];
7413     //int32_t sBufSize = 256, tBufSize = 256;
7414     int32_t            comparison;
7415     int32_t          sLen        = 0;
7416     UChar            *sBuf       = NULL;
7417     int32_t          tLen        = 0;
7418     UChar            *tBuf       = NULL;
7419     UBool freeSBuf = FALSE, freeTBuf = FALSE;
7420
7421     if (sColl->flags & UCOL_USE_ITERATOR) {
7422         UNormIterator *sNIt = NULL, *tNIt = NULL;
7423         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7424         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7425         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7426         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7427         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7428         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7429         comparison = u_strCompareIter(sIt, tIt, TRUE);
7430         unorm_closeIter(sNIt);
7431         unorm_closeIter(tNIt);
7432     } else {
7433         sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7434         sBuf = sColl->string;
7435         tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7436         tBuf = tColl->string;
7437
7438         if (normalize) {
7439             *status = U_ZERO_ERROR;
7440             if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7441                 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7442                                     sBuf, sLen,
7443                                     FALSE, 0,
7444                                     status);
7445                 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7446                     if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7447                         &sColl->writableBuffer,
7448                         (int32_t *)&sColl->writableBufSize, sLen,
7449                         0)
7450                         )
7451                     {
7452                         *status = U_MEMORY_ALLOCATION_ERROR;
7453                         return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7454                     }
7455                     *status = U_ZERO_ERROR;
7456                     sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7457                                         sBuf, sLen,
7458                                         FALSE, 0,
7459                                         status);
7460                 }
7461                 if(freeSBuf) {
7462                     uprv_free(sBuf);
7463                     freeSBuf = FALSE;
7464                 }
7465                 sBuf = sColl->writableBuffer;
7466                 if (sBuf != sColl->stackWritableBuffer) {
7467                     sColl->flags |= UCOL_ITER_ALLOCATED;
7468                 }
7469             }
7470
7471             *status = U_ZERO_ERROR;
7472             if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7473                 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7474                                     tBuf, tLen,
7475                                     FALSE, 0,
7476                                     status);
7477                 if(*status == U_BUFFER_OVERFLOW_ERROR) {
7478                     if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7479                         &tColl->writableBuffer,
7480                         (int32_t *)&tColl->writableBufSize, tLen,
7481                         0)
7482                         )
7483                     {
7484                         *status = U_MEMORY_ALLOCATION_ERROR;
7485                         return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7486                     }
7487                     *status = U_ZERO_ERROR;
7488                     tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7489                                         tBuf, tLen,
7490                                         FALSE, 0,
7491                                         status);
7492                 }
7493                 if(freeTBuf) {
7494                     uprv_free(tBuf);
7495                     freeTBuf = FALSE;
7496                 }
7497                 tBuf = tColl->writableBuffer;
7498                 if (tBuf != tColl->stackWritableBuffer) {
7499                     tColl->flags |= UCOL_ITER_ALLOCATED;
7500                 }
7501             }
7502         }
7503
7504         if (sLen == -1 && tLen == -1) {
7505             comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7506         } else {
7507             if (sLen == -1) {
7508                 sLen = u_strlen(sBuf);
7509             }
7510             if (tLen == -1) {
7511                 tLen = u_strlen(tBuf);
7512             }
7513             comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7514             if (comparison == 0) {
7515                 comparison = sLen - tLen;
7516             }
7517         }
7518     }
7519
7520     if (comparison < 0) {
7521         return UCOL_LESS;
7522     } else if (comparison == 0) {
7523         return UCOL_EQUAL;
7524     } else /* comparison > 0 */ {
7525         return UCOL_GREATER;
7526     }
7527 }
7528
7529 /*  CEBuf - A struct and some inline functions to handle the saving    */
7530 /*          of CEs in a buffer within ucol_strcoll                     */
7531
7532 #define UCOL_CEBUF_SIZE 512
7533 typedef struct ucol_CEBuf {
7534     uint32_t    *buf;
7535     uint32_t    *endp;
7536     uint32_t    *pos;
7537     uint32_t     localArray[UCOL_CEBUF_SIZE];
7538 } ucol_CEBuf;
7539
7540
7541 static
7542 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7543     (b)->buf = (b)->pos = (b)->localArray;
7544     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7545 }
7546
7547 static
7548 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7549     uint32_t  oldSize;
7550     uint32_t  newSize;
7551     uint32_t  *newBuf;
7552
7553     ci->flags |= UCOL_ITER_ALLOCATED;
7554     oldSize = b->pos - b->buf;
7555     newSize = oldSize * 2;
7556     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7557     if(newBuf == NULL) {
7558         *status = U_MEMORY_ALLOCATION_ERROR;
7559     }
7560     else {
7561         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7562         if (b->buf != b->localArray) {
7563             uprv_free(b->buf);
7564         }
7565         b->buf = newBuf;
7566         b->endp = b->buf + newSize;
7567         b->pos  = b->buf + oldSize;
7568     }
7569 }
7570
7571 static
7572 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7573     if (b->pos == b->endp) {
7574         ucol_CEBuf_Expand(b, ci, status);
7575     }
7576     if (U_SUCCESS(*status)) {
7577         *(b)->pos++ = ce;
7578     }
7579 }
7580
7581 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7582 /* It is used when compare gets in trouble and needs to bail out                     */
7583 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7584                                                   collIterate *tColl,
7585                                                   UErrorCode *status)
7586 {
7587     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7588     uint8_t *sourceKeyP = sourceKey;
7589     uint8_t *targetKeyP = targetKey;
7590     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7591     const UCollator *coll = sColl->coll;
7592     UChar *source = NULL;
7593     UChar *target = NULL;
7594     int32_t result = UCOL_EQUAL;
7595     UChar sStackBuf[256], tStackBuf[256];
7596     int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7597     int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7598
7599     // TODO: Handle long strings. Do the same in ucol_checkIdent.
7600     if(sColl->flags & UCOL_USE_ITERATOR) {
7601         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7602         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7603         source = sStackBuf;
7604         UChar *sBufp = source;
7605         target = tStackBuf;
7606         UChar *tBufp = target;
7607         while(sColl->iterator->hasNext(sColl->iterator)) {
7608             *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7609         }
7610         while(tColl->iterator->hasNext(tColl->iterator)) {
7611             *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7612         }
7613         sourceLength = sBufp - source;
7614         targetLength = tBufp - target;
7615     } else { // no iterators
7616         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7617         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7618         source = sColl->string;
7619         target = tColl->string;
7620     }
7621
7622
7623
7624     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7625     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7626         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7627         if(sourceKeyP == NULL) {
7628             *status = U_MEMORY_ALLOCATION_ERROR;
7629             goto cleanup_and_do_compare;
7630         }
7631         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7632     }
7633
7634     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7635     if(targetKeyLen > UCOL_MAX_BUFFER) {
7636         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7637         if(targetKeyP == NULL) {
7638             *status = U_MEMORY_ALLOCATION_ERROR;
7639             goto cleanup_and_do_compare;
7640         }
7641         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7642     }
7643
7644     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7645
7646 cleanup_and_do_compare:
7647     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7648         uprv_free(sourceKeyP);
7649     }
7650
7651     if(targetKeyP != NULL && targetKeyP != targetKey) {
7652         uprv_free(targetKeyP);
7653     }
7654
7655     if(result<0) {
7656         return UCOL_LESS;
7657     } else if(result>0) {
7658         return UCOL_GREATER;
7659     } else {
7660         return UCOL_EQUAL;
7661     }
7662 }
7663
7664
7665 static inline UCollationResult
7666 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7667 //              const UCollator    *coll,
7668 //              const UChar        *source,
7669 //              int32_t            sourceLength,
7670 //              const UChar        *target,
7671 //              int32_t            targetLength,
7672               UErrorCode *status)
7673 {
7674     U_ALIGN_CODE(16);
7675
7676     const UCollator *coll = sColl->coll;
7677
7678
7679     // setting up the collator parameters
7680     UColAttributeValue strength = coll->strength;
7681     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7682
7683     UBool checkSecTer = initialCheckSecTer;
7684     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7685     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7686     UBool checkIdent = (strength == UCOL_IDENTICAL);
7687     UBool checkCase = (coll->caseLevel == UCOL_ON);
7688     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7689     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7690     UBool qShifted = shifted && checkQuad;
7691     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7692
7693     if(doHiragana && shifted) {
7694         return (ucol_compareUsingSortKeys(sColl, tColl, status));
7695     }
7696     uint8_t caseSwitch = coll->caseSwitch;
7697     uint8_t tertiaryMask = coll->tertiaryMask;
7698
7699     // This is the lowest primary value that will not be ignored if shifted
7700     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7701
7702     UCollationResult result = UCOL_EQUAL;
7703     UCollationResult hirResult = UCOL_EQUAL;
7704
7705     // Preparing the CE buffers. They will be filled during the primary phase
7706     ucol_CEBuf   sCEs;
7707     ucol_CEBuf   tCEs;
7708     UCOL_INIT_CEBUF(&sCEs);
7709     UCOL_INIT_CEBUF(&tCEs);
7710
7711     uint32_t secS = 0, secT = 0;
7712     uint32_t sOrder=0, tOrder=0;
7713
7714     // Non shifted primary processing is quite simple
7715     if(!shifted) {
7716         for(;;) {
7717
7718             // We fetch CEs until we hit a non ignorable primary or end.
7719             do {
7720                 // We get the next CE
7721                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7722                 // Stuff it in the buffer
7723                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7724                 // And keep just the primary part.
7725                 sOrder &= UCOL_PRIMARYMASK;
7726             } while(sOrder == 0);
7727
7728             // see the comments on the above block
7729             do {
7730                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7731                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7732                 tOrder &= UCOL_PRIMARYMASK;
7733             } while(tOrder == 0);
7734
7735             // if both primaries are the same
7736             if(sOrder == tOrder) {
7737                 // and there are no more CEs, we advance to the next level
7738                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7739                     break;
7740                 }
7741                 if(doHiragana && hirResult == UCOL_EQUAL) {
7742                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7743                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7744                             ? UCOL_LESS:UCOL_GREATER;
7745                     }
7746                 }
7747             } else {
7748                 // if two primaries are different, we are done
7749                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7750                 goto commonReturn;
7751             }
7752         } // no primary difference... do the rest from the buffers
7753     } else { // shifted - do a slightly more complicated processing :)
7754         for(;;) {
7755             UBool sInShifted = FALSE;
7756             UBool tInShifted = FALSE;
7757             // This version of code can be refactored. However, it seems easier to understand this way.
7758             // Source loop. Sam as the target loop.
7759             for(;;) {
7760                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7761                 if(sOrder == UCOL_NO_MORE_CES) {
7762                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7763                     break;
7764                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7765                     /* UCA amendment - ignore ignorables that follow shifted code points */
7766                     continue;
7767                 } else if(isContinuation(sOrder)) {
7768                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7769                         if(sInShifted) {
7770                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7771                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7772                             continue;
7773                         } else {
7774                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7775                             break;
7776                         }
7777                     } else { /* Just lower level values */
7778                         if(sInShifted) {
7779                             continue;
7780                         } else {
7781                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7782                             continue;
7783                         }
7784                     }
7785                 } else { /* regular */
7786                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7787                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7788                         break;
7789                     } else {
7790                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
7791                             sInShifted = TRUE;
7792                             sOrder &= UCOL_PRIMARYMASK;
7793                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7794                             continue;
7795                         } else {
7796                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7797                             sInShifted = FALSE;
7798                             continue;
7799                         }
7800                     }
7801                 }
7802             }
7803             sOrder &= UCOL_PRIMARYMASK;
7804             sInShifted = FALSE;
7805
7806             for(;;) {
7807                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7808                 if(tOrder == UCOL_NO_MORE_CES) {
7809                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7810                     break;
7811                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7812                     /* UCA amendment - ignore ignorables that follow shifted code points */
7813                     continue;
7814                 } else if(isContinuation(tOrder)) {
7815                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7816                         if(tInShifted) {
7817                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7818                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7819                             continue;
7820                         } else {
7821                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7822                             break;
7823                         }
7824                     } else { /* Just lower level values */
7825                         if(tInShifted) {
7826                             continue;
7827                         } else {
7828                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7829                             continue;
7830                         }
7831                     }
7832                 } else { /* regular */
7833                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7834                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7835                         break;
7836                     } else {
7837                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
7838                             tInShifted = TRUE;
7839                             tOrder &= UCOL_PRIMARYMASK;
7840                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7841                             continue;
7842                         } else {
7843                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7844                             tInShifted = FALSE;
7845                             continue;
7846                         }
7847                     }
7848                 }
7849             }
7850             tOrder &= UCOL_PRIMARYMASK;
7851             tInShifted = FALSE;
7852
7853             if(sOrder == tOrder) {
7854                 /*
7855                 if(doHiragana && hirResult == UCOL_EQUAL) {
7856                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7857                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7858                 ? UCOL_LESS:UCOL_GREATER;
7859                 }
7860                 }
7861                 */
7862                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7863                     break;
7864                 } else {
7865                     sOrder = 0;
7866                     tOrder = 0;
7867                     continue;
7868                 }
7869             } else {
7870                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7871                 goto commonReturn;
7872             }
7873         } /* no primary difference... do the rest from the buffers */
7874     }
7875
7876     /* now, we're gonna reexamine collected CEs */
7877     uint32_t    *sCE;
7878     uint32_t    *tCE;
7879
7880     /* This is the secondary level of comparison */
7881     if(checkSecTer) {
7882         if(!isFrenchSec) { /* normal */
7883             sCE = sCEs.buf;
7884             tCE = tCEs.buf;
7885             for(;;) {
7886                 while (secS == 0) {
7887                     secS = *(sCE++) & UCOL_SECONDARYMASK;
7888                 }
7889
7890                 while(secT == 0) {
7891                     secT = *(tCE++) & UCOL_SECONDARYMASK;
7892                 }
7893
7894                 if(secS == secT) {
7895                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7896                         break;
7897                     } else {
7898                         secS = 0; secT = 0;
7899                         continue;
7900                     }
7901                 } else {
7902                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7903                     goto commonReturn;
7904                 }
7905             }
7906         } else { /* do the French */
7907             uint32_t *sCESave = NULL;
7908             uint32_t *tCESave = NULL;
7909             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7910             tCE = tCEs.pos-2;
7911             for(;;) {
7912                 while (secS == 0 && sCE >= sCEs.buf) {
7913                     if(sCESave == 0) {
7914                         secS = *(sCE--);
7915                         if(isContinuation(secS)) {
7916                             while(isContinuation(secS = *(sCE--)))
7917                                 ;
7918                             /* after this, secS has the start of continuation, and sCEs points before that */
7919                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7920                             sCE+=2;  /* need to point to the first continuation CP */
7921                             /* However, now you can just continue doing stuff */
7922                         }
7923                     } else {
7924                         secS = *(sCE++);
7925                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
7926                             sCE = sCESave;            /* reset the pointer to before continuation */
7927                             sCESave = 0;
7928                             continue;
7929                         }
7930                     }
7931                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7932                 }
7933
7934                 while(secT == 0 && tCE >= tCEs.buf) {
7935                     if(tCESave == 0) {
7936                         secT = *(tCE--);
7937                         if(isContinuation(secT)) {
7938                             while(isContinuation(secT = *(tCE--)))
7939                                 ;
7940                             /* after this, secS has the start of continuation, and sCEs points before that */
7941                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7942                             tCE+=2;  /* need to point to the first continuation CP */
7943                             /* However, now you can just continue doing stuff */
7944                         }
7945                     } else {
7946                         secT = *(tCE++);
7947                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
7948                             tCE = tCESave;          /* reset the pointer to before continuation */
7949                             tCESave = 0;
7950                             continue;
7951                         }
7952                     }
7953                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7954                 }
7955
7956                 if(secS == secT) {
7957                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7958                         break;
7959                     } else {
7960                         secS = 0; secT = 0;
7961                         continue;
7962                     }
7963                 } else {
7964                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7965                     goto commonReturn;
7966                 }
7967             }
7968         }
7969     }
7970
7971     /* doing the case bit */
7972     if(checkCase) {
7973         sCE = sCEs.buf;
7974         tCE = tCEs.buf;
7975         for(;;) {
7976             while((secS & UCOL_REMOVE_CASE) == 0) {
7977                 if(!isContinuation(*sCE++)) {
7978                     secS =*(sCE-1);
7979                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7980                         // primary ignorables should not be considered on the case level when the strength is primary
7981                         // otherwise, the CEs stop being well-formed
7982                         secS &= UCOL_TERT_CASE_MASK;
7983                         secS ^= caseSwitch;
7984                     } else {
7985                         secS = 0;
7986                     }
7987                 } else {
7988                     secS = 0;
7989                 }
7990             }
7991
7992             while((secT & UCOL_REMOVE_CASE) == 0) {
7993                 if(!isContinuation(*tCE++)) {
7994                     secT = *(tCE-1);
7995                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7996                         // primary ignorables should not be considered on the case level when the strength is primary
7997                         // otherwise, the CEs stop being well-formed
7998                         secT &= UCOL_TERT_CASE_MASK;
7999                         secT ^= caseSwitch;
8000                     } else {
8001                         secT = 0;
8002                     }
8003                 } else {
8004                     secT = 0;
8005                 }
8006             }
8007
8008             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
8009                 result = UCOL_LESS;
8010                 goto commonReturn;
8011             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
8012                 result = UCOL_GREATER;
8013                 goto commonReturn;
8014             }
8015
8016             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
8017                 break;
8018             } else {
8019                 secS = 0;
8020                 secT = 0;
8021             }
8022         }
8023     }
8024
8025     /* Tertiary level */
8026     if(checkTertiary) {
8027         secS = 0;
8028         secT = 0;
8029         sCE = sCEs.buf;
8030         tCE = tCEs.buf;
8031         for(;;) {
8032             while((secS & UCOL_REMOVE_CASE) == 0) {
8033                 secS = *(sCE++) & tertiaryMask;
8034                 if(!isContinuation(secS)) {
8035                     secS ^= caseSwitch;
8036                 } else {
8037                     secS &= UCOL_REMOVE_CASE;
8038                 }
8039             }
8040
8041             while((secT & UCOL_REMOVE_CASE)  == 0) {
8042                 secT = *(tCE++) & tertiaryMask;
8043                 if(!isContinuation(secT)) {
8044                     secT ^= caseSwitch;
8045                 } else {
8046                     secT &= UCOL_REMOVE_CASE;
8047                 }
8048             }
8049
8050             if(secS == secT) {
8051                 if((secS & UCOL_REMOVE_CASE) == 1) {
8052                     break;
8053                 } else {
8054                     secS = 0; secT = 0;
8055                     continue;
8056                 }
8057             } else {
8058                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8059                 goto commonReturn;
8060             }
8061         }
8062     }
8063
8064
8065     if(qShifted /*checkQuad*/) {
8066         UBool sInShifted = TRUE;
8067         UBool tInShifted = TRUE;
8068         secS = 0;
8069         secT = 0;
8070         sCE = sCEs.buf;
8071         tCE = tCEs.buf;
8072         for(;;) {
8073             while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
8074                 secS = *(sCE++);
8075                 if(isContinuation(secS)) {
8076                     if(!sInShifted) {
8077                         continue;
8078                     }
8079                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
8080                     secS = UCOL_PRIMARYMASK;
8081                     sInShifted = FALSE;
8082                 } else {
8083                     sInShifted = TRUE;
8084                 }
8085             }
8086             secS &= UCOL_PRIMARYMASK;
8087
8088
8089             while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
8090                 secT = *(tCE++);
8091                 if(isContinuation(secT)) {
8092                     if(!tInShifted) {
8093                         continue;
8094                     }
8095                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
8096                     secT = UCOL_PRIMARYMASK;
8097                     tInShifted = FALSE;
8098                 } else {
8099                     tInShifted = TRUE;
8100                 }
8101             }
8102             secT &= UCOL_PRIMARYMASK;
8103
8104             if(secS == secT) {
8105                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
8106                     break;
8107                 } else {
8108                     secS = 0; secT = 0;
8109                     continue;
8110                 }
8111             } else {
8112                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8113                 goto commonReturn;
8114             }
8115         }
8116     } else if(doHiragana && hirResult != UCOL_EQUAL) {
8117         // If we're fine on quaternaries, we might be different
8118         // on Hiragana. This, however, might fail us in shifted.
8119         result = hirResult;
8120         goto commonReturn;
8121     }
8122
8123     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
8124     /*  as a tiebreaker if all else is equal.                                */
8125     /*  Getting here  should be quite rare - strings are not identical -     */
8126     /*     that is checked first, but compared == through all other checks.  */
8127     if(checkIdent)
8128     {
8129         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8130         result = ucol_checkIdent(sColl, tColl, TRUE, status);
8131     }
8132
8133 commonReturn:
8134     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
8135         freeHeapWritableBuffer(sColl);
8136         freeHeapWritableBuffer(tColl);
8137
8138         if (sCEs.buf != sCEs.localArray ) {
8139             uprv_free(sCEs.buf);
8140         }
8141         if (tCEs.buf != tCEs.localArray ) {
8142             uprv_free(tCEs.buf);
8143         }
8144     }
8145
8146     return result;
8147 }
8148
8149
8150 static inline uint32_t
8151 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
8152                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
8153 {
8154     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8155     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8156     int32_t offset = 1;
8157     UChar schar = 0, tchar = 0;
8158
8159     for(;;) {
8160         if(len == -1) {
8161             if(s[*index] == 0) { // end of string
8162                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8163             } else {
8164                 schar = s[*index];
8165             }
8166         } else {
8167             if(*index == len) {
8168                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8169             } else {
8170                 schar = s[*index];
8171             }
8172         }
8173
8174         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8175             offset++;
8176         }
8177
8178         if (schar == tchar) {
8179             (*index)++;
8180             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8181         }
8182         else
8183         {
8184             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8185                 return UCOL_BAIL_OUT_CE;
8186             }
8187             // skip completely ignorables
8188             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8189             if(isZeroCE == 0) { // we have to ignore completely ignorables
8190                 (*index)++;
8191                 continue;
8192             }
8193
8194             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8195         }
8196     }
8197 }
8198
8199
8200 /**
8201  * This is a fast strcoll, geared towards text in Latin-1.
8202  * It supports contractions of size two, French secondaries
8203  * and case switching. You can use it with strengths primary
8204  * to tertiary. It does not support shifted and case level.
8205  * It relies on the table build by setupLatin1Table. If it
8206  * doesn't understand something, it will go to the regular
8207  * strcoll.
8208  */
8209 static inline UCollationResult
8210 ucol_strcollUseLatin1( const UCollator    *coll,
8211               const UChar        *source,
8212               int32_t            sLen,
8213               const UChar        *target,
8214               int32_t            tLen,
8215               UErrorCode *status)
8216 {
8217     U_ALIGN_CODE(16);
8218     int32_t strength = coll->strength;
8219
8220     int32_t sIndex = 0, tIndex = 0;
8221     UChar sChar = 0, tChar = 0;
8222     uint32_t sOrder=0, tOrder=0;
8223
8224     UBool endOfSource = FALSE;
8225
8226     uint32_t *elements = coll->latinOneCEs;
8227
8228     UBool haveContractions = FALSE; // if we have contractions in our string
8229                                     // we cannot do French secondary
8230
8231     // Do the primary level
8232     for(;;) {
8233         while(sOrder==0) { // this loop skips primary ignorables
8234             // sOrder=getNextlatinOneCE(source);
8235             if(sLen==-1) {   // handling zero terminated strings
8236                 sChar=source[sIndex++];
8237                 if(sChar==0) {
8238                     endOfSource = TRUE;
8239                     break;
8240                 }
8241             } else {        // handling strings with known length
8242                 if(sIndex==sLen) {
8243                     endOfSource = TRUE;
8244                     break;
8245                 }
8246                 sChar=source[sIndex++];
8247             }
8248             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8249                 //fprintf(stderr, "R");
8250                 goto returnRegular;
8251                 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8252             }
8253             sOrder = elements[sChar];
8254             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8255                 // specials can basically be either contractions or bail-out signs. If we get anything
8256                 // else, we'll bail out anywasy
8257                 if(getCETag(sOrder) == CONTRACTION_TAG) {
8258                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8259                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8260                     // However, if there are contractions in the table, but we always use just one char,
8261                     // we might be able to do French. This should be checked out.
8262                 }
8263                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8264                     //fprintf(stderr, "S");
8265                     goto returnRegular;
8266                     //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8267                 }
8268             }
8269         }
8270
8271         while(tOrder==0) {  // this loop skips primary ignorables
8272             // tOrder=getNextlatinOneCE(target);
8273             if(tLen==-1) {    // handling zero terminated strings
8274                 tChar=target[tIndex++];
8275                 if(tChar==0) {
8276                     if(endOfSource) { // this is different than source loop,
8277                         // as we already know that source loop is done here,
8278                         // so we can either finish the primary loop if both
8279                         // strings are done or anounce the result if only
8280                         // target is done. Same below.
8281                         goto endOfPrimLoop;
8282                     } else {
8283                         return UCOL_GREATER;
8284                     }
8285                 }
8286             } else {          // handling strings with known length
8287                 if(tIndex==tLen) {
8288                     if(endOfSource) {
8289                         goto endOfPrimLoop;
8290                     } else {
8291                         return UCOL_GREATER;
8292                     }
8293                 }
8294                 tChar=target[tIndex++];
8295             }
8296             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8297                 //fprintf(stderr, "R");
8298                 goto returnRegular;
8299                 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8300             }
8301             tOrder = elements[tChar];
8302             if(tOrder >= UCOL_NOT_FOUND) {
8303                 // Handling specials, see the comments for source
8304                 if(getCETag(tOrder) == CONTRACTION_TAG) {
8305                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8306                     haveContractions = TRUE;
8307                 }
8308                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8309                     //fprintf(stderr, "S");
8310                     goto returnRegular;
8311                     //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8312                 }
8313             }
8314         }
8315         if(endOfSource) { // source is finished, but target is not, say the result.
8316             return UCOL_LESS;
8317         }
8318
8319         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8320             sOrder = 0; tOrder = 0;
8321             continue;
8322         } else {
8323             // compare current top bytes
8324             if(((sOrder^tOrder)&0xFF000000)!=0) {
8325                 // top bytes differ, return difference
8326                 if(sOrder < tOrder) {
8327                     return UCOL_LESS;
8328                 } else if(sOrder > tOrder) {
8329                     return UCOL_GREATER;
8330                 }
8331                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8332                 // since we must return enum value
8333             }
8334
8335             // top bytes match, continue with following bytes
8336             sOrder<<=8;
8337             tOrder<<=8;
8338         }
8339     }
8340
8341 endOfPrimLoop:
8342     // after primary loop, we definitely know the sizes of strings,
8343     // so we set it and use simpler loop for secondaries and tertiaries
8344     sLen = sIndex; tLen = tIndex;
8345     if(strength >= UCOL_SECONDARY) {
8346         // adjust the table beggining
8347         elements += coll->latinOneTableLen;
8348         endOfSource = FALSE;
8349
8350         if(coll->frenchCollation == UCOL_OFF) { // non French
8351             // This loop is a simplified copy of primary loop
8352             // at this point we know that whole strings are latin-1, so we don't
8353             // check for that. We also know that we only have contractions as
8354             // specials.
8355             sIndex = 0; tIndex = 0;
8356             for(;;) {
8357                 while(sOrder==0) {
8358                     if(sIndex==sLen) {
8359                         endOfSource = TRUE;
8360                         break;
8361                     }
8362                     sChar=source[sIndex++];
8363                     sOrder = elements[sChar];
8364                     if(sOrder > UCOL_NOT_FOUND) {
8365                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8366                     }
8367                 }
8368
8369                 while(tOrder==0) {
8370                     if(tIndex==tLen) {
8371                         if(endOfSource) {
8372                             goto endOfSecLoop;
8373                         } else {
8374                             return UCOL_GREATER;
8375                         }
8376                     }
8377                     tChar=target[tIndex++];
8378                     tOrder = elements[tChar];
8379                     if(tOrder > UCOL_NOT_FOUND) {
8380                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8381                     }
8382                 }
8383                 if(endOfSource) {
8384                     return UCOL_LESS;
8385                 }
8386
8387                 if(sOrder == tOrder) {
8388                     sOrder = 0; tOrder = 0;
8389                     continue;
8390                 } else {
8391                     // see primary loop for comments on this
8392                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8393                         if(sOrder < tOrder) {
8394                             return UCOL_LESS;
8395                         } else if(sOrder > tOrder) {
8396                             return UCOL_GREATER;
8397                         }
8398                     }
8399                     sOrder<<=8;
8400                     tOrder<<=8;
8401                 }
8402             }
8403         } else { // French
8404             if(haveContractions) { // if we have contractions, we have to bail out
8405                 // since we don't really know how to handle them here
8406                 goto returnRegular;
8407                 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8408             }
8409             // For French, we go backwards
8410             sIndex = sLen; tIndex = tLen;
8411             for(;;) {
8412                 while(sOrder==0) {
8413                     if(sIndex==0) {
8414                         endOfSource = TRUE;
8415                         break;
8416                     }
8417                     sChar=source[--sIndex];
8418                     sOrder = elements[sChar];
8419                     // don't even look for contractions
8420                 }
8421
8422                 while(tOrder==0) {
8423                     if(tIndex==0) {
8424                         if(endOfSource) {
8425                             goto endOfSecLoop;
8426                         } else {
8427                             return UCOL_GREATER;
8428                         }
8429                     }
8430                     tChar=target[--tIndex];
8431                     tOrder = elements[tChar];
8432                     // don't even look for contractions
8433                 }
8434                 if(endOfSource) {
8435                     return UCOL_LESS;
8436                 }
8437
8438                 if(sOrder == tOrder) {
8439                     sOrder = 0; tOrder = 0;
8440                     continue;
8441                 } else {
8442                     // see the primary loop for comments
8443                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8444                         if(sOrder < tOrder) {
8445                             return UCOL_LESS;
8446                         } else if(sOrder > tOrder) {
8447                             return UCOL_GREATER;
8448                         }
8449                     }
8450                     sOrder<<=8;
8451                     tOrder<<=8;
8452                 }
8453             }
8454         }
8455     }
8456
8457 endOfSecLoop:
8458     if(strength >= UCOL_TERTIARY) {
8459         // tertiary loop is the same as secondary (except no French)
8460         elements += coll->latinOneTableLen;
8461         sIndex = 0; tIndex = 0;
8462         endOfSource = FALSE;
8463         for(;;) {
8464             while(sOrder==0) {
8465                 if(sIndex==sLen) {
8466                     endOfSource = TRUE;
8467                     break;
8468                 }
8469                 sChar=source[sIndex++];
8470                 sOrder = elements[sChar];
8471                 if(sOrder > UCOL_NOT_FOUND) {
8472                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8473                 }
8474             }
8475             while(tOrder==0) {
8476                 if(tIndex==tLen) {
8477                     if(endOfSource) {
8478                         return UCOL_EQUAL; // if both strings are at the end, they are equal
8479                     } else {
8480                         return UCOL_GREATER;
8481                     }
8482                 }
8483                 tChar=target[tIndex++];
8484                 tOrder = elements[tChar];
8485                 if(tOrder > UCOL_NOT_FOUND) {
8486                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8487                 }
8488             }
8489             if(endOfSource) {
8490                 return UCOL_LESS;
8491             }
8492             if(sOrder == tOrder) {
8493                 sOrder = 0; tOrder = 0;
8494                 continue;
8495             } else {
8496                 if(((sOrder^tOrder)&0xff000000)!=0) {
8497                     if(sOrder < tOrder) {
8498                         return UCOL_LESS;
8499                     } else if(sOrder > tOrder) {
8500                         return UCOL_GREATER;
8501                     }
8502                 }
8503                 sOrder<<=8;
8504                 tOrder<<=8;
8505             }
8506         }
8507     }
8508     return UCOL_EQUAL;
8509
8510 returnRegular:
8511     // Preparing the context objects for iterating over strings
8512     collIterate sColl, tColl;
8513
8514     IInit_collIterate(coll, source, sLen, &sColl);
8515     IInit_collIterate(coll, target, tLen, &tColl);
8516     return ucol_strcollRegular(&sColl, &tColl, status);
8517 }
8518
8519
8520 U_CAPI UCollationResult U_EXPORT2
8521 ucol_strcollIter( const UCollator    *coll,
8522                  UCharIterator *sIter,
8523                  UCharIterator *tIter,
8524                  UErrorCode         *status)
8525 {
8526     if(!status || U_FAILURE(*status)) {
8527         return UCOL_EQUAL;
8528     }
8529
8530     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8531     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8532
8533     if (sIter == tIter) {
8534         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8535         return UCOL_EQUAL;
8536     }
8537     if(sIter == NULL || tIter == NULL || coll == NULL) {
8538         *status = U_ILLEGAL_ARGUMENT_ERROR;
8539         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8540         return UCOL_EQUAL;
8541     }
8542
8543     UCollationResult result = UCOL_EQUAL;
8544
8545     // Preparing the context objects for iterating over strings
8546     collIterate sColl, tColl;
8547     // The division for the array length may truncate the array size to
8548     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8549     // for all platforms anyway.
8550     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8551     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8552     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8553
8554     IInit_collIterate(coll, NULL, -1, &sColl);
8555     sColl.iterator = sIter;
8556     sColl.flags |= UCOL_USE_ITERATOR;
8557     IInit_collIterate(coll, NULL, -1, &tColl);
8558     tColl.flags |= UCOL_USE_ITERATOR;
8559     tColl.iterator = tIter;
8560
8561     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8562         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8563         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8564         sColl.flags &= ~UCOL_ITER_NORM;
8565
8566         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8567         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8568         tColl.flags &= ~UCOL_ITER_NORM;
8569     }
8570
8571     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8572
8573     while((sChar = sColl.iterator->next(sColl.iterator)) ==
8574         (tChar = tColl.iterator->next(tColl.iterator))) {
8575             if(sChar == U_SENTINEL) {
8576                 result = UCOL_EQUAL;
8577                 goto end_compare;
8578             }
8579     }
8580
8581     if(sChar == U_SENTINEL) {
8582         tChar = tColl.iterator->previous(tColl.iterator);
8583     }
8584
8585     if(tChar == U_SENTINEL) {
8586         sChar = sColl.iterator->previous(sColl.iterator);
8587     }
8588
8589     sChar = sColl.iterator->previous(sColl.iterator);
8590     tChar = tColl.iterator->previous(tColl.iterator);
8591
8592     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8593     {
8594         // We are stopped in the middle of a contraction.
8595         // Scan backwards through the == part of the string looking for the start of the contraction.
8596         //   It doesn't matter which string we scan, since they are the same in this region.
8597         do
8598         {
8599             sChar = sColl.iterator->previous(sColl.iterator);
8600             tChar = tColl.iterator->previous(tColl.iterator);
8601         }
8602         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8603     }
8604
8605
8606     if(U_SUCCESS(*status)) {
8607         result = ucol_strcollRegular(&sColl, &tColl, status);
8608     }
8609
8610 end_compare:
8611     if(sNormIter || tNormIter) {
8612         unorm_closeIter(sNormIter);
8613         unorm_closeIter(tNormIter);
8614     }
8615
8616     UTRACE_EXIT_VALUE_STATUS(result, *status)
8617     return result;
8618 }
8619
8620
8621 /*                                                                      */
8622 /* ucol_strcoll     Main public API string comparison function          */
8623 /*                                                                      */
8624 U_CAPI UCollationResult U_EXPORT2
8625 ucol_strcoll( const UCollator    *coll,
8626               const UChar        *source,
8627               int32_t            sourceLength,
8628               const UChar        *target,
8629               int32_t            targetLength)
8630 {
8631     U_ALIGN_CODE(16);
8632
8633     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8634     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8635         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8636         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8637         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8638     }
8639
8640     if(source == NULL || target == NULL) {
8641         // do not crash, but return. Should have
8642         // status argument to return error.
8643         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8644         return UCOL_EQUAL;
8645     }
8646
8647     /* Quick check if source and target are same strings. */
8648     /* They should either both be NULL terminated or the explicit length should be set on both. */
8649     if (source==target && sourceLength==targetLength) {
8650         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8651         return UCOL_EQUAL;
8652     }
8653
8654     /* Scan the strings.  Find:                                                             */
8655     /*    The length of any leading portion that is equal                                   */
8656     /*    Whether they are exactly equal.  (in which case we just return)                   */
8657     const UChar    *pSrc    = source;
8658     const UChar    *pTarg   = target;
8659     int32_t        equalLength;
8660
8661     if (sourceLength == -1 && targetLength == -1) {
8662         // Both strings are null terminated.
8663         //    Scan through any leading equal portion.
8664         while (*pSrc == *pTarg && *pSrc != 0) {
8665             pSrc++;
8666             pTarg++;
8667         }
8668         if (*pSrc == 0 && *pTarg == 0) {
8669             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8670             return UCOL_EQUAL;
8671         }
8672         equalLength = pSrc - source;
8673     }
8674     else
8675     {
8676         // One or both strings has an explicit length.
8677         const UChar    *pSrcEnd = source + sourceLength;
8678         const UChar    *pTargEnd = target + targetLength;
8679
8680         // Scan while the strings are bitwise ==, or until one is exhausted.
8681         for (;;) {
8682             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8683                 break;
8684             }
8685             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8686                 break;
8687             }
8688             if (*pSrc != *pTarg) {
8689                 break;
8690             }
8691             pSrc++;
8692             pTarg++;
8693         }
8694         equalLength = pSrc - source;
8695
8696         // If we made it all the way through both strings, we are done.  They are ==
8697         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8698             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8699         {
8700             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8701             return UCOL_EQUAL;
8702         }
8703     }
8704     if (equalLength > 0) {
8705         /* There is an identical portion at the beginning of the two strings.        */
8706         /*   If the identical portion ends within a contraction or a comibining      */
8707         /*   character sequence, back up to the start of that sequence.              */
8708
8709         // These values should already be set by the code above.
8710         //pSrc  = source + equalLength;        /* point to the first differing chars   */
8711         //pTarg = target + equalLength;
8712         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8713             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8714         {
8715             // We are stopped in the middle of a contraction.
8716             // Scan backwards through the == part of the string looking for the start of the contraction.
8717             //   It doesn't matter which string we scan, since they are the same in this region.
8718             do
8719             {
8720                 equalLength--;
8721                 pSrc--;
8722             }
8723             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8724         }
8725
8726         source += equalLength;
8727         target += equalLength;
8728         if (sourceLength > 0) {
8729             sourceLength -= equalLength;
8730         }
8731         if (targetLength > 0) {
8732             targetLength -= equalLength;
8733         }
8734     }
8735
8736     UErrorCode status = U_ZERO_ERROR;
8737     UCollationResult returnVal;
8738     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8739         collIterate sColl, tColl;
8740         // Preparing the context objects for iterating over strings
8741         IInit_collIterate(coll, source, sourceLength, &sColl);
8742         IInit_collIterate(coll, target, targetLength, &tColl);
8743         returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
8744     } else {
8745         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8746     }
8747     UTRACE_EXIT_VALUE(returnVal);
8748     return returnVal;
8749 }
8750
8751 /* convenience function for comparing strings */
8752 U_CAPI UBool U_EXPORT2
8753 ucol_greater(    const    UCollator        *coll,
8754         const    UChar            *source,
8755         int32_t            sourceLength,
8756         const    UChar            *target,
8757         int32_t            targetLength)
8758 {
8759     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8760         == UCOL_GREATER);
8761 }
8762
8763 /* convenience function for comparing strings */
8764 U_CAPI UBool U_EXPORT2
8765 ucol_greaterOrEqual(    const    UCollator    *coll,
8766             const    UChar        *source,
8767             int32_t        sourceLength,
8768             const    UChar        *target,
8769             int32_t        targetLength)
8770 {
8771     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8772         != UCOL_LESS);
8773 }
8774
8775 /* convenience function for comparing strings */
8776 U_CAPI UBool U_EXPORT2
8777 ucol_equal(        const    UCollator        *coll,
8778             const    UChar            *source,
8779             int32_t            sourceLength,
8780             const    UChar            *target,
8781             int32_t            targetLength)
8782 {
8783     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8784         == UCOL_EQUAL);
8785 }
8786
8787 U_CAPI void U_EXPORT2
8788 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8789     if(coll && coll->UCA) {
8790         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8791     }
8792 }
8793
8794 #endif /* #if !UCONFIG_NO_COLLATION */