icuSources/i18n/ucol.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 1996-2012, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  ucol.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 * Modification history
  12 * Date        Name      Comments
  13 * 1996-1999   various members of ICU team maintained C API for collation framework
  14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
  15 * 03/01/2001  synwee    Added maxexpansion functionality.
  16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
  17 */
  18
  19 #include "unicode/utypes.h"
  20
  21 #if !UCONFIG_NO_COLLATION
  22
  23 #include "unicode/bytestream.h"
  24 #include "unicode/coleitr.h"
  25 #include "unicode/unorm.h"
  26 #include "unicode/udata.h"
  27 #include "unicode/ustring.h"
  28 #include "unicode/utf8.h"
  29
  30 #include "ucol_imp.h"
  31 #include "bocsu.h"
  32
  33 #include "normalizer2impl.h"
  34 #include "unorm_it.h"
  35 #include "umutex.h"
  36 #include "cmemory.h"
  37 #include "ucln_in.h"
  38 #include "cstring.h"
  39 #include "utracimp.h"
  40 #include "putilimp.h"
  41 #include "uassert.h"
  42 #include "unicode/coll.h"
  43
  44 #ifdef UCOL_DEBUG
  45 #include <stdio.h>
  46 #endif
  47
  48 U_NAMESPACE_USE
  49
  50 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  51
  52 #define LAST_BYTE_MASK_           0xFF
  53 #define SECOND_LAST_BYTE_SHIFT_   8
  54
  55 #define ZERO_CC_LIMIT_            0xC0
  56
  57 // These are static pointers to the NFC/NFD implementation instance.
  58 // Each of them is always the same between calls to u_cleanup
  59 // and therefore writing to it is not synchronized.
  60 // They are cleaned in ucol_cleanup
  61 static const Normalizer2 *g_nfd = NULL;
  62 static const Normalizer2Impl *g_nfcImpl = NULL;
  63
  64 // These are values from UCA required for
  65 // implicit generation and supressing sort key compression
  66 // they should regularly be in the UCA, but if one
  67 // is running without UCA, it could be a problem
  68 static const int32_t maxRegularPrimary  = 0x7A;
  69 static const int32_t minImplicitPrimary = 0xE0;
  70 static const int32_t maxImplicitPrimary = 0xE4;
  71
  72 U_CDECL_BEGIN
  73 static UBool U_CALLCONV
  74 ucol_cleanup(void)
  75 {
  76     g_nfd = NULL;
  77     g_nfcImpl = NULL;
  78     return TRUE;
  79 }
  80
  81 static int32_t U_CALLCONV
  82 _getFoldingOffset(uint32_t data) {
  83     return (int32_t)(data&0xFFFFFF);
  84 }
  85
  86 U_CDECL_END
  87
  88 static inline
  89 UBool initializeNFD(UErrorCode *status) {
  90     if (g_nfd != NULL) {
  91         return TRUE;
  92     } else {
  93         // The result is constant, until the library is reloaded.
  94         g_nfd = Normalizer2Factory::getNFDInstance(*status);
  95         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
  96         return U_SUCCESS(*status);
  97     }
  98 }
  99
 100 // init FCD data
 101 static inline
 102 UBool initializeFCD(UErrorCode *status) {
 103     if (g_nfcImpl != NULL) {
 104         return TRUE;
 105     } else {
 106         // The result is constant, until the library is reloaded.
 107         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
 108         // Note: Alternatively, we could also store this pointer in each collIterate struct,
 109         // same as Normalizer2Factory::getImpl(collIterate->nfd).
 110         ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
 111         return U_SUCCESS(*status);
 112     }
 113 }
 114
 115 static
 116 inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
 117                               int32_t sourceLen, collIterate *s,
 118                               UErrorCode *status)
 119 {
 120     (s)->string = (s)->pos = sourceString;
 121     (s)->origFlags = 0;
 122     (s)->flags = 0;
 123     if (sourceLen >= 0) {
 124         s->flags |= UCOL_ITER_HASLEN;
 125         (s)->endp = (UChar *)sourceString+sourceLen;
 126     }
 127     else {
 128         /* change to enable easier checking for end of string for fcdpositon */
 129         (s)->endp = NULL;
 130     }
 131     (s)->extendCEs = NULL;
 132     (s)->extendCEsSize = 0;
 133     (s)->CEpos = (s)->toReturn = (s)->CEs;
 134     (s)->offsetBuffer = NULL;
 135     (s)->offsetBufferSize = 0;
 136     (s)->offsetReturn = (s)->offsetStore = NULL;
 137     (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
 138     (s)->coll = (collator);
 139     if (initializeNFD(status)) {
 140         (s)->nfd = g_nfd;
 141     } else {
 142         return;
 143     }
 144     (s)->fcdPosition = 0;
 145     if(collator->normalizationMode == UCOL_ON) {
 146         (s)->flags |= UCOL_ITER_NORM;
 147     }
 148     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
 149         (s)->flags |= UCOL_HIRAGANA_Q;
 150     }
 151     (s)->iterator = NULL;
 152     //(s)->iteratorIndex = 0;
 153 }
 154
 155 U_CAPI void  U_EXPORT2
 156 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
 157                              int32_t sourceLen, collIterate *s,
 158                              UErrorCode *status) {
 159     /* Out-of-line version for use from other files. */
 160     IInit_collIterate(collator, sourceString, sourceLen, s, status);
 161 }
 162
 163 U_CAPI collIterate * U_EXPORT2
 164 uprv_new_collIterate(UErrorCode *status) {
 165     if(U_FAILURE(*status)) {
 166         return NULL;
 167     }
 168     collIterate *s = new collIterate;
 169     if(s == NULL) {
 170         *status = U_MEMORY_ALLOCATION_ERROR;
 171         return NULL;
 172     }
 173     return s;
 174 }
 175
 176 U_CAPI void U_EXPORT2
 177 uprv_delete_collIterate(collIterate *s) {
 178     delete s;
 179 }
 180
 181 U_CAPI UBool U_EXPORT2
 182 uprv_collIterateAtEnd(collIterate *s) {
 183     return s == NULL || s->pos == s->endp;
 184 }
 185
 186 /**
 187 * Backup the state of the collIterate struct data
 188 * @param data collIterate to backup
 189 * @param backup storage
 190 */
 191 static
 192 inline void backupState(const collIterate *data, collIterateState *backup)
 193 {
 194     backup->fcdPosition = data->fcdPosition;
 195     backup->flags       = data->flags;
 196     backup->origFlags   = data->origFlags;
 197     backup->pos         = data->pos;
 198     backup->bufferaddress = data->writableBuffer.getBuffer();
 199     backup->buffersize    = data->writableBuffer.length();
 200     backup->iteratorMove = 0;
 201     backup->iteratorIndex = 0;
 202     if(data->iterator != NULL) {
 203         //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
 204         backup->iteratorIndex = data->iterator->getState(data->iterator);
 205         // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
 206         if(backup->iteratorIndex == UITER_NO_STATE) {
 207             while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
 208                 backup->iteratorMove++;
 209                 data->iterator->move(data->iterator, -1, UITER_CURRENT);
 210             }
 211             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 212         }
 213     }
 214 }
 215
 216 /**
 217 * Loads the state into the collIterate struct data
 218 * @param data collIterate to backup
 219 * @param backup storage
 220 * @param forwards boolean to indicate if forwards iteration is used,
 221 *        false indicates backwards iteration
 222 */
 223 static
 224 inline void loadState(collIterate *data, const collIterateState *backup,
 225                       UBool        forwards)
 226 {
 227     UErrorCode status = U_ZERO_ERROR;
 228     data->flags       = backup->flags;
 229     data->origFlags   = backup->origFlags;
 230     if(data->iterator != NULL) {
 231         //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
 232         data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
 233         if(backup->iteratorMove != 0) {
 234             data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 235         }
 236     }
 237     data->pos         = backup->pos;
 238
 239     if ((data->flags & UCOL_ITER_INNORMBUF) &&
 240         data->writableBuffer.getBuffer() != backup->bufferaddress) {
 241         /*
 242         this is when a new buffer has been reallocated and we'll have to
 243         calculate the new position.
 244         note the new buffer has to contain the contents of the old buffer.
 245         */
 246         if (forwards) {
 247             data->pos = data->writableBuffer.getTerminatedBuffer() +
 248                                          (data->pos - backup->bufferaddress);
 249         }
 250         else {
 251             /* backwards direction */
 252             int32_t temp = backup->buffersize -
 253                                   (int32_t)(data->pos - backup->bufferaddress);
 254             data->pos = data->writableBuffer.getTerminatedBuffer() + (data->writableBuffer.length() - temp);
 255         }
 256     }
 257     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
 258         /*
 259         this is alittle tricky.
 260         if we are initially not in the normalization buffer, even if we
 261         normalize in the later stage, the data in the buffer will be
 262         ignored, since we skip back up to the data string.
 263         however if we are already in the normalization buffer, any
 264         further normalization will pull data into the normalization
 265         buffer and modify the fcdPosition.
 266         since we are keeping the data in the buffer for use, the
 267         fcdPosition can not be reverted back.
 268         arrgghh....
 269         */
 270         data->fcdPosition = backup->fcdPosition;
 271     }
 272 }
 273
 274 static UBool
 275 reallocCEs(collIterate *data, int32_t newCapacity) {
 276     uint32_t *oldCEs = data->extendCEs;
 277     if(oldCEs == NULL) {
 278         oldCEs = data->CEs;
 279     }
 280     int32_t length = data->CEpos - oldCEs;
 281     uint32_t *newCEs = (uint32_t *)uprv_malloc(newCapacity * 4);
 282     if(newCEs == NULL) {
 283         return FALSE;
 284     }
 285     uprv_memcpy(newCEs, oldCEs, length * 4);
 286     uprv_free(data->extendCEs);
 287     data->extendCEs = newCEs;
 288     data->extendCEsSize = newCapacity;
 289     data->CEpos = newCEs + length;
 290     return TRUE;
 291 }
 292
 293 static UBool
 294 increaseCEsCapacity(collIterate *data) {
 295     int32_t oldCapacity;
 296     if(data->extendCEs != NULL) {
 297         oldCapacity = data->extendCEsSize;
 298     } else {
 299         oldCapacity = LENGTHOF(data->CEs);
 300     }
 301     return reallocCEs(data, 2 * oldCapacity);
 302 }
 303
 304 static UBool
 305 ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
 306     int32_t oldCapacity;
 307     if(data->extendCEs != NULL) {
 308         oldCapacity = data->extendCEsSize;
 309     } else {
 310         oldCapacity = LENGTHOF(data->CEs);
 311     }
 312     if(minCapacity <= oldCapacity) {
 313         return TRUE;
 314     }
 315     oldCapacity *= 2;
 316     return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
 317 }
 318
 319 void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
 320     if(U_FAILURE(errorCode)) {
 321         return;
 322     }
 323     int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
 324     U_ASSERT(length >= offsetBufferSize || offsetStore != NULL);
 325     if(length >= offsetBufferSize) {
 326         int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
 327         int32_t *newBuffer = static_cast<int32_t *>(uprv_malloc(newCapacity * 4));
 328         if(newBuffer == NULL) {
 329             errorCode = U_MEMORY_ALLOCATION_ERROR;
 330             return;
 331         }
 332         if(length > 0) {
 333             uprv_memcpy(newBuffer, offsetBuffer, length * 4);
 334         }
 335         uprv_free(offsetBuffer);
 336         offsetBuffer = newBuffer;
 337         offsetStore = offsetBuffer + length;
 338         offsetBufferSize = newCapacity;
 339     }
 340     *offsetStore++ = offset;
 341 }
 342
 343 /*
 344 * collIter_eos()
 345 *     Checks for a collIterate being positioned at the end of
 346 *     its source string.
 347 *
 348 */
 349 static
 350 inline UBool collIter_eos(collIterate *s) {
 351     if(s->flags & UCOL_USE_ITERATOR) {
 352       return !(s->iterator->hasNext(s->iterator));
 353     }
 354     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
 355         // Null terminated string, but not at null, so not at end.
 356         //   Whether in main or normalization buffer doesn't matter.
 357         return FALSE;
 358     }
 359
 360     // String with length.  Can't be in normalization buffer, which is always
 361     //  null termintated.
 362     if (s->flags & UCOL_ITER_HASLEN) {
 363         return (s->pos == s->endp);
 364     }
 365
 366     // We are at a null termination, could be either normalization buffer or main string.
 367     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
 368         // At null at end of main string.
 369         return TRUE;
 370     }
 371
 372     // At null at end of normalization buffer.  Need to check whether there there are
 373     //   any characters left in the main buffer.
 374     if(s->origFlags & UCOL_USE_ITERATOR) {
 375       return !(s->iterator->hasNext(s->iterator));
 376     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
 377         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
 378         return (*s->fcdPosition == 0);
 379     }
 380     else {
 381         // Main string with an end pointer.
 382         return s->fcdPosition == s->endp;
 383     }
 384 }
 385
 386 /*
 387 * collIter_bos()
 388 *     Checks for a collIterate being positioned at the start of
 389 *     its source string.
 390 *
 391 */
 392 static
 393 inline UBool collIter_bos(collIterate *source) {
 394   // if we're going backwards, we need to know whether there is more in the
 395   // iterator, even if we are in the side buffer
 396   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 397     return !source->iterator->hasPrevious(source->iterator);
 398   }
 399   if (source->pos <= source->string ||
 400       ((source->flags & UCOL_ITER_INNORMBUF) &&
 401       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
 402     return TRUE;
 403   }
 404   return FALSE;
 405 }
 406
 407 /*static
 408 inline UBool collIter_SimpleBos(collIterate *source) {
 409   // if we're going backwards, we need to know whether there is more in the
 410   // iterator, even if we are in the side buffer
 411   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 412     return !source->iterator->hasPrevious(source->iterator);
 413   }
 414   if (source->pos == source->string) {
 415     return TRUE;
 416   }
 417   return FALSE;
 418 }*/
 419     //return (data->pos == data->string) ||
 420
 421
 422 /****************************************************************************/
 423 /* Following are the open/close functions                                   */
 424 /*                                                                          */
 425 /****************************************************************************/
 426
 427 static UCollator*
 428 ucol_initFromBinary(const uint8_t *bin, int32_t length,
 429                 const UCollator *base,
 430                 UCollator *fillIn,
 431                 UErrorCode *status)
 432 {
 433     UCollator *result = fillIn;
 434     if(U_FAILURE(*status)) {
 435         return NULL;
 436     }
 437     /*
 438     if(base == NULL) {
 439         // we don't support null base yet
 440         *status = U_ILLEGAL_ARGUMENT_ERROR;
 441         return NULL;
 442     }
 443     */
 444     // We need these and we could be running without UCA
 445     uprv_uca_initImplicitConstants(status);
 446     UCATableHeader *colData = (UCATableHeader *)bin;
 447     // do we want version check here? We're trying to figure out whether collators are compatible
 448     if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
 449         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) ||
 450         colData->version[0] != UCOL_BUILDER_VERSION)
 451     {
 452         *status = U_COLLATOR_VERSION_MISMATCH;
 453         return NULL;
 454     }
 455     else {
 456         if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
 457             result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
 458             if(U_FAILURE(*status)){
 459                 return NULL;
 460             }
 461             result->hasRealData = TRUE;
 462         }
 463         else {
 464             if(base) {
 465                 result = ucol_initCollator(base->image, result, base, status);
 466                 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
 467                 if(U_FAILURE(*status)){
 468                     return NULL;
 469                 }
 470                 result->hasRealData = FALSE;
 471             }
 472             else {
 473                 *status = U_USELESS_COLLATOR_ERROR;
 474                 return NULL;
 475             }
 476         }
 477         result->freeImageOnClose = FALSE;
 478     }
 479     result->actualLocale = NULL;
 480     result->validLocale = NULL;
 481     result->requestedLocale = NULL;
 482     result->rules = NULL;
 483     result->rulesLength = 0;
 484     result->freeRulesOnClose = FALSE;
 485     result->ucaRules = NULL;
 486     return result;
 487 }
 488
 489 U_CAPI UCollator* U_EXPORT2
 490 ucol_openBinary(const uint8_t *bin, int32_t length,
 491                 const UCollator *base,
 492                 UErrorCode *status)
 493 {
 494     return ucol_initFromBinary(bin, length, base, NULL, status);
 495 }
 496
 497 U_CAPI int32_t U_EXPORT2
 498 ucol_cloneBinary(const UCollator *coll,
 499                  uint8_t *buffer, int32_t capacity,
 500                  UErrorCode *status)
 501 {
 502     int32_t length = 0;
 503     if(U_FAILURE(*status)) {
 504         return length;
 505     }
 506     if(capacity < 0) {
 507         *status = U_ILLEGAL_ARGUMENT_ERROR;
 508         return length;
 509     }
 510     if(coll->hasRealData == TRUE) {
 511         length = coll->image->size;
 512         if(length <= capacity) {
 513             uprv_memcpy(buffer, coll->image, length);
 514         } else {
 515             *status = U_BUFFER_OVERFLOW_ERROR;
 516         }
 517     } else {
 518         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 519         if(length <= capacity) {
 520             /* build the UCATableHeader with minimal entries */
 521             /* do not copy the header from the UCA file because its values are wrong! */
 522             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 523
 524             /* reset everything */
 525             uprv_memset(buffer, 0, length);
 526
 527             /* set the tailoring-specific values */
 528             UCATableHeader *myData = (UCATableHeader *)buffer;
 529             myData->size = length;
 530
 531             /* offset for the options, the only part of the data that is present after the header */
 532             myData->options = sizeof(UCATableHeader);
 533
 534             /* need to always set the expansion value for an upper bound of the options */
 535             myData->expansion = myData->options + sizeof(UColOptionSet);
 536
 537             myData->magic = UCOL_HEADER_MAGIC;
 538             myData->isBigEndian = U_IS_BIG_ENDIAN;
 539             myData->charSetFamily = U_CHARSET_FAMILY;
 540
 541             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 542             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 543
 544             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 545             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 546             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 547             myData->jamoSpecial = coll->image->jamoSpecial;
 548
 549             /* copy the collator options */
 550             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 551         } else {
 552             *status = U_BUFFER_OVERFLOW_ERROR;
 553         }
 554     }
 555     return length;
 556 }
 557
 558 U_CAPI UCollator* U_EXPORT2
 559 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
 560 {
 561     UCollator * localCollator;
 562     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
 563     char *stackBufferChars = (char *)stackBuffer;
 564     int32_t imageSize = 0;
 565     int32_t rulesSize = 0;
 566     int32_t rulesPadding = 0;
 567     int32_t defaultReorderCodesSize = 0;
 568     int32_t reorderCodesSize = 0;
 569     uint8_t *image;
 570     UChar *rules;
 571     int32_t* defaultReorderCodes;
 572     int32_t* reorderCodes;
 573     uint8_t* leadBytePermutationTable;
 574     UBool colAllocated = FALSE;
 575     UBool imageAllocated = FALSE;
 576
 577     if (status == NULL || U_FAILURE(*status)){
 578         return 0;
 579     }
 580     if ((stackBuffer && !pBufferSize) || !coll){
 581        *status = U_ILLEGAL_ARGUMENT_ERROR;
 582         return 0;
 583     }
 584
 585     if (coll->rules && coll->freeRulesOnClose) {
 586         rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar);
 587         rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar));
 588         bufferSizeNeeded += rulesSize + rulesPadding;
 589     }
 590     // no padding for alignment needed from here since the next two are 4 byte quantities
 591     if (coll->defaultReorderCodes) {
 592         defaultReorderCodesSize = coll->defaultReorderCodesLength * sizeof(int32_t);
 593         bufferSizeNeeded += defaultReorderCodesSize;
 594     }
 595     if (coll->reorderCodes) {
 596         reorderCodesSize = coll->reorderCodesLength * sizeof(int32_t);
 597         bufferSizeNeeded += reorderCodesSize;
 598     }
 599     if (coll->leadBytePermutationTable) {
 600         bufferSizeNeeded += 256 * sizeof(uint8_t);
 601     }
 602
 603     if (stackBuffer && *pBufferSize <= 0) { /* 'preflighting' request - set needed size into *pBufferSize */
 604         *pBufferSize =  bufferSizeNeeded;
 605         return 0;
 606     }
 607
 608     /* Pointers on 64-bit platforms need to be aligned
 609      * on a 64-bit boundry in memory.
 610      */
 611     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
 612         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
 613         if (*pBufferSize > offsetUp) {
 614             *pBufferSize -= offsetUp;
 615             stackBufferChars += offsetUp;
 616         }
 617         else {
 618             /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */
 619             *pBufferSize = 1;
 620         }
 621     }
 622     stackBuffer = (void *)stackBufferChars;
 623
 624     if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) {
 625         /* allocate one here...*/
 626         stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded);
 627         // Null pointer check.
 628         if (stackBufferChars == NULL) {
 629             *status = U_MEMORY_ALLOCATION_ERROR;
 630             return NULL;
 631         }
 632         colAllocated = TRUE;
 633         if (U_SUCCESS(*status)) {
 634             *status = U_SAFECLONE_ALLOCATED_WARNING;
 635         }
 636     }
 637     localCollator = (UCollator *)stackBufferChars;
 638     rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding);
 639     defaultReorderCodes = (int32_t*)((uint8_t*)rules + rulesSize);
 640     reorderCodes = (int32_t*)((uint8_t*)defaultReorderCodes + defaultReorderCodesSize);
 641     leadBytePermutationTable = (uint8_t*)reorderCodes + reorderCodesSize;
 642
 643     {
 644         UErrorCode tempStatus = U_ZERO_ERROR;
 645         imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus);
 646     }
 647     if (coll->freeImageOnClose) {
 648         image = (uint8_t *)uprv_malloc(imageSize);
 649         // Null pointer check
 650         if (image == NULL) {
 651             *status = U_MEMORY_ALLOCATION_ERROR;
 652             return NULL;
 653         }
 654         ucol_cloneBinary(coll, image, imageSize, status);
 655         imageAllocated = TRUE;
 656     }
 657     else {
 658         image = (uint8_t *)coll->image;
 659     }
 660     localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status);
 661     if (U_FAILURE(*status)) {
 662         return NULL;
 663     }
 664
 665     if (coll->rules) {
 666         if (coll->freeRulesOnClose) {
 667             localCollator->rules = u_strcpy(rules, coll->rules);
 668             //bufferEnd += rulesSize;
 669         }
 670         else {
 671             localCollator->rules = coll->rules;
 672         }
 673         localCollator->freeRulesOnClose = FALSE;
 674         localCollator->rulesLength = coll->rulesLength;
 675     }
 676
 677     // collator reordering
 678     if (coll->defaultReorderCodes) {
 679         localCollator->defaultReorderCodes =
 680             (int32_t*) uprv_memcpy(defaultReorderCodes, coll->defaultReorderCodes, coll->defaultReorderCodesLength * sizeof(int32_t));
 681         localCollator->defaultReorderCodesLength = coll->defaultReorderCodesLength;
 682         localCollator->freeDefaultReorderCodesOnClose = FALSE;
 683     }
 684     if (coll->reorderCodes) {
 685         localCollator->reorderCodes =
 686             (int32_t*)uprv_memcpy(reorderCodes, coll->reorderCodes, coll->reorderCodesLength * sizeof(int32_t));
 687         localCollator->reorderCodesLength = coll->reorderCodesLength;
 688         localCollator->freeReorderCodesOnClose = FALSE;
 689     }
 690     if (coll->leadBytePermutationTable) {
 691         localCollator->leadBytePermutationTable =
 692             (uint8_t*) uprv_memcpy(leadBytePermutationTable, coll->leadBytePermutationTable, 256);
 693         localCollator->freeLeadBytePermutationTableOnClose = FALSE;
 694     }
 695
 696     int32_t i;
 697     for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
 698         ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status);
 699     }
 700     // zero copies of pointers
 701     localCollator->actualLocale = NULL;
 702     localCollator->validLocale = NULL;
 703     localCollator->requestedLocale = NULL;
 704     localCollator->ucaRules = coll->ucaRules; // There should only be one copy here.
 705     localCollator->freeOnClose = colAllocated;
 706     localCollator->freeImageOnClose = imageAllocated;
 707     return localCollator;
 708 }
 709
 710 U_CAPI void U_EXPORT2
 711 ucol_close(UCollator *coll)
 712 {
 713     UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
 714     UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
 715     if(coll != NULL) {
 716         // these are always owned by each UCollator struct,
 717         // so we always free them
 718         if(coll->validLocale != NULL) {
 719             uprv_free(coll->validLocale);
 720         }
 721         if(coll->actualLocale != NULL) {
 722             uprv_free(coll->actualLocale);
 723         }
 724         if(coll->requestedLocale != NULL) {
 725             uprv_free(coll->requestedLocale);
 726         }
 727         if(coll->latinOneCEs != NULL) {
 728             uprv_free(coll->latinOneCEs);
 729         }
 730         if(coll->options != NULL && coll->freeOptionsOnClose) {
 731             uprv_free(coll->options);
 732         }
 733         if(coll->rules != NULL && coll->freeRulesOnClose) {
 734             uprv_free((UChar *)coll->rules);
 735         }
 736         if(coll->image != NULL && coll->freeImageOnClose) {
 737             uprv_free((UCATableHeader *)coll->image);
 738         }
 739
 740         if(coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
 741             uprv_free(coll->leadBytePermutationTable);
 742         }
 743         if(coll->defaultReorderCodes != NULL && coll->freeDefaultReorderCodesOnClose == TRUE) {
 744             uprv_free(coll->defaultReorderCodes);
 745         }
 746         if(coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
 747             uprv_free(coll->reorderCodes);
 748         }
 749
 750         if(coll->delegate != NULL) {
 751           delete (Collator*)coll->delegate;
 752         }
 753
 754         /* Here, it would be advisable to close: */
 755         /* - UData for UCA (unless we stuff it in the root resb */
 756         /* Again, do we need additional housekeeping... HMMM! */
 757         UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
 758         if(coll->freeOnClose){
 759             /* for safeClone, if freeOnClose is FALSE,
 760             don't free the other instance data */
 761             uprv_free(coll);
 762         }
 763     }
 764     UTRACE_EXIT();
 765 }
 766
 767 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
 768 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
 769 U_CFUNC uint8_t* U_EXPORT2
 770 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
 771 {
 772     uint8_t *result = NULL;
 773     if(U_FAILURE(*status)) {
 774         return NULL;
 775     }
 776     if(coll->hasRealData == TRUE) {
 777         *length = coll->image->size;
 778         result = (uint8_t *)uprv_malloc(*length);
 779         /* test for NULL */
 780         if (result == NULL) {
 781             *status = U_MEMORY_ALLOCATION_ERROR;
 782             return NULL;
 783         }
 784         uprv_memcpy(result, coll->image, *length);
 785     } else {
 786         *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 787         result = (uint8_t *)uprv_malloc(*length);
 788         /* test for NULL */
 789         if (result == NULL) {
 790             *status = U_MEMORY_ALLOCATION_ERROR;
 791             return NULL;
 792         }
 793
 794         /* build the UCATableHeader with minimal entries */
 795         /* do not copy the header from the UCA file because its values are wrong! */
 796         /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 797
 798         /* reset everything */
 799         uprv_memset(result, 0, *length);
 800
 801         /* set the tailoring-specific values */
 802         UCATableHeader *myData = (UCATableHeader *)result;
 803         myData->size = *length;
 804
 805         /* offset for the options, the only part of the data that is present after the header */
 806         myData->options = sizeof(UCATableHeader);
 807
 808         /* need to always set the expansion value for an upper bound of the options */
 809         myData->expansion = myData->options + sizeof(UColOptionSet);
 810
 811         myData->magic = UCOL_HEADER_MAGIC;
 812         myData->isBigEndian = U_IS_BIG_ENDIAN;
 813         myData->charSetFamily = U_CHARSET_FAMILY;
 814
 815         /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 816         uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 817
 818         uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 819         uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 820         uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 821         myData->jamoSpecial = coll->image->jamoSpecial;
 822
 823         /* copy the collator options */
 824         uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 825     }
 826     return result;
 827 }
 828
 829 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 830     if(U_FAILURE(*status)) {
 831         return;
 832     }
 833     result->caseFirst = (UColAttributeValue)opts->caseFirst;
 834     result->caseLevel = (UColAttributeValue)opts->caseLevel;
 835     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
 836     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
 837     if(result->normalizationMode == UCOL_ON && !initializeFCD(status)) {
 838         return;
 839     }
 840     result->strength = (UColAttributeValue)opts->strength;
 841     result->variableTopValue = opts->variableTopValue;
 842     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
 843     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
 844     result->numericCollation = (UColAttributeValue)opts->numericCollation;
 845     result->caseFirstisDefault = TRUE;
 846     result->caseLevelisDefault = TRUE;
 847     result->frenchCollationisDefault = TRUE;
 848     result->normalizationModeisDefault = TRUE;
 849     result->strengthisDefault = TRUE;
 850     result->variableTopValueisDefault = TRUE;
 851     result->alternateHandlingisDefault = TRUE;
 852     result->hiraganaQisDefault = TRUE;
 853     result->numericCollationisDefault = TRUE;
 854
 855     ucol_updateInternalState(result, status);
 856
 857     result->options = opts;
 858 }
 859
 860
 861 /**
 862 * Approximate determination if a character is at a contraction end.
 863 * Guaranteed to be TRUE if a character is at the end of a contraction,
 864 * otherwise it is not deterministic.
 865 * @param c character to be determined
 866 * @param coll collator
 867 */
 868 static
 869 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 870     if (c < coll->minContrEndCP) {
 871         return FALSE;
 872     }
 873
 874     int32_t  hash = c;
 875     uint8_t  htbyte;
 876     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 877         if (U16_IS_TRAIL(c)) {
 878             return TRUE;
 879         }
 880         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 881     }
 882     htbyte = coll->contrEndCP[hash>>3];
 883     return (((htbyte >> (hash & 7)) & 1) == 1);
 884 }
 885
 886
 887
 888 /*
 889 *   i_getCombiningClass()
 890 *        A fast, at least partly inline version of u_getCombiningClass()
 891 *        This is a candidate for further optimization.  Used heavily
 892 *        in contraction processing.
 893 */
 894 static
 895 inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) {
 896     uint8_t sCC = 0;
 897     if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) {
 898         sCC = u_getCombiningClass(c);
 899     }
 900     return sCC;
 901 }
 902
 903 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
 904     UChar c;
 905     UCollator *result = fillIn;
 906     if(U_FAILURE(*status) || image == NULL) {
 907         return NULL;
 908     }
 909
 910     if(result == NULL) {
 911         result = (UCollator *)uprv_malloc(sizeof(UCollator));
 912         if(result == NULL) {
 913             *status = U_MEMORY_ALLOCATION_ERROR;
 914             return result;
 915         }
 916         result->freeOnClose = TRUE;
 917     } else {
 918         result->freeOnClose = FALSE;
 919     }
 920
 921     result->delegate = NULL;
 922
 923     result->image = image;
 924     result->mapping.getFoldingOffset = _getFoldingOffset;
 925     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
 926     utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
 927     if(U_FAILURE(*status)) {
 928         if(result->freeOnClose == TRUE) {
 929             uprv_free(result);
 930             result = NULL;
 931         }
 932         return result;
 933     }
 934
 935     result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping);
 936     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
 937     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
 938     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
 939     result->rules = NULL;
 940     result->rulesLength = 0;
 941     result->freeRulesOnClose = FALSE;
 942     result->defaultReorderCodes = NULL;
 943     result->defaultReorderCodesLength = 0;
 944     result->freeDefaultReorderCodesOnClose = FALSE;
 945     result->reorderCodes = NULL;
 946     result->reorderCodesLength = 0;
 947     result->freeReorderCodesOnClose = FALSE;
 948     result->leadBytePermutationTable = NULL;
 949     result->freeLeadBytePermutationTableOnClose = FALSE;
 950
 951     /* get the version info from UCATableHeader and populate the Collator struct*/
 952     result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
 953     result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
 954     result->dataVersion[2] = 0;
 955     result->dataVersion[3] = 0;
 956
 957     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
 958     result->minUnsafeCP = 0;
 959     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
 960         if (ucol_unsafeCP(c, result)) break;
 961     }
 962     result->minUnsafeCP = c;
 963
 964     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
 965     result->minContrEndCP = 0;
 966     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
 967         if (ucol_contractionEndCP(c, result)) break;
 968     }
 969     result->minContrEndCP = c;
 970
 971     /* max expansion tables */
 972     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
 973                                          result->image->endExpansionCE);
 974     result->lastEndExpansionCE = result->endExpansionCE +
 975                                  result->image->endExpansionCECount - 1;
 976     result->expansionCESize = (uint8_t*)result->image +
 977                                                result->image->expansionCESize;
 978
 979
 980     //result->errorCode = *status;
 981
 982     result->latinOneCEs = NULL;
 983
 984     result->latinOneRegenTable = FALSE;
 985     result->latinOneFailed = FALSE;
 986     result->UCA = UCA;
 987
 988     /* Normally these will be set correctly later. This is the default if you use UCA or the default. */
 989     result->ucaRules = NULL;
 990     result->actualLocale = NULL;
 991     result->validLocale = NULL;
 992     result->requestedLocale = NULL;
 993     result->hasRealData = FALSE; // real data lives in .dat file...
 994     result->freeImageOnClose = FALSE;
 995
 996     /* set attributes */
 997     ucol_setOptionsFromHeader(
 998         result,
 999         (UColOptionSet*)((uint8_t*)result->image+result->image->options),
1000         status);
1001     result->freeOptionsOnClose = FALSE;
1002
1003     return result;
1004 }
1005
1006 /* new Mark's code */
1007
1008 /**
1009  * For generation of Implicit CEs
1010  * @author Davis
1011  *
1012  * Cleaned up so that changes can be made more easily.
1013  * Old values:
1014 # First Implicit: E26A792D
1015 # Last Implicit: E3DC70C0
1016 # First CJK: E0030300
1017 # Last CJK: E0A9DD00
1018 # First CJK_A: E0A9DF00
1019 # Last CJK_A: E0DE3100
1020  */
1021 /* Following is a port of Mark's code for new treatment of implicits.
1022  * It is positioned here, since ucol_initUCA need to initialize the
1023  * variables below according to the data in the fractional UCA.
1024  */
1025
1026 /**
1027  * Function used to:
1028  * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1029  * b) bump any non-CJK characters by 10FFFF.
1030  * The relevant blocks are:
1031  * A:    4E00..9FFF; CJK Unified Ideographs
1032  *       F900..FAFF; CJK Compatibility Ideographs
1033  * B:    3400..4DBF; CJK Unified Ideographs Extension A
1034  *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
1035  * As long as
1036  *   no new B characters are allocated between 4E00 and FAFF, and
1037  *   no new A characters are outside of this range,
1038  * (very high probability) this simple code will work.
1039  * The reordered blocks are:
1040  * Block1 is CJK
1041  * Block2 is CJK_COMPAT_USED
1042  * Block3 is CJK_A
1043  * (all contiguous)
1044  * Any other CJK gets its normal code point
1045  * Any non-CJK gets +10FFFF
1046  * When we reorder Block1, we make sure that it is at the very start,
1047  * so that it will use a 3-byte form.
1048  * Warning: the we only pick up the compatibility characters that are
1049  * NOT decomposed, so that block is smaller!
1050  */
1051
1052 // CONSTANTS
1053 static const UChar32
1054     NON_CJK_OFFSET = 0x110000,
1055     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1056
1057 /**
1058  * Precomputed by initImplicitConstants()
1059  */
1060 static int32_t
1061     final3Multiplier = 0,
1062     final4Multiplier = 0,
1063     final3Count = 0,
1064     final4Count = 0,
1065     medialCount = 0,
1066     min3Primary = 0,
1067     min4Primary = 0,
1068     max4Primary = 0,
1069     minTrail = 0,
1070     maxTrail = 0,
1071     max3Trail = 0,
1072     max4Trail = 0,
1073     min4Boundary = 0;
1074
1075 static const UChar32
1076     // 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
1077     // 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;  (Unicode 6.1)
1078     CJK_BASE = 0x4E00,
1079     CJK_LIMIT = 0x9FCC+1,
1080     // Unified CJK ideographs in the compatibility ideographs block.
1081     CJK_COMPAT_USED_BASE = 0xFA0E,
1082     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1083     // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
1084     // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
1085     CJK_A_BASE = 0x3400,
1086     CJK_A_LIMIT = 0x4DB5+1,
1087     // 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
1088     // 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
1089     CJK_B_BASE = 0x20000,
1090     CJK_B_LIMIT = 0x2A6D6+1,
1091     // 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
1092     // 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
1093     CJK_C_BASE = 0x2A700,
1094     CJK_C_LIMIT = 0x2B734+1,
1095     // 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
1096     // 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
1097     CJK_D_BASE = 0x2B740,
1098     CJK_D_LIMIT = 0x2B81D+1;
1099     // when adding to this list, look for all occurrences (in project)
1100     // of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
1101
1102 static UChar32 swapCJK(UChar32 i) {
1103     if (i < CJK_A_BASE) {
1104         // non-CJK
1105     } else if (i < CJK_A_LIMIT) {
1106         // Extension A has lower code points than the original Unihan+compat
1107         // but sorts higher.
1108         return i - CJK_A_BASE
1109                 + (CJK_LIMIT - CJK_BASE)
1110                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1111     } else if (i < CJK_BASE) {
1112         // non-CJK
1113     } else if (i < CJK_LIMIT) {
1114         return i - CJK_BASE;
1115     } else if (i < CJK_COMPAT_USED_BASE) {
1116         // non-CJK
1117     } else if (i < CJK_COMPAT_USED_LIMIT) {
1118         return i - CJK_COMPAT_USED_BASE
1119                 + (CJK_LIMIT - CJK_BASE);
1120     } else if (i < CJK_B_BASE) {
1121         // non-CJK
1122     } else if (i < CJK_B_LIMIT) {
1123         return i; // non-BMP-CJK
1124     } else if (i < CJK_C_BASE) {
1125         // non-CJK
1126     } else if (i < CJK_C_LIMIT) {
1127         return i; // non-BMP-CJK
1128     } else if (i < CJK_D_BASE) {
1129         // non-CJK
1130     } else if (i < CJK_D_LIMIT) {
1131         return i; // non-BMP-CJK
1132     }
1133     return i + NON_CJK_OFFSET; // non-CJK
1134 }
1135
1136 U_CAPI UChar32 U_EXPORT2
1137 uprv_uca_getRawFromCodePoint(UChar32 i) {
1138     return swapCJK(i)+1;
1139 }
1140
1141 U_CAPI UChar32 U_EXPORT2
1142 uprv_uca_getCodePointFromRaw(UChar32 i) {
1143     i--;
1144     UChar32 result = 0;
1145     if(i >= NON_CJK_OFFSET) {
1146         result = i - NON_CJK_OFFSET;
1147     } else if(i >= CJK_B_BASE) {
1148         result = i;
1149     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1150         if(i < CJK_LIMIT - CJK_BASE) {
1151             result = i + CJK_BASE;
1152         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1153             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1154         } else {
1155             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1156         }
1157     } else {
1158         result = -1;
1159     }
1160     return result;
1161 }
1162
1163 // GET IMPLICIT PRIMARY WEIGHTS
1164 // Return value is left justified primary key
1165 U_CAPI uint32_t U_EXPORT2
1166 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1167     /*
1168     if (cp < 0 || cp > UCOL_MAX_INPUT) {
1169         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1170     }
1171     */
1172     int32_t last0 = cp - min4Boundary;
1173     if (last0 < 0) {
1174         int32_t last1 = cp / final3Count;
1175         last0 = cp % final3Count;
1176
1177         int32_t last2 = last1 / medialCount;
1178         last1 %= medialCount;
1179
1180         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1181         last1 = minTrail + last1; // offset
1182         last2 = min3Primary + last2; // offset
1183         /*
1184         if (last2 >= min4Primary) {
1185             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1186         }
1187         */
1188         return (last2 << 24) + (last1 << 16) + (last0 << 8);
1189     } else {
1190         int32_t last1 = last0 / final4Count;
1191         last0 %= final4Count;
1192
1193         int32_t last2 = last1 / medialCount;
1194         last1 %= medialCount;
1195
1196         int32_t last3 = last2 / medialCount;
1197         last2 %= medialCount;
1198
1199         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1200         last1 = minTrail + last1; // offset
1201         last2 = minTrail + last2; // offset
1202         last3 = min4Primary + last3; // offset
1203         /*
1204         if (last3 > max4Primary) {
1205             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1206         }
1207         */
1208         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1209     }
1210 }
1211
1212 static uint32_t U_EXPORT2
1213 uprv_uca_getImplicitPrimary(UChar32 cp) {
1214    //fprintf(stdout, "Incoming: %04x\n", cp);
1215     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1216
1217     cp = swapCJK(cp);
1218     cp++;
1219     // we now have a range of numbers from 0 to 21FFFF.
1220
1221     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1222     //fprintf(stdout, "CJK swapped: %04x\n", cp);
1223
1224     return uprv_uca_getImplicitFromRaw(cp);
1225 }
1226
1227 /**
1228  * Converts implicit CE into raw integer ("code point")
1229  * @param implicit
1230  * @return -1 if illegal format
1231  */
1232 U_CAPI UChar32 U_EXPORT2
1233 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1234     UChar32 result;
1235     UChar32 b3 = implicit & 0xFF;
1236     UChar32 b2 = (implicit >> 8) & 0xFF;
1237     UChar32 b1 = (implicit >> 16) & 0xFF;
1238     UChar32 b0 = (implicit >> 24) & 0xFF;
1239
1240     // simple parameter checks
1241     if (b0 < min3Primary || b0 > max4Primary
1242         || b1 < minTrail || b1 > maxTrail)
1243         return -1;
1244     // normal offsets
1245     b1 -= minTrail;
1246
1247     // take care of the final values, and compose
1248     if (b0 < min4Primary) {
1249         if (b2 < minTrail || b2 > max3Trail || b3 != 0)
1250             return -1;
1251         b2 -= minTrail;
1252         UChar32 remainder = b2 % final3Multiplier;
1253         if (remainder != 0)
1254             return -1;
1255         b0 -= min3Primary;
1256         b2 /= final3Multiplier;
1257         result = ((b0 * medialCount) + b1) * final3Count + b2;
1258     } else {
1259         if (b2 < minTrail || b2 > maxTrail
1260             || b3 < minTrail || b3 > max4Trail)
1261             return -1;
1262         b2 -= minTrail;
1263         b3 -= minTrail;
1264         UChar32 remainder = b3 % final4Multiplier;
1265         if (remainder != 0)
1266             return -1;
1267         b3 /= final4Multiplier;
1268         b0 -= min4Primary;
1269         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1270     }
1271     // final check
1272     if (result < 0 || result > UCOL_MAX_INPUT)
1273         return -1;
1274     return result;
1275 }
1276
1277
1278 static inline int32_t divideAndRoundUp(int a, int b) {
1279     return 1 + (a-1)/b;
1280 }
1281
1282 /* this function is either called from initUCA or from genUCA before
1283  * doing canonical closure for the UCA.
1284  */
1285
1286 /**
1287  * Set up to generate implicits.
1288  * Maintenance Note:  this function may end up being called more than once, due
1289  *                    to threading races during initialization.  Make sure that
1290  *                    none of the Constants is ever transiently assigned an
1291  *                    incorrect value.
1292  * @param minPrimary
1293  * @param maxPrimary
1294  * @param minTrail final byte
1295  * @param maxTrail final byte
1296  * @param gap3 the gap we leave for tailoring for 3-byte forms
1297  * @param gap4 the gap we leave for tailoring for 4-byte forms
1298  */
1299 static void initImplicitConstants(int minPrimary, int maxPrimary,
1300                                     int minTrailIn, int maxTrailIn,
1301                                     int gap3, int primaries3count,
1302                                     UErrorCode *status) {
1303     // some simple parameter checks
1304     if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF)
1305         || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF)
1306         || (primaries3count < 1))
1307     {
1308         *status = U_ILLEGAL_ARGUMENT_ERROR;
1309         return;
1310     };
1311
1312     minTrail = minTrailIn;
1313     maxTrail = maxTrailIn;
1314
1315     min3Primary = minPrimary;
1316     max4Primary = maxPrimary;
1317     // compute constants for use later.
1318     // number of values we can use in trailing bytes
1319     // leave room for empty values between AND above, e.g. if gap = 2
1320     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1321     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1322     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1323     final3Multiplier = gap3 + 1;
1324     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1325     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1326
1327     // medials can use full range
1328     medialCount = (maxTrail - minTrail + 1);
1329     // find out how many values fit in each form
1330     int32_t threeByteCount = medialCount * final3Count;
1331     // now determine where the 3/4 boundary is.
1332     // we use 3 bytes below the boundary, and 4 above
1333     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1334     int32_t primaries4count = primariesAvailable - primaries3count;
1335
1336
1337     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1338     min4Primary = minPrimary + primaries3count;
1339     min4Boundary = min3ByteCoverage;
1340     // Now expand out the multiplier for the 4 bytes, and redo.
1341
1342     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1343     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1344     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1345     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1346     if (gap4 < 1) {
1347         *status = U_ILLEGAL_ARGUMENT_ERROR;
1348         return;
1349     }
1350     final4Multiplier = gap4 + 1;
1351     final4Count = neededPerFinalByte;
1352     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1353 }
1354
1355     /**
1356      * Supply parameters for generating implicit CEs
1357      */
1358 U_CAPI void U_EXPORT2
1359 uprv_uca_initImplicitConstants(UErrorCode *status) {
1360     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1361     //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1362     initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status);
1363 }
1364
1365
1366 /*    collIterNormalize     Incremental Normalization happens here.                       */
1367 /*                          pick up the range of chars identifed by FCD,                  */
1368 /*                          normalize it into the collIterate's writable buffer,          */
1369 /*                          switch the collIterate's state to use the writable buffer.    */
1370 /*                                                                                        */
1371 static
1372 void collIterNormalize(collIterate *collationSource)
1373 {
1374     UErrorCode  status = U_ZERO_ERROR;
1375     const UChar *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1376     const UChar *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1377
1378     collationSource->nfd->normalize(UnicodeString(FALSE, srcP, (int32_t)(endP - srcP)),
1379                                     collationSource->writableBuffer,
1380                                     status);
1381     if (U_FAILURE(status)) {
1382 #ifdef UCOL_DEBUG
1383         fprintf(stderr, "collIterNormalize(), NFD failed, status = %s\n", u_errorName(status));
1384 #endif
1385         return;
1386     }
1387
1388     collationSource->pos        = collationSource->writableBuffer.getTerminatedBuffer();
1389     collationSource->origFlags  = collationSource->flags;
1390     collationSource->flags     |= UCOL_ITER_INNORMBUF;
1391     collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1392 }
1393
1394
1395 // This function takes the iterator and extracts normalized stuff up to the next boundary
1396 // It is similar in the end results to the collIterNormalize, but for the cases when we
1397 // use an iterator
1398 /*static
1399 inline void normalizeIterator(collIterate *collationSource) {
1400   UErrorCode status = U_ZERO_ERROR;
1401   UBool wasNormalized = FALSE;
1402   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1403   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1404   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1405     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1406   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1407     // reallocate and terminate
1408     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1409                                &collationSource->writableBuffer,
1410                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1411                                0)
1412     ) {
1413     #ifdef UCOL_DEBUG
1414         fprintf(stderr, "normalizeIterator(), out of memory\n");
1415     #endif
1416         return;
1417     }
1418     status = U_ZERO_ERROR;
1419     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1420     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1421     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1422     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1423   }
1424   // Terminate the buffer - we already checked that it is big enough
1425   collationSource->writableBuffer[normLen] = 0;
1426   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1427       collationSource->flags |= UCOL_ITER_ALLOCATED;
1428   }
1429   collationSource->pos        = collationSource->writableBuffer;
1430   collationSource->origFlags  = collationSource->flags;
1431   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1432   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1433 }*/
1434
1435
1436 /* Incremental FCD check and normalize                                                    */
1437 /*   Called from getNextCE when normalization state is suspect.                           */
1438 /*   When entering, the state is known to be this:                                        */
1439 /*      o   We are working in the main buffer of the collIterate, not the side            */
1440 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1441 /*          so we won't get here.                                                         */
1442 /*      o   The leading combining class from the current character is 0 or                */
1443 /*          the trailing combining class of the previous char was zero.                   */
1444 /*          True because the previous call to this function will have always exited       */
1445 /*          that way, and we get called for every char where cc might be non-zero.        */
1446 static
1447 inline UBool collIterFCD(collIterate *collationSource) {
1448     const UChar *srcP, *endP;
1449     uint8_t     leadingCC;
1450     uint8_t     prevTrailingCC = 0;
1451     uint16_t    fcd;
1452     UBool       needNormalize = FALSE;
1453
1454     srcP = collationSource->pos-1;
1455
1456     if (collationSource->flags & UCOL_ITER_HASLEN) {
1457         endP = collationSource->endp;
1458     } else {
1459         endP = NULL;
1460     }
1461
1462     // Get the trailing combining class of the current character. If it's zero, we are OK.
1463     fcd = g_nfcImpl->nextFCD16(srcP, endP);
1464     if (fcd != 0) {
1465         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1466
1467         if (prevTrailingCC != 0) {
1468             // The current char has a non-zero trailing CC.  Scan forward until we find
1469             //   a char with a leading cc of zero.
1470             while (endP == NULL || srcP != endP)
1471             {
1472                 const UChar *savedSrcP = srcP;
1473
1474                 fcd = g_nfcImpl->nextFCD16(srcP, endP);
1475                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1476                 if (leadingCC == 0) {
1477                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1478                                            //   back up over it.  (Could be surrogate pair!)
1479                     break;
1480                 }
1481
1482                 if (leadingCC < prevTrailingCC) {
1483                     needNormalize = TRUE;
1484                 }
1485
1486                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1487             }
1488         }
1489     }
1490
1491     collationSource->fcdPosition = (UChar *)srcP;
1492
1493     return needNormalize;
1494 }
1495
1496 /****************************************************************************/
1497 /* Following are the CE retrieval functions                                 */
1498 /*                                                                          */
1499 /****************************************************************************/
1500
1501 static uint32_t getImplicit(UChar32 cp, collIterate *collationSource);
1502 static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource);
1503
1504 /* there should be a macro version of this function in the header file */
1505 /* This is the first function that tries to fetch a collation element  */
1506 /* If it's not succesfull or it encounters a more difficult situation  */
1507 /* some more sofisticated and slower functions are invoked             */
1508 static
1509 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1510     uint32_t order = 0;
1511     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1512         order = *(collationSource->toReturn++);                         /* if so, return them */
1513         if(collationSource->CEpos == collationSource->toReturn) {
1514             collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs;
1515         }
1516         return order;
1517     }
1518
1519     UChar ch = 0;
1520     collationSource->offsetReturn = NULL;
1521
1522     do {
1523         for (;;)                           /* Loop handles case when incremental normalize switches   */
1524         {                                  /*   to or from the side buffer / original string, and we  */
1525             /*   need to start again to get the next character.        */
1526
1527             if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1528             {
1529                 // The source string is null terminated and we're not working from the side buffer,
1530                 //   and we're not normalizing.  This is the fast path.
1531                 //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1532                 ch = *collationSource->pos++;
1533                 if (ch != 0) {
1534                     break;
1535                 }
1536                 else {
1537                     return UCOL_NO_MORE_CES;
1538                 }
1539             }
1540
1541             if (collationSource->flags & UCOL_ITER_HASLEN) {
1542                 // Normal path for strings when length is specified.
1543                 //   (We can't be in side buffer because it is always null terminated.)
1544                 if (collationSource->pos >= collationSource->endp) {
1545                     // Ran off of the end of the main source string.  We're done.
1546                     return UCOL_NO_MORE_CES;
1547                 }
1548                 ch = *collationSource->pos++;
1549             }
1550             else if(collationSource->flags & UCOL_USE_ITERATOR) {
1551                 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1552                 if(iterCh == U_SENTINEL) {
1553                     return UCOL_NO_MORE_CES;
1554                 }
1555                 ch = (UChar)iterCh;
1556             }
1557             else
1558             {
1559                 // Null terminated string.
1560                 ch = *collationSource->pos++;
1561                 if (ch == 0) {
1562                     // Ran off end of buffer.
1563                     if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1564                         // Ran off end of main string. backing up one character.
1565                         collationSource->pos--;
1566                         return UCOL_NO_MORE_CES;
1567                     }
1568                     else
1569                     {
1570                         // Hit null in the normalize side buffer.
1571                         // Usually this means the end of the normalized data,
1572                         // except for one odd case: a null followed by combining chars,
1573                         //   which is the case if we are at the start of the buffer.
1574                         if (collationSource->pos == collationSource->writableBuffer.getBuffer()+1) {
1575                             break;
1576                         }
1577
1578                         //  Null marked end of side buffer.
1579                         //   Revert to the main string and
1580                         //   loop back to top to try again to get a character.
1581                         collationSource->pos   = collationSource->fcdPosition;
1582                         collationSource->flags = collationSource->origFlags;
1583                         continue;
1584                     }
1585                 }
1586             }
1587
1588             if(collationSource->flags&UCOL_HIRAGANA_Q) {
1589                 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag
1590                  * based on whether the previous codepoint was Hiragana or Katakana.
1591                  */
1592                 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) ||
1593                         ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) {
1594                     collationSource->flags |= UCOL_WAS_HIRAGANA;
1595                 } else {
1596                     collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1597                 }
1598             }
1599
1600             // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1601             //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1602             if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1603                 break;
1604             }
1605
1606             if (collationSource->fcdPosition >= collationSource->pos) {
1607                 // An earlier FCD check has already covered the current character.
1608                 // We can go ahead and process this char.
1609                 break;
1610             }
1611
1612             if (ch < ZERO_CC_LIMIT_ ) {
1613                 // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1614                 break;
1615             }
1616
1617             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1618                 // We need to peek at the next character in order to tell if we are FCD
1619                 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1620                     // We are at the last char of source string.
1621                     //  It is always OK for FCD check.
1622                     break;
1623                 }
1624
1625                 // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1626                 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1627                     break;
1628                 }
1629             }
1630
1631
1632             // Need a more complete FCD check and possible normalization.
1633             if (collIterFCD(collationSource)) {
1634                 collIterNormalize(collationSource);
1635             }
1636             if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1637                 //  No normalization was needed.  Go ahead and process the char we already had.
1638                 break;
1639             }
1640
1641             // Some normalization happened.  Next loop iteration will pick up a char
1642             //   from the normalization buffer.
1643
1644         }   // end for (;;)
1645
1646
1647         if (ch <= 0xFF) {
1648             /*  For latin-1 characters we never need to fall back to the UCA table        */
1649             /*    because all of the UCA data is replicated in the latinOneMapping array  */
1650             order = coll->latinOneMapping[ch];
1651             if (order > UCOL_NOT_FOUND) {
1652                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1653             }
1654         }
1655         else
1656         {
1657             // Always use UCA for Han, Hangul
1658             // (Han extension A is before main Han block)
1659             // **** Han compatibility chars ?? ****
1660             if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
1661                 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) {
1662                 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) {
1663                     // between the two target ranges; do normal lookup
1664                     // **** this range is YI, Modifier tone letters, ****
1665                     // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
1666                     // **** Latin-D might be tailored, so we need to ****
1667                     // **** do the normal lookup for these guys.     ****
1668                     order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1669                 } else {
1670                     // in one of the target ranges; use UCA
1671                     order = UCOL_NOT_FOUND;
1672                 }
1673             } else {
1674                 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
1675             }
1676
1677             if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1678                 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1679             }
1680
1681             if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1682                 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1683                 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
1684
1685                 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1686                     order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1687                 }
1688             }
1689         }
1690     } while ( order == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
1691
1692     if(order == UCOL_NOT_FOUND) {
1693         order = getImplicit(ch, collationSource);
1694     }
1695     return order; /* return the CE */
1696 }
1697
1698 /* ucol_getNextCE, out-of-line version for use from other files.   */
1699 U_CAPI uint32_t  U_EXPORT2
1700 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1701     return ucol_IGetNextCE(coll, collationSource, status);
1702 }
1703
1704
1705 /**
1706 * Incremental previous normalization happens here. Pick up the range of chars
1707 * identifed by FCD, normalize it into the collIterate's writable buffer,
1708 * switch the collIterate's state to use the writable buffer.
1709 * @param data collation iterator data
1710 */
1711 static
1712 void collPrevIterNormalize(collIterate *data)
1713 {
1714     UErrorCode status  = U_ZERO_ERROR;
1715     const UChar *pEnd   = data->pos;  /* End normalize + 1 */
1716     const UChar *pStart;
1717
1718     /* Start normalize */
1719     if (data->fcdPosition == NULL) {
1720         pStart = data->string;
1721     }
1722     else {
1723         pStart = data->fcdPosition + 1;
1724     }
1725
1726     int32_t normLen =
1727         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)((pEnd - pStart) + 1)),
1728                              data->writableBuffer,
1729                              status).
1730         length();
1731     if(U_FAILURE(status)) {
1732         return;
1733     }
1734     /*
1735     this puts the null termination infront of the normalized string instead
1736     of the end
1737     */
1738     data->writableBuffer.insert(0, (UChar)0);
1739
1740     /*
1741      * The usual case at this point is that we've got a base
1742      * character followed by marks that were normalized. If
1743      * fcdPosition is NULL, that means that we backed up to
1744      * the beginning of the string and there's no base character.
1745      *
1746      * Forward processing will usually normalize when it sees
1747      * the first mark, so that mark will get it's natural offset
1748      * and the rest will get the offset of the character following
1749      * the marks. The base character will also get its natural offset.
1750      *
1751      * We write the offset of the base character, if there is one,
1752      * followed by the offset of the first mark and then the offsets
1753      * of the rest of the marks.
1754      */
1755     int32_t firstMarkOffset = 0;
1756     int32_t trailOffset     = (int32_t)(data->pos - data->string + 1);
1757     int32_t trailCount      = normLen - 1;
1758
1759     if (data->fcdPosition != NULL) {
1760         int32_t baseOffset = (int32_t)(data->fcdPosition - data->string);
1761         UChar   baseChar   = *data->fcdPosition;
1762
1763         firstMarkOffset = baseOffset + 1;
1764
1765         /*
1766          * If the base character is the start of a contraction, forward processing
1767          * will normalize the marks while checking for the contraction, which means
1768          * that the offset of the first mark will the same as the other marks.
1769          *
1770          * **** THIS IS PROBABLY NOT A COMPLETE TEST ****
1771          */
1772         if (baseChar >= 0x100) {
1773             uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar);
1774
1775             if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) {
1776                 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar);
1777             }
1778
1779             if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) {
1780                 firstMarkOffset = trailOffset;
1781             }
1782         }
1783
1784         data->appendOffset(baseOffset, status);
1785     }
1786
1787     data->appendOffset(firstMarkOffset, status);
1788
1789     for (int32_t i = 0; i < trailCount; i += 1) {
1790         data->appendOffset(trailOffset, status);
1791     }
1792
1793     data->offsetRepeatValue = trailOffset;
1794
1795     data->offsetReturn = data->offsetStore - 1;
1796     if (data->offsetReturn == data->offsetBuffer) {
1797         data->offsetStore = data->offsetBuffer;
1798     }
1799
1800     data->pos        = data->writableBuffer.getTerminatedBuffer() + 1 + normLen;
1801     data->origFlags  = data->flags;
1802     data->flags     |= UCOL_ITER_INNORMBUF;
1803     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1804 }
1805
1806
1807 /**
1808 * Incremental FCD check for previous iteration and normalize. Called from
1809 * getPrevCE when normalization state is suspect.
1810 * When entering, the state is known to be this:
1811 * o  We are working in the main buffer of the collIterate, not the side
1812 *    writable buffer. When in the side buffer, normalization mode is always
1813 *    off, so we won't get here.
1814 * o  The leading combining class from the current character is 0 or the
1815 *    trailing combining class of the previous char was zero.
1816 *    True because the previous call to this function will have always exited
1817 *    that way, and we get called for every char where cc might be non-zero.
1818 * @param data collation iterate struct
1819 * @return normalization status, TRUE for normalization to be done, FALSE
1820 *         otherwise
1821 */
1822 static
1823 inline UBool collPrevIterFCD(collIterate *data)
1824 {
1825     const UChar *src, *start;
1826     uint8_t     leadingCC;
1827     uint8_t     trailingCC = 0;
1828     uint16_t    fcd;
1829     UBool       result = FALSE;
1830
1831     start = data->string;
1832     src = data->pos + 1;
1833
1834     /* Get the trailing combining class of the current character. */
1835     fcd = g_nfcImpl->previousFCD16(start, src);
1836
1837     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1838
1839     if (leadingCC != 0) {
1840         /*
1841         The current char has a non-zero leading combining class.
1842         Scan backward until we find a char with a trailing cc of zero.
1843         */
1844         for (;;)
1845         {
1846             if (start == src) {
1847                 data->fcdPosition = NULL;
1848                 return result;
1849             }
1850
1851             fcd = g_nfcImpl->previousFCD16(start, src);
1852
1853             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1854
1855             if (trailingCC == 0) {
1856                 break;
1857             }
1858
1859             if (leadingCC < trailingCC) {
1860                 result = TRUE;
1861             }
1862
1863             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1864         }
1865     }
1866
1867     data->fcdPosition = (UChar *)src;
1868
1869     return result;
1870 }
1871
1872 /** gets a code unit from the string at a given offset
1873  *  Handles both normal and iterative cases.
1874  *  No error checking - caller beware!
1875  */
1876 static inline
1877 UChar peekCodeUnit(collIterate *source, int32_t offset) {
1878     if(source->pos != NULL) {
1879         return *(source->pos + offset);
1880     } else if(source->iterator != NULL) {
1881         UChar32 c;
1882         if(offset != 0) {
1883             source->iterator->move(source->iterator, offset, UITER_CURRENT);
1884             c = source->iterator->next(source->iterator);
1885             source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1886         } else {
1887             c = source->iterator->current(source->iterator);
1888         }
1889         return c >= 0 ? (UChar)c : 0xfffd;  // If the caller works properly, we should never see c<0.
1890     } else {
1891         return 0xfffd;
1892     }
1893 }
1894
1895 // Code point version. Treats the offset as a _code point_ delta.
1896 // We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
1897 // We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
1898 static inline
1899 UChar32 peekCodePoint(collIterate *source, int32_t offset) {
1900     UChar32 c;
1901     if(source->pos != NULL) {
1902         const UChar *p = source->pos;
1903         if(offset >= 0) {
1904             // Skip forward over (offset-1) code points.
1905             while(--offset >= 0) {
1906                 if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
1907                     ++p;
1908                 }
1909             }
1910             // Read the code point there.
1911             c = *p++;
1912             UChar trail;
1913             if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
1914                 c = U16_GET_SUPPLEMENTARY(c, trail);
1915             }
1916         } else /* offset<0 */ {
1917             // Skip backward over (offset-1) code points.
1918             while(++offset < 0) {
1919                 if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
1920                     --p;
1921                 }
1922             }
1923             // Read the code point before that.
1924             c = *--p;
1925             UChar lead;
1926             if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
1927                 c = U16_GET_SUPPLEMENTARY(lead, c);
1928             }
1929         }
1930     } else if(source->iterator != NULL) {
1931         if(offset >= 0) {
1932             // Skip forward over (offset-1) code points.
1933             int32_t fwd = offset;
1934             while(fwd-- > 0) {
1935                 uiter_next32(source->iterator);
1936             }
1937             // Read the code point there.
1938             c = uiter_current32(source->iterator);
1939             // Return to the starting point, skipping backward over (offset-1) code points.
1940             while(offset-- > 0) {
1941                 uiter_previous32(source->iterator);
1942             }
1943         } else /* offset<0 */ {
1944             // Read backward, reading offset code points, remember only the last-read one.
1945             int32_t back = offset;
1946             do {
1947                 c = uiter_previous32(source->iterator);
1948             } while(++back < 0);
1949             // Return to the starting position, skipping forward over offset code points.
1950             do {
1951                 uiter_next32(source->iterator);
1952             } while(++offset < 0);
1953         }
1954     } else {
1955         c = U_SENTINEL;
1956     }
1957     return c;
1958 }
1959
1960 /**
1961 * Determines if we are at the start of the data string in the backwards
1962 * collation iterator
1963 * @param data collation iterator
1964 * @return TRUE if we are at the start
1965 */
1966 static
1967 inline UBool isAtStartPrevIterate(collIterate *data) {
1968     if(data->pos == NULL && data->iterator != NULL) {
1969         return !data->iterator->hasPrevious(data->iterator);
1970     }
1971     //return (collIter_bos(data)) ||
1972     return (data->pos == data->string) ||
1973               ((data->flags & UCOL_ITER_INNORMBUF) && (data->pos != NULL) &&
1974               *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1975 }
1976
1977 static
1978 inline void goBackOne(collIterate *data) {
1979 # if 0
1980     // somehow, it looks like we need to keep iterator synced up
1981     // at all times, as above.
1982     if(data->pos) {
1983         data->pos--;
1984     }
1985     if(data->iterator) {
1986         data->iterator->previous(data->iterator);
1987     }
1988 #endif
1989     if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1990         data->iterator->previous(data->iterator);
1991     }
1992     if(data->pos) {
1993         data->pos --;
1994     }
1995 }
1996
1997 /**
1998 * Inline function that gets a simple CE.
1999 * So what it does is that it will first check the expansion buffer. If the
2000 * expansion buffer is not empty, ie the end pointer to the expansion buffer
2001 * is different from the string pointer, we return the collation element at the
2002 * return pointer and decrement it.
2003 * For more complicated CEs it resorts to getComplicatedCE.
2004 * @param coll collator data
2005 * @param data collation iterator struct
2006 * @param status error status
2007 */
2008 static
2009 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
2010                                UErrorCode *status)
2011 {
2012     uint32_t result = (uint32_t)UCOL_NULLORDER;
2013
2014     if (data->offsetReturn != NULL) {
2015         if (data->offsetRepeatCount > 0) {
2016                 data->offsetRepeatCount -= 1;
2017         } else {
2018             if (data->offsetReturn == data->offsetBuffer) {
2019                 data->offsetReturn = NULL;
2020                 data->offsetStore  = data->offsetBuffer;
2021             } else {
2022                 data->offsetReturn -= 1;
2023             }
2024         }
2025     }
2026
2027     if ((data->extendCEs && data->toReturn > data->extendCEs) ||
2028             (!data->extendCEs && data->toReturn > data->CEs))
2029     {
2030         data->toReturn -= 1;
2031         result = *(data->toReturn);
2032         if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) {
2033             data->CEpos = data->toReturn;
2034         }
2035     }
2036     else {
2037         UChar ch = 0;
2038
2039         do {
2040             /*
2041             Loop handles case when incremental normalize switches to or from the
2042             side buffer / original string, and we need to start again to get the
2043             next character.
2044             */
2045             for (;;) {
2046                 if (data->flags & UCOL_ITER_HASLEN) {
2047                     /*
2048                     Normal path for strings when length is specified.
2049                     Not in side buffer because it is always null terminated.
2050                     */
2051                     if (data->pos <= data->string) {
2052                         /* End of the main source string */
2053                         return UCOL_NO_MORE_CES;
2054                     }
2055                     data->pos --;
2056                     ch = *data->pos;
2057                 }
2058                 // we are using an iterator to go back. Pray for us!
2059                 else if (data->flags & UCOL_USE_ITERATOR) {
2060                   UChar32 iterCh = data->iterator->previous(data->iterator);
2061                   if(iterCh == U_SENTINEL) {
2062                     return UCOL_NO_MORE_CES;
2063                   } else {
2064                     ch = (UChar)iterCh;
2065                   }
2066                 }
2067                 else {
2068                     data->pos --;
2069                     ch = *data->pos;
2070                     /* we are in the side buffer. */
2071                     if (ch == 0) {
2072                         /*
2073                         At the start of the normalize side buffer.
2074                         Go back to string.
2075                         Because pointer points to the last accessed character,
2076                         hence we have to increment it by one here.
2077                         */
2078                         data->flags = data->origFlags;
2079                         data->offsetRepeatValue = 0;
2080
2081                          if (data->fcdPosition == NULL) {
2082                             data->pos = data->string;
2083                             return UCOL_NO_MORE_CES;
2084                         }
2085                         else {
2086                             data->pos   = data->fcdPosition + 1;
2087                         }
2088
2089                        continue;
2090                     }
2091                 }
2092
2093                 if(data->flags&UCOL_HIRAGANA_Q) {
2094                   if(ch>=0x3040 && ch<=0x309f) {
2095                     data->flags |= UCOL_WAS_HIRAGANA;
2096                   } else {
2097                     data->flags &= ~UCOL_WAS_HIRAGANA;
2098                   }
2099                 }
2100
2101                 /*
2102                 * got a character to determine if there's fcd and/or normalization
2103                 * stuff to do.
2104                 * if the current character is not fcd.
2105                 * if current character is at the start of the string
2106                 * Trailing combining class == 0.
2107                 * Note if pos is in the writablebuffer, norm is always 0
2108                 */
2109                 if (ch < ZERO_CC_LIMIT_ ||
2110                   // this should propel us out of the loop in the iterator case
2111                     (data->flags & UCOL_ITER_NORM) == 0 ||
2112                     (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2113                     || data->string == data->pos) {
2114                     break;
2115                 }
2116
2117                 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2118                     /* if next character is FCD */
2119                     if (data->pos == data->string) {
2120                         /* First char of string is always OK for FCD check */
2121                         break;
2122                     }
2123
2124                     /* Not first char of string, do the FCD fast test */
2125                     if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2126                         break;
2127                     }
2128                 }
2129
2130                 /* Need a more complete FCD check and possible normalization. */
2131                 if (collPrevIterFCD(data)) {
2132                     collPrevIterNormalize(data);
2133                 }
2134
2135                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2136                     /*  No normalization. Go ahead and process the char. */
2137                     break;
2138                 }
2139
2140                 /*
2141                 Some normalization happened.
2142                 Next loop picks up a char from the normalization buffer.
2143                 */
2144             }
2145
2146             /* attempt to handle contractions, after removal of the backwards
2147             contraction
2148             */
2149             if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2150                 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2151             } else {
2152                 if (ch <= 0xFF) {
2153                     result = coll->latinOneMapping[ch];
2154                 }
2155                 else {
2156                     // Always use UCA for [3400..9FFF], [AC00..D7AF]
2157                     // **** [FA0E..FA2F] ?? ****
2158                     if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 &&
2159                         (ch >= 0x3400 && ch <= 0xD7AF)) {
2160                         if (ch > 0x9FFF && ch < 0xAC00) {
2161                             // between the two target ranges; do normal lookup
2162                             // **** this range is YI, Modifier tone letters, ****
2163                             // **** Latin-D, Syloti Nagari, Phagas-pa.       ****
2164                             // **** Latin-D might be tailored, so we need to ****
2165                             // **** do the normal lookup for these guys.     ****
2166                              result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2167                         } else {
2168                             result = UCOL_NOT_FOUND;
2169                         }
2170                     } else {
2171                         result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
2172                     }
2173                 }
2174                 if (result > UCOL_NOT_FOUND) {
2175                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2176                 }
2177                 if (result == UCOL_NOT_FOUND) { // Not found in master list
2178                     if (!isAtStartPrevIterate(data) &&
2179                         ucol_contractionEndCP(ch, data->coll))
2180                     {
2181                         result = UCOL_CONTRACTION;
2182                     } else {
2183                         if(coll->UCA) {
2184                             result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
2185                         }
2186                     }
2187
2188                     if (result > UCOL_NOT_FOUND) {
2189                         if(coll->UCA) {
2190                             result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2191                         }
2192                     }
2193                 }
2194             }
2195         } while ( result == UCOL_IGNORABLE && ch >= UCOL_FIRST_HANGUL && ch <= UCOL_LAST_HANGUL );
2196
2197         if(result == UCOL_NOT_FOUND) {
2198             result = getPrevImplicit(ch, data);
2199         }
2200     }
2201
2202     return result;
2203 }
2204
2205
2206 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
2207 U_CFUNC uint32_t  U_EXPORT2
2208 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2209                         UErrorCode *status) {
2210     return ucol_IGetPrevCE(coll, data, status);
2211 }
2212
2213
2214 /* this should be connected to special Jamo handling */
2215 U_CFUNC uint32_t  U_EXPORT2
2216 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2217     collIterate colIt;
2218     IInit_collIterate(coll, &u, 1, &colIt, status);
2219     if(U_FAILURE(*status)) {
2220         return 0;
2221     }
2222     return ucol_IGetNextCE(coll, &colIt, status);
2223 }
2224
2225 /**
2226 * Inserts the argument character into the end of the buffer pushing back the
2227 * null terminator.
2228 * @param data collIterate struct data
2229 * @param ch character to be appended
2230 * @return the position of the new addition
2231 */
2232 static
2233 inline const UChar * insertBufferEnd(collIterate *data, UChar ch)
2234 {
2235     int32_t oldLength = data->writableBuffer.length();
2236     return data->writableBuffer.append(ch).getTerminatedBuffer() + oldLength;
2237 }
2238
2239 /**
2240 * Inserts the argument string into the end of the buffer pushing back the
2241 * null terminator.
2242 * @param data collIterate struct data
2243 * @param string to be appended
2244 * @param length of the string to be appended
2245 * @return the position of the new addition
2246 */
2247 static
2248 inline const UChar * insertBufferEnd(collIterate *data, const UChar *str, int32_t length)
2249 {
2250     int32_t oldLength = data->writableBuffer.length();
2251     return data->writableBuffer.append(str, length).getTerminatedBuffer() + oldLength;
2252 }
2253
2254 /**
2255 * Special normalization function for contraction in the forwards iterator.
2256 * This normalization sequence will place the current character at source->pos
2257 * and its following normalized sequence into the buffer.
2258 * The fcd position, pos will be changed.
2259 * pos will now point to positions in the buffer.
2260 * Flags will be changed accordingly.
2261 * @param data collation iterator data
2262 */
2263 static
2264 inline void normalizeNextContraction(collIterate *data)
2265 {
2266     int32_t     strsize;
2267     UErrorCode  status     = U_ZERO_ERROR;
2268     /* because the pointer points to the next character */
2269     const UChar *pStart    = data->pos - 1;
2270     const UChar *pEnd;
2271
2272     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2273         data->writableBuffer.setTo(*(pStart - 1));
2274         strsize               = 1;
2275     }
2276     else {
2277         strsize = data->writableBuffer.length();
2278     }
2279
2280     pEnd = data->fcdPosition;
2281
2282     data->writableBuffer.append(
2283         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)), status));
2284     if(U_FAILURE(status)) {
2285         return;
2286     }
2287
2288     data->pos        = data->writableBuffer.getTerminatedBuffer() + strsize;
2289     data->origFlags  = data->flags;
2290     data->flags     |= UCOL_ITER_INNORMBUF;
2291     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2292 }
2293
2294 /**
2295 * Contraction character management function that returns the next character
2296 * for the forwards iterator.
2297 * Does nothing if the next character is in buffer and not the first character
2298 * in it.
2299 * Else it checks next character in data string to see if it is normalizable.
2300 * If it is not, the character is simply copied into the buffer, else
2301 * the whole normalized substring is copied into the buffer, including the
2302 * current character.
2303 * @param data collation element iterator data
2304 * @return next character
2305 */
2306 static
2307 inline UChar getNextNormalizedChar(collIterate *data)
2308 {
2309     UChar  nextch;
2310     UChar  ch;
2311     // Here we need to add the iterator code. One problem is the way
2312     // end of string is handled. If we just return next char, it could
2313     // be the sentinel. Most of the cases already check for this, but we
2314     // need to be sure.
2315     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2316          /* if no normalization and not in buffer. */
2317       if(data->flags & UCOL_USE_ITERATOR) {
2318          return (UChar)data->iterator->next(data->iterator);
2319       } else {
2320          return *(data->pos ++);
2321       }
2322     }
2323
2324     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2325       //normalizeIterator(data);
2326     //}
2327
2328     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2329     if ((innormbuf && *data->pos != 0) ||
2330         (data->fcdPosition != NULL && !innormbuf &&
2331         data->pos < data->fcdPosition)) {
2332         /*
2333         if next character is in normalized buffer, no further normalization
2334         is required
2335         */
2336         return *(data->pos ++);
2337     }
2338
2339     if (data->flags & UCOL_ITER_HASLEN) {
2340         /* in data string */
2341         if (data->pos + 1 == data->endp) {
2342             return *(data->pos ++);
2343         }
2344     }
2345     else {
2346         if (innormbuf) {
2347           // inside the normalization buffer, but at the end
2348           // (since we encountered zero). This means, in the
2349           // case we're using char iterator, that we need to
2350           // do another round of normalization.
2351           //if(data->origFlags & UCOL_USE_ITERATOR) {
2352             // we need to restore original flags,
2353             // otherwise, we'll lose them
2354             //data->flags = data->origFlags;
2355             //normalizeIterator(data);
2356             //return *(data->pos++);
2357           //} else {
2358             /*
2359             in writable buffer, at this point fcdPosition can not be
2360             pointing to the end of the data string. see contracting tag.
2361             */
2362           if(data->fcdPosition) {
2363             if (*(data->fcdPosition + 1) == 0 ||
2364                 data->fcdPosition + 1 == data->endp) {
2365                 /* at the end of the string, dump it into the normalizer */
2366                 data->pos = insertBufferEnd(data, *(data->fcdPosition)) + 1;
2367                 // Check if data->pos received a null pointer
2368                 if (data->pos == NULL) {
2369                     return (UChar)-1; // Return to indicate error.
2370                 }
2371                 return *(data->fcdPosition ++);
2372             }
2373             data->pos = data->fcdPosition;
2374           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2375             // if we are here, we're using a normalizing iterator.
2376             // we should just continue further.
2377             data->flags = data->origFlags;
2378             data->pos = NULL;
2379             return (UChar)data->iterator->next(data->iterator);
2380           }
2381           //}
2382         }
2383         else {
2384             if (*(data->pos + 1) == 0) {
2385                 return *(data->pos ++);
2386             }
2387         }
2388     }
2389
2390     ch = *data->pos ++;
2391     nextch = *data->pos;
2392
2393     /*
2394     * if the current character is not fcd.
2395     * Trailing combining class == 0.
2396     */
2397     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2398         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2399          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2400             /*
2401             Need a more complete FCD check and possible normalization.
2402             normalize substring will be appended to buffer
2403             */
2404         if (collIterFCD(data)) {
2405             normalizeNextContraction(data);
2406             return *(data->pos ++);
2407         }
2408         else if (innormbuf) {
2409             /* fcdposition shifted even when there's no normalization, if we
2410             don't input the rest into this, we'll get the wrong position when
2411             we reach the end of the writableBuffer */
2412             int32_t length = (int32_t)(data->fcdPosition - data->pos + 1);
2413             data->pos = insertBufferEnd(data, data->pos - 1, length);
2414             // Check if data->pos received a null pointer
2415             if (data->pos == NULL) {
2416                 return (UChar)-1; // Return to indicate error.
2417             }
2418             return *(data->pos ++);
2419         }
2420     }
2421
2422     if (innormbuf) {
2423         /*
2424         no normalization is to be done hence only one character will be
2425         appended to the buffer.
2426         */
2427         data->pos = insertBufferEnd(data, ch) + 1;
2428         // Check if data->pos received a null pointer
2429         if (data->pos == NULL) {
2430             return (UChar)-1; // Return to indicate error.
2431         }
2432     }
2433
2434     /* points back to the pos in string */
2435     return ch;
2436 }
2437
2438
2439
2440 /**
2441 * Function to copy the buffer into writableBuffer and sets the fcd position to
2442 * the correct position
2443 * @param source data string source
2444 * @param buffer character buffer
2445 */
2446 static
2447 inline void setDiscontiguosAttribute(collIterate *source, const UnicodeString &buffer)
2448 {
2449     /* okay confusing part here. to ensure that the skipped characters are
2450     considered later, we need to place it in the appropriate position in the
2451     normalization buffer and reassign the pos pointer. simple case if pos
2452     reside in string, simply copy to normalization buffer and
2453     fcdposition = pos, pos = start of normalization buffer. if pos in
2454     normalization buffer, we'll insert the copy infront of pos and point pos
2455     to the start of the normalization buffer. why am i doing these copies?
2456     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2457     not require any changes, which be really painful. */
2458     if (source->flags & UCOL_ITER_INNORMBUF) {
2459         int32_t replaceLength = source->pos - source->writableBuffer.getBuffer();
2460         source->writableBuffer.replace(0, replaceLength, buffer);
2461     }
2462     else {
2463         source->fcdPosition  = source->pos;
2464         source->origFlags    = source->flags;
2465         source->flags       |= UCOL_ITER_INNORMBUF;
2466         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2467         source->writableBuffer = buffer;
2468     }
2469
2470     source->pos = source->writableBuffer.getTerminatedBuffer();
2471 }
2472
2473 /**
2474 * Function to get the discontiguos collation element within the source.
2475 * Note this function will set the position to the appropriate places.
2476 * @param coll current collator used
2477 * @param source data string source
2478 * @param constart index to the start character in the contraction table
2479 * @return discontiguos collation element offset
2480 */
2481 static
2482 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2483                                 const UChar *constart)
2484 {
2485     /* source->pos currently points to the second combining character after
2486        the start character */
2487           const UChar *temppos      = source->pos;
2488           UnicodeString buffer;
2489     const UChar   *tempconstart = constart;
2490           uint8_t  tempflags    = source->flags;
2491           UBool    multicontraction = FALSE;
2492           collIterateState discState;
2493
2494           backupState(source, &discState);
2495
2496     buffer.setTo(peekCodePoint(source, -1));
2497     for (;;) {
2498         UChar    *UCharOffset;
2499         UChar     schar,
2500                   tchar;
2501         uint32_t  result;
2502
2503         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2504             || (peekCodeUnit(source, 0) == 0  &&
2505             //|| (*source->pos == 0  &&
2506                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2507                  source->fcdPosition == NULL ||
2508                  source->fcdPosition == source->endp ||
2509                  *(source->fcdPosition) == 0 ||
2510                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2511                  /* end of string in null terminated string or stopped by a
2512                  null character, note fcd does not always point to a base
2513                  character after the discontiguos change */
2514                  u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
2515                  //u_getCombiningClass(*(source->pos)) == 0) {
2516             //constart = (UChar *)coll->image + getContractOffset(CE);
2517             if (multicontraction) {
2518                 source->pos    = temppos - 1;
2519                 setDiscontiguosAttribute(source, buffer);
2520                 return *(coll->contractionCEs +
2521                                     (tempconstart - coll->contractionIndex));
2522             }
2523             constart = tempconstart;
2524             break;
2525         }
2526
2527         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2528         schar = getNextNormalizedChar(source);
2529
2530         while (schar > (tchar = *UCharOffset)) {
2531             UCharOffset++;
2532         }
2533
2534         if (schar != tchar) {
2535             /* not the correct codepoint. we stuff the current codepoint into
2536             the discontiguos buffer and try the next character */
2537             buffer.append(schar);
2538             continue;
2539         }
2540         else {
2541             if (u_getCombiningClass(schar) ==
2542                 u_getCombiningClass(peekCodePoint(source, -2))) {
2543                 buffer.append(schar);
2544                 continue;
2545             }
2546             result = *(coll->contractionCEs +
2547                                       (UCharOffset - coll->contractionIndex));
2548         }
2549
2550         if (result == UCOL_NOT_FOUND) {
2551           break;
2552         } else if (isContraction(result)) {
2553             /* this is a multi-contraction*/
2554             tempconstart = (UChar *)coll->image + getContractOffset(result);
2555             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2556                 != UCOL_NOT_FOUND) {
2557                 multicontraction = TRUE;
2558                 temppos       = source->pos + 1;
2559             }
2560         } else {
2561             setDiscontiguosAttribute(source, buffer);
2562             return result;
2563         }
2564     }
2565
2566     /* no problems simply reverting just like that,
2567     if we are in string before getting into this function, points back to
2568     string hence no problem.
2569     if we are in normalization buffer before getting into this function,
2570     since we'll never use another normalization within this function, we
2571     know that fcdposition points to a base character. the normalization buffer
2572     never change, hence this revert works. */
2573     loadState(source, &discState, TRUE);
2574     goBackOne(source);
2575
2576     //source->pos   = temppos - 1;
2577     source->flags = tempflags;
2578     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2579 }
2580
2581 /* now uses Mark's getImplicitPrimary code */
2582 static
2583 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2584     uint32_t r = uprv_uca_getImplicitPrimary(cp);
2585     *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2586     collationSource->offsetRepeatCount += 1;
2587     return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2588 }
2589
2590 /**
2591 * Inserts the argument character into the front of the buffer replacing the
2592 * front null terminator.
2593 * @param data collation element iterator data
2594 * @param ch character to be appended
2595 */
2596 static
2597 inline void insertBufferFront(collIterate *data, UChar ch)
2598 {
2599     data->pos = data->writableBuffer.setCharAt(0, ch).insert(0, (UChar)0).getTerminatedBuffer() + 2;
2600 }
2601
2602 /**
2603 * Special normalization function for contraction in the previous iterator.
2604 * This normalization sequence will place the current character at source->pos
2605 * and its following normalized sequence into the buffer.
2606 * The fcd position, pos will be changed.
2607 * pos will now point to positions in the buffer.
2608 * Flags will be changed accordingly.
2609 * @param data collation iterator data
2610 */
2611 static
2612 inline void normalizePrevContraction(collIterate *data, UErrorCode *status)
2613 {
2614     const UChar *pEnd = data->pos + 1;         /* End normalize + 1 */
2615     const UChar *pStart;
2616
2617     UnicodeString endOfBuffer;
2618     if (data->flags & UCOL_ITER_HASLEN) {
2619         /*
2620         normalization buffer not used yet, we'll pull down the next
2621         character into the end of the buffer
2622         */
2623         endOfBuffer.setTo(*pEnd);
2624     }
2625     else {
2626         endOfBuffer.setTo(data->writableBuffer, 1);  // after the leading NUL
2627     }
2628
2629     if (data->fcdPosition == NULL) {
2630         pStart = data->string;
2631     }
2632     else {
2633         pStart = data->fcdPosition + 1;
2634     }
2635     int32_t normLen =
2636         data->nfd->normalize(UnicodeString(FALSE, pStart, (int32_t)(pEnd - pStart)),
2637                              data->writableBuffer,
2638                              *status).
2639         length();
2640     if(U_FAILURE(*status)) {
2641         return;
2642     }
2643     /*
2644     this puts the null termination infront of the normalized string instead
2645     of the end
2646     */
2647     data->pos =
2648         data->writableBuffer.insert(0, (UChar)0).append(endOfBuffer).getTerminatedBuffer() +
2649         1 + normLen;
2650     data->origFlags  = data->flags;
2651     data->flags     |= UCOL_ITER_INNORMBUF;
2652     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2653 }
2654
2655 /**
2656 * Contraction character management function that returns the previous character
2657 * for the backwards iterator.
2658 * Does nothing if the previous character is in buffer and not the first
2659 * character in it.
2660 * Else it checks previous character in data string to see if it is
2661 * normalizable.
2662 * If it is not, the character is simply copied into the buffer, else
2663 * the whole normalized substring is copied into the buffer, including the
2664 * current character.
2665 * @param data collation element iterator data
2666 * @return previous character
2667 */
2668 static
2669 inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status)
2670 {
2671     UChar  prevch;
2672     UChar  ch;
2673     const UChar *start;
2674     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2675     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2676         (innormbuf && *(data->pos - 1) != 0)) {
2677         /*
2678         if no normalization.
2679         if previous character is in normalized buffer, no further normalization
2680         is required
2681         */
2682       if(data->flags & UCOL_USE_ITERATOR) {
2683         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2684         return (UChar)data->iterator->next(data->iterator);
2685       } else {
2686         return *(data->pos - 1);
2687       }
2688     }
2689
2690     start = data->pos;
2691     if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) {
2692         /* in data string */
2693         if ((start - 1) == data->string) {
2694             return *(start - 1);
2695         }
2696         start --;
2697         ch     = *start;
2698         prevch = *(start - 1);
2699     }
2700     else {
2701         /*
2702         in writable buffer, at this point fcdPosition can not be NULL.
2703         see contracting tag.
2704         */
2705         if (data->fcdPosition == data->string) {
2706             /* at the start of the string, just dump it into the normalizer */
2707             insertBufferFront(data, *(data->fcdPosition));
2708             data->fcdPosition = NULL;
2709             return *(data->pos - 1);
2710         }
2711         start  = data->fcdPosition;
2712         ch     = *start;
2713         prevch = *(start - 1);
2714     }
2715     /*
2716     * if the current character is not fcd.
2717     * Trailing combining class == 0.
2718     */
2719     if (data->fcdPosition > start &&
2720        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2721     {
2722         /*
2723         Need a more complete FCD check and possible normalization.
2724         normalize substring will be appended to buffer
2725         */
2726         const UChar *backuppos = data->pos;
2727         data->pos = start;
2728         if (collPrevIterFCD(data)) {
2729             normalizePrevContraction(data, status);
2730             return *(data->pos - 1);
2731         }
2732         data->pos = backuppos;
2733         data->fcdPosition ++;
2734     }
2735
2736     if (innormbuf) {
2737     /*
2738     no normalization is to be done hence only one character will be
2739     appended to the buffer.
2740     */
2741         insertBufferFront(data, ch);
2742         data->fcdPosition --;
2743     }
2744
2745     return ch;
2746 }
2747
2748 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2749 /* It is called by getNextCE */
2750
2751 /* The following should be even */
2752 #define UCOL_MAX_DIGITS_FOR_NUMBER 254
2753
2754 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2755     collIterateState entryState;
2756     backupState(source, &entryState);
2757     UChar32 cp = ch;
2758
2759     for (;;) {
2760         // This loop will repeat only in the case of contractions, and only when a contraction
2761         //   is found and the first CE resulting from that contraction is itself a special
2762         //   (an expansion, for example.)  All other special CE types are fully handled the
2763         //   first time through, and the loop exits.
2764
2765         const uint32_t *CEOffset = NULL;
2766         switch(getCETag(CE)) {
2767         case NOT_FOUND_TAG:
2768             /* This one is not found, and we'll let somebody else bother about it... no more games */
2769             return CE;
2770         case SPEC_PROC_TAG:
2771             {
2772                 // Special processing is getting a CE that is preceded by a certain prefix
2773                 // Currently this is only needed for optimizing Japanese length and iteration marks.
2774                 // When we encouter a special processing tag, we go backwards and try to see if
2775                 // we have a match.
2776                 // Contraction tables are used - so the whole process is not unlike contraction.
2777                 // prefix data is stored backwards in the table.
2778                 const UChar *UCharOffset;
2779                 UChar schar, tchar;
2780                 collIterateState prefixState;
2781                 backupState(source, &prefixState);
2782                 loadState(source, &entryState, TRUE);
2783                 goBackOne(source); // We want to look at the point where we entered - actually one
2784                 // before that...
2785
2786                 for(;;) {
2787                     // This loop will run once per source string character, for as long as we
2788                     //  are matching a potential contraction sequence
2789
2790                     // First we position ourselves at the begining of contraction sequence
2791                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2792                     if (collIter_bos(source)) {
2793                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2794                         break;
2795                     }
2796                     schar = getPrevNormalizedChar(source, status);
2797                     goBackOne(source);
2798
2799                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2800                         UCharOffset++;
2801                     }
2802
2803                     if (schar == tchar) {
2804                         // Found the source string char in the table.
2805                         //  Pick up the corresponding CE from the table.
2806                         CE = *(coll->contractionCEs +
2807                             (UCharOffset - coll->contractionIndex));
2808                     }
2809                     else
2810                     {
2811                         // Source string char was not in the table.
2812                         //   We have not found the prefix.
2813                         CE = *(coll->contractionCEs +
2814                             (ContractionStart - coll->contractionIndex));
2815                     }
2816
2817                     if(!isPrefix(CE)) {
2818                         // The source string char was in the contraction table, and the corresponding
2819                         //   CE is not a prefix CE.  We found the prefix, break
2820                         //   out of loop, this CE will end up being returned.  This is the normal
2821                         //   way out of prefix handling when the source actually contained
2822                         //   the prefix.
2823                         break;
2824                     }
2825                 }
2826                 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2827                     loadState(source, &prefixState, TRUE);
2828                     if(source->origFlags & UCOL_USE_ITERATOR) {
2829                         source->flags = source->origFlags;
2830                     }
2831                 } else { // prefix search was a failure, we have to backup all the way to the start
2832                     loadState(source, &entryState, TRUE);
2833                 }
2834                 break;
2835             }
2836         case CONTRACTION_TAG:
2837             {
2838                 /* This should handle contractions */
2839                 collIterateState state;
2840                 backupState(source, &state);
2841                 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND;
2842                 const UChar *UCharOffset;
2843                 UChar schar, tchar;
2844
2845                 for (;;) {
2846                     /* This loop will run once per source string character, for as long as we     */
2847                     /*  are matching a potential contraction sequence                  */
2848
2849                     /* First we position ourselves at the begining of contraction sequence */
2850                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2851
2852                     if (collIter_eos(source)) {
2853                         // Ran off the end of the source string.
2854                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2855                         // So we'll pick whatever we have at the point...
2856                         if (CE == UCOL_NOT_FOUND) {
2857                             // back up the source over all the chars we scanned going into this contraction.
2858                             CE = firstCE;
2859                             loadState(source, &state, TRUE);
2860                             if(source->origFlags & UCOL_USE_ITERATOR) {
2861                                 source->flags = source->origFlags;
2862                             }
2863                         }
2864                         break;
2865                     }
2866
2867                     uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2868                     uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2869
2870                     schar = getNextNormalizedChar(source);
2871                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2872                         UCharOffset++;
2873                     }
2874
2875                     if (schar == tchar) {
2876                         // Found the source string char in the contraction table.
2877                         //  Pick up the corresponding CE from the table.
2878                         CE = *(coll->contractionCEs +
2879                             (UCharOffset - coll->contractionIndex));
2880                     }
2881                     else
2882                     {
2883                         // Source string char was not in contraction table.
2884                         //   Unless we have a discontiguous contraction, we have finished
2885                         //   with this contraction.
2886                         // in order to do the proper detection, we
2887                         // need to see if we're dealing with a supplementary
2888                         /* We test whether the next two char are surrogate pairs.
2889                         * This test is done if the iterator is not NULL.
2890                         * If there is no surrogate pair, the iterator
2891                         * goes back one if needed. */
2892                         UChar32 miss = schar;
2893                         if (source->iterator) {
2894                             UChar32 surrNextChar; /* the next char in the iteration to test */
2895                             int32_t prevPos; /* holds the previous position before move forward of the source iterator */
2896                             if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) {
2897                                 prevPos = source->iterator->index;
2898                                 surrNextChar = getNextNormalizedChar(source);
2899                                 if (U16_IS_TRAIL(surrNextChar)) {
2900                                     miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar);
2901                                 } else if (prevPos < source->iterator->index){
2902                                     goBackOne(source);
2903                                 }
2904                             }
2905                         } else if (U16_IS_LEAD(schar)) {
2906                             miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source));
2907                         }
2908
2909                         uint8_t sCC;
2910                         if (miss < 0x300 ||
2911                             maxCC == 0 ||
2912                             (sCC = i_getCombiningClass(miss, coll)) == 0 ||
2913                             sCC>maxCC ||
2914                             (allSame != 0 && sCC == maxCC) ||
2915                             collIter_eos(source))
2916                         {
2917                             //  Contraction can not be discontiguous.
2918                             goBackOne(source);  // back up the source string by one,
2919                             //  because  the character we just looked at was
2920                             //  not part of the contraction.   */
2921                             if(U_IS_SUPPLEMENTARY(miss)) {
2922                                 goBackOne(source);
2923                             }
2924                             CE = *(coll->contractionCEs +
2925                                 (ContractionStart - coll->contractionIndex));
2926                         } else {
2927                             //
2928                             // Contraction is possibly discontiguous.
2929                             //   Scan more of source string looking for a match
2930                             //
2931                             UChar tempchar;
2932                             /* find the next character if schar is not a base character
2933                             and we are not yet at the end of the string */
2934                             tempchar = getNextNormalizedChar(source);
2935                             // probably need another supplementary thingie here
2936                             goBackOne(source);
2937                             if (i_getCombiningClass(tempchar, coll) == 0) {
2938                                 goBackOne(source);
2939                                 if(U_IS_SUPPLEMENTARY(miss)) {
2940                                     goBackOne(source);
2941                                 }
2942                                 /* Spit out the last char of the string, wasn't tasty enough */
2943                                 CE = *(coll->contractionCEs +
2944                                     (ContractionStart - coll->contractionIndex));
2945                             } else {
2946                                 CE = getDiscontiguous(coll, source, ContractionStart);
2947                             }
2948                         }
2949                     } // else after if(schar == tchar)
2950
2951                     if(CE == UCOL_NOT_FOUND) {
2952                         /* The Source string did not match the contraction that we were checking.  */
2953                         /*  Back up the source position to undo the effects of having partially    */
2954                         /*   scanned through what ultimately proved to not be a contraction.       */
2955                         loadState(source, &state, TRUE);
2956                         CE = firstCE;
2957                         break;
2958                     }
2959
2960                     if(!isContraction(CE)) {
2961                         // The source string char was in the contraction table, and the corresponding
2962                         //   CE is not a contraction CE.  We completed the contraction, break
2963                         //   out of loop, this CE will end up being returned.  This is the normal
2964                         //   way out of contraction handling when the source actually contained
2965                         //   the contraction.
2966                         break;
2967                     }
2968
2969
2970                     // The source string char was in the contraction table, and the corresponding
2971                     //   CE is IS  a contraction CE.  We will continue looping to check the source
2972                     //   string for the remaining chars in the contraction.
2973                     uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2974                     if(tempCE != UCOL_NOT_FOUND) {
2975                         // We have scanned a a section of source string for which there is a
2976                         //  CE from the contraction table.  Remember the CE and scan position, so
2977                         //  that we can return to this point if further scanning fails to
2978                         //  match a longer contraction sequence.
2979                         firstCE = tempCE;
2980
2981                         goBackOne(source);
2982                         backupState(source, &state);
2983                         getNextNormalizedChar(source);
2984
2985                         // Another way to do this is:
2986                         //collIterateState tempState;
2987                         //backupState(source, &tempState);
2988                         //goBackOne(source);
2989                         //backupState(source, &state);
2990                         //loadState(source, &tempState, TRUE);
2991
2992                         // The problem is that for incomplete contractions we have to remember the previous
2993                         // position. Before, the only thing I needed to do was state.pos--;
2994                         // After iterator introduction and especially after introduction of normalizing
2995                         // iterators, it became much more difficult to decrease the saved state.
2996                         // I'm not yet sure which of the two methods above is faster.
2997                     }
2998                 } // for(;;)
2999                 break;
3000             } // case CONTRACTION_TAG:
3001         case LONG_PRIMARY_TAG:
3002             {
3003                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3004                 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3005                 source->offsetRepeatCount += 1;
3006                 return CE;
3007             }
3008         case EXPANSION_TAG:
3009             {
3010                 /* This should handle expansion. */
3011                 /* NOTE: we can encounter both continuations and expansions in an expansion! */
3012                 /* I have to decide where continuations are going to be dealt with */
3013                 uint32_t size;
3014                 uint32_t i;    /* general counter */
3015
3016                 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3017                 size = getExpansionCount(CE);
3018                 CE = *CEOffset++;
3019               //source->offsetRepeatCount = -1;
3020
3021                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3022                     for(i = 1; i<size; i++) {
3023                         *(source->CEpos++) = *CEOffset++;
3024                         source->offsetRepeatCount += 1;
3025                     }
3026                 } else { /* else, we do */
3027                     while(*CEOffset != 0) {
3028                         *(source->CEpos++) = *CEOffset++;
3029                         source->offsetRepeatCount += 1;
3030                     }
3031                 }
3032
3033                 return CE;
3034             }
3035         case DIGIT_TAG:
3036             {
3037                 /*
3038                 We do a check to see if we want to collate digits as numbers; if so we generate
3039                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3040                 */
3041                 //uint32_t size;
3042                 uint32_t i;    /* general counter */
3043
3044                 if (source->coll->numericCollation == UCOL_ON){
3045                     collIterateState digitState = {0,0,0,0,0,0,0,0,0};
3046                     UChar32 char32 = 0;
3047                     int32_t digVal = 0;
3048
3049                     uint32_t digIndx = 0;
3050                     uint32_t endIndex = 0;
3051                     uint32_t trailingZeroIndex = 0;
3052
3053                     uint8_t collateVal = 0;
3054
3055                     UBool nonZeroValReached = FALSE;
3056
3057                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs.
3058                     /*
3059                          We parse the source string until we hit a char that's NOT a digit.
3060                         Use this u_charDigitValue. This might be slow because we have to
3061                         handle surrogates...
3062                     */
3063             /*
3064                     if (U16_IS_LEAD(ch)){
3065                       if (!collIter_eos(source)) {
3066                         backupState(source, &digitState);
3067                         UChar trail = getNextNormalizedChar(source);
3068                         if(U16_IS_TRAIL(trail)) {
3069                           char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3070                         } else {
3071                           loadState(source, &digitState, TRUE);
3072                           char32 = ch;
3073                         }
3074                       } else {
3075                         char32 = ch;
3076                       }
3077                     } else {
3078                       char32 = ch;
3079                     }
3080                     digVal = u_charDigitValue(char32);
3081             */
3082                     digVal = u_charDigitValue(cp); // if we have arrived here, we have
3083                     // already processed possible supplementaries that trigered the digit tag -
3084                     // all supplementaries are marked in the UCA.
3085                     /*
3086                         We  pad a zero in front of the first element anyways. This takes
3087                         care of the (probably) most common case where people are sorting things followed
3088                         by a single digit
3089                     */
3090                     digIndx++;
3091                     for(;;){
3092                         // Make sure we have enough space. No longer needed;
3093                         // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER
3094                         // (it has been pre-incremented) so we just ensure that numTempBuf is big enough
3095                         // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3).
3096
3097                         // Skipping over leading zeroes.
3098                         if (digVal != 0) {
3099                             nonZeroValReached = TRUE;
3100                         }
3101                         if (nonZeroValReached) {
3102                             /*
3103                             We parse the digit string into base 100 numbers (this fits into a byte).
3104                             We only add to the buffer in twos, thus if we are parsing an odd character,
3105                             that serves as the 'tens' digit while the if we are parsing an even one, that
3106                             is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3107                             a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3108                             overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3109                             than all the other bytes.
3110                             */
3111
3112                             if (digIndx % 2 == 1){
3113                                 collateVal += (uint8_t)digVal;
3114
3115                                 // We don't enter the low-order-digit case unless we've already seen
3116                                 // the high order, or for the first digit, which is always non-zero.
3117                                 if (collateVal != 0)
3118                                     trailingZeroIndex = 0;
3119
3120                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3121                                 collateVal = 0;
3122                             }
3123                             else{
3124                                 // We drop the collation value into the buffer so if we need to do
3125                                 // a "front patch" we don't have to check to see if we're hitting the
3126                                 // last element.
3127                                 collateVal = (uint8_t)(digVal * 10);
3128
3129                                 // Check for trailing zeroes.
3130                                 if (collateVal == 0)
3131                                 {
3132                                     if (!trailingZeroIndex)
3133                                         trailingZeroIndex = (digIndx/2) + 2;
3134                                 }
3135                                 else
3136                                     trailingZeroIndex = 0;
3137
3138                                 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3139                             }
3140                             digIndx++;
3141                         }
3142
3143                         // Get next character.
3144                         if (!collIter_eos(source)){
3145                             ch = getNextNormalizedChar(source);
3146                             if (U16_IS_LEAD(ch)){
3147                                 if (!collIter_eos(source)) {
3148                                     backupState(source, &digitState);
3149                                     UChar trail = getNextNormalizedChar(source);
3150                                     if(U16_IS_TRAIL(trail)) {
3151                                         char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3152                                     } else {
3153                                         loadState(source, &digitState, TRUE);
3154                                         char32 = ch;
3155                                     }
3156                                 }
3157                             } else {
3158                                 char32 = ch;
3159                             }
3160
3161                             if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){
3162                                 // Resetting position to point to the next unprocessed char. We
3163                                 // overshot it when doing our test/set for numbers.
3164                                 if (char32 > 0xFFFF) { // For surrogates.
3165                                     loadState(source, &digitState, TRUE);
3166                                     //goBackOne(source);
3167                                 }
3168                                 goBackOne(source);
3169                                 break;
3170                             }
3171                         } else {
3172                             break;
3173                         }
3174                     }
3175
3176                     if (nonZeroValReached == FALSE){
3177                         digIndx = 2;
3178                         numTempBuf[2] = 6;
3179                     }
3180
3181                     endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3182                     if (digIndx % 2 != 0){
3183                         /*
3184                         We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3185                         we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3186                         Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3187                         single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3188                         */
3189
3190                         for(i = 2; i < endIndex; i++){
3191                             numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3192                                 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3193                         }
3194                         --digIndx;
3195                     }
3196
3197                     // Subtract one off of the last byte.
3198                     numTempBuf[endIndex-1] -= 1;
3199
3200                     /*
3201                     We want to skip over the first two slots in the buffer. The first slot
3202                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3203                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3204                     */
3205                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3206                     numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3207
3208                     // Now transfer the collation key to our collIterate struct.
3209                     // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3210                     //size = ((endIndex+1) & ~1)/2;
3211                     CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3212                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3213                         UCOL_BYTE_COMMON; // Tertiary weight.
3214                     i = 2; // Reset the index into the buffer.
3215                     while(i < endIndex)
3216                     {
3217                         uint32_t primWeight = numTempBuf[i++] << 8;
3218                         if ( i < endIndex)
3219                             primWeight |= numTempBuf[i++];
3220                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3221                     }
3222
3223                 } else {
3224                     // no numeric mode, we'll just switch to whatever we stashed and continue
3225                     CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3226                     CE = *CEOffset++;
3227                     break;
3228                 }
3229                 return CE;
3230             }
3231             /* various implicits optimization */
3232         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3233             /* UCA is filled with these. Tailorings are NOT_FOUND */
3234             return getImplicit(cp, source);
3235         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3236             // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3237             return getImplicit(cp, source);
3238         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3239             {
3240                 static const uint32_t
3241                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3242                 //const uint32_t LCount = 19;
3243                 static const uint32_t VCount = 21;
3244                 static const uint32_t TCount = 28;
3245                 //const uint32_t NCount = VCount * TCount;   // 588
3246                 //const uint32_t SCount = LCount * NCount;   // 11172
3247                 uint32_t L = ch - SBase;
3248
3249                 // divide into pieces
3250
3251                 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3252                 L /= TCount;
3253                 uint32_t V = L % VCount;
3254                 L /= VCount;
3255
3256                 // offset them
3257
3258                 L += LBase;
3259                 V += VBase;
3260                 T += TBase;
3261
3262                 // return the first CE, but first put the rest into the expansion buffer
3263                 if (!source->coll->image->jamoSpecial) { // FAST PATH
3264
3265                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
3266                     if (T != TBase) {
3267                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
3268                     }
3269
3270                     return UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
3271
3272                 } else { // Jamo is Special
3273                     // Since Hanguls pass the FCD check, it is
3274                     // guaranteed that we won't be in
3275                     // the normalization buffer if something like this happens
3276
3277                     // However, if we are using a uchar iterator and normalization
3278                     // is ON, the Hangul that lead us here is going to be in that
3279                     // normalization buffer. Here we want to restore the uchar
3280                     // iterator state and pull out of the normalization buffer
3281                     if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3282                         source->flags = source->origFlags; // restore the iterator
3283                         source->pos = NULL;
3284                     }
3285
3286                     // Move Jamos into normalization buffer
3287                     UChar *buffer = source->writableBuffer.getBuffer(4);
3288                     int32_t bufferLength;
3289                     buffer[0] = (UChar)L;
3290                     buffer[1] = (UChar)V;
3291                     if (T != TBase) {
3292                         buffer[2] = (UChar)T;
3293                         bufferLength = 3;
3294                     } else {
3295                         bufferLength = 2;
3296                     }
3297                     source->writableBuffer.releaseBuffer(bufferLength);
3298
3299                     // Indicate where to continue in main input string after exhausting the writableBuffer
3300                     source->fcdPosition       = source->pos;
3301
3302                     source->pos   = source->writableBuffer.getTerminatedBuffer();
3303                     source->origFlags   = source->flags;
3304                     source->flags       |= UCOL_ITER_INNORMBUF;
3305                     source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3306
3307                     return(UCOL_IGNORABLE);
3308                 }
3309             }
3310         case SURROGATE_TAG:
3311             /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
3312             /* two things can happen here: next code point can be a trailing surrogate - we will use it */
3313             /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
3314             /* we treat it like an unassigned code point. */
3315             {
3316                 UChar trail;
3317                 collIterateState state;
3318                 backupState(source, &state);
3319                 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
3320                     // we chould have stepped one char forward and it might have turned that it
3321                     // was not a trail surrogate. In that case, we have to backup.
3322                     loadState(source, &state, TRUE);
3323                     return UCOL_NOT_FOUND;
3324                 } else {
3325                     /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
3326                     CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
3327                     if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
3328                         // We need to backup
3329                         loadState(source, &state, TRUE);
3330                         return CE;
3331                     }
3332                     // calculate the supplementary code point value, if surrogate was not tailored
3333                     cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3334                 }
3335             }
3336             break;
3337         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3338             UChar nextChar;
3339             if( source->flags & UCOL_USE_ITERATOR) {
3340                 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3341                     cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3342                     source->iterator->next(source->iterator);
3343                     return getImplicit(cp, source);
3344                 }
3345             } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3346                       U_IS_TRAIL((nextChar=*source->pos))) {
3347                 cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3348                 source->pos++;
3349                 return getImplicit(cp, source);
3350             }
3351             return UCOL_NOT_FOUND;
3352         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3353             return UCOL_NOT_FOUND; /* broken surrogate sequence */
3354         case CHARSET_TAG:
3355             /* not yet implemented */
3356             /* probably after 1.8 */
3357             return UCOL_NOT_FOUND;
3358         default:
3359             *status = U_INTERNAL_PROGRAM_ERROR;
3360             CE=0;
3361             break;
3362     }
3363     if (CE <= UCOL_NOT_FOUND) break;
3364   }
3365   return CE;
3366 }
3367
3368
3369 /* now uses Mark's getImplicitPrimary code */
3370 static
3371 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3372     uint32_t r = uprv_uca_getImplicitPrimary(cp);
3373
3374     *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3375     collationSource->toReturn = collationSource->CEpos;
3376
3377     // **** doesn't work if using iterator ****
3378     if (collationSource->flags & UCOL_ITER_INNORMBUF) {
3379         collationSource->offsetRepeatCount = 1;
3380     } else {
3381         int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
3382
3383         UErrorCode errorCode = U_ZERO_ERROR;
3384         collationSource->appendOffset(firstOffset, errorCode);
3385         collationSource->appendOffset(firstOffset + 1, errorCode);
3386
3387         collationSource->offsetReturn = collationSource->offsetStore - 1;
3388         *(collationSource->offsetBuffer) = firstOffset;
3389         if (collationSource->offsetReturn == collationSource->offsetBuffer) {
3390             collationSource->offsetStore = collationSource->offsetBuffer;
3391         }
3392     }
3393
3394     return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3395 }
3396
3397 /**
3398  * This function handles the special CEs like contractions, expansions,
3399  * surrogates, Thai.
3400  * It is called by both getPrevCE
3401  */
3402 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3403                           collIterate *source,
3404                           UErrorCode *status)
3405 {
3406     const uint32_t *CEOffset    = NULL;
3407           UChar    *UCharOffset = NULL;
3408           UChar    schar;
3409     const UChar    *constart    = NULL;
3410           uint32_t size;
3411           UChar    buffer[UCOL_MAX_BUFFER];
3412           uint32_t *endCEBuffer;
3413           UChar   *strbuffer;
3414           int32_t noChars = 0;
3415           int32_t CECount = 0;
3416
3417     for(;;)
3418     {
3419         /* the only ces that loops are thai and contractions */
3420         switch (getCETag(CE))
3421         {
3422         case NOT_FOUND_TAG:  /* this tag always returns */
3423             return CE;
3424
3425         case SPEC_PROC_TAG:
3426             {
3427                 // Special processing is getting a CE that is preceded by a certain prefix
3428                 // Currently this is only needed for optimizing Japanese length and iteration marks.
3429                 // When we encouter a special processing tag, we go backwards and try to see if
3430                 // we have a match.
3431                 // Contraction tables are used - so the whole process is not unlike contraction.
3432                 // prefix data is stored backwards in the table.
3433                 const UChar *UCharOffset;
3434                 UChar schar, tchar;
3435                 collIterateState prefixState;
3436                 backupState(source, &prefixState);
3437                 for(;;) {
3438                     // This loop will run once per source string character, for as long as we
3439                     //  are matching a potential contraction sequence
3440
3441                     // First we position ourselves at the begining of contraction sequence
3442                     const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3443
3444                     if (collIter_bos(source)) {
3445                         CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3446                         break;
3447                     }
3448                     schar = getPrevNormalizedChar(source, status);
3449                     goBackOne(source);
3450
3451                     while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3452                         UCharOffset++;
3453                     }
3454
3455                     if (schar == tchar) {
3456                         // Found the source string char in the table.
3457                         //  Pick up the corresponding CE from the table.
3458                         CE = *(coll->contractionCEs +
3459                             (UCharOffset - coll->contractionIndex));
3460                     }
3461                     else
3462                     {
3463                         // if there is a completely ignorable code point in the middle of
3464                         // a prefix, we need to act as if it's not there
3465                         // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3466                         // lone surrogates cannot be set to zero as it would break other processing
3467                         uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
3468                         // it's easy for BMP code points
3469                         if(isZeroCE == 0) {
3470                             continue;
3471                         } else if(U16_IS_SURROGATE(schar)) {
3472                             // for supplementary code points, we have to check the next one
3473                             // situations where we are going to ignore
3474                             // 1. beginning of the string: schar is a lone surrogate
3475                             // 2. schar is a lone surrogate
3476                             // 3. schar is a trail surrogate in a valid surrogate sequence
3477                             //    that is explicitly set to zero.
3478                             if (!collIter_bos(source)) {
3479                                 UChar lead;
3480                                 if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
3481                                     isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
3482                                     if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
3483                                         uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
3484                                         if(finalCE == 0) {
3485                                             // this is a real, assigned completely ignorable code point
3486                                             goBackOne(source);
3487                                             continue;
3488                                         }
3489                                     }
3490                                 } else {
3491                                     // lone surrogate, treat like unassigned
3492                                     return UCOL_NOT_FOUND;
3493                                 }
3494                             } else {
3495                                 // lone surrogate at the beggining, treat like unassigned
3496                                 return UCOL_NOT_FOUND;
3497                             }
3498                         }
3499                         // Source string char was not in the table.
3500                         //   We have not found the prefix.
3501                         CE = *(coll->contractionCEs +
3502                             (ContractionStart - coll->contractionIndex));
3503                     }
3504
3505                     if(!isPrefix(CE)) {
3506                         // The source string char was in the contraction table, and the corresponding
3507                         //   CE is not a prefix CE.  We found the prefix, break
3508                         //   out of loop, this CE will end up being returned.  This is the normal
3509                         //   way out of prefix handling when the source actually contained
3510                         //   the prefix.
3511                         break;
3512                     }
3513                 }
3514                 loadState(source, &prefixState, TRUE);
3515                 break;
3516             }
3517
3518         case CONTRACTION_TAG: {
3519             /* to ensure that the backwards and forwards iteration matches, we
3520             take the current region of most possible match and pass it through
3521             the forward iteration. this will ensure that the obstinate problem of
3522             overlapping contractions will not occur.
3523             */
3524             schar = peekCodeUnit(source, 0);
3525             constart = (UChar *)coll->image + getContractOffset(CE);
3526             if (isAtStartPrevIterate(source)
3527                 /* commented away contraction end checks after adding the checks
3528                 in getPrevCE  */) {
3529                     /* start of string or this is not the end of any contraction */
3530                     CE = *(coll->contractionCEs +
3531                         (constart - coll->contractionIndex));
3532                     break;
3533             }
3534             strbuffer = buffer;
3535             UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3536             *(UCharOffset --) = 0;
3537             noChars = 0;
3538             // have to swap thai characters
3539             while (ucol_unsafeCP(schar, coll)) {
3540                 *(UCharOffset) = schar;
3541                 noChars++;
3542                 UCharOffset --;
3543                 schar = getPrevNormalizedChar(source, status);
3544                 goBackOne(source);
3545                 // TODO: when we exhaust the contraction buffer,
3546                 // it needs to get reallocated. The problem is
3547                 // that the size depends on the string which is
3548                 // not iterated over. However, since we're travelling
3549                 // backwards, we already had to set the iterator at
3550                 // the end - so we might as well know where we are?
3551                 if (UCharOffset + 1 == buffer) {
3552                     /* we have exhausted the buffer */
3553                     int32_t newsize = 0;
3554                     if(source->pos) { // actually dealing with a position
3555                         newsize = (int32_t)(source->pos - source->string + 1);
3556                     } else { // iterator
3557                         newsize = 4 * UCOL_MAX_BUFFER;
3558                     }
3559                     strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3560                         (newsize + UCOL_MAX_BUFFER));
3561                     /* test for NULL */
3562                     if (strbuffer == NULL) {
3563                         *status = U_MEMORY_ALLOCATION_ERROR;
3564                         return UCOL_NO_MORE_CES;
3565                     }
3566                     UCharOffset = strbuffer + newsize;
3567                     uprv_memcpy(UCharOffset, buffer,
3568                         UCOL_MAX_BUFFER * sizeof(UChar));
3569                     UCharOffset --;
3570                 }
3571                 if ((source->pos && (source->pos == source->string ||
3572                     ((source->flags & UCOL_ITER_INNORMBUF) &&
3573                     *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3574                     || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3575                         break;
3576                 }
3577             }
3578             /* adds the initial base character to the string */
3579             *(UCharOffset) = schar;
3580             noChars++;
3581
3582             int32_t offsetBias;
3583
3584             // **** doesn't work if using iterator ****
3585             if (source->flags & UCOL_ITER_INNORMBUF) {
3586                 offsetBias = -1;
3587             } else {
3588                 offsetBias = (int32_t)(source->pos - source->string);
3589             }
3590
3591             /* a new collIterate is used to simplify things, since using the current
3592             collIterate will mean that the forward and backwards iteration will
3593             share and change the same buffers. we don't want to get into that. */
3594             collIterate temp;
3595             int32_t rawOffset;
3596
3597             IInit_collIterate(coll, UCharOffset, noChars, &temp, status);
3598             if(U_FAILURE(*status)) {
3599                 return (uint32_t)UCOL_NULLORDER;
3600             }
3601             temp.flags &= ~UCOL_ITER_NORM;
3602             temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT;
3603
3604             rawOffset = (int32_t)(temp.pos - temp.string); // should always be zero?
3605             CE = ucol_IGetNextCE(coll, &temp, status);
3606
3607             if (source->extendCEs) {
3608                 endCEBuffer = source->extendCEs + source->extendCEsSize;
3609                 CECount = (int32_t)((source->CEpos - source->extendCEs)/sizeof(uint32_t));
3610             } else {
3611                 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3612                 CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
3613             }
3614
3615             while (CE != UCOL_NO_MORE_CES) {
3616                 *(source->CEpos ++) = CE;
3617
3618                 if (offsetBias >= 0) {
3619                     source->appendOffset(rawOffset + offsetBias, *status);
3620                 }
3621
3622                 CECount++;
3623                 if (source->CEpos == endCEBuffer) {
3624                     /* ran out of CE space, reallocate to new buffer.
3625                     If reallocation fails, reset pointers and bail out,
3626                     there's no guarantee of the right character position after
3627                     this bail*/
3628                     if (!increaseCEsCapacity(source)) {
3629                         *status = U_MEMORY_ALLOCATION_ERROR;
3630                         break;
3631                     }
3632
3633                     endCEBuffer = source->extendCEs + source->extendCEsSize;
3634                 }
3635
3636                 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
3637                     rawOffset = (int32_t)(temp.fcdPosition - temp.string);
3638                 } else {
3639                     rawOffset = (int32_t)(temp.pos - temp.string);
3640                 }
3641
3642                 CE = ucol_IGetNextCE(coll, &temp, status);
3643             }
3644
3645             if (strbuffer != buffer) {
3646                 uprv_free(strbuffer);
3647             }
3648             if (U_FAILURE(*status)) {
3649                 return (uint32_t)UCOL_NULLORDER;
3650             }
3651
3652             if (source->offsetRepeatValue != 0) {
3653                 if (CECount > noChars) {
3654                     source->offsetRepeatCount += temp.offsetRepeatCount;
3655                 } else {
3656                     // **** does this really skip the right offsets? ****
3657                     source->offsetReturn -= (noChars - CECount);
3658                 }
3659             }
3660
3661             if (offsetBias >= 0) {
3662                 source->offsetReturn = source->offsetStore - 1;
3663                 if (source->offsetReturn == source->offsetBuffer) {
3664                     source->offsetStore = source->offsetBuffer;
3665                 }
3666             }
3667
3668             source->toReturn = source->CEpos - 1;
3669             if (source->toReturn == source->CEs) {
3670                 source->CEpos = source->CEs;
3671             }
3672
3673             return *(source->toReturn);
3674         }
3675         case LONG_PRIMARY_TAG:
3676             {
3677                 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3678                 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3679                 source->toReturn = source->CEpos - 1;
3680
3681                 if (source->flags & UCOL_ITER_INNORMBUF) {
3682                     source->offsetRepeatCount = 1;
3683                 } else {
3684                     int32_t firstOffset = (int32_t)(source->pos - source->string);
3685
3686                     source->appendOffset(firstOffset, *status);
3687                     source->appendOffset(firstOffset + 1, *status);
3688
3689                     source->offsetReturn = source->offsetStore - 1;
3690                     *(source->offsetBuffer) = firstOffset;
3691                     if (source->offsetReturn == source->offsetBuffer) {
3692                         source->offsetStore = source->offsetBuffer;
3693                     }
3694                 }
3695
3696
3697                 return *(source->toReturn);
3698             }
3699
3700         case EXPANSION_TAG: /* this tag always returns */
3701             {
3702             /*
3703             This should handle expansion.
3704             NOTE: we can encounter both continuations and expansions in an expansion!
3705             I have to decide where continuations are going to be dealt with
3706             */
3707             int32_t firstOffset = (int32_t)(source->pos - source->string);
3708
3709             // **** doesn't work if using iterator ****
3710             if (source->offsetReturn != NULL) {
3711                 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) {
3712                     source->offsetStore = source->offsetBuffer;
3713                 }else {
3714                   firstOffset = -1;
3715                 }
3716             }
3717
3718             /* find the offset to expansion table */
3719             CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3720             size     = getExpansionCount(CE);
3721             if (size != 0) {
3722                 /*
3723                 if there are less than 16 elements in expansion, we don't terminate
3724                 */
3725                 uint32_t count;
3726
3727                 for (count = 0; count < size; count++) {
3728                     *(source->CEpos ++) = *CEOffset++;
3729
3730                     if (firstOffset >= 0) {
3731                         source->appendOffset(firstOffset + 1, *status);
3732                     }
3733                 }
3734             } else {
3735                 /* else, we do */
3736                 while (*CEOffset != 0) {
3737                     *(source->CEpos ++) = *CEOffset ++;
3738
3739                     if (firstOffset >= 0) {
3740                         source->appendOffset(firstOffset + 1, *status);
3741                     }
3742                 }
3743             }
3744
3745             if (firstOffset >= 0) {
3746                 source->offsetReturn = source->offsetStore - 1;
3747                 *(source->offsetBuffer) = firstOffset;
3748                 if (source->offsetReturn == source->offsetBuffer) {
3749                     source->offsetStore = source->offsetBuffer;
3750                 }
3751             } else {
3752                 source->offsetRepeatCount += size - 1;
3753             }
3754
3755             source->toReturn = source->CEpos - 1;
3756             // in case of one element expansion, we
3757             // want to immediately return CEpos
3758             if(source->toReturn == source->CEs) {
3759                 source->CEpos = source->CEs;
3760             }
3761
3762             return *(source->toReturn);
3763             }
3764
3765         case DIGIT_TAG:
3766             {
3767                 /*
3768                 We do a check to see if we want to collate digits as numbers; if so we generate
3769                 a custom collation key. Otherwise we pull out the value stored in the expansion table.
3770                 */
3771                 uint32_t i;    /* general counter */
3772
3773                 if (source->coll->numericCollation == UCOL_ON){
3774                     uint32_t digIndx = 0;
3775                     uint32_t endIndex = 0;
3776                     uint32_t leadingZeroIndex = 0;
3777                     uint32_t trailingZeroCount = 0;
3778
3779                     uint8_t collateVal = 0;
3780
3781                     UBool nonZeroValReached = FALSE;
3782
3783                     uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs.
3784                     /*
3785                     We parse the source string until we hit a char that's NOT a digit.
3786                     Use this u_charDigitValue. This might be slow because we have to
3787                     handle surrogates...
3788                     */
3789                     /*
3790                     We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less,
3791                     with any chunks smaller than that being on the right end of the digit string - i.e. the first collation
3792                     element we process when going backward. To determine how long that chunk might be, we may need to make
3793                     two passes through the loop that collects digits - one to see how long the string is (and how much is
3794                     leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has
3795                     more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation
3796                     element chunk after resetting the state to the initialState at the right side of the digit string.
3797                     */
3798                     uint32_t ceLimit = 0;
3799                     UChar initial_ch = ch;
3800                     collIterateState initialState = {0,0,0,0,0,0,0,0,0};
3801                     backupState(source, &initialState);
3802
3803                     for(;;) {
3804                         collIterateState state = {0,0,0,0,0,0,0,0,0};
3805                         UChar32 char32 = 0;
3806                         int32_t digVal = 0;
3807
3808                         if (U16_IS_TRAIL (ch)) {
3809                             if (!collIter_bos(source)){
3810                                 UChar lead = getPrevNormalizedChar(source, status);
3811                                 if(U16_IS_LEAD(lead)) {
3812                                     char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3813                                     goBackOne(source);
3814                                 } else {
3815                                     char32 = ch;
3816                                 }
3817                             } else {
3818                                 char32 = ch;
3819                             }
3820                         } else {
3821                             char32 = ch;
3822                         }
3823                         digVal = u_charDigitValue(char32);
3824
3825                         for(;;) {
3826                             // Make sure we have enough space. No longer needed;
3827                             // at this point the largest value of digIndx when we need to save data in numTempBuf
3828                             // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure
3829                             // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2).
3830
3831                             // Skip over trailing zeroes, and keep a count of them.
3832                             if (digVal != 0)
3833                                 nonZeroValReached = TRUE;
3834
3835                             if (nonZeroValReached) {
3836                                 /*
3837                                 We parse the digit string into base 100 numbers (this fits into a byte).
3838                                 We only add to the buffer in twos, thus if we are parsing an odd character,
3839                                 that serves as the 'tens' digit while the if we are parsing an even one, that
3840                                 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3841                                 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3842                                 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3843                                 than all the other bytes.
3844
3845                                 Since we're doing in this reverse we want to put the first digit encountered into the
3846                                 ones place and the second digit encountered into the tens place.
3847                                 */
3848
3849                                 if ((digIndx + trailingZeroCount) % 2 == 1) {
3850                                     // High-order digit case (tens place)
3851                                     collateVal += (uint8_t)(digVal * 10);
3852
3853                                     // We cannot set leadingZeroIndex unless it has been set for the
3854                                     // low-order digit. Therefore, all we can do for the high-order
3855                                     // digit is turn it off, never on.
3856                                     // The only time we will have a high digit without a low is for
3857                                     // the very first non-zero digit, so no zero check is necessary.
3858                                     if (collateVal != 0)
3859                                         leadingZeroIndex = 0;
3860
3861                                     // The first pass through, digIndx may exceed the limit, but in that case
3862                                     // we no longer care about numTempBuf contents since they will be discarded
3863                                     if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) {
3864                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3865                                     }
3866                                     collateVal = 0;
3867                                 } else {
3868                                     // Low-order digit case (ones place)
3869                                     collateVal = (uint8_t)digVal;
3870
3871                                     // Check for leading zeroes.
3872                                     if (collateVal == 0) {
3873                                         if (!leadingZeroIndex)
3874                                             leadingZeroIndex = (digIndx/2) + 2;
3875                                     } else
3876                                         leadingZeroIndex = 0;
3877
3878                                     // No need to write to buffer; the case of a last odd digit
3879                                     // is handled below.
3880                                 }
3881                                 ++digIndx;
3882                             } else
3883                                 ++trailingZeroCount;
3884
3885                             if (!collIter_bos(source)) {
3886                                 ch = getPrevNormalizedChar(source, status);
3887                                 //goBackOne(source);
3888                                 if (U16_IS_TRAIL(ch)) {
3889                                     backupState(source, &state);
3890                                     if (!collIter_bos(source)) {
3891                                         goBackOne(source);
3892                                         UChar lead = getPrevNormalizedChar(source, status);
3893
3894                                         if(U16_IS_LEAD(lead)) {
3895                                             char32 = U16_GET_SUPPLEMENTARY(lead,ch);
3896                                         } else {
3897                                             loadState(source, &state, FALSE);
3898                                             char32 = ch;
3899                                         }
3900                                     }
3901                                 } else
3902                                     char32 = ch;
3903
3904                                 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) {
3905                                     if (char32 > 0xFFFF) {// For surrogates.
3906                                         loadState(source, &state, FALSE);
3907                                     }
3908                                     // Don't need to "reverse" the goBackOne call,
3909                                     // as this points to the next position to process..
3910                                     //if (char32 > 0xFFFF) // For surrogates.
3911                                     //getNextNormalizedChar(source);
3912                                     break;
3913                                 }
3914
3915                                 goBackOne(source);
3916                             }else
3917                                 break;
3918                         }
3919
3920                         if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) {
3921                             // our collation element is not too big, go ahead and finish with it
3922                             break;
3923                         }
3924                         // our digit string is too long for a collation element;
3925                         // set the limit for it, reset the state and begin again
3926                         ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER;
3927                         if ( ceLimit == 0 ) {
3928                             ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER;
3929                         }
3930                         ch = initial_ch;
3931                         loadState(source, &initialState, FALSE);
3932                         digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0;
3933                         collateVal = 0;
3934                         nonZeroValReached = FALSE;
3935                     }
3936
3937                     if (! nonZeroValReached) {
3938                         digIndx = 2;
3939                         trailingZeroCount = 0;
3940                         numTempBuf[2] = 6;
3941                     }
3942
3943                     if ((digIndx + trailingZeroCount) % 2 != 0) {
3944                         numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3945                         digIndx += 1;       // The implicit leading zero
3946                     }
3947                     if (trailingZeroCount % 2 != 0) {
3948                         // We had to consume one trailing zero for the low digit
3949                         // of the least significant byte
3950                         digIndx += 1;       // The trailing zero not in the exponent
3951                         trailingZeroCount -= 1;
3952                     }
3953
3954                     endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3955
3956                     // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3957                     numTempBuf[2] -= 1;
3958
3959                     /*
3960                     We want to skip over the first two slots in the buffer. The first slot
3961                     is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3962                     sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3963                     The exponent must be adjusted by the number of leading zeroes, and the number of
3964                     trailing zeroes.
3965                     */
3966                     numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3967                     uint32_t exponent = (digIndx+trailingZeroCount)/2;
3968                     if (leadingZeroIndex)
3969                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3970                     numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
3971
3972                     // Now transfer the collation key to our collIterate struct.
3973                     // The total size for our collation key is half of endIndex, rounded up.
3974                     int32_t size = (endIndex+1)/2;
3975                     if(!ensureCEsCapacity(source, size)) {
3976                         return (uint32_t)UCOL_NULLORDER;
3977                     }
3978                     *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3979                         (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3980                         UCOL_BYTE_COMMON; // Tertiary weight.
3981                     i = endIndex - 1; // Reset the index into the buffer.
3982                     while(i >= 2) {
3983                         uint32_t primWeight = numTempBuf[i--] << 8;
3984                         if ( i >= 2)
3985                             primWeight |= numTempBuf[i--];
3986                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3987                     }
3988
3989                     source->toReturn = source->CEpos -1;
3990                     return *(source->toReturn);
3991                 } else {
3992                     CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3993                     CE = *(CEOffset++);
3994                     break;
3995                 }
3996             }
3997
3998         case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3999             {
4000                 static const uint32_t
4001                     SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4002                 //const uint32_t LCount = 19;
4003                 static const uint32_t VCount = 21;
4004                 static const uint32_t TCount = 28;
4005                 //const uint32_t NCount = VCount * TCount;   /* 588 */
4006                 //const uint32_t SCount = LCount * NCount;   /* 11172 */
4007
4008                 uint32_t L = ch - SBase;
4009                 /*
4010                 divide into pieces.
4011                 we do it in this order since some compilers can do % and / in one
4012                 operation
4013                 */
4014                 uint32_t T = L % TCount;
4015                 L /= TCount;
4016                 uint32_t V = L % VCount;
4017                 L /= VCount;
4018
4019                 /* offset them */
4020                 L += LBase;
4021                 V += VBase;
4022                 T += TBase;
4023
4024                 int32_t firstOffset = (int32_t)(source->pos - source->string);
4025                 source->appendOffset(firstOffset, *status);
4026
4027                 /*
4028                  * return the first CE, but first put the rest into the expansion buffer
4029                  */
4030                 if (!source->coll->image->jamoSpecial) {
4031                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
4032                     *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
4033                     source->appendOffset(firstOffset + 1, *status);
4034
4035                     if (T != TBase) {
4036                         *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
4037                         source->appendOffset(firstOffset + 1, *status);
4038                     }
4039
4040                     source->toReturn = source->CEpos - 1;
4041
4042                     source->offsetReturn = source->offsetStore - 1;
4043                     if (source->offsetReturn == source->offsetBuffer) {
4044                         source->offsetStore = source->offsetBuffer;
4045                     }
4046
4047                     return *(source->toReturn);
4048                 } else {
4049                     // Since Hanguls pass the FCD check, it is
4050                     // guaranteed that we won't be in
4051                     // the normalization buffer if something like this happens
4052
4053                     // Move Jamos into normalization buffer
4054                     UChar *tempbuffer = source->writableBuffer.getBuffer(5);
4055                     int32_t tempbufferLength, jamoOffset;
4056                     tempbuffer[0] = 0;
4057                     tempbuffer[1] = (UChar)L;
4058                     tempbuffer[2] = (UChar)V;
4059                     if (T != TBase) {
4060                         tempbuffer[3] = (UChar)T;
4061                         tempbufferLength = 4;
4062                     } else {
4063                         tempbufferLength = 3;
4064                     }
4065                     source->writableBuffer.releaseBuffer(tempbufferLength);
4066
4067                     // Indicate where to continue in main input string after exhausting the writableBuffer
4068                     if (source->pos  == source->string) {
4069                         jamoOffset = 0;
4070                         source->fcdPosition = NULL;
4071                     } else {
4072                         jamoOffset = source->pos - source->string;
4073                         source->fcdPosition       = source->pos-1;
4074                     }
4075
4076                     // Append offsets for the additional chars
4077                     // (not the 0, and not the L whose offsets match the original Hangul)
4078                     int32_t jamoRemaining = tempbufferLength - 2;
4079                     jamoOffset++; // appended offsets should match end of original Hangul
4080                     while (jamoRemaining-- > 0) {
4081                         source->appendOffset(jamoOffset, *status);
4082                     }
4083
4084                     source->offsetRepeatValue = jamoOffset;
4085
4086                     source->offsetReturn = source->offsetStore - 1;
4087                     if (source->offsetReturn == source->offsetBuffer) {
4088                         source->offsetStore = source->offsetBuffer;
4089                     }
4090
4091                     source->pos               = source->writableBuffer.getTerminatedBuffer() + tempbufferLength;
4092                     source->origFlags         = source->flags;
4093                     source->flags            |= UCOL_ITER_INNORMBUF;
4094                     source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4095
4096                     return(UCOL_IGNORABLE);
4097                 }
4098             }
4099
4100         case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4101             return getPrevImplicit(ch, source);
4102
4103             // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4104         case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4105             return getPrevImplicit(ch, source);
4106
4107         case SURROGATE_TAG:  /* This is a surrogate pair */
4108             /* essentially an engaged lead surrogate. */
4109             /* if you have encountered it here, it means that a */
4110             /* broken sequence was encountered and this is an error */
4111             return UCOL_NOT_FOUND;
4112
4113         case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4114             return UCOL_NOT_FOUND; /* broken surrogate sequence */
4115
4116         case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4117             {
4118                 UChar32 cp = 0;
4119                 UChar  prevChar;
4120                 const UChar *prev;
4121                 if (isAtStartPrevIterate(source)) {
4122                     /* we are at the start of the string, wrong place to be at */
4123                     return UCOL_NOT_FOUND;
4124                 }
4125                 if (source->pos != source->writableBuffer.getBuffer()) {
4126                     prev     = source->pos - 1;
4127                 } else {
4128                     prev     = source->fcdPosition;
4129                 }
4130                 prevChar = *prev;
4131
4132                 /* Handles Han and Supplementary characters here.*/
4133                 if (U16_IS_LEAD(prevChar)) {
4134                     cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4135                     source->pos = prev;
4136                 } else {
4137                     return UCOL_NOT_FOUND; /* like unassigned */
4138                 }
4139
4140                 return getPrevImplicit(cp, source);
4141             }
4142
4143             /* UCA is filled with these. Tailorings are NOT_FOUND */
4144             /* not yet implemented */
4145         case CHARSET_TAG:  /* this tag always returns */
4146             /* probably after 1.8 */
4147             return UCOL_NOT_FOUND;
4148
4149         default:           /* this tag always returns */
4150             *status = U_INTERNAL_PROGRAM_ERROR;
4151             CE=0;
4152             break;
4153         }
4154
4155         if (CE <= UCOL_NOT_FOUND) {
4156             break;
4157         }
4158     }
4159
4160     return CE;
4161 }
4162
4163 /* This should really be a macro                                                                      */
4164 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4165 /* secondaries in French                                                                              */
4166 /*
4167 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4168   uint8_t temp;
4169   while(start<end) {
4170     temp = *start;
4171     *start++ = *end;
4172     *end-- = temp;
4173   }
4174 }
4175 */
4176
4177 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4178   TYPE tempA; \
4179 while((start)<(end)) { \
4180     tempA = *(start); \
4181     *(start)++ = *(end); \
4182     *(end)-- = tempA; \
4183 } \
4184 }
4185
4186 /****************************************************************************/
4187 /* Following are the sortkey generation functions                           */
4188 /*                                                                          */
4189 /****************************************************************************/
4190
4191 U_CAPI int32_t U_EXPORT2
4192 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4193                    const uint8_t *src2, int32_t src2Length,
4194                    uint8_t *dest, int32_t destCapacity) {
4195     /* check arguments */
4196     if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4197         src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4198         destCapacity<0 || (destCapacity>0 && dest==NULL)
4199     ) {
4200         /* error, attempt to write a zero byte and return 0 */
4201         if(dest!=NULL && destCapacity>0) {
4202             *dest=0;
4203         }
4204         return 0;
4205     }
4206
4207     /* check lengths and capacity */
4208     if(src1Length<0) {
4209         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4210     }
4211     if(src2Length<0) {
4212         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4213     }
4214
4215     int32_t destLength=src1Length+src2Length;
4216     if(destLength>destCapacity) {
4217         /* the merged sort key does not fit into the destination */
4218         return destLength;
4219     }
4220
4221     /* merge the sort keys with the same number of levels */
4222     uint8_t *p=dest;
4223     for(;;) {
4224         /* copy level from src1 not including 00 or 01 */
4225         uint8_t b;
4226         while((b=*src1)>=2) {
4227             ++src1;
4228             *p++=b;
4229         }
4230
4231         /* add a 02 merge separator */
4232         *p++=2;
4233
4234         /* copy level from src2 not including 00 or 01 */
4235         while((b=*src2)>=2) {
4236             ++src2;
4237             *p++=b;
4238         }
4239
4240         /* if both sort keys have another level, then add a 01 level separator and continue */
4241         if(*src1==1 && *src2==1) {
4242             ++src1;
4243             ++src2;
4244             *p++=1;
4245         } else {
4246             break;
4247         }
4248     }
4249
4250     /*
4251      * here, at least one sort key is finished now, but the other one
4252      * might have some contents left from containing more levels;
4253      * that contents is just appended to the result
4254      */
4255     if(*src1!=0) {
4256         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4257         src2=src1;
4258     }
4259     /* append src2, "the other, unfinished sort key" */
4260     while((*p++=*src2++)!=0) {}
4261
4262     /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */
4263     return (int32_t)(p-dest);
4264 }
4265
4266 U_NAMESPACE_BEGIN
4267
4268 class SortKeyByteSink : public ByteSink {
4269 public:
4270     SortKeyByteSink(char *dest, int32_t destCapacity)
4271             : buffer_(dest), capacity_(destCapacity),
4272               appended_(0) {
4273         if (buffer_ == NULL) {
4274             capacity_ = 0;
4275         } else if(capacity_ < 0) {
4276             buffer_ = NULL;
4277             capacity_ = 0;
4278         }
4279     }
4280     virtual ~SortKeyByteSink();
4281
4282     virtual void Append(const char *bytes, int32_t n);
4283     void Append(uint32_t b) {
4284         if (appended_ < capacity_ || Resize(1, appended_)) {
4285             buffer_[appended_] = (char)b;
4286         }
4287         ++appended_;
4288     }
4289     void Append(uint32_t b1, uint32_t b2) {
4290         int32_t a2 = appended_ + 2;
4291         if (a2 <= capacity_ || Resize(2, appended_)) {
4292             buffer_[appended_] = (char)b1;
4293             buffer_[appended_ + 1] = (char)b2;
4294         } else if(appended_ < capacity_) {
4295             buffer_[appended_] = (char)b1;
4296         }
4297         appended_ = a2;
4298     }
4299     virtual char *GetAppendBuffer(int32_t min_capacity,
4300                                   int32_t desired_capacity_hint,
4301                                   char *scratch, int32_t scratch_capacity,
4302                                   int32_t *result_capacity);
4303     int32_t NumberOfBytesAppended() const { return appended_; }
4304     /** @return FALSE if memory allocation failed */
4305     UBool IsOk() const { return buffer_ != NULL; }
4306
4307 protected:
4308     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) = 0;
4309     virtual UBool Resize(int32_t appendCapacity, int32_t length) = 0;
4310
4311     void SetNotOk() {
4312         buffer_ = NULL;
4313         capacity_ = 0;
4314     }
4315
4316     char *buffer_;
4317     int32_t capacity_;
4318     int32_t appended_;
4319
4320 private:
4321     SortKeyByteSink(const SortKeyByteSink &); // copy constructor not implemented
4322     SortKeyByteSink &operator=(const SortKeyByteSink &); // assignment operator not implemented
4323 };
4324
4325 SortKeyByteSink::~SortKeyByteSink() {}
4326
4327 void
4328 SortKeyByteSink::Append(const char *bytes, int32_t n) {
4329     if (n <= 0 || bytes == NULL) {
4330         return;
4331     }
4332     int32_t length = appended_;
4333     appended_ += n;
4334     if ((buffer_ + length) == bytes) {
4335         return;  // the caller used GetAppendBuffer() and wrote the bytes already
4336     }
4337     int32_t available = capacity_ - length;
4338     if (n <= available) {
4339         uprv_memcpy(buffer_ + length, bytes, n);
4340     } else {
4341         AppendBeyondCapacity(bytes, n, length);
4342     }
4343 }
4344
4345 char *
4346 SortKeyByteSink::GetAppendBuffer(int32_t min_capacity,
4347                                  int32_t desired_capacity_hint,
4348                                  char *scratch,
4349                                  int32_t scratch_capacity,
4350                                  int32_t *result_capacity) {
4351     if (min_capacity < 1 || scratch_capacity < min_capacity) {
4352         *result_capacity = 0;
4353         return NULL;
4354     }
4355     int32_t available = capacity_ - appended_;
4356     if (available >= min_capacity) {
4357         *result_capacity = available;
4358         return buffer_ + appended_;
4359     } else if (Resize(desired_capacity_hint, appended_)) {
4360         *result_capacity = capacity_ - appended_;
4361         return buffer_ + appended_;
4362     } else {
4363         *result_capacity = scratch_capacity;
4364         return scratch;
4365     }
4366 }
4367
4368 class FixedSortKeyByteSink : public SortKeyByteSink {
4369 public:
4370     FixedSortKeyByteSink(char *dest, int32_t destCapacity)
4371             : SortKeyByteSink(dest, destCapacity) {}
4372     virtual ~FixedSortKeyByteSink();
4373
4374 private:
4375     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4376     virtual UBool Resize(int32_t appendCapacity, int32_t length);
4377 };
4378
4379 FixedSortKeyByteSink::~FixedSortKeyByteSink() {}
4380
4381 void
4382 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) {
4383     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4384     // Fill the buffer completely.
4385     int32_t available = capacity_ - length;
4386     if (available > 0) {
4387         uprv_memcpy(buffer_ + length, bytes, available);
4388     }
4389 }
4390
4391 UBool
4392 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) {
4393     return FALSE;
4394 }
4395
4396 class CollationKeyByteSink : public SortKeyByteSink {
4397 public:
4398     CollationKeyByteSink(CollationKey &key)
4399             : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()),
4400               key_(key) {}
4401     virtual ~CollationKeyByteSink();
4402
4403 private:
4404     virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length);
4405     virtual UBool Resize(int32_t appendCapacity, int32_t length);
4406
4407     CollationKey &key_;
4408 };
4409
4410 CollationKeyByteSink::~CollationKeyByteSink() {}
4411
4412 void
4413 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) {
4414     // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_
4415     if (Resize(n, length)) {
4416         uprv_memcpy(buffer_ + length, bytes, n);
4417     }
4418 }
4419
4420 UBool
4421 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) {
4422     if (buffer_ == NULL) {
4423         return FALSE;  // allocation failed before already
4424     }
4425     int32_t newCapacity = 2 * capacity_;
4426     int32_t altCapacity = length + 2 * appendCapacity;
4427     if (newCapacity < altCapacity) {
4428         newCapacity = altCapacity;
4429     }
4430     if (newCapacity < 200) {
4431         newCapacity = 200;
4432     }
4433     uint8_t *newBuffer = key_.reallocate(newCapacity, length);
4434     if (newBuffer == NULL) {
4435         SetNotOk();
4436         return FALSE;
4437     }
4438     buffer_ = reinterpret_cast<char *>(newBuffer);
4439     capacity_ = newCapacity;
4440     return TRUE;
4441 }
4442
4443 /**
4444  * uint8_t byte buffer, similar to CharString but simpler.
4445  */
4446 class SortKeyLevel : public UMemory {
4447 public:
4448     SortKeyLevel() : len(0), ok(TRUE) {}
4449     ~SortKeyLevel() {}
4450
4451     /** @return FALSE if memory allocation failed */
4452     UBool isOk() const { return ok; }
4453     UBool isEmpty() const { return len == 0; }
4454     int32_t length() const { return len; }
4455     const uint8_t *data() const { return buffer.getAlias(); }
4456     uint8_t operator[](int32_t index) const { return buffer[index]; }
4457
4458     void appendByte(uint32_t b);
4459
4460     void appendTo(ByteSink &sink) const {
4461         sink.Append(reinterpret_cast<const char *>(buffer.getAlias()), len);
4462     }
4463
4464     uint8_t &lastByte() {
4465         U_ASSERT(len > 0);
4466         return buffer[len - 1];
4467     }
4468
4469     uint8_t *getLastFewBytes(int32_t n) {
4470         if (ok && len >= n) {
4471             return buffer.getAlias() + len - n;
4472         } else {
4473             return NULL;
4474         }
4475     }
4476
4477 private:
4478     MaybeStackArray<uint8_t, 40> buffer;
4479     int32_t len;
4480     UBool ok;
4481
4482     UBool ensureCapacity(int32_t appendCapacity);
4483
4484     SortKeyLevel(const SortKeyLevel &other); // forbid copying of this class
4485     SortKeyLevel &operator=(const SortKeyLevel &other); // forbid copying of this class
4486 };
4487
4488 void SortKeyLevel::appendByte(uint32_t b) {
4489     if(len < buffer.getCapacity() || ensureCapacity(1)) {
4490         buffer[len++] = (uint8_t)b;
4491     }
4492 }
4493
4494 UBool SortKeyLevel::ensureCapacity(int32_t appendCapacity) {
4495     if(!ok) {
4496         return FALSE;
4497     }
4498     int32_t newCapacity = 2 * buffer.getCapacity();
4499     int32_t altCapacity = len + 2 * appendCapacity;
4500     if (newCapacity < altCapacity) {
4501         newCapacity = altCapacity;
4502     }
4503     if (newCapacity < 200) {
4504         newCapacity = 200;
4505     }
4506     if(buffer.resize(newCapacity, len)==NULL) {
4507         return ok = FALSE;
4508     }
4509     return TRUE;
4510 }
4511
4512 U_NAMESPACE_END
4513
4514 /* sortkey API */
4515 U_CAPI int32_t U_EXPORT2
4516 ucol_getSortKey(const    UCollator    *coll,
4517         const    UChar        *source,
4518         int32_t        sourceLength,
4519         uint8_t        *result,
4520         int32_t        resultLength)
4521 {
4522     UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4523     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4524         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source,
4525             ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength));
4526     }
4527
4528     if(coll->delegate != NULL) {
4529       return ((const Collator*)coll->delegate)->getSortKey(source, sourceLength, result, resultLength);
4530     }
4531
4532     UErrorCode status = U_ZERO_ERROR;
4533     int32_t keySize   = 0;
4534
4535     if(source != NULL) {
4536         // source == NULL is actually an error situation, but we would need to
4537         // have an error code to return it. Until we introduce a new
4538         // API, it stays like this
4539
4540         /* this uses the function pointer that is set in updateinternalstate */
4541         /* currently, there are two funcs: */
4542         /*ucol_calcSortKey(...);*/
4543         /*ucol_calcSortKeySimpleTertiary(...);*/
4544
4545         uint8_t noDest[1] = { 0 };
4546         if(result == NULL) {
4547             // Distinguish pure preflighting from an allocation error.
4548             result = noDest;
4549             resultLength = 0;
4550         }
4551         FixedSortKeyByteSink sink(reinterpret_cast<char *>(result), resultLength);
4552         coll->sortKeyGen(coll, source, sourceLength, sink, &status);
4553         if(U_SUCCESS(status)) {
4554             keySize = sink.NumberOfBytesAppended();
4555         }
4556     }
4557     UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4558     UTRACE_EXIT_STATUS(status);
4559     return keySize;
4560 }
4561
4562 U_CFUNC int32_t
4563 ucol_getCollationKey(const UCollator *coll,
4564                      const UChar *source, int32_t sourceLength,
4565                      CollationKey &key,
4566                      UErrorCode &errorCode) {
4567     CollationKeyByteSink sink(key);
4568     coll->sortKeyGen(coll, source, sourceLength, sink, &errorCode);
4569     return sink.NumberOfBytesAppended();
4570 }
4571
4572 // Is this primary weight compressible?
4573 // Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
4574 // TODO: This should use per-lead-byte flags from FractionalUCA.txt.
4575 static inline UBool
4576 isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
4577     return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
4578 }
4579
4580 static
4581 inline void doCaseShift(SortKeyLevel &cases, uint32_t &caseShift) {
4582     if (caseShift  == 0) {
4583         cases.appendByte(UCOL_CASE_BYTE_START);
4584         caseShift = UCOL_CASE_SHIFT_START;
4585     }
4586 }
4587
4588 // Packs the secondary buffer when processing French locale.
4589 static void
4590 packFrench(const uint8_t *secondaries, int32_t secsize, SortKeyByteSink &result) {
4591     secondaries += secsize;  // We read the secondary-level bytes back to front.
4592     uint8_t secondary;
4593     int32_t count2 = 0;
4594     int32_t i = 0;
4595     // we use i here since the key size already accounts for terminators, so we'll discard the increment
4596     for(i = 0; i<secsize; i++) {
4597         secondary = *(secondaries-i-1);
4598         /* This is compression code. */
4599         if (secondary == UCOL_COMMON2) {
4600             ++count2;
4601         } else {
4602             if (count2 > 0) {
4603                 if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4604                     while (count2 > UCOL_TOP_COUNT2) {
4605                         result.Append(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4606                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4607                     }
4608                     result.Append(UCOL_COMMON_TOP2 - (count2-1));
4609                 } else {
4610                     while (count2 > UCOL_BOT_COUNT2) {
4611                         result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4612                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4613                     }
4614                     result.Append(UCOL_COMMON_BOT2 + (count2-1));
4615                 }
4616                 count2 = 0;
4617             }
4618             result.Append(secondary);
4619         }
4620     }
4621     if (count2 > 0) {
4622         while (count2 > UCOL_BOT_COUNT2) {
4623             result.Append(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4624             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4625         }
4626         result.Append(UCOL_COMMON_BOT2 + (count2-1));
4627     }
4628 }
4629
4630 #define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0
4631
4632 /* This is the sortkey work horse function */
4633 U_CFUNC void U_CALLCONV
4634 ucol_calcSortKey(const    UCollator    *coll,
4635         const    UChar        *source,
4636         int32_t        sourceLength,
4637         SortKeyByteSink &result,
4638         UErrorCode *status)
4639 {
4640     if(U_FAILURE(*status)) {
4641         return;
4642     }
4643
4644     SortKeyByteSink &primaries = result;
4645     SortKeyLevel secondaries;
4646     SortKeyLevel tertiaries;
4647     SortKeyLevel cases;
4648     SortKeyLevel quads;
4649
4650     UnicodeString normSource;
4651
4652     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4653
4654     UColAttributeValue strength = coll->strength;
4655
4656     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4657     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4658     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4659     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4660     UBool  doCase = (coll->caseLevel == UCOL_ON);
4661     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4662     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4663     //UBool  qShifted = shifted && (compareQuad == 0);
4664     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4665
4666     uint32_t variableTopValue = coll->variableTopValue;
4667     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4668     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4669     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4670     uint8_t UCOL_HIRAGANA_QUAD = 0;
4671     if(doHiragana) {
4672         UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4673         /* allocate one more space for hiragana, value for hiragana */
4674     }
4675     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4676
4677     /* support for special features like caselevel and funky secondaries */
4678     int32_t lastSecondaryLength = 0;
4679     uint32_t caseShift = 0;
4680
4681     /* If we need to normalize, we'll do it all at once at the beginning! */
4682     const Normalizer2 *norm2;
4683     if(compareIdent) {
4684         norm2 = Normalizer2Factory::getNFDInstance(*status);
4685     } else if(coll->normalizationMode != UCOL_OFF) {
4686         norm2 = Normalizer2Factory::getFCDInstance(*status);
4687     } else {
4688         norm2 = NULL;
4689     }
4690     if(norm2 != NULL) {
4691         normSource.setTo(FALSE, source, len);
4692         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
4693         if(qcYesLength != len) {
4694             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
4695             normSource.truncate(qcYesLength);
4696             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
4697             source = normSource.getBuffer();
4698             len = normSource.length();
4699         }
4700     }
4701     collIterate s;
4702     IInit_collIterate(coll, source, len, &s, status);
4703     if(U_FAILURE(*status)) {
4704         return;
4705     }
4706     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
4707
4708     uint32_t order = 0;
4709
4710     uint8_t primary1 = 0;
4711     uint8_t primary2 = 0;
4712     uint8_t secondary = 0;
4713     uint8_t tertiary = 0;
4714     uint8_t caseSwitch = coll->caseSwitch;
4715     uint8_t tertiaryMask = coll->tertiaryMask;
4716     int8_t tertiaryAddition = coll->tertiaryAddition;
4717     uint8_t tertiaryTop = coll->tertiaryTop;
4718     uint8_t tertiaryBottom = coll->tertiaryBottom;
4719     uint8_t tertiaryCommon = coll->tertiaryCommon;
4720     uint8_t caseBits = 0;
4721
4722     UBool wasShifted = FALSE;
4723     UBool notIsContinuation = FALSE;
4724
4725     uint32_t count2 = 0, count3 = 0, count4 = 0;
4726     uint8_t leadPrimary = 0;
4727
4728     for(;;) {
4729         order = ucol_IGetNextCE(coll, &s, status);
4730         if(order == UCOL_NO_MORE_CES) {
4731             break;
4732         }
4733
4734         if(order == 0) {
4735             continue;
4736         }
4737
4738         notIsContinuation = !isContinuation(order);
4739
4740         if(notIsContinuation) {
4741             tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4742         } else {
4743             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4744         }
4745
4746         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4747         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4748         primary1 = (uint8_t)(order >> 8);
4749
4750         uint8_t originalPrimary1 = primary1;
4751         if(notIsContinuation && coll->leadBytePermutationTable != NULL) {
4752             primary1 = coll->leadBytePermutationTable[primary1];
4753         }
4754
4755         if((shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4756                         || (!notIsContinuation && wasShifted)))
4757             || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
4758         {
4759             /* and other ignorables should be removed if following a shifted code point */
4760             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4761                 /* we should just completely ignore it */
4762                 continue;
4763             }
4764             if(compareQuad == 0) {
4765                 if(count4 > 0) {
4766                     while (count4 > UCOL_BOT_COUNT4) {
4767                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4768                         count4 -= UCOL_BOT_COUNT4;
4769                     }
4770                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4771                     count4 = 0;
4772                 }
4773                 /* We are dealing with a variable and we're treating them as shifted */
4774                 /* This is a shifted ignorable */
4775                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4776                     quads.appendByte(primary1);
4777                 }
4778                 if(primary2 != 0) {
4779                     quads.appendByte(primary2);
4780                 }
4781             }
4782             wasShifted = TRUE;
4783         } else {
4784             wasShifted = FALSE;
4785             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4786             /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
4787             /* regular and simple sortkey calc */
4788             if(primary1 != UCOL_IGNORABLE) {
4789                 if(notIsContinuation) {
4790                     if(leadPrimary == primary1) {
4791                         primaries.Append(primary2);
4792                     } else {
4793                         if(leadPrimary != 0) {
4794                             primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4795                         }
4796                         if(primary2 == UCOL_IGNORABLE) {
4797                             /* one byter, not compressed */
4798                             primaries.Append(primary1);
4799                             leadPrimary = 0;
4800                         } else if(isCompressible(coll, originalPrimary1)) {
4801                             /* compress */
4802                             primaries.Append(leadPrimary = primary1, primary2);
4803                         } else {
4804                             leadPrimary = 0;
4805                             primaries.Append(primary1, primary2);
4806                         }
4807                     }
4808                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4809                     if(primary2 == UCOL_IGNORABLE) {
4810                         primaries.Append(primary1);
4811                     } else {
4812                         primaries.Append(primary1, primary2);
4813                     }
4814                 }
4815             }
4816
4817             if(secondary > compareSec) {
4818                 if(!isFrenchSec) {
4819                     /* This is compression code. */
4820                     if (secondary == UCOL_COMMON2 && notIsContinuation) {
4821                         ++count2;
4822                     } else {
4823                         if (count2 > 0) {
4824                             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4825                                 while (count2 > UCOL_TOP_COUNT2) {
4826                                     secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4827                                     count2 -= (uint32_t)UCOL_TOP_COUNT2;
4828                                 }
4829                                 secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
4830                             } else {
4831                                 while (count2 > UCOL_BOT_COUNT2) {
4832                                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4833                                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
4834                                 }
4835                                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4836                             }
4837                             count2 = 0;
4838                         }
4839                         secondaries.appendByte(secondary);
4840                     }
4841                 } else {
4842                     /* Do the special handling for French secondaries */
4843                     /* We need to get continuation elements and do intermediate restore */
4844                     /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4845                     if(notIsContinuation) {
4846                         if (lastSecondaryLength > 1) {
4847                             uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4848                             if (frenchStartPtr != NULL) {
4849                                 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4850                                 uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4851                                 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4852                             }
4853                         }
4854                         lastSecondaryLength = 1;
4855                     } else {
4856                         ++lastSecondaryLength;
4857                     }
4858                     secondaries.appendByte(secondary);
4859                 }
4860             }
4861
4862             if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) {
4863                 // do the case level if we need to do it. We don't want to calculate
4864                 // case level for primary ignorables if we have only primary strength and case level
4865                 // otherwise we would break well formedness of CEs
4866                 doCaseShift(cases, caseShift);
4867                 if(notIsContinuation) {
4868                     caseBits = (uint8_t)(tertiary & 0xC0);
4869
4870                     if(tertiary != 0) {
4871                         if(coll->caseFirst == UCOL_UPPER_FIRST) {
4872                             if((caseBits & 0xC0) == 0) {
4873                                 cases.lastByte() |= 1 << (--caseShift);
4874                             } else {
4875                                 cases.lastByte() |= 0 << (--caseShift);
4876                                 /* second bit */
4877                                 doCaseShift(cases, caseShift);
4878                                 cases.lastByte() |= ((caseBits>>6)&1) << (--caseShift);
4879                             }
4880                         } else {
4881                             if((caseBits & 0xC0) == 0) {
4882                                 cases.lastByte() |= 0 << (--caseShift);
4883                             } else {
4884                                 cases.lastByte() |= 1 << (--caseShift);
4885                                 /* second bit */
4886                                 doCaseShift(cases, caseShift);
4887                                 cases.lastByte() |= ((caseBits>>7)&1) << (--caseShift);
4888                             }
4889                         }
4890                     }
4891                 }
4892             } else {
4893                 if(notIsContinuation) {
4894                     tertiary ^= caseSwitch;
4895                 }
4896             }
4897
4898             tertiary &= tertiaryMask;
4899             if(tertiary > compareTer) {
4900                 /* This is compression code. */
4901                 /* sequence size check is included in the if clause */
4902                 if (tertiary == tertiaryCommon && notIsContinuation) {
4903                     ++count3;
4904                 } else {
4905                     if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
4906                         tertiary += tertiaryAddition;
4907                     } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
4908                         tertiary -= tertiaryAddition;
4909                     }
4910                     if (count3 > 0) {
4911                         if ((tertiary > tertiaryCommon)) {
4912                             while (count3 > coll->tertiaryTopCount) {
4913                                 tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4914                                 count3 -= (uint32_t)coll->tertiaryTopCount;
4915                             }
4916                             tertiaries.appendByte(tertiaryTop - (count3-1));
4917                         } else {
4918                             while (count3 > coll->tertiaryBottomCount) {
4919                                 tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4920                                 count3 -= (uint32_t)coll->tertiaryBottomCount;
4921                             }
4922                             tertiaries.appendByte(tertiaryBottom + (count3-1));
4923                         }
4924                         count3 = 0;
4925                     }
4926                     tertiaries.appendByte(tertiary);
4927                 }
4928             }
4929
4930             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4931                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4932                     if(count4>0) { // Close this part
4933                         while (count4 > UCOL_BOT_COUNT4) {
4934                             quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4935                             count4 -= UCOL_BOT_COUNT4;
4936                         }
4937                         quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
4938                         count4 = 0;
4939                     }
4940                     quads.appendByte(UCOL_HIRAGANA_QUAD); // Add the Hiragana
4941                 } else { // This wasn't Hiragana, so we can continue adding stuff
4942                     count4++;
4943                 }
4944             }
4945         }
4946     }
4947
4948     /* Here, we are generally done with processing */
4949     /* bailing out would not be too productive */
4950
4951     UBool ok = TRUE;
4952     if(U_SUCCESS(*status)) {
4953         /* we have done all the CE's, now let's put them together to form a key */
4954         if(compareSec == 0) {
4955             if (count2 > 0) {
4956                 while (count2 > UCOL_BOT_COUNT2) {
4957                     secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4958                     count2 -= (uint32_t)UCOL_BOT_COUNT2;
4959                 }
4960                 secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
4961             }
4962             result.Append(UCOL_LEVELTERMINATOR);
4963             if(!secondaries.isOk()) {
4964                 ok = FALSE;
4965             } else if(!isFrenchSec) {
4966                 secondaries.appendTo(result);
4967             } else {
4968                 // If there are any unresolved continuation secondaries,
4969                 // reverse them here so that we can reverse the whole secondary thing.
4970                 if (lastSecondaryLength > 1) {
4971                     uint8_t *frenchStartPtr = secondaries.getLastFewBytes(lastSecondaryLength);
4972                     if (frenchStartPtr != NULL) {
4973                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4974                         uint8_t *frenchEndPtr = frenchStartPtr + lastSecondaryLength - 1;
4975                         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4976                     }
4977                 }
4978                 packFrench(secondaries.data(), secondaries.length(), result);
4979             }
4980         }
4981
4982         if(doCase) {
4983             ok &= cases.isOk();
4984             result.Append(UCOL_LEVELTERMINATOR);
4985             cases.appendTo(result);
4986         }
4987
4988         if(compareTer == 0) {
4989             if (count3 > 0) {
4990                 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4991                     while (count3 >= coll->tertiaryTopCount) {
4992                         tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
4993                         count3 -= (uint32_t)coll->tertiaryTopCount;
4994                     }
4995                     tertiaries.appendByte(tertiaryTop - count3);
4996                 } else {
4997                     while (count3 > coll->tertiaryBottomCount) {
4998                         tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
4999                         count3 -= (uint32_t)coll->tertiaryBottomCount;
5000                     }
5001                     tertiaries.appendByte(tertiaryBottom + (count3-1));
5002                 }
5003             }
5004             ok &= tertiaries.isOk();
5005             result.Append(UCOL_LEVELTERMINATOR);
5006             tertiaries.appendTo(result);
5007
5008             if(compareQuad == 0/*qShifted == TRUE*/) {
5009                 if(count4 > 0) {
5010                     while (count4 > UCOL_BOT_COUNT4) {
5011                         quads.appendByte(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5012                         count4 -= UCOL_BOT_COUNT4;
5013                     }
5014                     quads.appendByte(UCOL_COMMON_BOT4 + (count4-1));
5015                 }
5016                 ok &= quads.isOk();
5017                 result.Append(UCOL_LEVELTERMINATOR);
5018                 quads.appendTo(result);
5019             }
5020
5021             if(compareIdent) {
5022                 result.Append(UCOL_LEVELTERMINATOR);
5023                 u_writeIdenticalLevelRun(s.string, len, result);
5024             }
5025         }
5026         result.Append(0);
5027     }
5028
5029     /* To avoid memory leak, free the offset buffer if necessary. */
5030     ucol_freeOffsetBuffer(&s);
5031
5032     ok &= result.IsOk();
5033     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5034 }
5035
5036
5037 U_CFUNC void U_CALLCONV
5038 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5039         const    UChar        *source,
5040         int32_t        sourceLength,
5041         SortKeyByteSink &result,
5042         UErrorCode *status)
5043 {
5044     U_ALIGN_CODE(16);
5045
5046     if(U_FAILURE(*status)) {
5047         return;
5048     }
5049
5050     SortKeyByteSink &primaries = result;
5051     SortKeyLevel secondaries;
5052     SortKeyLevel tertiaries;
5053
5054     UnicodeString normSource;
5055
5056     int32_t len =  sourceLength;
5057
5058     /* If we need to normalize, we'll do it all at once at the beginning! */
5059     if(coll->normalizationMode != UCOL_OFF) {
5060         normSource.setTo(len < 0, source, len);
5061         const Normalizer2 *norm2 = Normalizer2Factory::getFCDInstance(*status);
5062         int32_t qcYesLength = norm2->spanQuickCheckYes(normSource, *status);
5063         if(qcYesLength != normSource.length()) {
5064             UnicodeString unnormalized = normSource.tempSubString(qcYesLength);
5065             normSource.truncate(qcYesLength);
5066             norm2->normalizeSecondAndAppend(normSource, unnormalized, *status);
5067             source = normSource.getBuffer();
5068             len = normSource.length();
5069         }
5070     }
5071     collIterate s;
5072     IInit_collIterate(coll, (UChar *)source, len, &s, status);
5073     if(U_FAILURE(*status)) {
5074         return;
5075     }
5076     s.flags &= ~UCOL_ITER_NORM;  // source passed the FCD test or else was normalized.
5077
5078     uint32_t order = 0;
5079
5080     uint8_t primary1 = 0;
5081     uint8_t primary2 = 0;
5082     uint8_t secondary = 0;
5083     uint8_t tertiary = 0;
5084     uint8_t caseSwitch = coll->caseSwitch;
5085     uint8_t tertiaryMask = coll->tertiaryMask;
5086     int8_t tertiaryAddition = coll->tertiaryAddition;
5087     uint8_t tertiaryTop = coll->tertiaryTop;
5088     uint8_t tertiaryBottom = coll->tertiaryBottom;
5089     uint8_t tertiaryCommon = coll->tertiaryCommon;
5090
5091     UBool notIsContinuation = FALSE;
5092
5093     uint32_t count2 = 0, count3 = 0;
5094     uint8_t leadPrimary = 0;
5095
5096     for(;;) {
5097         order = ucol_IGetNextCE(coll, &s, status);
5098
5099         if(order == 0) {
5100             continue;
5101         }
5102
5103         if(order == UCOL_NO_MORE_CES) {
5104             break;
5105         }
5106
5107         notIsContinuation = !isContinuation(order);
5108
5109         if(notIsContinuation) {
5110             tertiary = (uint8_t)((order & tertiaryMask));
5111         } else {
5112             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5113         }
5114
5115         secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5116         primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5117         primary1 = (uint8_t)(order >> 8);
5118
5119         uint8_t originalPrimary1 = primary1;
5120         if (coll->leadBytePermutationTable != NULL && notIsContinuation) {
5121             primary1 = coll->leadBytePermutationTable[primary1];
5122         }
5123
5124         /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5125         /* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will   */
5126         /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5127         /* regular and simple sortkey calc */
5128         if(primary1 != UCOL_IGNORABLE) {
5129             if(notIsContinuation) {
5130                 if(leadPrimary == primary1) {
5131                     primaries.Append(primary2);
5132                 } else {
5133                     if(leadPrimary != 0) {
5134                         primaries.Append((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5135                     }
5136                     if(primary2 == UCOL_IGNORABLE) {
5137                         /* one byter, not compressed */
5138                         primaries.Append(primary1);
5139                         leadPrimary = 0;
5140                     } else if(isCompressible(coll, originalPrimary1)) {
5141                         /* compress */
5142                         primaries.Append(leadPrimary = primary1, primary2);
5143                     } else {
5144                         leadPrimary = 0;
5145                         primaries.Append(primary1, primary2);
5146                     }
5147                 }
5148             } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5149                 if(primary2 == UCOL_IGNORABLE) {
5150                     primaries.Append(primary1);
5151                 } else {
5152                     primaries.Append(primary1, primary2);
5153                 }
5154             }
5155         }
5156
5157         if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5158             /* This is compression code. */
5159             if (secondary == UCOL_COMMON2 && notIsContinuation) {
5160                 ++count2;
5161             } else {
5162                 if (count2 > 0) {
5163                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5164                         while (count2 > UCOL_TOP_COUNT2) {
5165                             secondaries.appendByte(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5166                             count2 -= (uint32_t)UCOL_TOP_COUNT2;
5167                         }
5168                         secondaries.appendByte(UCOL_COMMON_TOP2 - (count2-1));
5169                     } else {
5170                         while (count2 > UCOL_BOT_COUNT2) {
5171                             secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5172                             count2 -= (uint32_t)UCOL_BOT_COUNT2;
5173                         }
5174                         secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5175                     }
5176                     count2 = 0;
5177                 }
5178                 secondaries.appendByte(secondary);
5179             }
5180         }
5181
5182         if(notIsContinuation) {
5183             tertiary ^= caseSwitch;
5184         }
5185
5186         if(tertiary > 0) {
5187             /* This is compression code. */
5188             /* sequence size check is included in the if clause */
5189             if (tertiary == tertiaryCommon && notIsContinuation) {
5190                 ++count3;
5191             } else {
5192                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5193                     tertiary += tertiaryAddition;
5194                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5195                     tertiary -= tertiaryAddition;
5196                 }
5197                 if (count3 > 0) {
5198                     if ((tertiary > tertiaryCommon)) {
5199                         while (count3 > coll->tertiaryTopCount) {
5200                             tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5201                             count3 -= (uint32_t)coll->tertiaryTopCount;
5202                         }
5203                         tertiaries.appendByte(tertiaryTop - (count3-1));
5204                     } else {
5205                         while (count3 > coll->tertiaryBottomCount) {
5206                             tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5207                             count3 -= (uint32_t)coll->tertiaryBottomCount;
5208                         }
5209                         tertiaries.appendByte(tertiaryBottom + (count3-1));
5210                     }
5211                     count3 = 0;
5212                 }
5213                 tertiaries.appendByte(tertiary);
5214             }
5215         }
5216     }
5217
5218     UBool ok = TRUE;
5219     if(U_SUCCESS(*status)) {
5220         /* we have done all the CE's, now let's put them together to form a key */
5221         if (count2 > 0) {
5222             while (count2 > UCOL_BOT_COUNT2) {
5223                 secondaries.appendByte(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5224                 count2 -= (uint32_t)UCOL_BOT_COUNT2;
5225             }
5226             secondaries.appendByte(UCOL_COMMON_BOT2 + (count2-1));
5227         }
5228         ok &= secondaries.isOk();
5229         result.Append(UCOL_LEVELTERMINATOR);
5230         secondaries.appendTo(result);
5231
5232         if (count3 > 0) {
5233             if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5234                 while (count3 >= coll->tertiaryTopCount) {
5235                     tertiaries.appendByte(tertiaryTop - coll->tertiaryTopCount);
5236                     count3 -= (uint32_t)coll->tertiaryTopCount;
5237                 }
5238                 tertiaries.appendByte(tertiaryTop - count3);
5239             } else {
5240                 while (count3 > coll->tertiaryBottomCount) {
5241                     tertiaries.appendByte(tertiaryBottom + coll->tertiaryBottomCount);
5242                     count3 -= (uint32_t)coll->tertiaryBottomCount;
5243                 }
5244                 tertiaries.appendByte(tertiaryBottom + (count3-1));
5245             }
5246         }
5247         ok &= tertiaries.isOk();
5248         result.Append(UCOL_LEVELTERMINATOR);
5249         tertiaries.appendTo(result);
5250
5251         result.Append(0);
5252     }
5253
5254     /* To avoid memory leak, free the offset buffer if necessary. */
5255     ucol_freeOffsetBuffer(&s);
5256
5257     ok &= result.IsOk();
5258     if(!ok && U_SUCCESS(*status)) { *status = U_MEMORY_ALLOCATION_ERROR; }
5259 }
5260
5261 static inline
5262 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5263     UBool notIsContinuation = !isContinuation(CE);
5264     uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5265     if((LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5266                || (!notIsContinuation && *wasShifted)))
5267         || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */
5268     {
5269         // The stuff below should probably be in the sortkey code... maybe not...
5270         if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5271             /* we should just completely ignore it */
5272             *wasShifted = TRUE;
5273             //continue;
5274         }
5275         //*wasShifted = TRUE;
5276         return TRUE;
5277     } else {
5278         *wasShifted = FALSE;
5279         return FALSE;
5280     }
5281 }
5282 static inline
5283 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5284     if(level < maxLevel) {
5285         dest[i++] = UCOL_LEVELTERMINATOR;
5286     } else {
5287         dest[i++] = 0;
5288     }
5289 }
5290
5291 /** enumeration of level identifiers for partial sort key generation */
5292 enum {
5293   UCOL_PSK_PRIMARY = 0,
5294     UCOL_PSK_SECONDARY = 1,
5295     UCOL_PSK_CASE = 2,
5296     UCOL_PSK_TERTIARY = 3,
5297     UCOL_PSK_QUATERNARY = 4,
5298     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5299     UCOL_PSK_IDENTICAL = 6,
5300     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5301     UCOL_PSK_LIMIT
5302 };
5303
5304 /** collation state enum. *_SHIFT value is how much to shift right
5305  *  to get the state piece to the right. *_MASK value should be
5306  *  ANDed with the shifted state. This data is stored in state[1]
5307  *  field.
5308  */
5309 enum {
5310     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5311     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5312     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5313     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5314     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5315      *  This field is also used to denote that the French secondary level is finished
5316      */
5317     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5318     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5319     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5320     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5321     /** When we do French we need to reverse secondary values. However, continuations
5322      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5323      */
5324     UCOL_PSK_BOCSU_BYTES_SHIFT = 7,
5325     UCOL_PSK_BOCSU_BYTES_MASK = 3,
5326     UCOL_PSK_CONSUMED_CES_SHIFT = 9,
5327     UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF
5328 };
5329
5330 // macro calculating the number of expansion CEs available
5331 #define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn
5332
5333
5334 /** main sortkey part procedure. On the first call,
5335  *  you should pass in a collator, an iterator, empty state
5336  *  state[0] == state[1] == 0, a buffer to hold results
5337  *  number of bytes you need and an error code pointer.
5338  *  Make sure your buffer is big enough to hold the wanted
5339  *  number of sortkey bytes. I don't check.
5340  *  The only meaningful status you can get back is
5341  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5342  *  have been dealt a raw deal and that you probably won't
5343  *  be able to use partial sortkey generation for this
5344  *  particular combination of string and collator. This
5345  *  is highly unlikely, but you should still check the error code.
5346  *  Any other status means that you're not in a sane situation
5347  *  anymore. After the first call, preserve state values and
5348  *  use them on subsequent calls to obtain more bytes of a sortkey.
5349  *  Use until the number of bytes written is smaller than the requested
5350  *  number of bytes. Generated sortkey is not compatible with the
5351  *  one generated by ucol_getSortKey, as we don't do any compression.
5352  *  However, levels are still terminated by a 1 (one) and the sortkey
5353  *  is terminated by a 0 (zero). Identical level is the same as in the
5354  *  regular sortkey - internal bocu-1 implementation is used.
5355  *  For curious, although you cannot do much about this, here is
5356  *  the structure of state words.
5357  *  state[0] - iterator state. Depends on the iterator implementation,
5358  *             but allows the iterator to continue where it stopped in
5359  *             the last iteration.
5360  *  state[1] - collation processing state. Here is the distribution
5361  *             of the bits:
5362  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5363  *             quaternary, quin (we don't use this one), identical and
5364  *             null (producing only zeroes - first one to terminate the
5365  *             sortkey and subsequent to fill the buffer).
5366  *   3       - byte count. Number of bytes written on the primary level.
5367  *   4       - was shifted. Whether the previous iteration finished in the
5368  *             shifted state.
5369  *   5, 6    - French continuation bytes written. See the comment in the enum
5370  *   7,8     - Bocsu bytes used. Number of bytes from a bocu sequence on
5371  *             the identical level.
5372  *   9..31   - CEs consumed. Number of getCE or next32 operations performed
5373  *             since thes last successful update of the iterator state.
5374  */
5375 U_CAPI int32_t U_EXPORT2
5376 ucol_nextSortKeyPart(const UCollator *coll,
5377                      UCharIterator *iter,
5378                      uint32_t state[2],
5379                      uint8_t *dest, int32_t count,
5380                      UErrorCode *status)
5381 {
5382     /* error checking */
5383     if(status==NULL || U_FAILURE(*status)) {
5384         return 0;
5385     }
5386     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5387     if( coll==NULL || iter==NULL ||
5388         state==NULL ||
5389         count<0 || (count>0 && dest==NULL)
5390     ) {
5391         *status=U_ILLEGAL_ARGUMENT_ERROR;
5392         UTRACE_EXIT_STATUS(status);
5393         return 0;
5394     }
5395
5396     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5397                   coll, iter, state[0], state[1], dest, count);
5398
5399     if(count==0) {
5400         /* nothing to do */
5401         UTRACE_EXIT_VALUE(0);
5402         return 0;
5403     }
5404     /** Setting up situation according to the state we got from the previous iteration */
5405     // The state of the iterator from the previous invocation
5406     uint32_t iterState = state[0];
5407     // Has the last iteration ended in the shifted state
5408     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5409     // What is the current level of the sortkey?
5410     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5411     // Have we written only one byte from a two byte primary in the previous iteration?
5412     // Also on secondary level - have we finished with the French secondary?
5413     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5414     // number of bytes in the continuation buffer for French
5415     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5416     // Number of bytes already written from a bocsu sequence. Since
5417     // the longes bocsu sequence is 4 long, this can be up to 3.
5418     int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK;
5419     // Number of elements that need to be consumed in this iteration because
5420     // the iterator returned UITER_NO_STATE at the end of the last iteration,
5421     // so we had to save the last valid state.
5422     int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK;
5423
5424     /** values that depend on the collator attributes */
5425     // strength of the collator.
5426     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5427     // maximal level of the partial sortkey. Need to take whether case level is done
5428     int32_t maxLevel = 0;
5429     if(strength < UCOL_TERTIARY) {
5430         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5431             maxLevel = UCOL_PSK_CASE;
5432         } else {
5433             maxLevel = strength;
5434         }
5435     } else {
5436         if(strength == UCOL_TERTIARY) {
5437             maxLevel = UCOL_PSK_TERTIARY;
5438         } else if(strength == UCOL_QUATERNARY) {
5439             maxLevel = UCOL_PSK_QUATERNARY;
5440         } else { // identical
5441             maxLevel = UCOL_IDENTICAL;
5442         }
5443     }
5444     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5445     uint8_t UCOL_HIRAGANA_QUAD =
5446       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5447     // Boundary value that decides whether a CE is shifted or not
5448     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5449     // Are we doing French collation?
5450     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5451
5452     /** initializing the collation state */
5453     UBool notIsContinuation = FALSE;
5454     uint32_t CE = UCOL_NO_MORE_CES;
5455
5456     collIterate s;
5457     IInit_collIterate(coll, NULL, -1, &s, status);
5458     if(U_FAILURE(*status)) {
5459         UTRACE_EXIT_STATUS(*status);
5460         return 0;
5461     }
5462     s.iterator = iter;
5463     s.flags |= UCOL_USE_ITERATOR;
5464     // This variable tells us whether we have produced some other levels in this iteration
5465     // before we moved to the identical level. In that case, we need to switch the
5466     // type of the iterator.
5467     UBool doingIdenticalFromStart = FALSE;
5468     // Normalizing iterator
5469     // The division for the array length may truncate the array size to
5470     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5471     // for all platforms anyway.
5472     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5473     UNormIterator *normIter = NULL;
5474     // If the normalization is turned on for the collator and we are below identical level
5475     // we will use a FCD normalizing iterator
5476     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5477         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5478         s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5479         s.flags &= ~UCOL_ITER_NORM;
5480         if(U_FAILURE(*status)) {
5481             UTRACE_EXIT_STATUS(*status);
5482             return 0;
5483         }
5484     } else if(level == UCOL_PSK_IDENTICAL) {
5485         // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5486         // will be updating the state - and this cannot be done on an ordinary iterator.
5487         normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5488         s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5489         s.flags &= ~UCOL_ITER_NORM;
5490         if(U_FAILURE(*status)) {
5491             UTRACE_EXIT_STATUS(*status);
5492             return 0;
5493         }
5494         doingIdenticalFromStart = TRUE;
5495     }
5496
5497     // This is the tentative new state of the iterator. The problem
5498     // is that the iterator might return an undefined state, in
5499     // which case we should save the last valid state and increase
5500     // the iterator skip value.
5501     uint32_t newState = 0;
5502
5503     // First, we set the iterator to the last valid position
5504     // from the last iteration. This was saved in state[0].
5505     if(iterState == 0) {
5506         /* initial state */
5507         if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5508             s.iterator->move(s.iterator, 0, UITER_LIMIT);
5509         } else {
5510             s.iterator->move(s.iterator, 0, UITER_START);
5511         }
5512     } else {
5513         /* reset to previous state */
5514         s.iterator->setState(s.iterator, iterState, status);
5515         if(U_FAILURE(*status)) {
5516             UTRACE_EXIT_STATUS(*status);
5517             return 0;
5518         }
5519     }
5520
5521
5522
5523     // This variable tells us whether we can attempt to update the state
5524     // of iterator. Situations where we don't want to update iterator state
5525     // are the existence of expansion CEs that are not yet processed, and
5526     // finishing the case level without enough space in the buffer to insert
5527     // a level terminator.
5528     UBool canUpdateState = TRUE;
5529
5530     // Consume all the CEs that were consumed at the end of the previous
5531     // iteration without updating the iterator state. On identical level,
5532     // consume the code points.
5533     int32_t counter = cces;
5534     if(level < UCOL_PSK_IDENTICAL) {
5535         while(counter-->0) {
5536             // If we're doing French and we are on the secondary level,
5537             // we go backwards.
5538             if(level == UCOL_PSK_SECONDARY && doingFrench) {
5539                 CE = ucol_IGetPrevCE(coll, &s, status);
5540             } else {
5541                 CE = ucol_IGetNextCE(coll, &s, status);
5542             }
5543             if(CE==UCOL_NO_MORE_CES) {
5544                 /* should not happen */
5545                 *status=U_INTERNAL_PROGRAM_ERROR;
5546                 UTRACE_EXIT_STATUS(*status);
5547                 return 0;
5548             }
5549             if(uprv_numAvailableExpCEs(s)) {
5550                 canUpdateState = FALSE;
5551             }
5552         }
5553     } else {
5554         while(counter-->0) {
5555             uiter_next32(s.iterator);
5556         }
5557     }
5558
5559     // French secondary needs to know whether the iterator state of zero came from previous level OR
5560     // from a new invocation...
5561     UBool wasDoingPrimary = FALSE;
5562     // destination buffer byte counter. When this guy
5563     // gets to count, we're done with the iteration
5564     int32_t i = 0;
5565     // used to count the zero bytes written after we
5566     // have finished with the sort key
5567     int32_t j = 0;
5568
5569
5570     // Hm.... I think we're ready to plunge in. Basic story is as following:
5571     // we have a fall through case based on level. This is used for initial
5572     // positioning on iteration start. Every level processor contains a
5573     // for(;;) which will be broken when we exhaust all the CEs. Other
5574     // way to exit is a goto saveState, which happens when we have filled
5575     // out our buffer.
5576     switch(level) {
5577     case UCOL_PSK_PRIMARY:
5578         wasDoingPrimary = TRUE;
5579         for(;;) {
5580             if(i==count) {
5581                 goto saveState;
5582             }
5583             // We should save the state only if we
5584             // are sure that we are done with the
5585             // previous iterator state
5586             if(canUpdateState && byteCountOrFrenchDone == 0) {
5587                 newState = s.iterator->getState(s.iterator);
5588                 if(newState != UITER_NO_STATE) {
5589                     iterState = newState;
5590                     cces = 0;
5591                 }
5592             }
5593             CE = ucol_IGetNextCE(coll, &s, status);
5594             cces++;
5595             if(CE==UCOL_NO_MORE_CES) {
5596                 // Add the level separator
5597                 terminatePSKLevel(level, maxLevel, i, dest);
5598                 byteCountOrFrenchDone=0;
5599                 // Restart the iteration an move to the
5600                 // second level
5601                 s.iterator->move(s.iterator, 0, UITER_START);
5602                 cces = 0;
5603                 level = UCOL_PSK_SECONDARY;
5604                 break;
5605             }
5606             if(!isContinuation(CE)){
5607                 if(coll->leadBytePermutationTable != NULL){
5608                     CE = (coll->leadBytePermutationTable[CE>>24] << 24) | (CE & 0x00FFFFFF);
5609                 }
5610             }
5611             if(!isShiftedCE(CE, LVT, &wasShifted)) {
5612                 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5613                 if(CE != 0) {
5614                     if(byteCountOrFrenchDone == 0) {
5615                         // get the second byte of primary
5616                         dest[i++]=(uint8_t)(CE >> 8);
5617                     } else {
5618                         byteCountOrFrenchDone = 0;
5619                     }
5620                     if((CE &=0xff)!=0) {
5621                         if(i==count) {
5622                             /* overflow */
5623                             byteCountOrFrenchDone = 1;
5624                             cces--;
5625                             goto saveState;
5626                         }
5627                         dest[i++]=(uint8_t)CE;
5628                     }
5629                 }
5630             }
5631             if(uprv_numAvailableExpCEs(s)) {
5632                 canUpdateState = FALSE;
5633             } else {
5634                 canUpdateState = TRUE;
5635             }
5636         }
5637         /* fall through to next level */
5638     case UCOL_PSK_SECONDARY:
5639         if(strength >= UCOL_SECONDARY) {
5640             if(!doingFrench) {
5641                 for(;;) {
5642                     if(i == count) {
5643                         goto saveState;
5644                     }
5645                     // We should save the state only if we
5646                     // are sure that we are done with the
5647                     // previous iterator state
5648                     if(canUpdateState) {
5649                         newState = s.iterator->getState(s.iterator);
5650                         if(newState != UITER_NO_STATE) {
5651                             iterState = newState;
5652                             cces = 0;
5653                         }
5654                     }
5655                     CE = ucol_IGetNextCE(coll, &s, status);
5656                     cces++;
5657                     if(CE==UCOL_NO_MORE_CES) {
5658                         // Add the level separator
5659                         terminatePSKLevel(level, maxLevel, i, dest);
5660                         byteCountOrFrenchDone = 0;
5661                         // Restart the iteration an move to the
5662                         // second level
5663                         s.iterator->move(s.iterator, 0, UITER_START);
5664                         cces = 0;
5665                         level = UCOL_PSK_CASE;
5666                         break;
5667                     }
5668                     if(!isShiftedCE(CE, LVT, &wasShifted)) {
5669                         CE >>= 8; /* get secondary */
5670                         if(CE != 0) {
5671                             dest[i++]=(uint8_t)CE;
5672                         }
5673                     }
5674                     if(uprv_numAvailableExpCEs(s)) {
5675                         canUpdateState = FALSE;
5676                     } else {
5677                         canUpdateState = TRUE;
5678                     }
5679                 }
5680             } else { // French secondary processing
5681                 uint8_t frenchBuff[UCOL_MAX_BUFFER];
5682                 int32_t frenchIndex = 0;
5683                 // Here we are going backwards.
5684                 // If the iterator is at the beggining, it should be
5685                 // moved to end.
5686                 if(wasDoingPrimary) {
5687                     s.iterator->move(s.iterator, 0, UITER_LIMIT);
5688                     cces = 0;
5689                 }
5690                 for(;;) {
5691                     if(i == count) {
5692                         goto saveState;
5693                     }
5694                     if(canUpdateState) {
5695                         newState = s.iterator->getState(s.iterator);
5696                         if(newState != UITER_NO_STATE) {
5697                             iterState = newState;
5698                             cces = 0;
5699                         }
5700                     }
5701                     CE = ucol_IGetPrevCE(coll, &s, status);
5702                     cces++;
5703                     if(CE==UCOL_NO_MORE_CES) {
5704                         // Add the level separator
5705                         terminatePSKLevel(level, maxLevel, i, dest);
5706                         byteCountOrFrenchDone = 0;
5707                         // Restart the iteration an move to the next level
5708                         s.iterator->move(s.iterator, 0, UITER_START);
5709                         level = UCOL_PSK_CASE;
5710                         break;
5711                     }
5712                     if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5713                         // reverse when we get a first non-continuation CE.
5714                         CE >>= 8;
5715                         frenchBuff[frenchIndex++] = (uint8_t)CE;
5716                     } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5717                         CE >>= 8; /* get secondary */
5718                         if(!frenchIndex) {
5719                             if(CE != 0) {
5720                                 dest[i++]=(uint8_t)CE;
5721                             }
5722                         } else {
5723                             frenchBuff[frenchIndex++] = (uint8_t)CE;
5724                             frenchIndex -= usedFrench;
5725                             usedFrench = 0;
5726                             while(i < count && frenchIndex) {
5727                                 dest[i++] = frenchBuff[--frenchIndex];
5728                                 usedFrench++;
5729                             }
5730                         }
5731                     }
5732                     if(uprv_numAvailableExpCEs(s)) {
5733                         canUpdateState = FALSE;
5734                     } else {
5735                         canUpdateState = TRUE;
5736                     }
5737                 }
5738             }
5739         } else {
5740             level = UCOL_PSK_CASE;
5741         }
5742         /* fall through to next level */
5743     case UCOL_PSK_CASE:
5744         if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5745             uint32_t caseShift = UCOL_CASE_SHIFT_START;
5746             uint8_t caseByte = UCOL_CASE_BYTE_START;
5747             uint8_t caseBits = 0;
5748
5749             for(;;) {
5750                 U_ASSERT(caseShift <= UCOL_CASE_SHIFT_START);
5751                 if(i == count) {
5752                     goto saveState;
5753                 }
5754                 // We should save the state only if we
5755                 // are sure that we are done with the
5756                 // previous iterator state
5757                 if(canUpdateState) {
5758                     newState = s.iterator->getState(s.iterator);
5759                     if(newState != UITER_NO_STATE) {
5760                         iterState = newState;
5761                         cces = 0;
5762                     }
5763                 }
5764                 CE = ucol_IGetNextCE(coll, &s, status);
5765                 cces++;
5766                 if(CE==UCOL_NO_MORE_CES) {
5767                     // On the case level we might have an unfinished
5768                     // case byte. Add one if it's started.
5769                     if(caseShift != UCOL_CASE_SHIFT_START) {
5770                         dest[i++] = caseByte;
5771                     }
5772                     cces = 0;
5773                     // We have finished processing CEs on this level.
5774                     // However, we don't know if we have enough space
5775                     // to add a case level terminator.
5776                     if(i < count) {
5777                         // Add the level separator
5778                         terminatePSKLevel(level, maxLevel, i, dest);
5779                         // Restart the iteration and move to the
5780                         // next level
5781                         s.iterator->move(s.iterator, 0, UITER_START);
5782                         level = UCOL_PSK_TERTIARY;
5783                     } else {
5784                         canUpdateState = FALSE;
5785                     }
5786                     break;
5787                 }
5788
5789                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5790                     if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) {
5791                         // do the case level if we need to do it. We don't want to calculate
5792                         // case level for primary ignorables if we have only primary strength and case level
5793                         // otherwise we would break well formedness of CEs
5794                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5795                         caseBits = (uint8_t)(CE & 0xC0);
5796                         // this copies the case level logic from the
5797                         // sort key generation code
5798                         if(CE != 0) {
5799                             if (caseShift == 0) {
5800                                 dest[i++] = caseByte;
5801                                 caseShift = UCOL_CASE_SHIFT_START;
5802                                 caseByte = UCOL_CASE_BYTE_START;
5803                             }
5804                             if(coll->caseFirst == UCOL_UPPER_FIRST) {
5805                                 if((caseBits & 0xC0) == 0) {
5806                                     caseByte |= 1 << (--caseShift);
5807                                 } else {
5808                                     caseByte |= 0 << (--caseShift);
5809                                     /* second bit */
5810                                     if(caseShift == 0) {
5811                                         dest[i++] = caseByte;
5812                                         caseShift = UCOL_CASE_SHIFT_START;
5813                                         caseByte = UCOL_CASE_BYTE_START;
5814                                     }
5815                                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
5816                                 }
5817                             } else {
5818                                 if((caseBits & 0xC0) == 0) {
5819                                     caseByte |= 0 << (--caseShift);
5820                                 } else {
5821                                     caseByte |= 1 << (--caseShift);
5822                                     /* second bit */
5823                                     if(caseShift == 0) {
5824                                         dest[i++] = caseByte;
5825                                         caseShift = UCOL_CASE_SHIFT_START;
5826                                         caseByte = UCOL_CASE_BYTE_START;
5827                                     }
5828                                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
5829                                 }
5830                             }
5831                         }
5832
5833                     }
5834                 }
5835                 // Not sure this is correct for the case level - revisit
5836                 if(uprv_numAvailableExpCEs(s)) {
5837                     canUpdateState = FALSE;
5838                 } else {
5839                     canUpdateState = TRUE;
5840                 }
5841             }
5842         } else {
5843             level = UCOL_PSK_TERTIARY;
5844         }
5845         /* fall through to next level */
5846     case UCOL_PSK_TERTIARY:
5847         if(strength >= UCOL_TERTIARY) {
5848             for(;;) {
5849                 if(i == count) {
5850                     goto saveState;
5851                 }
5852                 // We should save the state only if we
5853                 // are sure that we are done with the
5854                 // previous iterator state
5855                 if(canUpdateState) {
5856                     newState = s.iterator->getState(s.iterator);
5857                     if(newState != UITER_NO_STATE) {
5858                         iterState = newState;
5859                         cces = 0;
5860                     }
5861                 }
5862                 CE = ucol_IGetNextCE(coll, &s, status);
5863                 cces++;
5864                 if(CE==UCOL_NO_MORE_CES) {
5865                     // Add the level separator
5866                     terminatePSKLevel(level, maxLevel, i, dest);
5867                     byteCountOrFrenchDone = 0;
5868                     // Restart the iteration an move to the
5869                     // second level
5870                     s.iterator->move(s.iterator, 0, UITER_START);
5871                     cces = 0;
5872                     level = UCOL_PSK_QUATERNARY;
5873                     break;
5874                 }
5875                 if(!isShiftedCE(CE, LVT, &wasShifted)) {
5876                     notIsContinuation = !isContinuation(CE);
5877
5878                     if(notIsContinuation) {
5879                         CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5880                         CE ^= coll->caseSwitch;
5881                         CE &= coll->tertiaryMask;
5882                     } else {
5883                         CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
5884                     }
5885
5886                     if(CE != 0) {
5887                         dest[i++]=(uint8_t)CE;
5888                     }
5889                 }
5890                 if(uprv_numAvailableExpCEs(s)) {
5891                     canUpdateState = FALSE;
5892                 } else {
5893                     canUpdateState = TRUE;
5894                 }
5895             }
5896         } else {
5897             // if we're not doing tertiary
5898             // skip to the end
5899             level = UCOL_PSK_NULL;
5900         }
5901         /* fall through to next level */
5902     case UCOL_PSK_QUATERNARY:
5903         if(strength >= UCOL_QUATERNARY) {
5904             for(;;) {
5905                 if(i == count) {
5906                     goto saveState;
5907                 }
5908                 // We should save the state only if we
5909                 // are sure that we are done with the
5910                 // previous iterator state
5911                 if(canUpdateState) {
5912                     newState = s.iterator->getState(s.iterator);
5913                     if(newState != UITER_NO_STATE) {
5914                         iterState = newState;
5915                         cces = 0;
5916                     }
5917                 }
5918                 CE = ucol_IGetNextCE(coll, &s, status);
5919                 cces++;
5920                 if(CE==UCOL_NO_MORE_CES) {
5921                     // Add the level separator
5922                     terminatePSKLevel(level, maxLevel, i, dest);
5923                     //dest[i++] = UCOL_LEVELTERMINATOR;
5924                     byteCountOrFrenchDone = 0;
5925                     // Restart the iteration an move to the
5926                     // second level
5927                     s.iterator->move(s.iterator, 0, UITER_START);
5928                     cces = 0;
5929                     level = UCOL_PSK_QUIN;
5930                     break;
5931                 }
5932                 if(CE==0)
5933                     continue;
5934                 if(isShiftedCE(CE, LVT, &wasShifted)) {
5935                     CE >>= 16; /* get primary */
5936                     if(CE != 0) {
5937                         if(byteCountOrFrenchDone == 0) {
5938                             dest[i++]=(uint8_t)(CE >> 8);
5939                         } else {
5940                             byteCountOrFrenchDone = 0;
5941                         }
5942                         if((CE &=0xff)!=0) {
5943                             if(i==count) {
5944                                 /* overflow */
5945                                 byteCountOrFrenchDone = 1;
5946                                 goto saveState;
5947                             }
5948                             dest[i++]=(uint8_t)CE;
5949                         }
5950                     }
5951                 } else {
5952                     notIsContinuation = !isContinuation(CE);
5953                     if(notIsContinuation) {
5954                         if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5955                             dest[i++] = UCOL_HIRAGANA_QUAD;
5956                         } else {
5957                             dest[i++] = 0xFF;
5958                         }
5959                     }
5960                 }
5961                 if(uprv_numAvailableExpCEs(s)) {
5962                     canUpdateState = FALSE;
5963                 } else {
5964                     canUpdateState = TRUE;
5965                 }
5966             }
5967         } else {
5968             // if we're not doing quaternary
5969             // skip to the end
5970             level = UCOL_PSK_NULL;
5971         }
5972         /* fall through to next level */
5973     case UCOL_PSK_QUIN:
5974         level = UCOL_PSK_IDENTICAL;
5975         /* fall through to next level */
5976     case UCOL_PSK_IDENTICAL:
5977         if(strength >= UCOL_IDENTICAL) {
5978             UChar32 first, second;
5979             int32_t bocsuBytesWritten = 0;
5980             // We always need to do identical on
5981             // the NFD form of the string.
5982             if(normIter == NULL) {
5983                 // we arrived from the level below and
5984                 // normalization was not turned on.
5985                 // therefore, we need to make a fresh NFD iterator
5986                 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5987                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5988             } else if(!doingIdenticalFromStart) {
5989                 // there is an iterator, but we did some other levels.
5990                 // therefore, we have a FCD iterator - need to make
5991                 // a NFD one.
5992                 // normIter being at the beginning does not guarantee
5993                 // that the underlying iterator is at the beginning
5994                 iter->move(iter, 0, UITER_START);
5995                 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5996             }
5997             // At this point we have a NFD iterator that is positioned
5998             // in the right place
5999             if(U_FAILURE(*status)) {
6000                 UTRACE_EXIT_STATUS(*status);
6001                 return 0;
6002             }
6003             first = uiter_previous32(s.iterator);
6004             // maybe we're at the start of the string
6005             if(first == U_SENTINEL) {
6006                 first = 0;
6007             } else {
6008                 uiter_next32(s.iterator);
6009             }
6010
6011             j = 0;
6012             for(;;) {
6013                 if(i == count) {
6014                     if(j+1 < bocsuBytesWritten) {
6015                         bocsuBytesUsed = j+1;
6016                     }
6017                     goto saveState;
6018                 }
6019
6020                 // On identical level, we will always save
6021                 // the state if we reach this point, since
6022                 // we don't depend on getNextCE for content
6023                 // all the content is in our buffer and we
6024                 // already either stored the full buffer OR
6025                 // otherwise we won't arrive here.
6026                 newState = s.iterator->getState(s.iterator);
6027                 if(newState != UITER_NO_STATE) {
6028                     iterState = newState;
6029                     cces = 0;
6030                 }
6031
6032                 uint8_t buff[4];
6033                 second = uiter_next32(s.iterator);
6034                 cces++;
6035
6036                 // end condition for identical level
6037                 if(second == U_SENTINEL) {
6038                     terminatePSKLevel(level, maxLevel, i, dest);
6039                     level = UCOL_PSK_NULL;
6040                     break;
6041                 }
6042                 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6043                 first = second;
6044
6045                 j = 0;
6046                 if(bocsuBytesUsed != 0) {
6047                     while(bocsuBytesUsed-->0) {
6048                         j++;
6049                     }
6050                 }
6051
6052                 while(i < count && j < bocsuBytesWritten) {
6053                     dest[i++] = buff[j++];
6054                 }
6055             }
6056
6057         } else {
6058             level = UCOL_PSK_NULL;
6059         }
6060         /* fall through to next level */
6061     case UCOL_PSK_NULL:
6062         j = i;
6063         while(j<count) {
6064             dest[j++]=0;
6065         }
6066         break;
6067     default:
6068         *status = U_INTERNAL_PROGRAM_ERROR;
6069         UTRACE_EXIT_STATUS(*status);
6070         return 0;
6071     }
6072
6073 saveState:
6074     // Now we need to return stuff. First we want to see whether we have
6075     // done everything for the current state of iterator.
6076     if(byteCountOrFrenchDone
6077         || canUpdateState == FALSE
6078         || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE)
6079     {
6080         // Any of above mean that the previous transaction
6081         // wasn't finished and that we should store the
6082         // previous iterator state.
6083         state[0] = iterState;
6084     } else {
6085         // The transaction is complete. We will continue in the next iteration.
6086         state[0] = s.iterator->getState(s.iterator);
6087         cces = 0;
6088     }
6089     // Store the number of bocsu bytes written.
6090     if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) {
6091         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6092     }
6093     state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT;
6094
6095     // Next we put in the level of comparison
6096     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6097
6098     // If we are doing French, we need to store whether we have just finished the French level
6099     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6100         state[1] |= (((int32_t)(state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6101     } else {
6102         state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6103     }
6104
6105     // Was the latest CE shifted
6106     if(wasShifted) {
6107         state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6108     }
6109     // Check for cces overflow
6110     if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) {
6111         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6112     }
6113     // Store cces
6114     state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT);
6115
6116     // Check for French overflow
6117     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6118         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6119     }
6120     // Store number of bytes written in the French secondary continuation sequence
6121     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6122
6123
6124     // If we have used normalizing iterator, get rid of it
6125     if(normIter != NULL) {
6126         unorm_closeIter(normIter);
6127     }
6128
6129     /* To avoid memory leak, free the offset buffer if necessary. */
6130     ucol_freeOffsetBuffer(&s);
6131
6132     // Return number of meaningful sortkey bytes.
6133     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6134                   dest,i, state[0], state[1]);
6135     UTRACE_EXIT_VALUE(i);
6136     return i;
6137 }
6138
6139 /**
6140  * Produce a bound for a given sortkey and a number of levels.
6141  */
6142 U_CAPI int32_t U_EXPORT2
6143 ucol_getBound(const uint8_t       *source,
6144         int32_t             sourceLength,
6145         UColBoundMode       boundType,
6146         uint32_t            noOfLevels,
6147         uint8_t             *result,
6148         int32_t             resultLength,
6149         UErrorCode          *status)
6150 {
6151     // consistency checks
6152     if(status == NULL || U_FAILURE(*status)) {
6153         return 0;
6154     }
6155     if(source == NULL) {
6156         *status = U_ILLEGAL_ARGUMENT_ERROR;
6157         return 0;
6158     }
6159
6160     int32_t sourceIndex = 0;
6161     // Scan the string until we skip enough of the key OR reach the end of the key
6162     do {
6163         sourceIndex++;
6164         if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6165             noOfLevels--;
6166         }
6167     } while (noOfLevels > 0
6168         && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6169
6170     if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6171         && noOfLevels > 0) {
6172             *status = U_SORT_KEY_TOO_SHORT_WARNING;
6173     }
6174
6175
6176     // READ ME: this code assumes that the values for boundType
6177     // enum will not changes. They are set so that the enum value
6178     // corresponds to the number of extra bytes each bound type
6179     // needs.
6180     if(result != NULL && resultLength >= sourceIndex+boundType) {
6181         uprv_memcpy(result, source, sourceIndex);
6182         switch(boundType) {
6183             // Lower bound just gets terminated. No extra bytes
6184         case UCOL_BOUND_LOWER: // = 0
6185             break;
6186             // Upper bound needs one extra byte
6187         case UCOL_BOUND_UPPER: // = 1
6188             result[sourceIndex++] = 2;
6189             break;
6190             // Upper long bound needs two extra bytes
6191         case UCOL_BOUND_UPPER_LONG: // = 2
6192             result[sourceIndex++] = 0xFF;
6193             result[sourceIndex++] = 0xFF;
6194             break;
6195         default:
6196             *status = U_ILLEGAL_ARGUMENT_ERROR;
6197             return 0;
6198         }
6199         result[sourceIndex++] = 0;
6200
6201         return sourceIndex;
6202     } else {
6203         return sourceIndex+boundType+1;
6204     }
6205 }
6206
6207 /****************************************************************************/
6208 /* Following are the functions that deal with the properties of a collator  */
6209 /* there are new APIs and some compatibility APIs                           */
6210 /****************************************************************************/
6211
6212 static inline void
6213 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6214                     int32_t *primShift, int32_t *secShift, int32_t *terShift)
6215 {
6216     uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6217     UBool reverseSecondary = FALSE;
6218     UBool continuation = isContinuation(CE);
6219     if(!continuation) {
6220         tertiary = (uint8_t)((CE & coll->tertiaryMask));
6221         tertiary ^= coll->caseSwitch;
6222         reverseSecondary = TRUE;
6223     } else {
6224         tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6225         tertiary &= UCOL_REMOVE_CASE;
6226         reverseSecondary = FALSE;
6227     }
6228
6229     secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6230     primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6231     primary1 = (uint8_t)(CE >> 8);
6232
6233     if(primary1 != 0) {
6234         if (coll->leadBytePermutationTable != NULL && !continuation) {
6235             primary1 = coll->leadBytePermutationTable[primary1];
6236         }
6237
6238         coll->latinOneCEs[ch] |= (primary1 << *primShift);
6239         *primShift -= 8;
6240     }
6241     if(primary2 != 0) {
6242         if(*primShift < 0) {
6243             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6244             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6245             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6246             return;
6247         }
6248         coll->latinOneCEs[ch] |= (primary2 << *primShift);
6249         *primShift -= 8;
6250     }
6251     if(secondary != 0) {
6252         if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6253             coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6254             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6255         } else { // normal case
6256             coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6257         }
6258         *secShift -= 8;
6259     }
6260     if(tertiary != 0) {
6261         coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6262         *terShift -= 8;
6263     }
6264 }
6265
6266 static inline UBool
6267 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6268     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6269     if(newTable == NULL) {
6270       *status = U_MEMORY_ALLOCATION_ERROR;
6271       coll->latinOneFailed = TRUE;
6272       return FALSE;
6273     }
6274     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6275     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6276     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6277     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6278     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6279     coll->latinOneTableLen = size;
6280     uprv_free(coll->latinOneCEs);
6281     coll->latinOneCEs = newTable;
6282     return TRUE;
6283 }
6284
6285 static UBool
6286 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6287     UBool result = TRUE;
6288     if(coll->latinOneCEs == NULL) {
6289         coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6290         if(coll->latinOneCEs == NULL) {
6291             *status = U_MEMORY_ALLOCATION_ERROR;
6292             return FALSE;
6293         }
6294         coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6295     }
6296     UChar ch = 0;
6297     UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6298     // Check for null pointer
6299     if (U_FAILURE(*status)) {
6300         ucol_closeElements(it);
6301         return FALSE;
6302     }
6303     uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6304
6305     int32_t primShift = 24, secShift = 24, terShift = 24;
6306     uint32_t CE = 0;
6307     int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6308
6309     // TODO: make safe if you get more than you wanted...
6310     for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6311         primShift = 24; secShift = 24; terShift = 24;
6312         if(ch < 0x100) {
6313             CE = coll->latinOneMapping[ch];
6314         } else {
6315             CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch);
6316             if(CE == UCOL_NOT_FOUND && coll->UCA) {
6317                 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch);
6318             }
6319         }
6320         if(CE < UCOL_NOT_FOUND) {
6321             ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6322         } else {
6323             switch (getCETag(CE)) {
6324             case EXPANSION_TAG:
6325             case DIGIT_TAG:
6326                 ucol_setText(it, &ch, 1, status);
6327                 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6328                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6329                         coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6330                         coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6331                         coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6332                         break;
6333                     }
6334                     ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6335                 }
6336                 break;
6337             case CONTRACTION_TAG:
6338                 // here is the trick
6339                 // F2 is contraction. We do something very similar to contractions
6340                 // but have two indices, one in the real contraction table and the
6341                 // other to where we stuffed things. This hopes that we don't have
6342                 // many contractions (this should work for latin-1 tables).
6343                 {
6344                     if((CE & 0x00FFF000) != 0) {
6345                         *status = U_UNSUPPORTED_ERROR;
6346                         goto cleanup_after_failure;
6347                     }
6348
6349                     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6350
6351                     CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6352
6353                     coll->latinOneCEs[ch] = CE;
6354                     coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6355                     coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6356
6357                     // We're going to jump into contraction table, pick the elements
6358                     // and use them
6359                     do {
6360                         CE = *(coll->contractionCEs +
6361                             (UCharOffset - coll->contractionIndex));
6362                         if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
6363                             uint32_t size;
6364                             uint32_t i;    /* general counter */
6365                             uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6366                             size = getExpansionCount(CE);
6367                             //CE = *CEOffset++;
6368                             if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6369                                 for(i = 0; i<size; i++) {
6370                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6371                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6372                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6373                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6374                                         break;
6375                                     }
6376                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6377                                 }
6378                             } else { /* else, we do */
6379                                 while(*CEOffset != 0) {
6380                                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6381                                         coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6382                                         coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6383                                         coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6384                                         break;
6385                                     }
6386                                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6387                                 }
6388                             }
6389                             contractionOffset++;
6390                         } else if(CE < UCOL_NOT_FOUND) {
6391                             ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6392                         } else {
6393                             coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6394                             coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6395                             coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6396                             contractionOffset++;
6397                         }
6398                         UCharOffset++;
6399                         primShift = 24; secShift = 24; terShift = 24;
6400                         if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6401                             if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6402                                 goto cleanup_after_failure;
6403                             }
6404                         }
6405                     } while(*UCharOffset != 0xFFFF);
6406                 }
6407                 break;;
6408             case SPEC_PROC_TAG:
6409                 {
6410                     // 0xB7 is a precontext character defined in UCA5.1, a special
6411                     // handle is implemeted in order to save LatinOne table for
6412                     // most locales.
6413                     if (ch==0xb7) {
6414                         ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6415                     }
6416                     else {
6417                         goto cleanup_after_failure;
6418                     }
6419                 }
6420                 break;
6421             default:
6422                 goto cleanup_after_failure;
6423             }
6424         }
6425     }
6426     // compact table
6427     if(contractionOffset < coll->latinOneTableLen) {
6428         if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6429             goto cleanup_after_failure;
6430         }
6431     }
6432     ucol_closeElements(it);
6433     return result;
6434
6435 cleanup_after_failure:
6436     // status should already be set before arriving here.
6437     coll->latinOneFailed = TRUE;
6438     ucol_closeElements(it);
6439     return FALSE;
6440 }
6441
6442 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6443     if(U_SUCCESS(*status)) {
6444         if(coll->caseFirst == UCOL_UPPER_FIRST) {
6445             coll->caseSwitch = UCOL_CASE_SWITCH;
6446         } else {
6447             coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6448         }
6449
6450         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6451             coll->tertiaryMask = UCOL_REMOVE_CASE;
6452             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6453             coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */
6454             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6455             coll->tertiaryBottom = UCOL_COMMON_BOT3;
6456         } else {
6457             coll->tertiaryMask = UCOL_KEEP_CASE;
6458             coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6459             if(coll->caseFirst == UCOL_UPPER_FIRST) {
6460                 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6461                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6462                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6463             } else {
6464                 coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6465                 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6466                 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6467             }
6468         }
6469
6470         /* Set the compression values */
6471         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - coll->tertiaryBottom - 1);
6472         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6473         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6474
6475         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6476             && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE)
6477         {
6478             coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6479         } else {
6480             coll->sortKeyGen = ucol_calcSortKey;
6481         }
6482         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
6483             && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed)
6484         {
6485             if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6486                 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6487                     //fprintf(stderr, "F");
6488                     coll->latinOneUse = TRUE;
6489                 } else {
6490                     coll->latinOneUse = FALSE;
6491                 }
6492                 if(*status == U_UNSUPPORTED_ERROR) {
6493                     *status = U_ZERO_ERROR;
6494                 }
6495             } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6496                 coll->latinOneUse = TRUE;
6497             }
6498         } else {
6499             coll->latinOneUse = FALSE;
6500         }
6501     }
6502 }
6503
6504 U_CAPI uint32_t  U_EXPORT2
6505 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6506     if(U_FAILURE(*status) || coll == NULL) {
6507         return 0;
6508     }
6509     if(len == -1) {
6510         len = u_strlen(varTop);
6511     }
6512     if(len == 0) {
6513         *status = U_ILLEGAL_ARGUMENT_ERROR;
6514         return 0;
6515     }
6516
6517     if(coll->delegate!=NULL) {
6518       return ((Collator*)coll->delegate)->setVariableTop(varTop, len, *status);
6519     }
6520
6521
6522     collIterate s;
6523     IInit_collIterate(coll, varTop, len, &s, status);
6524     if(U_FAILURE(*status)) {
6525         return 0;
6526     }
6527
6528     uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6529
6530     /* here we check if we have consumed all characters */
6531     /* you can put in either one character or a contraction */
6532     /* you shouldn't put more... */
6533     if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6534         *status = U_CE_NOT_FOUND_ERROR;
6535         return 0;
6536     }
6537
6538     uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6539
6540     if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6541         *status = U_PRIMARY_TOO_LONG_ERROR;
6542         return 0;
6543     }
6544     if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
6545         coll->variableTopValueisDefault = FALSE;
6546         coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6547     }
6548
6549     /* To avoid memory leak, free the offset buffer if necessary. */
6550     ucol_freeOffsetBuffer(&s);
6551
6552     return CE & UCOL_PRIMARYMASK;
6553 }
6554
6555 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6556     if(U_FAILURE(*status) || coll == NULL) {
6557         return 0;
6558     }
6559     if(coll->delegate!=NULL) {
6560       return ((const Collator*)coll->delegate)->getVariableTop(*status);
6561     }
6562     return coll->variableTopValue<<16;
6563 }
6564
6565 U_CAPI void  U_EXPORT2
6566 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6567     if(U_FAILURE(*status) || coll == NULL) {
6568         return;
6569     }
6570
6571     if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
6572         coll->variableTopValueisDefault = FALSE;
6573         coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6574     }
6575 }
6576 /* Attribute setter API */
6577 U_CAPI void  U_EXPORT2
6578 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6579     if(U_FAILURE(*status) || coll == NULL) {
6580       return;
6581     }
6582
6583     if(coll->delegate != NULL) {
6584       ((Collator*)coll->delegate)->setAttribute(attr,value,*status);
6585       return;
6586     }
6587
6588     UColAttributeValue oldFrench = coll->frenchCollation;
6589     UColAttributeValue oldCaseFirst = coll->caseFirst;
6590     switch(attr) {
6591     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6592         if(value == UCOL_ON) {
6593             coll->numericCollation = UCOL_ON;
6594             coll->numericCollationisDefault = FALSE;
6595         } else if (value == UCOL_OFF) {
6596             coll->numericCollation = UCOL_OFF;
6597             coll->numericCollationisDefault = FALSE;
6598         } else if (value == UCOL_DEFAULT) {
6599             coll->numericCollationisDefault = TRUE;
6600             coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6601         } else {
6602             *status = U_ILLEGAL_ARGUMENT_ERROR;
6603         }
6604         break;
6605     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6606         if(value == UCOL_ON || value == UCOL_OFF || value == UCOL_DEFAULT) {
6607             // This attribute is an implementation detail of the CLDR Japanese tailoring.
6608             // The implementation might change to use a different mechanism
6609             // to achieve the same Japanese sort order.
6610             // Since ICU 50, this attribute is not settable any more via API functions.
6611         } else {
6612             *status = U_ILLEGAL_ARGUMENT_ERROR;
6613         }
6614         break;
6615     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6616         if(value == UCOL_ON) {
6617             coll->frenchCollation = UCOL_ON;
6618             coll->frenchCollationisDefault = FALSE;
6619         } else if (value == UCOL_OFF) {
6620             coll->frenchCollation = UCOL_OFF;
6621             coll->frenchCollationisDefault = FALSE;
6622         } else if (value == UCOL_DEFAULT) {
6623             coll->frenchCollationisDefault = TRUE;
6624             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6625         } else {
6626             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6627         }
6628         break;
6629     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6630         if(value == UCOL_SHIFTED) {
6631             coll->alternateHandling = UCOL_SHIFTED;
6632             coll->alternateHandlingisDefault = FALSE;
6633         } else if (value == UCOL_NON_IGNORABLE) {
6634             coll->alternateHandling = UCOL_NON_IGNORABLE;
6635             coll->alternateHandlingisDefault = FALSE;
6636         } else if (value == UCOL_DEFAULT) {
6637             coll->alternateHandlingisDefault = TRUE;
6638             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6639         } else {
6640             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6641         }
6642         break;
6643     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6644         if(value == UCOL_LOWER_FIRST) {
6645             coll->caseFirst = UCOL_LOWER_FIRST;
6646             coll->caseFirstisDefault = FALSE;
6647         } else if (value == UCOL_UPPER_FIRST) {
6648             coll->caseFirst = UCOL_UPPER_FIRST;
6649             coll->caseFirstisDefault = FALSE;
6650         } else if (value == UCOL_OFF) {
6651             coll->caseFirst = UCOL_OFF;
6652             coll->caseFirstisDefault = FALSE;
6653         } else if (value == UCOL_DEFAULT) {
6654             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6655             coll->caseFirstisDefault = TRUE;
6656         } else {
6657             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6658         }
6659         break;
6660     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6661         if(value == UCOL_ON) {
6662             coll->caseLevel = UCOL_ON;
6663             coll->caseLevelisDefault = FALSE;
6664         } else if (value == UCOL_OFF) {
6665             coll->caseLevel = UCOL_OFF;
6666             coll->caseLevelisDefault = FALSE;
6667         } else if (value == UCOL_DEFAULT) {
6668             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6669             coll->caseLevelisDefault = TRUE;
6670         } else {
6671             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6672         }
6673         break;
6674     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6675         if(value == UCOL_ON) {
6676             coll->normalizationMode = UCOL_ON;
6677             coll->normalizationModeisDefault = FALSE;
6678             initializeFCD(status);
6679         } else if (value == UCOL_OFF) {
6680             coll->normalizationMode = UCOL_OFF;
6681             coll->normalizationModeisDefault = FALSE;
6682         } else if (value == UCOL_DEFAULT) {
6683             coll->normalizationModeisDefault = TRUE;
6684             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6685             if(coll->normalizationMode == UCOL_ON) {
6686                 initializeFCD(status);
6687             }
6688         } else {
6689             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6690         }
6691         break;
6692     case UCOL_STRENGTH:         /* attribute for strength */
6693         if (value == UCOL_DEFAULT) {
6694             coll->strengthisDefault = TRUE;
6695             coll->strength = (UColAttributeValue)coll->options->strength;
6696         } else if (value <= UCOL_IDENTICAL) {
6697             coll->strengthisDefault = FALSE;
6698             coll->strength = value;
6699         } else {
6700             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6701         }
6702         break;
6703     case UCOL_ATTRIBUTE_COUNT:
6704     default:
6705         *status = U_ILLEGAL_ARGUMENT_ERROR;
6706         break;
6707     }
6708     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6709         coll->latinOneRegenTable = TRUE;
6710     } else {
6711         coll->latinOneRegenTable = FALSE;
6712     }
6713     ucol_updateInternalState(coll, status);
6714 }
6715
6716 U_CAPI UColAttributeValue  U_EXPORT2
6717 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6718     if(U_FAILURE(*status) || coll == NULL) {
6719       return UCOL_DEFAULT;
6720     }
6721
6722     if(coll->delegate != NULL) {
6723       return ((Collator*)coll->delegate)->getAttribute(attr,*status);
6724     }
6725
6726     switch(attr) {
6727     case UCOL_NUMERIC_COLLATION:
6728       return coll->numericCollation;
6729     case UCOL_HIRAGANA_QUATERNARY_MODE:
6730       return coll->hiraganaQ;
6731     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6732         return coll->frenchCollation;
6733     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6734         return coll->alternateHandling;
6735     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6736         return coll->caseFirst;
6737     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6738         return coll->caseLevel;
6739     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6740         return coll->normalizationMode;
6741     case UCOL_STRENGTH:         /* attribute for strength */
6742         return coll->strength;
6743     case UCOL_ATTRIBUTE_COUNT:
6744     default:
6745         *status = U_ILLEGAL_ARGUMENT_ERROR;
6746         break;
6747     }
6748     return UCOL_DEFAULT;
6749 }
6750
6751 U_CAPI void U_EXPORT2
6752 ucol_setStrength(    UCollator                *coll,
6753             UCollationStrength        strength)
6754 {
6755     UErrorCode status = U_ZERO_ERROR;
6756     ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6757 }
6758
6759 U_CAPI UCollationStrength U_EXPORT2
6760 ucol_getStrength(const UCollator *coll)
6761 {
6762     UErrorCode status = U_ZERO_ERROR;
6763     return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6764 }
6765
6766 U_CAPI int32_t U_EXPORT2
6767 ucol_getReorderCodes(const UCollator *coll,
6768                     int32_t *dest,
6769                     int32_t destCapacity,
6770                     UErrorCode *status) {
6771     if (U_FAILURE(*status)) {
6772         return 0;
6773     }
6774
6775     if(coll->delegate!=NULL) {
6776       return ((const Collator*)coll->delegate)->getReorderCodes(dest, destCapacity, *status);
6777     }
6778
6779     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6780         *status = U_ILLEGAL_ARGUMENT_ERROR;
6781         return 0;
6782     }
6783
6784 #ifdef UCOL_DEBUG
6785     printf("coll->reorderCodesLength = %d\n", coll->reorderCodesLength);
6786     printf("coll->defaultReorderCodesLength = %d\n", coll->defaultReorderCodesLength);
6787 #endif
6788
6789     if (coll->reorderCodesLength > destCapacity) {
6790         *status = U_BUFFER_OVERFLOW_ERROR;
6791         return coll->reorderCodesLength;
6792     }
6793     for (int32_t i = 0; i < coll->reorderCodesLength; i++) {
6794         dest[i] = coll->reorderCodes[i];
6795     }
6796     return coll->reorderCodesLength;
6797 }
6798
6799 U_CAPI void U_EXPORT2
6800 ucol_setReorderCodes(UCollator* coll,
6801                     const int32_t* reorderCodes,
6802                     int32_t reorderCodesLength,
6803                     UErrorCode *status) {
6804     if (U_FAILURE(*status)) {
6805         return;
6806     }
6807
6808     if (reorderCodesLength < 0 || (reorderCodesLength > 0 && reorderCodes == NULL)) {
6809         *status = U_ILLEGAL_ARGUMENT_ERROR;
6810         return;
6811     }
6812
6813     if(coll->delegate!=NULL) {
6814       ((Collator*)coll->delegate)->setReorderCodes(reorderCodes, reorderCodesLength, *status);
6815       return;
6816     }
6817
6818     if (coll->reorderCodes != NULL && coll->freeReorderCodesOnClose == TRUE) {
6819         uprv_free(coll->reorderCodes);
6820     }
6821     coll->reorderCodes = NULL;
6822     coll->reorderCodesLength = 0;
6823     if (reorderCodesLength == 0) {
6824         if (coll->leadBytePermutationTable != NULL && coll->freeLeadBytePermutationTableOnClose == TRUE) {
6825             uprv_free(coll->leadBytePermutationTable);
6826         }
6827         coll->leadBytePermutationTable = NULL;
6828         return;
6829     }
6830     coll->reorderCodes = (int32_t*) uprv_malloc(reorderCodesLength * sizeof(int32_t));
6831     if (coll->reorderCodes == NULL) {
6832         *status = U_MEMORY_ALLOCATION_ERROR;
6833         return;
6834     }
6835     coll->freeReorderCodesOnClose = TRUE;
6836     for (int32_t i = 0; i < reorderCodesLength; i++) {
6837         coll->reorderCodes[i] = reorderCodes[i];
6838     }
6839     coll->reorderCodesLength = reorderCodesLength;
6840     ucol_buildPermutationTable(coll, status);
6841 }
6842
6843 U_CAPI int32_t U_EXPORT2
6844 ucol_getEquivalentReorderCodes(int32_t reorderCode,
6845                     int32_t* dest,
6846                     int32_t destCapacity,
6847                     UErrorCode *pErrorCode) {
6848     bool equivalentCodesSet[USCRIPT_CODE_LIMIT];
6849     uint16_t leadBytes[256];
6850     int leadBytesCount;
6851     int leadByteIndex;
6852     int16_t reorderCodesForLeadByte[USCRIPT_CODE_LIMIT];
6853     int reorderCodesForLeadByteCount;
6854     int reorderCodeIndex;
6855
6856     int32_t equivalentCodesCount = 0;
6857     int setIndex;
6858
6859     if (U_FAILURE(*pErrorCode)) {
6860         return 0;
6861     }
6862
6863     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
6864         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
6865         return 0;
6866     }
6867
6868     uprv_memset(equivalentCodesSet, 0, USCRIPT_CODE_LIMIT * sizeof(bool));
6869
6870     const UCollator* uca = ucol_initUCA(pErrorCode);
6871     if (U_FAILURE(*pErrorCode)) {
6872         return 0;
6873     }
6874     leadBytesCount = ucol_getLeadBytesForReorderCode(uca, reorderCode, leadBytes, 256);
6875     for (leadByteIndex = 0; leadByteIndex < leadBytesCount; leadByteIndex++) {
6876         reorderCodesForLeadByteCount = ucol_getReorderCodesForLeadByte(
6877             uca, leadBytes[leadByteIndex], reorderCodesForLeadByte, USCRIPT_CODE_LIMIT);
6878         for (reorderCodeIndex = 0; reorderCodeIndex < reorderCodesForLeadByteCount; reorderCodeIndex++) {
6879             equivalentCodesSet[reorderCodesForLeadByte[reorderCodeIndex]] = true;
6880         }
6881     }
6882
6883     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6884         if (equivalentCodesSet[setIndex] == true) {
6885             equivalentCodesCount++;
6886         }
6887     }
6888
6889     if (destCapacity == 0) {
6890         return equivalentCodesCount;
6891     }
6892
6893     equivalentCodesCount = 0;
6894     for (setIndex = 0; setIndex < USCRIPT_CODE_LIMIT; setIndex++) {
6895         if (equivalentCodesSet[setIndex] == true) {
6896             dest[equivalentCodesCount++] = setIndex;
6897             if (equivalentCodesCount >= destCapacity) {
6898                 break;
6899             }
6900         }
6901     }
6902     return equivalentCodesCount;
6903 }
6904
6905
6906 /****************************************************************************/
6907 /* Following are misc functions                                             */
6908 /* there are new APIs and some compatibility APIs                           */
6909 /****************************************************************************/
6910
6911 U_CAPI void U_EXPORT2
6912 ucol_getVersion(const UCollator* coll,
6913                 UVersionInfo versionInfo)
6914 {
6915     if(coll->delegate!=NULL) {
6916       ((const Collator*)coll->delegate)->getVersion(versionInfo);
6917       return;
6918     }
6919     /* RunTime version  */
6920     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
6921     /* Builder version*/
6922     uint8_t bdVersion = coll->image->version[0];
6923
6924     /* Charset Version. Need to get the version from cnv files
6925      * makeconv should populate cnv files with version and
6926      * an api has to be provided in ucnv.h to obtain this version
6927      */
6928     uint8_t csVersion = 0;
6929
6930     /* combine the version info */
6931     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
6932
6933     /* Tailoring rules */
6934     versionInfo[0] = (uint8_t)(cmbVersion>>8);
6935     versionInfo[1] = (uint8_t)cmbVersion;
6936     versionInfo[2] = coll->image->version[1];
6937     if(coll->UCA) {
6938         /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */
6939         versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07);
6940     } else {
6941         versionInfo[3] = 0;
6942     }
6943 }
6944
6945
6946 /* This internal API checks whether a character is tailored or not */
6947 U_CAPI UBool  U_EXPORT2
6948 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
6949     if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) {
6950         return FALSE;
6951     }
6952
6953     uint32_t CE = UCOL_NOT_FOUND;
6954     const UChar *ContractionStart = NULL;
6955     if(u < 0x100) { /* latin-1 */
6956         CE = coll->latinOneMapping[u];
6957         if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
6958             return FALSE;
6959         }
6960     } else { /* regular */
6961         CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u);
6962     }
6963
6964     if(isContraction(CE)) {
6965         ContractionStart = (UChar *)coll->image+getContractOffset(CE);
6966         CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
6967     }
6968
6969     return (UBool)(CE != UCOL_NOT_FOUND);
6970 }
6971
6972
6973 /****************************************************************************/
6974 /* Following are the string compare functions                               */
6975 /*                                                                          */
6976 /****************************************************************************/
6977
6978
6979 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
6980 /*                     Used by strcoll if strength == identical and strings  */
6981 /*                     are otherwise equal.                                  */
6982 /*                                                                           */
6983 /*                     Comparison must be done on NFD normalized strings.    */
6984 /*                     FCD is not good enough.                               */
6985
6986 static
6987 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
6988 {
6989     // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
6990     // of same type, but that doesn't really mean that it will stay that way.
6991     int32_t            comparison;
6992
6993     if (sColl->flags & UCOL_USE_ITERATOR) {
6994         // The division for the array length may truncate the array size to
6995         // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6996         // for all platforms anyway.
6997         UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6998         UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6999         UNormIterator *sNIt = NULL, *tNIt = NULL;
7000         sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7001         tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7002         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7003         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7004         UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7005         UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7006         comparison = u_strCompareIter(sIt, tIt, TRUE);
7007         unorm_closeIter(sNIt);
7008         unorm_closeIter(tNIt);
7009     } else {
7010         int32_t sLen      = (sColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(sColl->endp - sColl->string) : -1;
7011         const UChar *sBuf = sColl->string;
7012         int32_t tLen      = (tColl->flags & UCOL_ITER_HASLEN) ? (int32_t)(tColl->endp - tColl->string) : -1;
7013         const UChar *tBuf = tColl->string;
7014
7015         if (normalize) {
7016             *status = U_ZERO_ERROR;
7017             // Note: We could use Normalizer::compare() or similar, but for short strings
7018             // which may not be in FCD it might be faster to just NFD them.
7019             // Note: spanQuickCheckYes() + normalizeSecondAndAppend() rather than
7020             // NFD'ing immediately might be faster for long strings,
7021             // but string comparison is usually done on relatively short strings.
7022             sColl->nfd->normalize(UnicodeString((sColl->flags & UCOL_ITER_HASLEN) == 0, sBuf, sLen),
7023                                   sColl->writableBuffer,
7024                                   *status);
7025             tColl->nfd->normalize(UnicodeString((tColl->flags & UCOL_ITER_HASLEN) == 0, tBuf, tLen),
7026                                   tColl->writableBuffer,
7027                                   *status);
7028             if(U_FAILURE(*status)) {
7029                 return UCOL_LESS;
7030             }
7031             comparison = sColl->writableBuffer.compareCodePointOrder(tColl->writableBuffer);
7032         } else {
7033             comparison = u_strCompare(sBuf, sLen, tBuf, tLen, TRUE);
7034         }
7035     }
7036
7037     if (comparison < 0) {
7038         return UCOL_LESS;
7039     } else if (comparison == 0) {
7040         return UCOL_EQUAL;
7041     } else /* comparison > 0 */ {
7042         return UCOL_GREATER;
7043     }
7044 }
7045
7046 /*  CEBuf - A struct and some inline functions to handle the saving    */
7047 /*          of CEs in a buffer within ucol_strcoll                     */
7048
7049 #define UCOL_CEBUF_SIZE 512
7050 typedef struct ucol_CEBuf {
7051     uint32_t    *buf;
7052     uint32_t    *endp;
7053     uint32_t    *pos;
7054     uint32_t     localArray[UCOL_CEBUF_SIZE];
7055 } ucol_CEBuf;
7056
7057
7058 static
7059 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7060     (b)->buf = (b)->pos = (b)->localArray;
7061     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7062 }
7063
7064 static
7065 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) {
7066     uint32_t  oldSize;
7067     uint32_t  newSize;
7068     uint32_t  *newBuf;
7069
7070     ci->flags |= UCOL_ITER_ALLOCATED;
7071     oldSize = (uint32_t)(b->pos - b->buf);
7072     newSize = oldSize * 2;
7073     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7074     if(newBuf == NULL) {
7075         *status = U_MEMORY_ALLOCATION_ERROR;
7076     }
7077     else {
7078         uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7079         if (b->buf != b->localArray) {
7080             uprv_free(b->buf);
7081         }
7082         b->buf = newBuf;
7083         b->endp = b->buf + newSize;
7084         b->pos  = b->buf + oldSize;
7085     }
7086 }
7087
7088 static
7089 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) {
7090     if (b->pos == b->endp) {
7091         ucol_CEBuf_Expand(b, ci, status);
7092     }
7093     if (U_SUCCESS(*status)) {
7094         *(b)->pos++ = ce;
7095     }
7096 }
7097
7098 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7099 /* It is used when compare gets in trouble and needs to bail out                     */
7100 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7101                                                   collIterate *tColl,
7102                                                   UErrorCode *status)
7103 {
7104     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7105     uint8_t *sourceKeyP = sourceKey;
7106     uint8_t *targetKeyP = targetKey;
7107     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7108     const UCollator *coll = sColl->coll;
7109     const UChar *source = NULL;
7110     const UChar *target = NULL;
7111     int32_t result = UCOL_EQUAL;
7112     UnicodeString sourceString, targetString;
7113     int32_t sourceLength;
7114     int32_t targetLength;
7115
7116     if(sColl->flags & UCOL_USE_ITERATOR) {
7117         sColl->iterator->move(sColl->iterator, 0, UITER_START);
7118         tColl->iterator->move(tColl->iterator, 0, UITER_START);
7119         UChar32 c;
7120         while((c=sColl->iterator->next(sColl->iterator))>=0) {
7121             sourceString.append((UChar)c);
7122         }
7123         while((c=tColl->iterator->next(tColl->iterator))>=0) {
7124             targetString.append((UChar)c);
7125         }
7126         source = sourceString.getBuffer();
7127         sourceLength = sourceString.length();
7128         target = targetString.getBuffer();
7129         targetLength = targetString.length();
7130     } else { // no iterators
7131         sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(int32_t)(sColl->endp-sColl->string):-1;
7132         targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(int32_t)(tColl->endp-tColl->string):-1;
7133         source = sColl->string;
7134         target = tColl->string;
7135     }
7136
7137
7138
7139     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7140     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7141         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7142         if(sourceKeyP == NULL) {
7143             *status = U_MEMORY_ALLOCATION_ERROR;
7144             goto cleanup_and_do_compare;
7145         }
7146         sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7147     }
7148
7149     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7150     if(targetKeyLen > UCOL_MAX_BUFFER) {
7151         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7152         if(targetKeyP == NULL) {
7153             *status = U_MEMORY_ALLOCATION_ERROR;
7154             goto cleanup_and_do_compare;
7155         }
7156         targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7157     }
7158
7159     result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7160
7161 cleanup_and_do_compare:
7162     if(sourceKeyP != NULL && sourceKeyP != sourceKey) {
7163         uprv_free(sourceKeyP);
7164     }
7165
7166     if(targetKeyP != NULL && targetKeyP != targetKey) {
7167         uprv_free(targetKeyP);
7168     }
7169
7170     if(result<0) {
7171         return UCOL_LESS;
7172     } else if(result>0) {
7173         return UCOL_GREATER;
7174     } else {
7175         return UCOL_EQUAL;
7176     }
7177 }
7178
7179
7180 static UCollationResult
7181 ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
7182 {
7183     U_ALIGN_CODE(16);
7184
7185     const UCollator *coll = sColl->coll;
7186
7187
7188     // setting up the collator parameters
7189     UColAttributeValue strength = coll->strength;
7190     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7191
7192     UBool checkSecTer = initialCheckSecTer;
7193     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7194     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7195     UBool checkIdent = (strength == UCOL_IDENTICAL);
7196     UBool checkCase = (coll->caseLevel == UCOL_ON);
7197     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7198     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7199     UBool qShifted = shifted && checkQuad;
7200     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7201
7202     if(doHiragana && shifted) {
7203         return (ucol_compareUsingSortKeys(sColl, tColl, status));
7204     }
7205     uint8_t caseSwitch = coll->caseSwitch;
7206     uint8_t tertiaryMask = coll->tertiaryMask;
7207
7208     // This is the lowest primary value that will not be ignored if shifted
7209     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7210
7211     UCollationResult result = UCOL_EQUAL;
7212     UCollationResult hirResult = UCOL_EQUAL;
7213
7214     // Preparing the CE buffers. They will be filled during the primary phase
7215     ucol_CEBuf   sCEs;
7216     ucol_CEBuf   tCEs;
7217     UCOL_INIT_CEBUF(&sCEs);
7218     UCOL_INIT_CEBUF(&tCEs);
7219
7220     uint32_t secS = 0, secT = 0;
7221     uint32_t sOrder=0, tOrder=0;
7222
7223     // Non shifted primary processing is quite simple
7224     if(!shifted) {
7225         for(;;) {
7226
7227             // We fetch CEs until we hit a non ignorable primary or end.
7228             do {
7229                 // We get the next CE
7230                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7231                 // Stuff it in the buffer
7232                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7233                 // And keep just the primary part.
7234                 sOrder &= UCOL_PRIMARYMASK;
7235             } while(sOrder == 0);
7236
7237             // see the comments on the above block
7238             do {
7239                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7240                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7241                 tOrder &= UCOL_PRIMARYMASK;
7242             } while(tOrder == 0);
7243
7244             // if both primaries are the same
7245             if(sOrder == tOrder) {
7246                 // and there are no more CEs, we advance to the next level
7247                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7248                     break;
7249                 }
7250                 if(doHiragana && hirResult == UCOL_EQUAL) {
7251                     if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7252                         hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7253                             ? UCOL_LESS:UCOL_GREATER;
7254                     }
7255                 }
7256             } else {
7257                 // only need to check one for continuation
7258                 // if one is then the other must be or the preceding CE would be a prefix of the other
7259                 if (coll->leadBytePermutationTable != NULL && !isContinuation(sOrder)) {
7260                     sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7261                     tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7262                 }
7263                 // if two primaries are different, we are done
7264                 result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7265                 goto commonReturn;
7266             }
7267         } // no primary difference... do the rest from the buffers
7268     } else { // shifted - do a slightly more complicated processing :)
7269         for(;;) {
7270             UBool sInShifted = FALSE;
7271             UBool tInShifted = FALSE;
7272             // This version of code can be refactored. However, it seems easier to understand this way.
7273             // Source loop. Sam as the target loop.
7274             for(;;) {
7275                 sOrder = ucol_IGetNextCE(coll, sColl, status);
7276                 if(sOrder == UCOL_NO_MORE_CES) {
7277                     UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7278                     break;
7279                 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7280                     /* UCA amendment - ignore ignorables that follow shifted code points */
7281                     continue;
7282                 } else if(isContinuation(sOrder)) {
7283                     if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7284                         if(sInShifted) {
7285                             sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7286                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7287                             continue;
7288                         } else {
7289                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7290                             break;
7291                         }
7292                     } else { /* Just lower level values */
7293                         if(sInShifted) {
7294                             continue;
7295                         } else {
7296                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7297                             continue;
7298                         }
7299                     }
7300                 } else { /* regular */
7301                     if(coll->leadBytePermutationTable != NULL){
7302                         sOrder = (coll->leadBytePermutationTable[sOrder>>24] << 24) | (sOrder & 0x00FFFFFF);
7303                     }
7304                     if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7305                         UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7306                         break;
7307                     } else {
7308                         if((sOrder & UCOL_PRIMARYMASK) > 0) {
7309                             sInShifted = TRUE;
7310                             sOrder &= UCOL_PRIMARYMASK;
7311                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7312                             continue;
7313                         } else {
7314                             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status);
7315                             sInShifted = FALSE;
7316                             continue;
7317                         }
7318                     }
7319                 }
7320             }
7321             sOrder &= UCOL_PRIMARYMASK;
7322             sInShifted = FALSE;
7323
7324             for(;;) {
7325                 tOrder = ucol_IGetNextCE(coll, tColl, status);
7326                 if(tOrder == UCOL_NO_MORE_CES) {
7327                     UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7328                     break;
7329                 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7330                     /* UCA amendment - ignore ignorables that follow shifted code points */
7331                     continue;
7332                 } else if(isContinuation(tOrder)) {
7333                     if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7334                         if(tInShifted) {
7335                             tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7336                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7337                             continue;
7338                         } else {
7339                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7340                             break;
7341                         }
7342                     } else { /* Just lower level values */
7343                         if(tInShifted) {
7344                             continue;
7345                         } else {
7346                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7347                             continue;
7348                         }
7349                     }
7350                 } else { /* regular */
7351                     if(coll->leadBytePermutationTable != NULL){
7352                         tOrder = (coll->leadBytePermutationTable[tOrder>>24] << 24) | (tOrder & 0x00FFFFFF);
7353                     }
7354                     if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7355                         UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7356                         break;
7357                     } else {
7358                         if((tOrder & UCOL_PRIMARYMASK) > 0) {
7359                             tInShifted = TRUE;
7360                             tOrder &= UCOL_PRIMARYMASK;
7361                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7362                             continue;
7363                         } else {
7364                             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status);
7365                             tInShifted = FALSE;
7366                             continue;
7367                         }
7368                     }
7369                 }
7370             }
7371             tOrder &= UCOL_PRIMARYMASK;
7372             tInShifted = FALSE;
7373
7374             if(sOrder == tOrder) {
7375                 /*
7376                 if(doHiragana && hirResult == UCOL_EQUAL) {
7377                 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7378                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7379                 ? UCOL_LESS:UCOL_GREATER;
7380                 }
7381                 }
7382                 */
7383                 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7384                     break;
7385                 } else {
7386                     sOrder = 0;
7387                     tOrder = 0;
7388                     continue;
7389                 }
7390             } else {
7391                 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7392                 goto commonReturn;
7393             }
7394         } /* no primary difference... do the rest from the buffers */
7395     }
7396
7397     /* now, we're gonna reexamine collected CEs */
7398     uint32_t    *sCE;
7399     uint32_t    *tCE;
7400
7401     /* This is the secondary level of comparison */
7402     if(checkSecTer) {
7403         if(!isFrenchSec) { /* normal */
7404             sCE = sCEs.buf;
7405             tCE = tCEs.buf;
7406             for(;;) {
7407                 while (secS == 0) {
7408                     secS = *(sCE++) & UCOL_SECONDARYMASK;
7409                 }
7410
7411                 while(secT == 0) {
7412                     secT = *(tCE++) & UCOL_SECONDARYMASK;
7413                 }
7414
7415                 if(secS == secT) {
7416                     if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7417                         break;
7418                     } else {
7419                         secS = 0; secT = 0;
7420                         continue;
7421                     }
7422                 } else {
7423                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7424                     goto commonReturn;
7425                 }
7426             }
7427         } else { /* do the French */
7428             uint32_t *sCESave = NULL;
7429             uint32_t *tCESave = NULL;
7430             sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7431             tCE = tCEs.pos-2;
7432             for(;;) {
7433                 while (secS == 0 && sCE >= sCEs.buf) {
7434                     if(sCESave == NULL) {
7435                         secS = *(sCE--);
7436                         if(isContinuation(secS)) {
7437                             while(isContinuation(secS = *(sCE--)))
7438                                 ;
7439                             /* after this, secS has the start of continuation, and sCEs points before that */
7440                             sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7441                             sCE+=2;  /* need to point to the first continuation CP */
7442                             /* However, now you can just continue doing stuff */
7443                         }
7444                     } else {
7445                         secS = *(sCE++);
7446                         if(!isContinuation(secS)) { /* This means we have finished with this cont */
7447                             sCE = sCESave;            /* reset the pointer to before continuation */
7448                             sCESave = NULL;
7449                             secS = 0;  /* Fetch a fresh CE before the continuation sequence. */
7450                             continue;
7451                         }
7452                     }
7453                     secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7454                 }
7455
7456                 while(secT == 0 && tCE >= tCEs.buf) {
7457                     if(tCESave == NULL) {
7458                         secT = *(tCE--);
7459                         if(isContinuation(secT)) {
7460                             while(isContinuation(secT = *(tCE--)))
7461                                 ;
7462                             /* after this, secS has the start of continuation, and sCEs points before that */
7463                             tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7464                             tCE+=2;  /* need to point to the first continuation CP */
7465                             /* However, now you can just continue doing stuff */
7466                         }
7467                     } else {
7468                         secT = *(tCE++);
7469                         if(!isContinuation(secT)) { /* This means we have finished with this cont */
7470                             tCE = tCESave;          /* reset the pointer to before continuation */
7471                             tCESave = NULL;
7472                             secT = 0;  /* Fetch a fresh CE before the continuation sequence. */
7473                             continue;
7474                         }
7475                     }
7476                     secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7477                 }
7478
7479                 if(secS == secT) {
7480                     if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7481                         break;
7482                     } else {
7483                         secS = 0; secT = 0;
7484                         continue;
7485                     }
7486                 } else {
7487                     result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7488                     goto commonReturn;
7489                 }
7490             }
7491         }
7492     }
7493
7494     /* doing the case bit */
7495     if(checkCase) {
7496         sCE = sCEs.buf;
7497         tCE = tCEs.buf;
7498         for(;;) {
7499             while((secS & UCOL_REMOVE_CASE) == 0) {
7500                 if(!isContinuation(*sCE++)) {
7501                     secS =*(sCE-1);
7502                     if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7503                         // primary ignorables should not be considered on the case level when the strength is primary
7504                         // otherwise, the CEs stop being well-formed
7505                         secS &= UCOL_TERT_CASE_MASK;
7506                         secS ^= caseSwitch;
7507                     } else {
7508                         secS = 0;
7509                     }
7510                 } else {
7511                     secS = 0;
7512                 }
7513             }
7514
7515             while((secT & UCOL_REMOVE_CASE) == 0) {
7516                 if(!isContinuation(*tCE++)) {
7517                     secT = *(tCE-1);
7518                     if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) {
7519                         // primary ignorables should not be considered on the case level when the strength is primary
7520                         // otherwise, the CEs stop being well-formed
7521                         secT &= UCOL_TERT_CASE_MASK;
7522                         secT ^= caseSwitch;
7523                     } else {
7524                         secT = 0;
7525                     }
7526                 } else {
7527                     secT = 0;
7528                 }
7529             }
7530
7531             if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7532                 result = UCOL_LESS;
7533                 goto commonReturn;
7534             } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7535                 result = UCOL_GREATER;
7536                 goto commonReturn;
7537             }
7538
7539             if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7540                 break;
7541             } else {
7542                 secS = 0;
7543                 secT = 0;
7544             }
7545         }
7546     }
7547
7548     /* Tertiary level */
7549     if(checkTertiary) {
7550         secS = 0;
7551         secT = 0;
7552         sCE = sCEs.buf;
7553         tCE = tCEs.buf;
7554         for(;;) {
7555             while((secS & UCOL_REMOVE_CASE) == 0) {
7556                 secS = *(sCE++) & tertiaryMask;
7557                 if(!isContinuation(secS)) {
7558                     secS ^= caseSwitch;
7559                 } else {
7560                     secS &= UCOL_REMOVE_CASE;
7561                 }
7562             }
7563
7564             while((secT & UCOL_REMOVE_CASE)  == 0) {
7565                 secT = *(tCE++) & tertiaryMask;
7566                 if(!isContinuation(secT)) {
7567                     secT ^= caseSwitch;
7568                 } else {
7569                     secT &= UCOL_REMOVE_CASE;
7570                 }
7571             }
7572
7573             if(secS == secT) {
7574                 if((secS & UCOL_REMOVE_CASE) == 1) {
7575                     break;
7576                 } else {
7577                     secS = 0; secT = 0;
7578                     continue;
7579                 }
7580             } else {
7581                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7582                 goto commonReturn;
7583             }
7584         }
7585     }
7586
7587
7588     if(qShifted /*checkQuad*/) {
7589         UBool sInShifted = TRUE;
7590         UBool tInShifted = TRUE;
7591         secS = 0;
7592         secT = 0;
7593         sCE = sCEs.buf;
7594         tCE = tCEs.buf;
7595         for(;;) {
7596             while((secS == 0 && secS != UCOL_NO_MORE_CES) || (isContinuation(secS) && !sInShifted)) {
7597                 secS = *(sCE++);
7598                 if(isContinuation(secS)) {
7599                     if(!sInShifted) {
7600                         continue;
7601                     }
7602                 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7603                     secS = UCOL_PRIMARYMASK;
7604                     sInShifted = FALSE;
7605                 } else {
7606                     sInShifted = TRUE;
7607                 }
7608             }
7609             secS &= UCOL_PRIMARYMASK;
7610
7611
7612             while((secT == 0 && secT != UCOL_NO_MORE_CES) || (isContinuation(secT) && !tInShifted)) {
7613                 secT = *(tCE++);
7614                 if(isContinuation(secT)) {
7615                     if(!tInShifted) {
7616                         continue;
7617                     }
7618                 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7619                     secT = UCOL_PRIMARYMASK;
7620                     tInShifted = FALSE;
7621                 } else {
7622                     tInShifted = TRUE;
7623                 }
7624             }
7625             secT &= UCOL_PRIMARYMASK;
7626
7627             if(secS == secT) {
7628                 if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7629                     break;
7630                 } else {
7631                     secS = 0; secT = 0;
7632                     continue;
7633                 }
7634             } else {
7635                 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7636                 goto commonReturn;
7637             }
7638         }
7639     } else if(doHiragana && hirResult != UCOL_EQUAL) {
7640         // If we're fine on quaternaries, we might be different
7641         // on Hiragana. This, however, might fail us in shifted.
7642         result = hirResult;
7643         goto commonReturn;
7644     }
7645
7646     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7647     /*  as a tiebreaker if all else is equal.                                */
7648     /*  Getting here  should be quite rare - strings are not identical -     */
7649     /*     that is checked first, but compared == through all other checks.  */
7650     if(checkIdent)
7651     {
7652         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7653         result = ucol_checkIdent(sColl, tColl, TRUE, status);
7654     }
7655
7656 commonReturn:
7657     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7658         if (sCEs.buf != sCEs.localArray ) {
7659             uprv_free(sCEs.buf);
7660         }
7661         if (tCEs.buf != tCEs.localArray ) {
7662             uprv_free(tCEs.buf);
7663         }
7664     }
7665
7666     return result;
7667 }
7668
7669 static UCollationResult
7670 ucol_strcollRegular(const UCollator *coll,
7671                     const UChar *source, int32_t sourceLength,
7672                     const UChar *target, int32_t targetLength,
7673                     UErrorCode *status) {
7674     collIterate sColl, tColl;
7675     // Preparing the context objects for iterating over strings
7676     IInit_collIterate(coll, source, sourceLength, &sColl, status);
7677     IInit_collIterate(coll, target, targetLength, &tColl, status);
7678     if(U_FAILURE(*status)) {
7679         return UCOL_LESS;
7680     }
7681     return ucol_strcollRegular(&sColl, &tColl, status);
7682 }
7683
7684 static inline uint32_t
7685 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7686                           uint32_t CE, const UChar *s, int32_t *index, int32_t len)
7687 {
7688     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7689     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7690     int32_t offset = 1;
7691     UChar schar = 0, tchar = 0;
7692
7693     for(;;) {
7694         if(len == -1) {
7695             if(s[*index] == 0) { // end of string
7696                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7697             } else {
7698                 schar = s[*index];
7699             }
7700         } else {
7701             if(*index == len) {
7702                 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7703             } else {
7704                 schar = s[*index];
7705             }
7706         }
7707
7708         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7709             offset++;
7710         }
7711
7712         if (schar == tchar) {
7713             (*index)++;
7714             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7715         }
7716         else
7717         {
7718             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7719                 return UCOL_BAIL_OUT_CE;
7720             }
7721             // skip completely ignorables
7722             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
7723             if(isZeroCE == 0) { // we have to ignore completely ignorables
7724                 (*index)++;
7725                 continue;
7726             }
7727
7728             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7729         }
7730     }
7731 }
7732
7733
7734 /**
7735  * This is a fast strcoll, geared towards text in Latin-1.
7736  * It supports contractions of size two, French secondaries
7737  * and case switching. You can use it with strengths primary
7738  * to tertiary. It does not support shifted and case level.
7739  * It relies on the table build by setupLatin1Table. If it
7740  * doesn't understand something, it will go to the regular
7741  * strcoll.
7742  */
7743 static UCollationResult
7744 ucol_strcollUseLatin1( const UCollator    *coll,
7745               const UChar        *source,
7746               int32_t            sLen,
7747               const UChar        *target,
7748               int32_t            tLen,
7749               UErrorCode *status)
7750 {
7751     U_ALIGN_CODE(16);
7752     int32_t strength = coll->strength;
7753
7754     int32_t sIndex = 0, tIndex = 0;
7755     UChar sChar = 0, tChar = 0;
7756     uint32_t sOrder=0, tOrder=0;
7757
7758     UBool endOfSource = FALSE;
7759
7760     uint32_t *elements = coll->latinOneCEs;
7761
7762     UBool haveContractions = FALSE; // if we have contractions in our string
7763                                     // we cannot do French secondary
7764
7765     // Do the primary level
7766     for(;;) {
7767         while(sOrder==0) { // this loop skips primary ignorables
7768             // sOrder=getNextlatinOneCE(source);
7769             if(sLen==-1) {   // handling zero terminated strings
7770                 sChar=source[sIndex++];
7771                 if(sChar==0) {
7772                     endOfSource = TRUE;
7773                     break;
7774                 }
7775             } else {        // handling strings with known length
7776                 if(sIndex==sLen) {
7777                     endOfSource = TRUE;
7778                     break;
7779                 }
7780                 sChar=source[sIndex++];
7781             }
7782             if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7783                 //fprintf(stderr, "R");
7784                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7785             }
7786             sOrder = elements[sChar];
7787             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7788                 // specials can basically be either contractions or bail-out signs. If we get anything
7789                 // else, we'll bail out anywasy
7790                 if(getCETag(sOrder) == CONTRACTION_TAG) {
7791                     sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7792                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7793                     // However, if there are contractions in the table, but we always use just one char,
7794                     // we might be able to do French. This should be checked out.
7795                 }
7796                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7797                     //fprintf(stderr, "S");
7798                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7799                 }
7800             }
7801         }
7802
7803         while(tOrder==0) {  // this loop skips primary ignorables
7804             // tOrder=getNextlatinOneCE(target);
7805             if(tLen==-1) {    // handling zero terminated strings
7806                 tChar=target[tIndex++];
7807                 if(tChar==0) {
7808                     if(endOfSource) { // this is different than source loop,
7809                         // as we already know that source loop is done here,
7810                         // so we can either finish the primary loop if both
7811                         // strings are done or anounce the result if only
7812                         // target is done. Same below.
7813                         goto endOfPrimLoop;
7814                     } else {
7815                         return UCOL_GREATER;
7816                     }
7817                 }
7818             } else {          // handling strings with known length
7819                 if(tIndex==tLen) {
7820                     if(endOfSource) {
7821                         goto endOfPrimLoop;
7822                     } else {
7823                         return UCOL_GREATER;
7824                     }
7825                 }
7826                 tChar=target[tIndex++];
7827             }
7828             if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7829                 //fprintf(stderr, "R");
7830                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7831             }
7832             tOrder = elements[tChar];
7833             if(tOrder >= UCOL_NOT_FOUND) {
7834                 // Handling specials, see the comments for source
7835                 if(getCETag(tOrder) == CONTRACTION_TAG) {
7836                     tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
7837                     haveContractions = TRUE;
7838                 }
7839                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
7840                     //fprintf(stderr, "S");
7841                     return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7842                 }
7843             }
7844         }
7845         if(endOfSource) { // source is finished, but target is not, say the result.
7846             return UCOL_LESS;
7847         }
7848
7849         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
7850             sOrder = 0; tOrder = 0;
7851             continue;
7852         } else {
7853             // compare current top bytes
7854             if(((sOrder^tOrder)&0xFF000000)!=0) {
7855                 // top bytes differ, return difference
7856                 if(sOrder < tOrder) {
7857                     return UCOL_LESS;
7858                 } else if(sOrder > tOrder) {
7859                     return UCOL_GREATER;
7860                 }
7861                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
7862                 // since we must return enum value
7863             }
7864
7865             // top bytes match, continue with following bytes
7866             sOrder<<=8;
7867             tOrder<<=8;
7868         }
7869     }
7870
7871 endOfPrimLoop:
7872     // after primary loop, we definitely know the sizes of strings,
7873     // so we set it and use simpler loop for secondaries and tertiaries
7874     sLen = sIndex; tLen = tIndex;
7875     if(strength >= UCOL_SECONDARY) {
7876         // adjust the table beggining
7877         elements += coll->latinOneTableLen;
7878         endOfSource = FALSE;
7879
7880         if(coll->frenchCollation == UCOL_OFF) { // non French
7881             // This loop is a simplified copy of primary loop
7882             // at this point we know that whole strings are latin-1, so we don't
7883             // check for that. We also know that we only have contractions as
7884             // specials.
7885             sIndex = 0; tIndex = 0;
7886             for(;;) {
7887                 while(sOrder==0) {
7888                     if(sIndex==sLen) {
7889                         endOfSource = TRUE;
7890                         break;
7891                     }
7892                     sChar=source[sIndex++];
7893                     sOrder = elements[sChar];
7894                     if(sOrder > UCOL_NOT_FOUND) {
7895                         sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
7896                     }
7897                 }
7898
7899                 while(tOrder==0) {
7900                     if(tIndex==tLen) {
7901                         if(endOfSource) {
7902                             goto endOfSecLoop;
7903                         } else {
7904                             return UCOL_GREATER;
7905                         }
7906                     }
7907                     tChar=target[tIndex++];
7908                     tOrder = elements[tChar];
7909                     if(tOrder > UCOL_NOT_FOUND) {
7910                         tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
7911                     }
7912                 }
7913                 if(endOfSource) {
7914                     return UCOL_LESS;
7915                 }
7916
7917                 if(sOrder == tOrder) {
7918                     sOrder = 0; tOrder = 0;
7919                     continue;
7920                 } else {
7921                     // see primary loop for comments on this
7922                     if(((sOrder^tOrder)&0xFF000000)!=0) {
7923                         if(sOrder < tOrder) {
7924                             return UCOL_LESS;
7925                         } else if(sOrder > tOrder) {
7926                             return UCOL_GREATER;
7927                         }
7928                     }
7929                     sOrder<<=8;
7930                     tOrder<<=8;
7931                 }
7932             }
7933         } else { // French
7934             if(haveContractions) { // if we have contractions, we have to bail out
7935                 // since we don't really know how to handle them here
7936                 return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7937             }
7938             // For French, we go backwards
7939             sIndex = sLen; tIndex = tLen;
7940             for(;;) {
7941                 while(sOrder==0) {
7942                     if(sIndex==0) {
7943                         endOfSource = TRUE;
7944                         break;
7945                     }
7946                     sChar=source[--sIndex];
7947                     sOrder = elements[sChar];
7948                     // don't even look for contractions
7949                 }
7950
7951                 while(tOrder==0) {
7952                     if(tIndex==0) {
7953                         if(endOfSource) {
7954                             goto endOfSecLoop;
7955                         } else {
7956                             return UCOL_GREATER;
7957                         }
7958                     }
7959                     tChar=target[--tIndex];
7960                     tOrder = elements[tChar];
7961                     // don't even look for contractions
7962                 }
7963                 if(endOfSource) {
7964                     return UCOL_LESS;
7965                 }
7966
7967                 if(sOrder == tOrder) {
7968                     sOrder = 0; tOrder = 0;
7969                     continue;
7970                 } else {
7971                     // see the primary loop for comments
7972                     if(((sOrder^tOrder)&0xFF000000)!=0) {
7973                         if(sOrder < tOrder) {
7974                             return UCOL_LESS;
7975                         } else if(sOrder > tOrder) {
7976                             return UCOL_GREATER;
7977                         }
7978                     }
7979                     sOrder<<=8;
7980                     tOrder<<=8;
7981                 }
7982             }
7983         }
7984     }
7985
7986 endOfSecLoop:
7987     if(strength >= UCOL_TERTIARY) {
7988         // tertiary loop is the same as secondary (except no French)
7989         elements += coll->latinOneTableLen;
7990         sIndex = 0; tIndex = 0;
7991         endOfSource = FALSE;
7992         for(;;) {
7993             while(sOrder==0) {
7994                 if(sIndex==sLen) {
7995                     endOfSource = TRUE;
7996                     break;
7997                 }
7998                 sChar=source[sIndex++];
7999                 sOrder = elements[sChar];
8000                 if(sOrder > UCOL_NOT_FOUND) {
8001                     sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8002                 }
8003             }
8004             while(tOrder==0) {
8005                 if(tIndex==tLen) {
8006                     if(endOfSource) {
8007                         return UCOL_EQUAL; // if both strings are at the end, they are equal
8008                     } else {
8009                         return UCOL_GREATER;
8010                     }
8011                 }
8012                 tChar=target[tIndex++];
8013                 tOrder = elements[tChar];
8014                 if(tOrder > UCOL_NOT_FOUND) {
8015                     tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8016                 }
8017             }
8018             if(endOfSource) {
8019                 return UCOL_LESS;
8020             }
8021             if(sOrder == tOrder) {
8022                 sOrder = 0; tOrder = 0;
8023                 continue;
8024             } else {
8025                 if(((sOrder^tOrder)&0xff000000)!=0) {
8026                     if(sOrder < tOrder) {
8027                         return UCOL_LESS;
8028                     } else if(sOrder > tOrder) {
8029                         return UCOL_GREATER;
8030                     }
8031                 }
8032                 sOrder<<=8;
8033                 tOrder<<=8;
8034             }
8035         }
8036     }
8037     return UCOL_EQUAL;
8038 }
8039
8040 /*
8041   Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
8042   null terminated input string takes extra amount of CPU cycles.
8043 */
8044 static UCollationResult
8045 ucol_strcollRegularUTF8(
8046                     const UCollator *coll,
8047                     const char      *source,
8048                     int32_t         sourceLength,
8049                     const char      *target,
8050                     int32_t         targetLength,
8051                     UErrorCode      *status)
8052 {
8053     UCharIterator src;
8054     UCharIterator tgt;
8055
8056     uiter_setUTF8(&src, source, sourceLength);
8057     uiter_setUTF8(&tgt, target, targetLength);
8058
8059     // Preparing the context objects for iterating over strings
8060     collIterate sColl, tColl;
8061     IInit_collIterate(coll, NULL, -1, &sColl, status);
8062     IInit_collIterate(coll, NULL, -1, &tColl, status);
8063     if(U_FAILURE(*status)) {
8064         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8065         return UCOL_EQUAL;
8066     }
8067     // The division for the array length may truncate the array size to
8068     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8069     // for all platforms anyway.
8070     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8071     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8072     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8073
8074     sColl.iterator = &src;
8075     sColl.flags |= UCOL_USE_ITERATOR;
8076     tColl.flags |= UCOL_USE_ITERATOR;
8077     tColl.iterator = &tgt;
8078
8079     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8080         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8081         sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
8082         sColl.flags &= ~UCOL_ITER_NORM;
8083
8084         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8085         tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
8086         tColl.flags &= ~UCOL_ITER_NORM;
8087     }
8088
8089     return ucol_strcollRegular(&sColl, &tColl, status);
8090 }
8091
8092 static inline uint32_t
8093 ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
8094                           uint32_t CE, const char *s, int32_t *index, int32_t len)
8095 {
8096     const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8097     int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8098     int32_t offset = 1;
8099     UChar32 schar = 0, tchar = 0;
8100
8101     for(;;) {
8102         if (*index == len) {
8103             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8104         }
8105         U8_GET_OR_FFFD((const uint8_t*)s, 0, *index, len, schar);
8106         if (len < 0 && schar == 0) {
8107             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8108         }
8109
8110         while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8111             offset++;
8112         }
8113
8114         if (schar == tchar) {
8115             U8_FWD_1(s, *index, len);
8116             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8117         }
8118         else
8119         {
8120             if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8121                 return UCOL_BAIL_OUT_CE;
8122             }
8123             // skip completely ignorables
8124             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
8125             if(isZeroCE == 0) { // we have to ignore completely ignorables
8126                 U8_FWD_1(s, *index, len);
8127                 continue;
8128             }
8129
8130             return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8131         }
8132     }
8133 }
8134
8135 static inline UCollationResult
8136 ucol_strcollUseLatin1UTF8(
8137                 const UCollator *coll,
8138                 const char      *source,
8139                 int32_t         sLen,
8140                 const char      *target,
8141                 int32_t         tLen,
8142                 UErrorCode      *status)
8143 {
8144     U_ALIGN_CODE(16);
8145     int32_t strength = coll->strength;
8146
8147     int32_t sIndex = 0, tIndex = 0;
8148     UChar32 sChar = 0, tChar = 0;
8149     uint32_t sOrder=0, tOrder=0;
8150
8151     UBool endOfSource = FALSE;
8152
8153     uint32_t *elements = coll->latinOneCEs;
8154
8155     UBool haveContractions = FALSE; // if we have contractions in our string
8156                                     // we cannot do French secondary
8157
8158     // Do the primary level
8159     for(;;) {
8160         while(sOrder==0) { // this loop skips primary ignorables
8161             // sOrder=getNextlatinOneCE(source);
8162             if (sIndex == sLen) {
8163                 endOfSource = TRUE;
8164                 break;
8165             }
8166             U8_NEXT_OR_FFFD(source, sIndex, sLen ,sChar);
8167             if (sLen < 0 && sChar == 0) {
8168                 endOfSource = TRUE;
8169                 sLen = sIndex;
8170                 break;
8171             }
8172             if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8173                 //fprintf(stderr, "R");
8174                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8175             }
8176             sOrder = elements[sChar];
8177             if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8178                 // specials can basically be either contractions or bail-out signs. If we get anything
8179                 // else, we'll bail out anywasy
8180                 if(getCETag(sOrder) == CONTRACTION_TAG) {
8181                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8182                     haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8183                     // However, if there are contractions in the table, but we always use just one char,
8184                     // we might be able to do French. This should be checked out.
8185                 }
8186                 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8187                     //fprintf(stderr, "S");
8188                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8189                 }
8190             }
8191         }
8192
8193         while(tOrder==0) {  // this loop skips primary ignorables
8194             // tOrder=getNextlatinOneCE(target);
8195             if (tIndex == tLen) {
8196                 if(endOfSource) {
8197                     goto endOfPrimLoopU8;
8198                 } else {
8199                     return UCOL_GREATER;
8200                 }
8201             }
8202             U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8203             if (tLen < 0 && tChar == 0) {
8204                 if(endOfSource) {
8205                     tLen = tIndex;
8206                     goto endOfPrimLoopU8;
8207                 } else {
8208                     return UCOL_GREATER;
8209                 }
8210             }
8211             if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8212                 //fprintf(stderr, "R");
8213                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8214             }
8215             tOrder = elements[tChar];
8216             if(tOrder >= UCOL_NOT_FOUND) {
8217                 // Handling specials, see the comments for source
8218                 if(getCETag(tOrder) == CONTRACTION_TAG) {
8219                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8220                     haveContractions = TRUE;
8221                 }
8222                 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8223                     //fprintf(stderr, "S");
8224                     return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8225                 }
8226             }
8227         }
8228         if(endOfSource) { // source is finished, but target is not, say the result.
8229             return UCOL_LESS;
8230         }
8231
8232         if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8233             sOrder = 0; tOrder = 0;
8234             continue;
8235         } else {
8236             // compare current top bytes
8237             if(((sOrder^tOrder)&0xFF000000)!=0) {
8238                 // top bytes differ, return difference
8239                 if(sOrder < tOrder) {
8240                     return UCOL_LESS;
8241                 } else if(sOrder > tOrder) {
8242                     return UCOL_GREATER;
8243                 }
8244                 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8245                 // since we must return enum value
8246             }
8247
8248             // top bytes match, continue with following bytes
8249             sOrder<<=8;
8250             tOrder<<=8;
8251         }
8252     }
8253
8254 endOfPrimLoopU8:
8255     // after primary loop, we definitely know the sizes of strings,
8256     // so we set it and use simpler loop for secondaries and tertiaries
8257     sLen = sIndex; tLen = tIndex;
8258     if(strength >= UCOL_SECONDARY) {
8259         // adjust the table beggining
8260         elements += coll->latinOneTableLen;
8261         endOfSource = FALSE;
8262
8263         if(coll->frenchCollation == UCOL_OFF) { // non French
8264             // This loop is a simplified copy of primary loop
8265             // at this point we know that whole strings are latin-1, so we don't
8266             // check for that. We also know that we only have contractions as
8267             // specials.
8268             sIndex = 0; tIndex = 0;
8269             for(;;) {
8270                 while(sOrder==0) {
8271                     if(sIndex==sLen) {
8272                         endOfSource = TRUE;
8273                         break;
8274                     }
8275                     U_ASSERT(sLen >= 0);
8276                     U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8277                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8278                     sOrder = elements[sChar];
8279                     if(sOrder > UCOL_NOT_FOUND) {
8280                         sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8281                     }
8282                 }
8283
8284                 while(tOrder==0) {
8285                     if(tIndex==tLen) {
8286                         if(endOfSource) {
8287                             goto endOfSecLoopU8;
8288                         } else {
8289                             return UCOL_GREATER;
8290                         }
8291                     }
8292                     U_ASSERT(tLen >= 0);
8293                     U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8294                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8295                     tOrder = elements[tChar];
8296                     if(tOrder > UCOL_NOT_FOUND) {
8297                         tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8298                     }
8299                 }
8300                 if(endOfSource) {
8301                     return UCOL_LESS;
8302                 }
8303
8304                 if(sOrder == tOrder) {
8305                     sOrder = 0; tOrder = 0;
8306                     continue;
8307                 } else {
8308                     // see primary loop for comments on this
8309                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8310                         if(sOrder < tOrder) {
8311                             return UCOL_LESS;
8312                         } else if(sOrder > tOrder) {
8313                             return UCOL_GREATER;
8314                         }
8315                     }
8316                     sOrder<<=8;
8317                     tOrder<<=8;
8318                 }
8319             }
8320         } else { // French
8321             if(haveContractions) { // if we have contractions, we have to bail out
8322                 // since we don't really know how to handle them here
8323                 return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
8324             }
8325             // For French, we go backwards
8326             sIndex = sLen; tIndex = tLen;
8327             for(;;) {
8328                 while(sOrder==0) {
8329                     if(sIndex==0) {
8330                         endOfSource = TRUE;
8331                         break;
8332                     }
8333                     U8_PREV_OR_FFFD(source, 0, sIndex, sChar);
8334                     U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8335                     sOrder = elements[sChar];
8336                     // don't even look for contractions
8337                 }
8338
8339                 while(tOrder==0) {
8340                     if(tIndex==0) {
8341                         if(endOfSource) {
8342                             goto endOfSecLoopU8;
8343                         } else {
8344                             return UCOL_GREATER;
8345                         }
8346                     }
8347                     U8_PREV_OR_FFFD(target, 0, tIndex, tChar);
8348                     U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8349                     tOrder = elements[tChar];
8350                     // don't even look for contractions
8351                 }
8352                 if(endOfSource) {
8353                     return UCOL_LESS;
8354                 }
8355
8356                 if(sOrder == tOrder) {
8357                     sOrder = 0; tOrder = 0;
8358                     continue;
8359                 } else {
8360                     // see the primary loop for comments
8361                     if(((sOrder^tOrder)&0xFF000000)!=0) {
8362                         if(sOrder < tOrder) {
8363                             return UCOL_LESS;
8364                         } else if(sOrder > tOrder) {
8365                             return UCOL_GREATER;
8366                         }
8367                     }
8368                     sOrder<<=8;
8369                     tOrder<<=8;
8370                 }
8371             }
8372         }
8373     }
8374
8375 endOfSecLoopU8:
8376     if(strength >= UCOL_TERTIARY) {
8377         // tertiary loop is the same as secondary (except no French)
8378         elements += coll->latinOneTableLen;
8379         sIndex = 0; tIndex = 0;
8380         endOfSource = FALSE;
8381         for(;;) {
8382             while(sOrder==0) {
8383                 if(sIndex==sLen) {
8384                     endOfSource = TRUE;
8385                     break;
8386                 }
8387                 U_ASSERT(sLen >= 0);
8388                 U8_NEXT_OR_FFFD(source, sIndex, sLen, sChar);
8389                 U_ASSERT(sChar >= 0 && sChar <= 0xFF);
8390                 sOrder = elements[sChar];
8391                 if(sOrder > UCOL_NOT_FOUND) {
8392                     sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8393                 }
8394             }
8395             while(tOrder==0) {
8396                 if(tIndex==tLen) {
8397                     if(endOfSource) {
8398                         return UCOL_EQUAL; // if both strings are at the end, they are equal
8399                     } else {
8400                         return UCOL_GREATER;
8401                     }
8402                 }
8403                 U_ASSERT(tLen >= 0);
8404                 U8_NEXT_OR_FFFD(target, tIndex, tLen, tChar);
8405                 U_ASSERT(tChar >= 0 && tChar <= 0xFF);
8406                 tOrder = elements[tChar];
8407                 if(tOrder > UCOL_NOT_FOUND) {
8408                     tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8409                 }
8410             }
8411             if(endOfSource) {
8412                 return UCOL_LESS;
8413             }
8414             if(sOrder == tOrder) {
8415                 sOrder = 0; tOrder = 0;
8416                 continue;
8417             } else {
8418                 if(((sOrder^tOrder)&0xff000000)!=0) {
8419                     if(sOrder < tOrder) {
8420                         return UCOL_LESS;
8421                     } else if(sOrder > tOrder) {
8422                         return UCOL_GREATER;
8423                     }
8424                 }
8425                 sOrder<<=8;
8426                 tOrder<<=8;
8427             }
8428         }
8429     }
8430     return UCOL_EQUAL;
8431 }
8432
8433 U_CAPI UCollationResult U_EXPORT2
8434 ucol_strcollIter( const UCollator    *coll,
8435                  UCharIterator *sIter,
8436                  UCharIterator *tIter,
8437                  UErrorCode         *status)
8438 {
8439     if(!status || U_FAILURE(*status)) {
8440         return UCOL_EQUAL;
8441     }
8442
8443     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8444     UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8445
8446     if (sIter == tIter) {
8447         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8448         return UCOL_EQUAL;
8449     }
8450     if(sIter == NULL || tIter == NULL || coll == NULL) {
8451         *status = U_ILLEGAL_ARGUMENT_ERROR;
8452         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8453         return UCOL_EQUAL;
8454     }
8455
8456     UCollationResult result = UCOL_EQUAL;
8457
8458     // Preparing the context objects for iterating over strings
8459     collIterate sColl, tColl;
8460     IInit_collIterate(coll, NULL, -1, &sColl, status);
8461     IInit_collIterate(coll, NULL, -1, &tColl, status);
8462     if(U_FAILURE(*status)) {
8463         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8464         return UCOL_EQUAL;
8465     }
8466     // The division for the array length may truncate the array size to
8467     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8468     // for all platforms anyway.
8469     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8470     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8471     UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8472
8473     sColl.iterator = sIter;
8474     sColl.flags |= UCOL_USE_ITERATOR;
8475     tColl.flags |= UCOL_USE_ITERATOR;
8476     tColl.iterator = tIter;
8477
8478     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8479         sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8480         sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8481         sColl.flags &= ~UCOL_ITER_NORM;
8482
8483         tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8484         tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8485         tColl.flags &= ~UCOL_ITER_NORM;
8486     }
8487
8488     UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8489
8490     while((sChar = sColl.iterator->next(sColl.iterator)) ==
8491         (tChar = tColl.iterator->next(tColl.iterator))) {
8492             if(sChar == U_SENTINEL) {
8493                 result = UCOL_EQUAL;
8494                 goto end_compare;
8495             }
8496     }
8497
8498     if(sChar == U_SENTINEL) {
8499         tChar = tColl.iterator->previous(tColl.iterator);
8500     }
8501
8502     if(tChar == U_SENTINEL) {
8503         sChar = sColl.iterator->previous(sColl.iterator);
8504     }
8505
8506     sChar = sColl.iterator->previous(sColl.iterator);
8507     tChar = tColl.iterator->previous(tColl.iterator);
8508
8509     if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8510     {
8511         // We are stopped in the middle of a contraction.
8512         // Scan backwards through the == part of the string looking for the start of the contraction.
8513         //   It doesn't matter which string we scan, since they are the same in this region.
8514         do
8515         {
8516             sChar = sColl.iterator->previous(sColl.iterator);
8517             tChar = tColl.iterator->previous(tColl.iterator);
8518         }
8519         while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8520     }
8521
8522
8523     if(U_SUCCESS(*status)) {
8524         result = ucol_strcollRegular(&sColl, &tColl, status);
8525     }
8526
8527 end_compare:
8528     if(sNormIter || tNormIter) {
8529         unorm_closeIter(sNormIter);
8530         unorm_closeIter(tNormIter);
8531     }
8532
8533     UTRACE_EXIT_VALUE_STATUS(result, *status)
8534     return result;
8535 }
8536
8537
8538 /*                                                                      */
8539 /* ucol_strcoll     Main public API string comparison function          */
8540 /*                                                                      */
8541 U_CAPI UCollationResult U_EXPORT2
8542 ucol_strcoll( const UCollator    *coll,
8543               const UChar        *source,
8544               int32_t            sourceLength,
8545               const UChar        *target,
8546               int32_t            targetLength)
8547 {
8548     U_ALIGN_CODE(16);
8549
8550     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8551     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8552         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8553         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8554         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8555     }
8556
8557     if(source == NULL || target == NULL) {
8558         // do not crash, but return. Should have
8559         // status argument to return error.
8560         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8561         return UCOL_EQUAL;
8562     }
8563
8564     /* Quick check if source and target are same strings. */
8565     /* They should either both be NULL terminated or the explicit length should be set on both. */
8566     if (source==target && sourceLength==targetLength) {
8567         UTRACE_EXIT_VALUE(UCOL_EQUAL);
8568         return UCOL_EQUAL;
8569     }
8570
8571     if(coll->delegate != NULL) {
8572       UErrorCode status = U_ZERO_ERROR;
8573       return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
8574     }
8575
8576     /* Scan the strings.  Find:                                                             */
8577     /*    The length of any leading portion that is equal                                   */
8578     /*    Whether they are exactly equal.  (in which case we just return)                   */
8579     const UChar    *pSrc    = source;
8580     const UChar    *pTarg   = target;
8581     int32_t        equalLength;
8582
8583     if (sourceLength == -1 && targetLength == -1) {
8584         // Both strings are null terminated.
8585         //    Scan through any leading equal portion.
8586         while (*pSrc == *pTarg && *pSrc != 0) {
8587             pSrc++;
8588             pTarg++;
8589         }
8590         if (*pSrc == 0 && *pTarg == 0) {
8591             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8592             return UCOL_EQUAL;
8593         }
8594         equalLength = (int32_t)(pSrc - source);
8595     }
8596     else
8597     {
8598         // One or both strings has an explicit length.
8599         const UChar    *pSrcEnd = source + sourceLength;
8600         const UChar    *pTargEnd = target + targetLength;
8601
8602         // Scan while the strings are bitwise ==, or until one is exhausted.
8603         for (;;) {
8604             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8605                 break;
8606             }
8607             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8608                 break;
8609             }
8610             if (*pSrc != *pTarg) {
8611                 break;
8612             }
8613             pSrc++;
8614             pTarg++;
8615         }
8616         equalLength = (int32_t)(pSrc - source);
8617
8618         // If we made it all the way through both strings, we are done.  They are ==
8619         if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8620             (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))     /* and also at end of dest string                  */
8621         {
8622             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8623             return UCOL_EQUAL;
8624         }
8625     }
8626     if (equalLength > 0) {
8627         /* There is an identical portion at the beginning of the two strings.        */
8628         /*   If the identical portion ends within a contraction or a comibining      */
8629         /*   character sequence, back up to the start of that sequence.              */
8630
8631         // These values should already be set by the code above.
8632         //pSrc  = source + equalLength;        /* point to the first differing chars   */
8633         //pTarg = target + equalLength;
8634         if ((pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll)) ||
8635             (pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)))
8636         {
8637             // We are stopped in the middle of a contraction.
8638             // Scan backwards through the == part of the string looking for the start of the contraction.
8639             //   It doesn't matter which string we scan, since they are the same in this region.
8640             do
8641             {
8642                 equalLength--;
8643                 pSrc--;
8644             }
8645             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8646         }
8647
8648         source += equalLength;
8649         target += equalLength;
8650         if (sourceLength > 0) {
8651             sourceLength -= equalLength;
8652         }
8653         if (targetLength > 0) {
8654             targetLength -= equalLength;
8655         }
8656     }
8657
8658     UErrorCode status = U_ZERO_ERROR;
8659     UCollationResult returnVal;
8660     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8661         returnVal = ucol_strcollRegular(coll, source, sourceLength, target, targetLength, &status);
8662     } else {
8663         returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8664     }
8665     UTRACE_EXIT_VALUE(returnVal);
8666     return returnVal;
8667 }
8668
8669 U_CAPI UCollationResult U_EXPORT2
8670 ucol_strcollUTF8(
8671         const UCollator *coll,
8672         const char      *source,
8673         int32_t         sourceLength,
8674         const char      *target,
8675         int32_t         targetLength,
8676         UErrorCode      *status)
8677 {
8678     U_ALIGN_CODE(16);
8679
8680     UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
8681     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8682         UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8683         UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
8684         UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
8685     }
8686
8687     if (U_FAILURE(*status)) {
8688         /* do nothing */
8689         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8690         return UCOL_EQUAL;
8691     }
8692
8693     if(source == NULL || target == NULL) {
8694         *status = U_ILLEGAL_ARGUMENT_ERROR;
8695         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8696         return UCOL_EQUAL;
8697     }
8698
8699     /* Quick check if source and target are same strings. */
8700     /* They should either both be NULL terminated or the explicit length should be set on both. */
8701     if (source==target && sourceLength==targetLength) {
8702         UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8703         return UCOL_EQUAL;
8704     }
8705
8706     if(coll->delegate != NULL) {
8707         return ((const Collator*)coll->delegate)->compareUTF8(
8708             StringPiece(source, (sourceLength < 0) ? uprv_strlen(source) : sourceLength),
8709             StringPiece(target, (targetLength < 0) ? uprv_strlen(target) : targetLength),
8710             *status);
8711     }
8712
8713     /* Scan the strings.  Find:                                                             */
8714     /*    The length of any leading portion that is equal                                   */
8715     /*    Whether they are exactly equal.  (in which case we just return)                   */
8716     const char  *pSrc = source;
8717     const char  *pTarg = target;
8718     UBool       bSrcLimit = FALSE;
8719     UBool       bTargLimit = FALSE;
8720
8721     if (sourceLength == -1 && targetLength == -1) {
8722         // Both strings are null terminated.
8723         //    Scan through any leading equal portion.
8724         while (*pSrc == *pTarg && *pSrc != 0) {
8725             pSrc++;
8726             pTarg++;
8727         }
8728         if (*pSrc == 0 && *pTarg == 0) {
8729             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8730             return UCOL_EQUAL;
8731         }
8732         bSrcLimit = (*pSrc == 0);
8733         bTargLimit = (*pTarg == 0);
8734     }
8735     else
8736     {
8737         // One or both strings has an explicit length.
8738         const char *pSrcEnd = source + sourceLength;
8739         const char *pTargEnd = target + targetLength;
8740
8741         // Scan while the strings are bitwise ==, or until one is exhausted.
8742         for (;;) {
8743             if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8744                 break;
8745             }
8746             if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8747                 break;
8748             }
8749             if (*pSrc != *pTarg) {
8750                 break;
8751             }
8752             pSrc++;
8753             pTarg++;
8754         }
8755         bSrcLimit = (pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0));
8756         bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
8757
8758         // If we made it all the way through both strings, we are done.  They are ==
8759         if (bSrcLimit &&    /* At end of src string, however it was specified. */
8760             bTargLimit)     /* and also at end of dest string                  */
8761         {
8762             UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status);
8763             return UCOL_EQUAL;
8764         }
8765     }
8766
8767     U_ASSERT(!(bSrcLimit && bTargLimit));
8768
8769     int32_t    equalLength = pSrc - source;
8770     UBool       bSawNonLatin1 = FALSE;
8771
8772     if (equalLength > 0) {
8773         // Align position to the start of UTF-8 code point.
8774         if (bTargLimit) {
8775             U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
8776         } else {
8777             U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
8778         }
8779         pSrc = source + equalLength;
8780         pTarg = target + equalLength;
8781     }
8782
8783     if (equalLength > 0) {
8784         /* There is an identical portion at the beginning of the two strings.        */
8785         /*   If the identical portion ends within a contraction or a comibining      */
8786         /*   character sequence, back up to the start of that sequence.              */
8787         UBool bUnsafeCP = FALSE;
8788         UChar32 uc32 = -1;
8789
8790         if (!bSrcLimit) {
8791             U8_GET_OR_FFFD((const uint8_t*)source, 0, equalLength, sourceLength, uc32);
8792             if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8793                 bUnsafeCP = TRUE;
8794             }
8795             bSawNonLatin1 |= (uc32 > 0xff);
8796         }
8797         if (!bTargLimit) {
8798             U8_GET_OR_FFFD((const uint8_t*)target, 0, equalLength, targetLength, uc32);
8799             if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
8800                 bUnsafeCP = TRUE;
8801             }
8802             bSawNonLatin1 |= (uc32 > 0xff);
8803         }
8804
8805         if (bUnsafeCP) {
8806             while (equalLength > 0) {
8807                 // We are stopped in the middle of a contraction.
8808                 // Scan backwards through the == part of the string looking for the start of the contraction.
8809                 //   It doesn't matter which string we scan, since they are the same in this region.
8810                 U8_PREV_OR_FFFD((uint8_t*)source, 0, equalLength, uc32);
8811                 bSawNonLatin1 |= (uc32 > 0xff);
8812                 if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
8813                     break;
8814                 }
8815             }
8816         }
8817         source += equalLength;
8818         target += equalLength;
8819         if (sourceLength > 0) {
8820             sourceLength -= equalLength;
8821         }
8822         if (targetLength > 0) {
8823             targetLength -= equalLength;
8824         }
8825     } else {
8826         // Lead byte of Latin 1 character is 0x00 - 0xC3
8827         bSawNonLatin1 = (source && (sourceLength != 0) && (uint8_t)*source > 0xc3);
8828         bSawNonLatin1 |= (target && (targetLength != 0) && (uint8_t)*target > 0xc3);
8829     }
8830
8831     UCollationResult returnVal;
8832
8833     if(!coll->latinOneUse || bSawNonLatin1) {
8834         returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
8835     } else {
8836         returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
8837     }
8838     UTRACE_EXIT_VALUE_STATUS(returnVal, *status);
8839     return returnVal;
8840 }
8841
8842
8843 /* convenience function for comparing strings */
8844 U_CAPI UBool U_EXPORT2
8845 ucol_greater(    const    UCollator        *coll,
8846         const    UChar            *source,
8847         int32_t            sourceLength,
8848         const    UChar            *target,
8849         int32_t            targetLength)
8850 {
8851     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8852         == UCOL_GREATER);
8853 }
8854
8855 /* convenience function for comparing strings */
8856 U_CAPI UBool U_EXPORT2
8857 ucol_greaterOrEqual(    const    UCollator    *coll,
8858             const    UChar        *source,
8859             int32_t        sourceLength,
8860             const    UChar        *target,
8861             int32_t        targetLength)
8862 {
8863     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8864         != UCOL_LESS);
8865 }
8866
8867 /* convenience function for comparing strings */
8868 U_CAPI UBool U_EXPORT2
8869 ucol_equal(        const    UCollator        *coll,
8870             const    UChar            *source,
8871             int32_t            sourceLength,
8872             const    UChar            *target,
8873             int32_t            targetLength)
8874 {
8875     return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8876         == UCOL_EQUAL);
8877 }
8878
8879 U_CAPI void U_EXPORT2
8880 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
8881     if(coll && coll->UCA) {
8882         uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
8883     }
8884 }
8885
8886 #endif /* #if !UCONFIG_NO_COLLATION */