icuSources/i18n/ucol.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 1996-2003, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  ucol.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 * Modification history
  12 * Date        Name      Comments
  13 * 1996-1999   various members of ICU team maintained C API for collation framework
  14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
  15 * 03/01/2001  synwee    Added maxexpansion functionality.
  16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "uassert.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/uloc.h"
  25 #include "unicode/coll.h"
  26 #include "unicode/tblcoll.h"
  27 #include "unicode/coleitr.h"
  28 #include "unicode/unorm.h"
  29 #include "unicode/udata.h"
  30 #include "unicode/uchar.h"
  31 #include "unicode/caniter.h"
  32
  33 #include "ucol_bld.h"
  34 #include "ucol_imp.h"
  35 #include "ucol_tok.h"
  36 #include "ucol_elm.h"
  37 #include "bocsu.h"
  38
  39 #include "unormimp.h"
  40 #include "unorm_it.h"
  41 #include "uresimp.h"
  42 #include "umutex.h"
  43 #include "uhash.h"
  44 #include "ucln_in.h"
  45 #include "cstring.h"
  46
  47 #ifdef UCOL_DEBUG
  48 #include <stdio.h>
  49 #endif
  50
  51 U_NAMESPACE_USE
  52
  53 /* added by synwee for trie manipulation*/
  54 #define STAGE_1_SHIFT_            10
  55 #define STAGE_2_SHIFT_            4
  56 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
  57 #define STAGE_3_MASK_             0xF
  58 #define LAST_BYTE_MASK_           0xFF
  59 #define SECOND_LAST_BYTE_SHIFT_   8
  60
  61 #define ZERO_CC_LIMIT_            0xC0
  62
  63 static UCollator* UCA = NULL;
  64 static UCAConstants *UCAconsts = NULL;
  65 static UDataMemory* UCA_DATA_MEM = NULL;
  66
  67
  68 U_CDECL_BEGIN
  69 static UBool U_CALLCONV
  70 isAcceptableUCA(void * /*context*/,
  71              const char * /*type*/, const char * /*name*/,
  72              const UDataInfo *pInfo){
  73   /* context, type & name are intentionally not used */
  74     if( pInfo->size>=20 &&
  75         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  76         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  77         pInfo->dataFormat[0]==ucaDataInfo.dataFormat[0] &&   /* dataFormat="UCol" */
  78         pInfo->dataFormat[1]==ucaDataInfo.dataFormat[1] &&
  79         pInfo->dataFormat[2]==ucaDataInfo.dataFormat[2] &&
  80         pInfo->dataFormat[3]==ucaDataInfo.dataFormat[3] &&
  81         pInfo->formatVersion[0]==ucaDataInfo.formatVersion[0] &&
  82         pInfo->formatVersion[1]>=ucaDataInfo.formatVersion[1]// &&
  83         //pInfo->formatVersion[1]==ucaDataInfo.formatVersion[1] &&
  84         //pInfo->formatVersion[2]==ucaDataInfo.formatVersion[2] && // Too harsh
  85         //pInfo->formatVersion[3]==ucaDataInfo.formatVersion[3] && // Too harsh
  86         ) {
  87         UVersionInfo UCDVersion;
  88         u_getUnicodeVersion(UCDVersion);
  89         if(pInfo->dataVersion[0]==UCDVersion[0] &&
  90           pInfo->dataVersion[1]==UCDVersion[1]) { // &&
  91         //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
  92         //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
  93           return TRUE;
  94         } else {
  95           return FALSE;
  96         }
  97     } else {
  98         return FALSE;
  99     }
 100 }
 101
 102
 103 static int32_t U_CALLCONV
 104 _getFoldingOffset(uint32_t data) {
 105     return (int32_t)(data&0xFFFFFF);
 106 }
 107
 108 U_CDECL_END
 109
 110 static
 111 inline void  IInit_collIterate(const UCollator *collator, const UChar *sourceString,
 112                               int32_t sourceLen, collIterate *s) {
 113     (s)->string = (s)->pos = (UChar *)(sourceString);
 114     (s)->origFlags = 0;
 115     (s)->flags = 0;
 116     if (sourceLen >= 0) {
 117         s->flags |= UCOL_ITER_HASLEN;
 118         (s)->endp = (UChar *)sourceString+sourceLen;
 119     }
 120     else {
 121         /* change to enable easier checking for end of string for fcdpositon */
 122         (s)->endp = NULL;
 123     }
 124     (s)->CEpos = (s)->toReturn = (s)->CEs;
 125     (s)->writableBuffer = (s)->stackWritableBuffer;
 126     (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
 127     (s)->coll = (collator);
 128     (s)->fcdPosition = 0;
 129     if(collator->normalizationMode == UCOL_ON) {
 130         (s)->flags |= UCOL_ITER_NORM;
 131     }
 132     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
 133       (s)->flags |= UCOL_HIRAGANA_Q;
 134     }
 135     (s)->iterator = NULL;
 136     //(s)->iteratorIndex = 0;
 137 }
 138
 139 U_CAPI void  U_EXPORT2
 140 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
 141                              int32_t sourceLen, collIterate *s){
 142     /* Out-of-line version for use from other files. */
 143     IInit_collIterate(collator, sourceString, sourceLen, s);
 144 }
 145
 146
 147 /**
 148 * Backup the state of the collIterate struct data
 149 * @param data collIterate to backup
 150 * @param backup storage
 151 */
 152 static
 153 inline void backupState(const collIterate *data, collIterateState *backup)
 154 {
 155     backup->fcdPosition = data->fcdPosition;
 156     backup->flags       = data->flags;
 157     backup->origFlags   = data->origFlags;
 158     backup->pos         = data->pos;
 159     backup->bufferaddress = data->writableBuffer;
 160     backup->buffersize    = data->writableBufSize;
 161     if(data->iterator != NULL) {
 162       //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
 163       backup->iteratorIndex = data->iterator->getState(data->iterator);
 164       // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
 165       backup->iteratorMove = 0;
 166       if(backup->iteratorIndex == UITER_NO_STATE) {
 167         while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
 168           backup->iteratorMove++;
 169           data->iterator->move(data->iterator, -1, UITER_CURRENT);
 170         }
 171         data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 172       }
 173     }
 174 }
 175
 176 /**
 177 * Loads the state into the collIterate struct data
 178 * @param data collIterate to backup
 179 * @param backup storage
 180 * @param forwards boolean to indicate if forwards iteration is used,
 181 *        false indicates backwards iteration
 182 */
 183 static
 184 inline void loadState(collIterate *data, const collIterateState *backup,
 185                       UBool        forwards)
 186 {
 187   UErrorCode status = U_ZERO_ERROR;
 188     data->flags       = backup->flags;
 189     data->origFlags   = backup->origFlags;
 190     if(data->iterator != NULL) {
 191       //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
 192       data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
 193       if(backup->iteratorMove != 0) {
 194         data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 195       }
 196     }
 197     data->pos         = backup->pos;
 198     if ((data->flags & UCOL_ITER_INNORMBUF) &&
 199         data->writableBuffer != backup->bufferaddress) {
 200         /*
 201         this is when a new buffer has been reallocated and we'll have to
 202         calculate the new position.
 203         note the new buffer has to contain the contents of the old buffer.
 204         */
 205         if (forwards) {
 206             data->pos = data->writableBuffer +
 207                                          (data->pos - backup->bufferaddress);
 208         }
 209         else {
 210             /* backwards direction */
 211             uint32_t temp = backup->buffersize -
 212                                   (data->pos - backup->bufferaddress);
 213             data->pos = data->writableBuffer + (data->writableBufSize - temp);
 214         }
 215     }
 216     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
 217         /*
 218         this is alittle tricky.
 219         if we are initially not in the normalization buffer, even if we
 220         normalize in the later stage, the data in the buffer will be
 221         ignored, since we skip back up to the data string.
 222         however if we are already in the normalization buffer, any
 223         further normalization will pull data into the normalization
 224         buffer and modify the fcdPosition.
 225         since we are keeping the data in the buffer for use, the
 226         fcdPosition can not be reverted back.
 227         arrgghh....
 228         */
 229         data->fcdPosition = backup->fcdPosition;
 230     }
 231 }
 232
 233
 234 /*
 235 * collIter_eos()
 236 *     Checks for a collIterate being positioned at the end of
 237 *     its source string.
 238 *
 239 */
 240 static
 241 inline UBool collIter_eos(collIterate *s) {
 242     if(s->flags & UCOL_USE_ITERATOR) {
 243       return !(s->iterator->hasNext(s->iterator));
 244     }
 245     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
 246         // Null terminated string, but not at null, so not at end.
 247         //   Whether in main or normalization buffer doesn't matter.
 248         return FALSE;
 249     }
 250
 251     // String with length.  Can't be in normalization buffer, which is always
 252     //  null termintated.
 253     if (s->flags & UCOL_ITER_HASLEN) {
 254         return (s->pos == s->endp);
 255     }
 256
 257     // We are at a null termination, could be either normalization buffer or main string.
 258     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
 259         // At null at end of main string.
 260         return TRUE;
 261     }
 262
 263     // At null at end of normalization buffer.  Need to check whether there there are
 264     //   any characters left in the main buffer.
 265     if(s->origFlags & UCOL_USE_ITERATOR) {
 266       return !(s->iterator->hasNext(s->iterator));
 267     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
 268         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
 269         return (*s->fcdPosition == 0);
 270     }
 271     else {
 272         // Main string with an end pointer.
 273         return s->fcdPosition == s->endp;
 274     }
 275 }
 276
 277 /*
 278 * collIter_bos()
 279 *     Checks for a collIterate being positioned at the start of
 280 *     its source string.
 281 *
 282 */
 283 static
 284 inline UBool collIter_bos(collIterate *source) {
 285   // if we're going backwards, we need to know whether there is more in the
 286   // iterator, even if we are in the side buffer
 287   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 288     return !source->iterator->hasPrevious(source->iterator);
 289   }
 290   if (source->pos <= source->string ||
 291       ((source->flags & UCOL_ITER_INNORMBUF) &&
 292       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
 293     return TRUE;
 294   }
 295   return FALSE;
 296 }
 297
 298 static
 299 inline UBool collIter_SimpleBos(collIterate *source) {
 300   // if we're going backwards, we need to know whether there is more in the
 301   // iterator, even if we are in the side buffer
 302   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 303     return !source->iterator->hasPrevious(source->iterator);
 304   }
 305   if (source->pos == source->string) {
 306     return TRUE;
 307   }
 308   return FALSE;
 309 }
 310     //return (data->pos == data->string) ||
 311
 312
 313 /**
 314 * Checks and free writable buffer if it is not the original stack buffer
 315 * in collIterate. This function does not reassign the writable buffer.
 316 * @param data collIterate struct to determine and free the writable buffer
 317 */
 318 static
 319 inline void freeHeapWritableBuffer(collIterate *data)
 320 {
 321     if (data->writableBuffer != data->stackWritableBuffer) {
 322         uprv_free(data->writableBuffer);
 323     }
 324 }
 325
 326
 327 /****************************************************************************/
 328 /* Following are the open/close functions                                   */
 329 /*                                                                          */
 330 /****************************************************************************/
 331 static UCollator*
 332 tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) {
 333   int32_t rulesLen = 0;
 334   const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status);
 335   return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status);
 336
 337 }
 338
 339
 340 U_CAPI UCollator*
 341 ucol_open(const char *loc,
 342                   UErrorCode *status)
 343 {
 344   UCollator *result = NULL;
 345   if (status && U_SUCCESS(*status)) {
 346           result = Collator::createUCollator(loc, status);
 347         if (result) {
 348           return result;
 349         }
 350   }
 351   return ucol_open_internal(loc, status);
 352 }
 353
 354 // API in ucol_imp.h
 355
 356 U_CFUNC UCollator*
 357 ucol_open_internal(const char *loc,
 358                            UErrorCode *status)
 359 {
 360   ucol_initUCA(status);
 361
 362   /* New version */
 363   if(U_FAILURE(*status)) return 0;
 364
 365   UCollator *result = NULL;
 366   UResourceBundle *b = ures_open(NULL, loc, status);
 367   UResourceBundle *collElem = ures_getByKey(b, "CollationElements", NULL, status);
 368   UResourceBundle *binary = NULL;
 369   UErrorCode binaryStatus = U_ZERO_ERROR;
 370
 371   if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */
 372     *status = U_USING_DEFAULT_WARNING;
 373     result = ucol_initCollator(UCA->image, result, status);
 374     // if we use UCA, real locale is root
 375     result->rb = ures_open(NULL, "", status);
 376     result->elements = ures_open(NULL, "", status);
 377     if(U_FAILURE(*status)) {
 378       goto clean;
 379     }
 380     ures_close(b);
 381     result->hasRealData = FALSE;
 382   } else if(U_SUCCESS(*status)) {
 383     binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus);
 384
 385     if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */
 386       binary = NULL;
 387       result = tryOpeningFromRules(collElem, status);
 388       if(U_FAILURE(*status)) {
 389         goto clean;
 390       }
 391     } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */
 392       int32_t len = 0;
 393       const uint8_t *inData = ures_getBinary(binary, &len, status);
 394       UCATableHeader *colData = (UCATableHeader *)inData;
 395       if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
 396         uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
 397         colData->version[0] != UCOL_BUILDER_VERSION) {
 398         *status = U_DIFFERENT_UCA_VERSION;
 399         result = tryOpeningFromRules(collElem, status);
 400       } else {
 401         if(U_FAILURE(*status)){
 402           goto clean;
 403         }
 404         if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
 405           result = ucol_initCollator((const UCATableHeader *)inData, result, status);
 406           if(U_FAILURE(*status)){
 407             goto clean;
 408           }
 409           result->hasRealData = TRUE;
 410         } else {
 411           result = ucol_initCollator(UCA->image, result, status);
 412           ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status);
 413           if(U_FAILURE(*status)){
 414             goto clean;
 415           }
 416           result->hasRealData = FALSE;
 417         }
 418         result->freeImageOnClose = FALSE;
 419       }
 420     }
 421     result->rb = b;
 422     result->elements = collElem;
 423   } else { /* There is another error, and we're just gonna clean up */
 424 clean:
 425     ures_close(b);
 426     ures_close(collElem);
 427     ures_close(binary);
 428     return NULL;
 429   }
 430
 431   result->validLocale = NULL; // default is to use rb info
 432
 433   if(loc == NULL) {
 434     loc = ures_getLocale(result->rb, status);
 435   }
 436   result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char));
 437   /* test for NULL */
 438   if (result->requestedLocale == NULL) {
 439         *status = U_MEMORY_ALLOCATION_ERROR;
 440         ures_close(b); // ??? appears needed
 441     ures_close(collElem);
 442     ures_close(binary); // ??? appears needed
 443         return NULL;
 444   }
 445   uprv_strcpy(result->requestedLocale, loc);
 446
 447   ures_close(binary);
 448   return result;
 449 }
 450
 451 U_CAPI void U_EXPORT2
 452 ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt)
 453 {
 454   if (coll) {
 455     if (coll->validLocale) {
 456       uprv_free(coll->validLocale);
 457         }
 458     coll->validLocale = validLocaleToAdopt;
 459     if (coll->requestedLocale) { // should always have
 460       uprv_free(coll->requestedLocale);
 461         }
 462     coll->requestedLocale = requestedLocaleToAdopt;
 463   }
 464 }
 465
 466 U_CAPI void U_EXPORT2
 467 ucol_close(UCollator *coll)
 468 {
 469   if(coll != NULL) {
 470         // these are always owned by each UCollator struct,
 471         // so we always free them
 472     if(coll->validLocale != NULL) {
 473       uprv_free(coll->validLocale);
 474     }
 475     if(coll->requestedLocale != NULL) {
 476       uprv_free(coll->requestedLocale);
 477     }
 478
 479     /* Here, it would be advisable to close: */
 480     /* - UData for UCA (unless we stuff it in the root resb */
 481     /* Again, do we need additional housekeeping... HMMM! */
 482     if(coll->freeOnClose == FALSE){
 483       return; /* for safeClone, if freeOnClose is FALSE,
 484               don't free the other instance data */
 485     }
 486     if(coll->freeOptionsOnClose != FALSE) {
 487       if(coll->options != NULL) {
 488         uprv_free(coll->options);
 489       }
 490     }
 491     if(coll->mapping != NULL) {
 492         /*ucmpe32_close(coll->mapping);*/
 493       uprv_free(coll->mapping);
 494     }
 495     if(coll->rules != NULL && coll->freeRulesOnClose) {
 496       uprv_free((UChar *)coll->rules);
 497     }
 498     if(coll->rb != NULL) { /* pointing to read-only memory */
 499       ures_close(coll->rb);
 500     }
 501     if(coll->freeImageOnClose == TRUE) {
 502       uprv_free((UCATableHeader *)coll->image);
 503     }
 504     if(coll->elements != NULL) {
 505       ures_close(coll->elements);
 506     }
 507     if(coll->latinOneCEs != NULL) {
 508       uprv_free(coll->latinOneCEs);
 509     }
 510     uprv_free(coll);
 511   }
 512 }
 513
 514 U_CAPI UCollator* U_EXPORT2
 515 ucol_openRules( const UChar        *rules,
 516                 int32_t            rulesLength,
 517                 UColAttributeValue normalizationMode,
 518                 UCollationStrength strength,
 519                 UParseError        *parseError,
 520                 UErrorCode         *status)
 521 {
 522   uint32_t listLen = 0;
 523   UColTokenParser src;
 524   UColAttributeValue norm;
 525   UParseError tErr;
 526
 527   if(status == NULL || U_FAILURE(*status)){
 528     return 0;
 529   }
 530
 531   if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) {
 532     *status = U_ILLEGAL_ARGUMENT_ERROR;
 533     return 0;
 534   }
 535
 536   if(rulesLength == -1) {
 537     rulesLength = u_strlen(rules);
 538   }
 539
 540   if(parseError == NULL){
 541     parseError = &tErr;
 542   }
 543
 544   switch(normalizationMode) {
 545   case UCOL_OFF:
 546   case UCOL_ON:
 547   case UCOL_DEFAULT:
 548     norm = normalizationMode;
 549     break;
 550   default:
 551     *status = U_ILLEGAL_ARGUMENT_ERROR;
 552     return 0;
 553   }
 554
 555   ucol_initUCA(status);
 556
 557   if(U_FAILURE(*status)){
 558     return NULL;
 559   }
 560
 561   ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status);
 562   listLen = ucol_tok_assembleTokenList(&src,parseError, status);
 563
 564   if(U_FAILURE(*status)) {
 565     /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
 566     /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
 567     /* so something might be done here... or on lower level */
 568 #ifdef UCOL_DEBUG
 569     if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
 570       fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
 571     } else {
 572       fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
 573     }
 574 #endif
 575     ucol_tok_closeTokenList(&src);
 576     return NULL;
 577   }
 578   UCollator *result = NULL;
 579   UCATableHeader *table = NULL;
 580
 581   if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */
 582     /* also, if we wanted to remove some contractions, we should make a tailoring */
 583     table = ucol_assembleTailoringTable(&src, status);
 584     if(U_SUCCESS(*status)) {
 585       // builder version
 586       table->version[0] = UCOL_BUILDER_VERSION;
 587       // no tailoring information on this level
 588       table->version[1] = table->version[2] = table->version[3] = 0;
 589       // set UCD version
 590       u_getUnicodeVersion(table->UCDVersion);
 591       // set UCA version
 592       uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo));
 593       result = ucol_initCollator(table,0,status);
 594       result->hasRealData = TRUE;
 595       result->freeImageOnClose = TRUE;
 596     }
 597   } else { /* no rules, but no error either */
 598     // must be only options
 599     // We will init the collator from UCA
 600     result = ucol_initCollator(UCA->image,0,status);
 601     // And set only the options
 602     UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
 603     /* test for NULL */
 604     if (opts == NULL) {
 605         *status = U_MEMORY_ALLOCATION_ERROR;
 606         goto cleanup;
 607     }
 608     uprv_memcpy(opts, src.opts, sizeof(UColOptionSet));
 609     ucol_setOptionsFromHeader(result, opts, status);
 610     result->freeOptionsOnClose = TRUE;
 611     result->hasRealData = FALSE;
 612     result->freeImageOnClose = FALSE;
 613   }
 614
 615   if(U_SUCCESS(*status)) {
 616     UChar *newRules;
 617     result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
 618     if(rulesLength > 0) {
 619       newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR);
 620       /* test for NULL */
 621       if (newRules == NULL) {
 622           *status = U_MEMORY_ALLOCATION_ERROR;
 623           goto cleanup;
 624       }
 625       uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR);
 626       newRules[rulesLength]=0;
 627       result->rules = newRules;
 628       result->rulesLength = rulesLength;
 629       result->freeRulesOnClose = TRUE;
 630     }
 631     result->rb = NULL;
 632     result->elements = NULL;
 633     result->validLocale = NULL;
 634     result->requestedLocale = NULL;
 635     ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
 636     ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status);
 637   } else {
 638 cleanup:
 639     if(result != NULL) {
 640       ucol_close(result);
 641     } else {
 642       if(table != NULL) {
 643         uprv_free(table);
 644       }
 645     }
 646     result = NULL;
 647   }
 648
 649   ucol_tok_closeTokenList(&src);
 650
 651   return result;
 652 }
 653
 654 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
 655 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
 656 U_CAPI uint8_t* U_EXPORT2
 657 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
 658 {
 659   uint8_t *result = NULL;
 660   if(U_FAILURE(*status)) {
 661     return NULL;
 662   }
 663   if(coll->hasRealData == TRUE) {
 664     *length = coll->image->size;
 665     result = (uint8_t *)uprv_malloc(*length);
 666     /* test for NULL */
 667     if (result == NULL) {
 668         *status = U_MEMORY_ALLOCATION_ERROR;
 669         return NULL;
 670     }
 671     uprv_memcpy(result, coll->image, *length);
 672   } else {
 673     *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 674     result = (uint8_t *)uprv_malloc(*length);
 675     /* test for NULL */
 676     if (result == NULL) {
 677         *status = U_MEMORY_ALLOCATION_ERROR;
 678         return NULL;
 679     }
 680     uprv_memcpy(result, UCA->image, sizeof(UCATableHeader));
 681     uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 682   }
 683   return result;
 684 }
 685
 686 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 687   if(U_FAILURE(*status)) {
 688     return;
 689   }
 690     result->caseFirst = (UColAttributeValue)opts->caseFirst;
 691     result->caseLevel = (UColAttributeValue)opts->caseLevel;
 692     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
 693     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
 694     result->strength = (UColAttributeValue)opts->strength;
 695     result->variableTopValue = opts->variableTopValue;
 696     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
 697     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
 698     result->numericCollation = (UColAttributeValue)opts->numericCollation;
 699
 700     result->caseFirstisDefault = TRUE;
 701     result->caseLevelisDefault = TRUE;
 702     result->frenchCollationisDefault = TRUE;
 703     result->normalizationModeisDefault = TRUE;
 704     result->strengthisDefault = TRUE;
 705     result->variableTopValueisDefault = TRUE;
 706     result->hiraganaQisDefault = TRUE;
 707     result->numericCollationisDefault = TRUE;
 708
 709     ucol_updateInternalState(result, status);
 710
 711     result->options = opts;
 712 }
 713
 714 #if 0
 715 // doesn't look like anybody is using this
 716 void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 717   if(U_FAILURE(*status)) {
 718     return;
 719   }
 720     opts->caseFirst = result->caseFirst;
 721     opts->caseLevel = result->caseLevel;
 722     opts->frenchCollation = result->frenchCollation;
 723     opts->normalizationMode = result->normalizationMode;
 724     opts->strength = result->strength;
 725     opts->variableTopValue = result->variableTopValue;
 726     opts->alternateHandling = result->alternateHandling;
 727     opts->hiraganaQ = result->hiraganaQ;
 728     opts->numericCollation = result->numericCollation;
 729 }
 730 #endif
 731
 732 static const uint16_t *fcdTrieIndex=NULL;
 733
 734
 735 /**
 736 * Approximate determination if a character is at a contraction end.
 737 * Guaranteed to be TRUE if a character is at the end of a contraction,
 738 * otherwise it is not deterministic.
 739 * @param c character to be determined
 740 * @param coll collator
 741 */
 742 static
 743 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 744     if (UTF_IS_TRAIL(c)) {
 745       return TRUE;
 746     }
 747
 748     if (c < coll->minContrEndCP) {
 749         return FALSE;
 750     }
 751
 752     int32_t  hash = c;
 753     uint8_t  htbyte;
 754     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 755         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 756     }
 757     htbyte = coll->contrEndCP[hash>>3];
 758     return (((htbyte >> (hash & 7)) & 1) == 1);
 759 }
 760
 761
 762
 763 /*
 764 *   i_getCombiningClass()
 765 *        A fast, at least partly inline version of u_getCombiningClass()
 766 *        This is a candidate for further optimization.  Used heavily
 767 *        in contraction processing.
 768 */
 769 static
 770 inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) {
 771     uint8_t sCC = 0;
 772     if (c >= 0x300 && ucol_unsafeCP(c, coll)) {
 773         sCC = u_getCombiningClass(c);
 774     }
 775     return sCC;
 776 }
 777
 778
 779 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UErrorCode *status) {
 780     UChar c;
 781     UCollator *result = fillIn;
 782     if(U_FAILURE(*status) || image == NULL) {
 783         return NULL;
 784     }
 785
 786     if(result == NULL) {
 787         result = (UCollator *)uprv_malloc(sizeof(UCollator));
 788         if(result == NULL) {
 789             *status = U_MEMORY_ALLOCATION_ERROR;
 790             return result;
 791         }
 792         result->freeOnClose = TRUE;
 793     } else {
 794         result->freeOnClose = FALSE;
 795     }
 796
 797     result->image = image;
 798     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
 799     /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
 800     UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie));
 801     if(newUCAmapping != NULL) {
 802       utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
 803     } else {
 804       *status = U_MEMORY_ALLOCATION_ERROR;
 805       if(result->freeOnClose == TRUE) {
 806           uprv_free(result);
 807           result = NULL;
 808       }
 809       return result;
 810     }
 811     if(U_SUCCESS(*status)) {
 812         result->mapping = newUCAmapping;
 813     } else {
 814         if(result->freeOnClose == TRUE) {
 815             uprv_free(result);
 816             result = NULL;
 817         }
 818         uprv_free(newUCAmapping);
 819         return result;
 820     }
 821
 822     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
 823     result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping);
 824     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
 825     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
 826     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
 827
 828     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
 829     result->freeOptionsOnClose = FALSE;
 830
 831     /* set attributes */
 832     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
 833     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
 834     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
 835     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
 836     result->strength = (UColAttributeValue)result->options->strength;
 837     result->variableTopValue = result->options->variableTopValue;
 838     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
 839     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
 840     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
 841
 842     result->caseFirstisDefault = TRUE;
 843     result->caseLevelisDefault = TRUE;
 844     result->frenchCollationisDefault = TRUE;
 845     result->normalizationModeisDefault = TRUE;
 846     result->strengthisDefault = TRUE;
 847     result->variableTopValueisDefault = TRUE;
 848     result->alternateHandlingisDefault = TRUE;
 849     result->hiraganaQisDefault = TRUE;
 850     result->numericCollationisDefault = TRUE;
 851
 852     result->scriptOrder = NULL;
 853
 854     result->rules = NULL;
 855     result->rulesLength = 0;
 856
 857     /* get the version info from UCATableHeader and populate the Collator struct*/
 858     result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
 859     result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
 860
 861     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
 862     result->minUnsafeCP = 0;
 863     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
 864         if (ucol_unsafeCP(c, result)) break;
 865     }
 866     result->minUnsafeCP = c;
 867
 868     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
 869     result->minContrEndCP = 0;
 870     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
 871         if (ucol_contractionEndCP(c, result)) break;
 872     }
 873     result->minContrEndCP = c;
 874
 875     /* max expansion tables */
 876     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
 877                                          result->image->endExpansionCE);
 878     result->lastEndExpansionCE = result->endExpansionCE +
 879                                  result->image->endExpansionCECount - 1;
 880     result->expansionCESize = (uint8_t*)result->image +
 881                                                result->image->expansionCESize;
 882
 883     if (fcdTrieIndex == NULL) {
 884         fcdTrieIndex = unorm_getFCDTrie(status);
 885     }
 886
 887     //result->errorCode = *status;
 888
 889     result->latinOneCEs = NULL;
 890
 891     result->latinOneRegenTable = FALSE;
 892     result->latinOneFailed = FALSE;
 893
 894     ucol_updateInternalState(result, status);
 895
 896
 897     return result;
 898 }
 899
 900 U_CFUNC UBool
 901 ucol_cleanup(void)
 902 {
 903     if (UCA_DATA_MEM) {
 904         udata_close(UCA_DATA_MEM);
 905         UCA_DATA_MEM = NULL;
 906     }
 907     if (UCA) {
 908         ucol_close(UCA);
 909         UCA = NULL;
 910     }
 911     return TRUE;
 912 }
 913
 914 /* Following is a port of Mark's code for new treatment of implicits.
 915  * It is positioned here, since ucol_initUCA need to initialize the
 916  * variables below according to the data in the fractional UCA.
 917  */
 918
 919 /**
 920   * Function used to:
 921   * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
 922   * b) bump any non-CJK characters by 10FFFF.
 923   * The relevant blocks are:
 924   * A:  4E00..9FFF; CJK Unified Ideographs
 925   *             F900..FAFF; CJK Compatibility Ideographs
 926   * B:  3400..4DBF; CJK Unified Ideographs Extension A
 927   *             20000..XX;      CJK Unified Ideographs Extension B (and others later on)
 928   * As long as
 929   *     no new B characters are allocated between 4E00 and FAFF, and
 930   *     no new A characters are outside of this range,
 931   * (very high probability) this simple code will work.
 932   * The reordered blocks are:
 933   * Block1 is CJK
 934   * Block2 is CJK_COMPAT_USED
 935   * Block3 is CJK_A
 936   * Any other CJK gets its normal code point
 937   * Any non-CJK gets +10FFFF
 938   * When we reorder Block1, we make sure that it is at the very start,
 939   * so that it will use a 3-byte form.
 940   */
 941
 942 // CONSTANTS
 943 static const uint32_t
 944     NON_CJK_OFFSET = 0x110000,
 945     BYTES_TO_AVOID = 3,
 946     OTHER_COUNT = 256 - BYTES_TO_AVOID,
 947     LAST_COUNT = OTHER_COUNT / 2,
 948     LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes
 949     IMPLICIT_3BYTE_COUNT = 1;
 950
 951 // These depend on initUCA, and are initialized at that time
 952 static uint32_t
 953     IMPLICIT_BASE_BYTE = 0,
 954     IMPLICIT_LIMIT_BYTE = 0, // leave room for 1 3-byte and 2 4-byte forms
 955
 956     IMPLICIT_4BYTE_BOUNDARY = 0,
 957     LAST_MULTIPLIER = 0,
 958     LAST2_MULTIPLIER = 0,
 959     IMPLICIT_BASE_3BYTE = 0,
 960     IMPLICIT_BASE_4BYTE = 0;
 961
 962 static const UChar32
 963     CJK_BASE = 0x4E00,
 964     CJK_LIMIT = 0x9FFF+1,
 965     CJK_COMPAT_USED_BASE = 0xFA0E,
 966     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
 967     CJK_A_BASE = 0x3400,
 968     CJK_A_LIMIT = 0x4DBF+1,
 969     CJK_B_BASE = 0x20000,
 970     CJK_B_LIMIT = 0x2A6DF+1;
 971
 972 static inline UChar32 swapCJK(UChar32 cp) {
 973
 974         if (cp >= CJK_BASE) {
 975                 if (cp < CJK_LIMIT)                             return cp - CJK_BASE;
 976
 977                 if (cp < CJK_COMPAT_USED_BASE)  return cp + NON_CJK_OFFSET;
 978
 979                 if (cp < CJK_COMPAT_USED_LIMIT) return cp - CJK_COMPAT_USED_BASE
 980                                                                                                 + (CJK_LIMIT - CJK_BASE);
 981                 if (cp < CJK_B_BASE)                            return cp + NON_CJK_OFFSET;
 982
 983                 if (cp < CJK_B_LIMIT)                   return cp; // non-BMP-CJK
 984
 985                 return cp + NON_CJK_OFFSET;     // non-CJK
 986         }
 987         if (cp < CJK_A_BASE)                                    return cp + NON_CJK_OFFSET;
 988
 989         if (cp < CJK_A_LIMIT)                           return cp - CJK_A_BASE
 990                                                                                                 + (CJK_LIMIT - CJK_BASE)
 991                                                                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
 992     return cp + NON_CJK_OFFSET; // non-CJK
 993 }
 994
 995
 996 // GET IMPLICIT PRIMARY WEIGHTS
 997 // Return value is left justified primary key
 998
 999 static inline uint32_t getImplicitPrimary(UChar32 cp) {
1000
1001     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1002
1003     cp = swapCJK(cp);
1004
1005     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1006
1007     // we now have a range of numbers from 0 to 21FFFF.
1008
1009     // we must skip all 00, 01, 02 bytes, so most bytes have 253 values
1010     // we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
1011     // we shift so that HAN all has the same first primary, for compression.
1012     // for the 4 byte case, we make the gap as large as we can fit.
1013     // Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
1014     // Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
1015
1016     int32_t last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
1017     if (last0 < 0) {
1018         int32_t last1 = cp / LAST_COUNT;
1019         last0 = cp % LAST_COUNT;
1020
1021         int32_t last2 = last1 / OTHER_COUNT;
1022         last1 %= OTHER_COUNT;
1023         /*
1024         if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
1025             + Utility.hex(last2) + ", "
1026             + Utility.hex(last1) + ", "
1027             + Utility.hex(last0) + ", "
1028         );
1029         */
1030
1031         return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
1032     } else {
1033         int32_t last1 = last0 / LAST_COUNT2;
1034         last0 %= LAST_COUNT2;
1035
1036         int32_t last2 = last1 / OTHER_COUNT;
1037         last1 %= OTHER_COUNT;
1038
1039         int32_t last3 = last2 / OTHER_COUNT;
1040         last2 %= OTHER_COUNT;
1041
1042         /*
1043         if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
1044             + Utility.hex(last3) + ", "
1045             + Utility.hex(last2) + ", "
1046             + Utility.hex(last1) + ", "
1047             + Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
1048         );
1049         */
1050
1051        return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
1052     }
1053 }
1054
1055 /* this function is either called from initUCA or from genUCA before
1056  * doing canonical closure for the UCA.
1057  */
1058 U_CAPI void U_EXPORT2
1059 uprv_uca_initImplicitConstants(uint32_t baseByte)
1060 {
1061   IMPLICIT_BASE_BYTE = baseByte;
1062   IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4; // leave room for 1 3-byte and 2 4-byte forms
1063
1064   IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT;
1065   LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT;
1066   LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2;
1067   IMPLICIT_BASE_3BYTE = (IMPLICIT_BASE_BYTE << 24) + 0x030300;
1068   IMPLICIT_BASE_4BYTE = ((IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT) << 24) + 0x030303;
1069 }
1070
1071 /* do not close UCA returned by ucol_initUCA! */
1072 UCollator *
1073 ucol_initUCA(UErrorCode *status) {
1074     if(U_FAILURE(*status)) {
1075         return NULL;
1076     }
1077     umtx_lock(NULL);
1078     UBool f = (UCA == NULL);
1079     umtx_unlock(NULL);
1080
1081     if(f) {
1082         UCollator *newUCA = NULL;
1083         UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1084
1085         if(U_FAILURE(*status)) {
1086             if (result) {
1087                 udata_close(result);
1088             }
1089             uprv_free(newUCA);
1090         }
1091
1092         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1093             newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, status);
1094             if(U_SUCCESS(*status)){
1095                 newUCA->rb = NULL;
1096                                 newUCA->elements = NULL;
1097                                 newUCA->validLocale = NULL;
1098                                 newUCA->requestedLocale = NULL;
1099                                 newUCA->hasRealData = FALSE; // real data lives in .dat file...
1100                 newUCA->freeImageOnClose = FALSE;
1101                 umtx_lock(NULL);
1102                 if(UCA == NULL) {
1103                     UCA = newUCA;
1104                     UCA_DATA_MEM = result;
1105                     result = NULL;
1106                     newUCA = NULL;
1107                 }
1108                 umtx_unlock(NULL);
1109
1110                 if(newUCA != NULL) {
1111                     udata_close(result);
1112                     uprv_free(newUCA);
1113                 }
1114                 else {
1115                     ucln_i18n_registerCleanup();
1116                 }
1117                 // Initalize variables for implicit generation
1118                 UCAconsts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
1119                 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN);
1120                 UCA->mapping->getFoldingOffset = _getFoldingOffset;
1121             }else{
1122                 udata_close(result);
1123                 uprv_free(newUCA);
1124                 UCA= NULL;
1125             }
1126         }
1127     }
1128     return UCA;
1129 }
1130
1131
1132 /*    collIterNormalize     Incremental Normalization happens here.                       */
1133 /*                          pick up the range of chars identifed by FCD,                  */
1134 /*                          normalize it into the collIterate's writable buffer,          */
1135 /*                          switch the collIterate's state to use the writable buffer.    */
1136 /*                                                                                        */
1137 static
1138 void collIterNormalize(collIterate *collationSource)
1139 {
1140     UErrorCode  status = U_ZERO_ERROR;
1141
1142     int32_t    normLen;
1143     UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1144     UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1145
1146     normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1147                               srcP, (int32_t)(endP - srcP),
1148                               FALSE, 0,
1149                               &status);
1150     if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1151         // reallocate and terminate
1152         if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1153                                    &collationSource->writableBuffer,
1154                                    (int32_t *)&collationSource->writableBufSize, normLen + 1,
1155                                    0)
1156         ) {
1157 #ifdef UCOL_DEBUG
1158             fprintf(stderr, "collIterNormalize(), out of memory\n");
1159 #endif
1160             return;
1161         }
1162         status = U_ZERO_ERROR;
1163         normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1164                                   srcP, (int32_t)(endP - srcP),
1165                                   FALSE, 0,
1166                                   &status);
1167     }
1168     if (U_FAILURE(status)) {
1169 #ifdef UCOL_DEBUG
1170         fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1171 #endif
1172         return;
1173     }
1174
1175   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1176       collationSource->flags |= UCOL_ITER_ALLOCATED;
1177   }
1178   collationSource->pos        = collationSource->writableBuffer;
1179   collationSource->origFlags  = collationSource->flags;
1180   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1181   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1182 }
1183
1184
1185 // This function takes the iterator and extracts normalized stuff up to the next boundary
1186 // It is similar in the end results to the collIterNormalize, but for the cases when we
1187 // use an iterator
1188 static
1189 inline void normalizeIterator(collIterate *collationSource) {
1190   UErrorCode status = U_ZERO_ERROR;
1191   UBool wasNormalized = FALSE;
1192   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1193   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1194   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1195     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1196   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1197     // reallocate and terminate
1198     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1199                                &collationSource->writableBuffer,
1200                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1201                                0)
1202     ) {
1203     #ifdef UCOL_DEBUG
1204         fprintf(stderr, "normalizeIterator(), out of memory\n");
1205     #endif
1206         return;
1207     }
1208     status = U_ZERO_ERROR;
1209     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1210     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1211     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1212     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1213   }
1214   // Terminate the buffer - we already checked that it is big enough
1215   collationSource->writableBuffer[normLen] = 0;
1216   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1217       collationSource->flags |= UCOL_ITER_ALLOCATED;
1218   }
1219   collationSource->pos        = collationSource->writableBuffer;
1220   collationSource->origFlags  = collationSource->flags;
1221   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1222   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1223 }
1224
1225
1226 /* Incremental FCD check and normalize                                                    */
1227 /*   Called from getNextCE when normalization state is suspect.                           */
1228 /*   When entering, the state is known to be this:                                        */
1229 /*      o   We are working in the main buffer of the collIterate, not the side            */
1230 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1231 /*          so we won't get here.                                                         */
1232 /*      o   The leading combining class from the current character is 0 or                */
1233 /*          the trailing combining class of the previous char was zero.                   */
1234 /*          True because the previous call to this function will have always exited       */
1235 /*          that way, and we get called for every char where cc might be non-zero.        */
1236 static
1237 inline UBool collIterFCD(collIterate *collationSource) {
1238     UChar       c, c2;
1239     const UChar *srcP, *endP;
1240     uint8_t     leadingCC;
1241     uint8_t     prevTrailingCC = 0;
1242     uint16_t    fcd;
1243     UBool       needNormalize = FALSE;
1244
1245     srcP = collationSource->pos-1;
1246
1247     if (collationSource->flags & UCOL_ITER_HASLEN) {
1248         endP = collationSource->endp;
1249     } else {
1250         endP = NULL;
1251     }
1252
1253     // Get the trailing combining class of the current character.  If it's zero,
1254     //   we are OK.
1255     c = *srcP++;
1256     /* trie access */
1257     fcd = unorm_getFCD16(fcdTrieIndex, c);
1258     if (fcd != 0) {
1259         if (UTF_IS_FIRST_SURROGATE(c)) {
1260             if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1261                 ++srcP;
1262                 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1263             } else {
1264                 fcd = 0;
1265             }
1266         }
1267
1268         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1269
1270         if (prevTrailingCC != 0) {
1271             // The current char has a non-zero trailing CC.  Scan forward until we find
1272             //   a char with a leading cc of zero.
1273             while (endP == NULL || srcP != endP)
1274             {
1275                 const UChar *savedSrcP = srcP;
1276
1277                 c = *srcP++;
1278                 /* trie access */
1279                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1280                 if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
1281                     if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1282                         ++srcP;
1283                         fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1284                     } else {
1285                         fcd = 0;
1286                     }
1287                 }
1288                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1289                 if (leadingCC == 0) {
1290                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1291                                            //   back up over it.  (Could be surrogate pair!)
1292                     break;
1293                 }
1294
1295                 if (leadingCC < prevTrailingCC) {
1296                     needNormalize = TRUE;
1297                 }
1298
1299                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1300             }
1301         }
1302     }
1303
1304     collationSource->fcdPosition = (UChar *)srcP;
1305
1306     return needNormalize;
1307 }
1308
1309 /****************************************************************************/
1310 /* Following are the CE retrieval functions                                 */
1311 /*                                                                          */
1312 /****************************************************************************/
1313
1314 /* there should be a macro version of this function in the header file */
1315 /* This is the first function that tries to fetch a collation element  */
1316 /* If it's not succesfull or it encounters a more difficult situation  */
1317 /* some more sofisticated and slower functions are invoked             */
1318 static
1319 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1320     uint32_t order = 0;
1321     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1322       order = *(collationSource->toReturn++);                         /* if so, return them */
1323       if(collationSource->CEpos == collationSource->toReturn) {
1324         collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1325       }
1326       return order;
1327     }
1328
1329     UChar ch = 0;
1330
1331     for (;;)                           /* Loop handles case when incremental normalize switches   */
1332     {                                  /*   to or from the side buffer / original string, and we  */
1333                                        /*   need to start again to get the next character.        */
1334
1335         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1336         {
1337             // The source string is null terminated and we're not working from the side buffer,
1338             //   and we're not normalizing.  This is the fast path.
1339             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1340             ch = *collationSource->pos++;
1341             if (ch != 0) {
1342                 break;
1343             }
1344             else {
1345                 return UCOL_NO_MORE_CES;
1346             }
1347         }
1348
1349         if (collationSource->flags & UCOL_ITER_HASLEN) {
1350             // Normal path for strings when length is specified.
1351             //   (We can't be in side buffer because it is always null terminated.)
1352             if (collationSource->pos >= collationSource->endp) {
1353                 // Ran off of the end of the main source string.  We're done.
1354                 return UCOL_NO_MORE_CES;
1355             }
1356             ch = *collationSource->pos++;
1357         }
1358         else if(collationSource->flags & UCOL_USE_ITERATOR) {
1359             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1360             if(iterCh == U_SENTINEL) {
1361               return UCOL_NO_MORE_CES;
1362             }
1363             ch = (UChar)iterCh;
1364         }
1365         else
1366         {
1367             // Null terminated string.
1368             ch = *collationSource->pos++;
1369             if (ch == 0) {
1370                 // Ran off end of buffer.
1371                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1372                     // Ran off end of main string. backing up one character.
1373                     collationSource->pos--;
1374                     return UCOL_NO_MORE_CES;
1375                 }
1376                 else
1377                 {
1378                     // Hit null in the normalize side buffer.
1379                     // Usually this means the end of the normalized data,
1380                     // except for one odd case: a null followed by combining chars,
1381                     //   which is the case if we are at the start of the buffer.
1382                     if (collationSource->pos == collationSource->writableBuffer+1) {
1383                         break;
1384                     }
1385
1386                     //  Null marked end of side buffer.
1387                     //   Revert to the main string and
1388                     //   loop back to top to try again to get a character.
1389                     collationSource->pos   = collationSource->fcdPosition;
1390                     collationSource->flags = collationSource->origFlags;
1391                     continue;
1392                 }
1393             }
1394         }
1395
1396         if(collationSource->flags&UCOL_HIRAGANA_Q) {
1397           if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1398             collationSource->flags |= UCOL_WAS_HIRAGANA;
1399           } else {
1400             collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1401           }
1402         }
1403
1404         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1405         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1406         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1407             break;
1408         }
1409
1410         if (collationSource->fcdPosition >= collationSource->pos) {
1411             // An earlier FCD check has already covered the current character.
1412             // We can go ahead and process this char.
1413             break;
1414         }
1415
1416         if (ch < ZERO_CC_LIMIT_ ) {
1417             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1418             break;
1419         }
1420
1421         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1422             // We need to peek at the next character in order to tell if we are FCD
1423             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1424                 // We are at the last char of source string.
1425                 //  It is always OK for FCD check.
1426                 break;
1427             }
1428
1429             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1430             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1431                 break;
1432             }
1433         }
1434
1435
1436         // Need a more complete FCD check and possible normalization.
1437         if (collIterFCD(collationSource)) {
1438             collIterNormalize(collationSource);
1439         }
1440         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1441             //  No normalization was needed.  Go ahead and process the char we already had.
1442             break;
1443         }
1444
1445         // Some normalization happened.  Next loop iteration will pick up a char
1446         //   from the normalization buffer.
1447
1448     }   // end for (;;)
1449
1450
1451       if (ch <= 0xFF) {
1452           /*  For latin-1 characters we never need to fall back to the UCA table        */
1453           /*    because all of the UCA data is replicated in the latinOneMapping array  */
1454           order = coll->latinOneMapping[ch];
1455           if (order > UCOL_NOT_FOUND) {
1456               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1457           }
1458       }
1459       else
1460       {
1461           order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1462           if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1463               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1464           }
1465           if(order == UCOL_NOT_FOUND) {   /* We couldn't find a good CE in the tailoring */
1466             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1467             order = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
1468
1469             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1470               order = ucol_prv_getSpecialCE(UCA, ch, order, collationSource, status);
1471             }
1472           }
1473       }
1474     return order; /* return the CE */
1475 }
1476
1477 /* ucol_getNextCE, out-of-line version for use from other files.   */
1478 U_CAPI uint32_t  U_EXPORT2
1479 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1480     return ucol_IGetNextCE(coll, collationSource, status);
1481     }
1482
1483
1484 /**
1485 * Incremental previous normalization happens here. Pick up the range of chars
1486 * identifed by FCD, normalize it into the collIterate's writable buffer,
1487 * switch the collIterate's state to use the writable buffer.
1488 * @param data collation iterator data
1489 */
1490 static
1491 void collPrevIterNormalize(collIterate *data)
1492 {
1493     UErrorCode status  = U_ZERO_ERROR;
1494     UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1495     UChar      *pStart;
1496     uint32_t    normLen;
1497     UChar      *pStartNorm;
1498
1499     /* Start normalize */
1500     if (data->fcdPosition == NULL) {
1501         pStart = data->string;
1502     }
1503     else {
1504         pStart = data->fcdPosition + 1;
1505     }
1506
1507     normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1508                               data->writableBuffer, 0, &status);
1509
1510     if (data->writableBufSize <= normLen) {
1511             freeHeapWritableBuffer(data);
1512             data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1513                                                         sizeof(UChar));
1514             if(data->writableBuffer == NULL) { // something is wrong here, return
1515               return;
1516             }
1517             data->flags |= UCOL_ITER_ALLOCATED;
1518             /* to handle the zero termination */
1519             data->writableBufSize = normLen + 1;
1520     }
1521             status = U_ZERO_ERROR;
1522     /*
1523     this puts the null termination infront of the normalized string instead
1524     of the end
1525     */
1526     pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1527     *(pStartNorm - 1) = 0;
1528     unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1529                     normLen, &status);
1530
1531     data->pos        = data->writableBuffer + data->writableBufSize;
1532     data->origFlags  = data->flags;
1533     data->flags     |= UCOL_ITER_INNORMBUF;
1534     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1535 }
1536
1537
1538 /**
1539 * Incremental FCD check for previous iteration and normalize. Called from
1540 * getPrevCE when normalization state is suspect.
1541 * When entering, the state is known to be this:
1542 * o  We are working in the main buffer of the collIterate, not the side
1543 *    writable buffer. When in the side buffer, normalization mode is always
1544 *    off, so we won't get here.
1545 * o  The leading combining class from the current character is 0 or the
1546 *    trailing combining class of the previous char was zero.
1547 *    True because the previous call to this function will have always exited
1548 *    that way, and we get called for every char where cc might be non-zero.
1549 * @param data collation iterate struct
1550 * @return normalization status, TRUE for normalization to be done, FALSE
1551 *         otherwise
1552 */
1553 static
1554 inline UBool collPrevIterFCD(collIterate *data)
1555 {
1556     const UChar *src, *start;
1557     UChar       c, c2;
1558     uint8_t     leadingCC;
1559     uint8_t     trailingCC = 0;
1560     uint16_t    fcd;
1561     UBool       result = FALSE;
1562
1563     start = data->string;
1564     src = data->pos + 1;
1565
1566     /* Get the trailing combining class of the current character. */
1567     c = *--src;
1568     if (!UTF_IS_SURROGATE(c)) {
1569         fcd = unorm_getFCD16(fcdTrieIndex, c);
1570     } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1571         --src;
1572         fcd = unorm_getFCD16(fcdTrieIndex, c2);
1573         if (fcd != 0) {
1574             fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1575         }
1576     } else /* unpaired surrogate */ {
1577         fcd = 0;
1578     }
1579
1580     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1581
1582     if (leadingCC != 0) {
1583         /*
1584         The current char has a non-zero leading combining class.
1585         Scan backward until we find a char with a trailing cc of zero.
1586         */
1587         for (;;)
1588         {
1589             if (start == src) {
1590                 data->fcdPosition = NULL;
1591                 return result;
1592             }
1593
1594             c = *--src;
1595             if (!UTF_IS_SURROGATE(c)) {
1596                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1597             } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1598                 --src;
1599                 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1600                 if (fcd != 0) {
1601                     fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1602                 }
1603             } else /* unpaired surrogate */ {
1604                 fcd = 0;
1605             }
1606
1607             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1608
1609             if (trailingCC == 0) {
1610                 break;
1611             }
1612
1613             if (leadingCC < trailingCC) {
1614                 result = TRUE;
1615             }
1616
1617             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1618         }
1619     }
1620
1621     data->fcdPosition = (UChar *)src;
1622
1623     return result;
1624 }
1625
1626 /** gets a character from the string at a given offset
1627  *  Handles both normal and iterative cases.
1628  *  No error checking - caller beware!
1629  */
1630 inline static
1631 UChar peekCharacter(collIterate *source, int32_t offset) {
1632   if(source->pos != NULL) {
1633     return *(source->pos + offset);
1634   } else if(source->iterator != NULL) {
1635     if(offset != 0) {
1636       source->iterator->move(source->iterator, offset, UITER_CURRENT);
1637       UChar toReturn = (UChar)source->iterator->next(source->iterator);
1638       source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1639       return toReturn;
1640     } else {
1641       return (UChar)source->iterator->current(source->iterator);
1642     }
1643   } else {
1644     return (UChar)U_SENTINEL;
1645   }
1646 }
1647
1648 /**
1649 * Determines if we are at the start of the data string in the backwards
1650 * collation iterator
1651 * @param data collation iterator
1652 * @return TRUE if we are at the start
1653 */
1654 static
1655 inline UBool isAtStartPrevIterate(collIterate *data) {
1656   if(data->pos == NULL && data->iterator != NULL) {
1657     return !data->iterator->hasPrevious(data->iterator);
1658   }
1659   //return (collIter_bos(data)) ||
1660   return (data->pos == data->string) ||
1661             ((data->flags & UCOL_ITER_INNORMBUF) &&
1662             *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1663 }
1664
1665 /**
1666 * Inline function that gets a simple CE.
1667 * So what it does is that it will first check the expansion buffer. If the
1668 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1669 * is different from the string pointer, we return the collation element at the
1670 * return pointer and decrement it.
1671 * For more complicated CEs it resorts to getComplicatedCE.
1672 * @param coll collator data
1673 * @param data collation iterator struct
1674 * @param status error status
1675 */
1676 static
1677 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1678                                UErrorCode *status)
1679 {
1680     uint32_t result = UCOL_NULLORDER;
1681     if (data->toReturn > data->CEs) {
1682         data->toReturn --;
1683         result = *(data->toReturn);
1684         if (data->CEs == data->toReturn) {
1685             data->CEpos = data->toReturn;
1686         }
1687     }
1688     else {
1689         UChar ch = 0;
1690         /*
1691         Loop handles case when incremental normalize switches to or from the
1692         side buffer / original string, and we need to start again to get the
1693         next character.
1694         */
1695         for (;;) {
1696             if (data->flags & UCOL_ITER_HASLEN) {
1697                 /*
1698                 Normal path for strings when length is specified.
1699                 Not in side buffer because it is always null terminated.
1700                 */
1701                 if (data->pos <= data->string) {
1702                     /* End of the main source string */
1703                     return UCOL_NO_MORE_CES;
1704                 }
1705                 data->pos --;
1706                 ch = *data->pos;
1707             }
1708             // we are using an iterator to go back. Pray for us!
1709             else if (data->flags & UCOL_USE_ITERATOR) {
1710               UChar32 iterCh = data->iterator->previous(data->iterator);
1711               if(iterCh == U_SENTINEL) {
1712                 return UCOL_NO_MORE_CES;
1713               } else {
1714                 ch = (UChar)iterCh;
1715               }
1716             }
1717             else {
1718                 data->pos --;
1719                 ch = *data->pos;
1720                 /* we are in the side buffer. */
1721                 if (ch == 0) {
1722                     /*
1723                     At the start of the normalize side buffer.
1724                     Go back to string.
1725                     Because pointer points to the last accessed character,
1726                     hence we have to increment it by one here.
1727                     */
1728                     if (data->fcdPosition == NULL) {
1729                         data->pos = data->string;
1730                         return UCOL_NO_MORE_CES;
1731                     }
1732                     else {
1733                         data->pos   = data->fcdPosition + 1;
1734                     }
1735                     data->flags = data->origFlags;
1736                     continue;
1737                 }
1738             }
1739
1740             if(data->flags&UCOL_HIRAGANA_Q) {
1741               if(ch>=0x3040 && ch<=0x309f) {
1742                 data->flags |= UCOL_WAS_HIRAGANA;
1743               } else {
1744                 data->flags &= ~UCOL_WAS_HIRAGANA;
1745               }
1746             }
1747
1748             /*
1749             * got a character to determine if there's fcd and/or normalization
1750             * stuff to do.
1751             * if the current character is not fcd.
1752             * if current character is at the start of the string
1753             * Trailing combining class == 0.
1754             * Note if pos is in the writablebuffer, norm is always 0
1755             */
1756             if (ch < ZERO_CC_LIMIT_ ||
1757               // this should propel us out of the loop in the iterator case
1758                 (data->flags & UCOL_ITER_NORM) == 0 ||
1759                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
1760                 || data->string == data->pos) {
1761                 break;
1762             }
1763
1764             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1765                 /* if next character is FCD */
1766                 if (data->pos == data->string) {
1767                     /* First char of string is always OK for FCD check */
1768                     break;
1769                 }
1770
1771                 /* Not first char of string, do the FCD fast test */
1772                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
1773                     break;
1774                 }
1775             }
1776
1777             /* Need a more complete FCD check and possible normalization. */
1778             if (collPrevIterFCD(data)) {
1779                 collPrevIterNormalize(data);
1780             }
1781
1782             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1783                 /*  No normalization. Go ahead and process the char. */
1784                 break;
1785             }
1786
1787             /*
1788             Some normalization happened.
1789             Next loop picks up a char from the normalization buffer.
1790             */
1791         }
1792
1793         /* attempt to handle contractions, after removal of the backwards
1794         contraction
1795         */
1796         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
1797             result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
1798         }
1799         else {
1800             if (ch <= 0xFF) {
1801               result = coll->latinOneMapping[ch];
1802               if (result > UCOL_NOT_FOUND) {
1803                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1804               }
1805             }
1806             else {
1807               // TODO: fix me for THAI - I reference *(data->pos-1)
1808                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0 &&
1809                     /*UCOL_ISTHAIBASECONSONANT(ch) &&*/   // This is from the old specs - we now rearrange unconditionally
1810                     data->pos > data->string &&
1811                     UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1)))
1812                     //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
1813                 {
1814                     result = UCOL_THAI;
1815                 }
1816                 else {
1817                     /*result = ucmpe32_get(coll->mapping, ch);*/
1818                     result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1819                 }
1820                 if (result > UCOL_NOT_FOUND) {
1821                     result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
1822                 }
1823                 if (result == UCOL_NOT_FOUND) {
1824                   if (!isAtStartPrevIterate(data) &&
1825                       ucol_contractionEndCP(ch, data->coll)) {
1826                       result = UCOL_CONTRACTION;
1827                   }
1828                   else {
1829                         /*result = ucmpe32_get(UCA->mapping, ch);*/
1830                         result = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
1831                   }
1832
1833                   if (result > UCOL_NOT_FOUND) {
1834                     result = ucol_prv_getSpecialPrevCE(UCA, ch, result, data, status);
1835                   }
1836                 }
1837             }
1838         }
1839     }
1840     return result;
1841 }
1842
1843
1844 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
1845 U_CAPI uint32_t  U_EXPORT2
1846 ucol_getPrevCE(const UCollator *coll, collIterate *data,
1847                         UErrorCode *status) {
1848     return ucol_IGetPrevCE(coll, data, status);
1849 }
1850
1851
1852 /* this should be connected to special Jamo handling */
1853 U_CAPI uint32_t  U_EXPORT2
1854 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
1855   collIterate colIt;
1856   uint32_t order;
1857   IInit_collIterate(coll, &u, 1, &colIt);
1858   order = ucol_IGetNextCE(coll, &colIt, status);
1859   /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
1860   return order;
1861 }
1862
1863 /**
1864 * Inserts the argument character into the end of the buffer pushing back the
1865 * null terminator.
1866 * @param data collIterate struct data
1867 * @param pNull pointer to the null termination
1868 * @param ch character to be appended
1869 * @return the position of the new addition
1870 */
1871 static
1872 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
1873 {
1874           uint32_t  size    = data->writableBufSize;
1875           UChar    *newbuffer;
1876     const uint32_t  incsize = 5;
1877
1878     if ((data->writableBuffer + size) > (pNull + 1)) {
1879         *pNull = ch;
1880         *(pNull + 1) = 0;
1881         return pNull;
1882     }
1883
1884     /*
1885     buffer will always be null terminated at the end.
1886     giving extra space since it is likely that more characters will be added.
1887     */
1888     size += incsize;
1889     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
1890     if(newbuffer != NULL) { // something wrong, but no status
1891       uprv_memcpy(newbuffer, data->writableBuffer,
1892                   data->writableBufSize * sizeof(UChar));
1893
1894       freeHeapWritableBuffer(data);
1895       data->writableBufSize = size;
1896       data->writableBuffer  = newbuffer;
1897
1898       newbuffer        = newbuffer + data->writableBufSize;
1899       *newbuffer       = ch;
1900       *(newbuffer + 1) = 0;
1901     }
1902     return newbuffer;
1903 }
1904
1905 /**
1906 * Inserts the argument string into the end of the buffer pushing back the
1907 * null terminator.
1908 * @param data collIterate struct data
1909 * @param pNull pointer to the null termination
1910 * @param string to be appended
1911 * @param length of the string to be appended
1912 * @return the position of the new addition
1913 */
1914 static
1915 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
1916                                int32_t length)
1917 {
1918     uint32_t  size = pNull - data->writableBuffer;
1919     UChar    *newbuffer;
1920
1921     if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
1922         uprv_memcpy(pNull, str, length * sizeof(UChar));
1923         *(pNull + length) = 0;
1924         return pNull;
1925     }
1926
1927     /*
1928     buffer will always be null terminated at the end.
1929     giving extra space since it is likely that more characters will be added.
1930     */
1931     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
1932     if(newbuffer != NULL) {
1933       uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
1934       uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
1935
1936       freeHeapWritableBuffer(data);
1937       data->writableBufSize = size + length + 1;
1938       data->writableBuffer  = newbuffer;
1939     }
1940
1941     return newbuffer;
1942 }
1943
1944 /**
1945 * Special normalization function for contraction in the forwards iterator.
1946 * This normalization sequence will place the current character at source->pos
1947 * and its following normalized sequence into the buffer.
1948 * The fcd position, pos will be changed.
1949 * pos will now point to positions in the buffer.
1950 * Flags will be changed accordingly.
1951 * @param data collation iterator data
1952 */
1953 static
1954 inline void normalizeNextContraction(collIterate *data)
1955 {
1956     UChar      *buffer     = data->writableBuffer;
1957     uint32_t    buffersize = data->writableBufSize;
1958     uint32_t    strsize;
1959     UErrorCode  status     = U_ZERO_ERROR;
1960     /* because the pointer points to the next character */
1961     UChar      *pStart     = data->pos - 1;
1962     UChar      *pEnd;
1963     uint32_t    normLen;
1964     UChar      *pStartNorm;
1965
1966     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
1967         *data->writableBuffer = *(pStart - 1);
1968         strsize               = 1;
1969     }
1970     else {
1971         strsize = u_strlen(data->writableBuffer);
1972     }
1973
1974     pEnd = data->fcdPosition;
1975
1976     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
1977                               &status);
1978
1979     if (buffersize <= normLen + strsize) {
1980         uint32_t  size = strsize + normLen + 1;
1981         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
1982         if(temp != NULL) {
1983           uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
1984           freeHeapWritableBuffer(data);
1985           data->writableBuffer = temp;
1986           data->writableBufSize = size;
1987           data->flags |= UCOL_ITER_ALLOCATED;
1988         }
1989     }
1990
1991     status            = U_ZERO_ERROR;
1992     pStartNorm        = buffer + strsize;
1993     /* null-termination will be added here */
1994     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
1995                     normLen + 1, &status);
1996
1997     data->pos        = data->writableBuffer + strsize;
1998     data->origFlags  = data->flags;
1999     data->flags     |= UCOL_ITER_INNORMBUF;
2000     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2001 }
2002
2003 /**
2004 * Contraction character management function that returns the next character
2005 * for the forwards iterator.
2006 * Does nothing if the next character is in buffer and not the first character
2007 * in it.
2008 * Else it checks next character in data string to see if it is normalizable.
2009 * If it is not, the character is simply copied into the buffer, else
2010 * the whole normalized substring is copied into the buffer, including the
2011 * current character.
2012 * @param data collation element iterator data
2013 * @return next character
2014 */
2015 static
2016 inline UChar getNextNormalizedChar(collIterate *data)
2017 {
2018     UChar  nextch;
2019     UChar  ch;
2020     // Here we need to add the iterator code. One problem is the way
2021     // end of string is handled. If we just return next char, it could
2022     // be the sentinel. Most of the cases already check for this, but we
2023     // need to be sure.
2024     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2025          /* if no normalization and not in buffer. */
2026       if(data->flags & UCOL_USE_ITERATOR) {
2027          return (UChar)data->iterator->next(data->iterator);
2028       } else {
2029          return *(data->pos ++);
2030       }
2031     }
2032
2033     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2034       //normalizeIterator(data);
2035     //}
2036
2037     UChar  *pEndWritableBuffer = NULL;
2038     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2039     if ((innormbuf && *data->pos != 0) ||
2040         (data->fcdPosition != NULL && !innormbuf &&
2041         data->pos < data->fcdPosition)) {
2042         /*
2043         if next character is in normalized buffer, no further normalization
2044         is required
2045         */
2046         return *(data->pos ++);
2047     }
2048
2049     if (data->flags & UCOL_ITER_HASLEN) {
2050         /* in data string */
2051         if (data->pos + 1 == data->endp) {
2052             return *(data->pos ++);
2053         }
2054     }
2055     else {
2056         if (innormbuf) {
2057           // inside the normalization buffer, but at the end
2058           // (since we encountered zero). This means, in the
2059           // case we're using char iterator, that we need to
2060           // do another round of normalization.
2061           //if(data->origFlags & UCOL_USE_ITERATOR) {
2062             // we need to restore original flags,
2063             // otherwise, we'll lose them
2064             //data->flags = data->origFlags;
2065             //normalizeIterator(data);
2066             //return *(data->pos++);
2067           //} else {
2068             /*
2069             in writable buffer, at this point fcdPosition can not be
2070             pointing to the end of the data string. see contracting tag.
2071             */
2072           if(data->fcdPosition) {
2073             if (*(data->fcdPosition + 1) == 0 ||
2074                 data->fcdPosition + 1 == data->endp) {
2075                 /* at the end of the string, dump it into the normalizer */
2076                 data->pos = insertBufferEnd(data, data->pos,
2077                                             *(data->fcdPosition)) + 1;
2078                 return *(data->fcdPosition ++);
2079             }
2080             pEndWritableBuffer = data->pos;
2081             data->pos = data->fcdPosition;
2082           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2083             // if we are here, we're using a normalizing iterator.
2084             // we should just continue further.
2085             data->flags = data->origFlags;
2086             data->pos = NULL;
2087             return (UChar)data->iterator->next(data->iterator);
2088           }
2089           //}
2090         }
2091         else {
2092             if (*(data->pos + 1) == 0) {
2093                 return *(data->pos ++);
2094             }
2095         }
2096     }
2097
2098     ch = *data->pos ++;
2099     nextch = *data->pos;
2100
2101     /*
2102     * if the current character is not fcd.
2103     * Trailing combining class == 0.
2104     */
2105     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2106         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2107          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2108             /*
2109             Need a more complete FCD check and possible normalization.
2110             normalize substring will be appended to buffer
2111             */
2112         if (collIterFCD(data)) {
2113             normalizeNextContraction(data);
2114             return *(data->pos ++);
2115         }
2116         else if (innormbuf) {
2117             /* fcdposition shifted even when there's no normalization, if we
2118             don't input the rest into this, we'll get the wrong position when
2119             we reach the end of the writableBuffer */
2120             int32_t length = data->fcdPosition - data->pos + 1;
2121             data->pos = insertBufferEnd(data, pEndWritableBuffer,
2122                                         data->pos - 1, length);
2123             return *(data->pos ++);
2124         }
2125     }
2126
2127     if (innormbuf) {
2128         /*
2129         no normalization is to be done hence only one character will be
2130         appended to the buffer.
2131         */
2132         data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2133     }
2134
2135     /* points back to the pos in string */
2136     return ch;
2137 }
2138
2139 static
2140 inline void goBackOne(collIterate *data) {
2141 # if 0
2142   // somehow, it looks like we need to keep iterator synced up
2143   // at all times, as above.
2144   if(data->pos) {
2145     data->pos--;
2146   }
2147   if(data->iterator) {
2148     data->iterator->previous(data->iterator);
2149   }
2150 #endif
2151   if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
2152     data->iterator->previous(data->iterator);
2153   }
2154   if(data->pos) {
2155     data->pos --;
2156   }
2157 }
2158
2159
2160 /**
2161 * Function to copy the buffer into writableBuffer and sets the fcd position to
2162 * the correct position
2163 * @param source data string source
2164 * @param buffer character buffer
2165 * @param tempdb current position in buffer that has been used up
2166 */
2167 static
2168 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2169                                      UChar *tempdb)
2170 {
2171     /* okay confusing part here. to ensure that the skipped characters are
2172     considered later, we need to place it in the appropriate position in the
2173     normalization buffer and reassign the pos pointer. simple case if pos
2174     reside in string, simply copy to normalization buffer and
2175     fcdposition = pos, pos = start of normalization buffer. if pos in
2176     normalization buffer, we'll insert the copy infront of pos and point pos
2177     to the start of the normalization buffer. why am i doing these copies?
2178     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2179     not require any changes, which be really painful. */
2180     uint32_t length = u_strlen(buffer);;
2181     if (source->flags & UCOL_ITER_INNORMBUF) {
2182         u_strcpy(tempdb, source->pos);
2183     }
2184     else {
2185         source->fcdPosition  = source->pos;
2186         source->origFlags    = source->flags;
2187         source->flags       |= UCOL_ITER_INNORMBUF;
2188         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2189     }
2190
2191     if (length >= source->writableBufSize) {
2192         freeHeapWritableBuffer(source);
2193         source->writableBuffer =
2194                      (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2195         if(source->writableBuffer == NULL) {
2196           return;
2197         }
2198         source->writableBufSize = length;
2199     }
2200
2201     u_strcpy(source->writableBuffer, buffer);
2202     source->pos = source->writableBuffer;
2203 }
2204
2205 /**
2206 * Function to get the discontiguos collation element within the source.
2207 * Note this function will set the position to the appropriate places.
2208 * @param coll current collator used
2209 * @param source data string source
2210 * @param constart index to the start character in the contraction table
2211 * @return discontiguos collation element offset
2212 */
2213 static
2214 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2215                                 const UChar *constart)
2216 {
2217     /* source->pos currently points to the second combining character after
2218        the start character */
2219           UChar   *temppos      = source->pos;
2220           UChar    buffer[4*UCOL_MAX_BUFFER];
2221           UChar   *tempdb       = buffer;
2222     const UChar   *tempconstart = constart;
2223           uint8_t  tempflags    = source->flags;
2224           UBool    multicontraction = FALSE;
2225           UChar   *tempbufferpos = 0;
2226           collIterateState discState;
2227
2228           backupState(source, &discState);
2229
2230     //*tempdb = *(source->pos - 1);
2231           *tempdb = peekCharacter(source, -1);
2232     tempdb ++;
2233     while (TRUE) {
2234         UChar    *UCharOffset;
2235         UChar     schar,
2236                   tchar;
2237         uint32_t  result;
2238
2239         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2240             || (peekCharacter(source, 0) == 0  &&
2241             //|| (*source->pos == 0  &&
2242                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2243                  source->fcdPosition == NULL ||
2244                  source->fcdPosition == source->endp ||
2245                  *(source->fcdPosition) == 0 ||
2246                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2247                  /* end of string in null terminated string or stopped by a
2248                  null character, note fcd does not always point to a base
2249                  character after the discontiguos change */
2250                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2251                  //u_getCombiningClass(*(source->pos)) == 0) {
2252             //constart = (UChar *)coll->image + getContractOffset(CE);
2253             if (multicontraction) {
2254                 *tempbufferpos = 0;
2255                 source->pos    = temppos - 1;
2256                 setDiscontiguosAttribute(source, buffer, tempdb);
2257                 return *(coll->contractionCEs +
2258                                     (tempconstart - coll->contractionIndex));
2259             }
2260             constart = tempconstart;
2261             break;
2262         }
2263
2264         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2265         schar = getNextNormalizedChar(source);
2266
2267         while (schar > (tchar = *UCharOffset)) {
2268             UCharOffset++;
2269         }
2270
2271         if (schar != tchar) {
2272             /* not the correct codepoint. we stuff the current codepoint into
2273             the discontiguos buffer and try the next character */
2274             *tempdb = schar;
2275             tempdb ++;
2276             continue;
2277         }
2278         else {
2279             if (u_getCombiningClass(schar) ==
2280                 u_getCombiningClass(peekCharacter(source, -2))) {
2281                 //u_getCombiningClass(*(source->pos - 2))) {
2282                 *tempdb = schar;
2283                 tempdb ++;
2284                 continue;
2285             }
2286             result = *(coll->contractionCEs +
2287                                       (UCharOffset - coll->contractionIndex));
2288         }
2289         *tempdb = 0;
2290
2291         if (result == UCOL_NOT_FOUND) {
2292           break;
2293         } else if (isContraction(result)) {
2294             /* this is a multi-contraction*/
2295             tempconstart = (UChar *)coll->image + getContractOffset(result);
2296             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2297                 != UCOL_NOT_FOUND) {
2298                 multicontraction = TRUE;
2299                 temppos       = source->pos + 1;
2300                 tempbufferpos = buffer + u_strlen(buffer);
2301             }
2302         } else {
2303             setDiscontiguosAttribute(source, buffer, tempdb);
2304             return result;
2305         }
2306     }
2307
2308     /* no problems simply reverting just like that,
2309     if we are in string before getting into this function, points back to
2310     string hence no problem.
2311     if we are in normalization buffer before getting into this function,
2312     since we'll never use another normalization within this function, we
2313     know that fcdposition points to a base character. the normalization buffer
2314     never change, hence this revert works. */
2315     loadState(source, &discState, TRUE);
2316     goBackOne(source);
2317
2318     //source->pos   = temppos - 1;
2319     source->flags = tempflags;
2320     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2321 }
2322
2323 static
2324 inline UBool isNonChar(UChar32 cp) {
2325   if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2326     return TRUE;
2327   }
2328   return FALSE;
2329 }
2330
2331 /* now uses Mark's getImplicitPrimary code */
2332 static
2333 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2334   if(isNonChar(cp)) {
2335     return 0;
2336   }
2337   uint32_t r = getImplicitPrimary(cp);
2338   *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2339   return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2340 }
2341
2342 /**
2343 * Inserts the argument character into the front of the buffer replacing the
2344 * front null terminator.
2345 * @param data collation element iterator data
2346 * @param pNull pointer to the null terminator
2347 * @param ch character to be appended
2348 * @return positon of added character
2349 */
2350 static
2351 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2352 {
2353           uint32_t  size    = data->writableBufSize;
2354           UChar    *end;
2355           UChar    *newbuffer;
2356     const uint32_t  incsize = 5;
2357
2358     if (pNull > data->writableBuffer + 1) {
2359         *pNull       = ch;
2360         *(pNull - 1) = 0;
2361         return pNull;
2362     }
2363
2364     /*
2365     buffer will always be null terminated infront.
2366     giving extra space since it is likely that more characters will be added.
2367     */
2368     size += incsize;
2369     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2370     if(newbuffer == NULL) {
2371       return NULL;
2372     }
2373     end = newbuffer + incsize;
2374     uprv_memcpy(end, data->writableBuffer,
2375                 data->writableBufSize * sizeof(UChar));
2376     *end       = ch;
2377     *(end - 1) = 0;
2378
2379     freeHeapWritableBuffer(data);
2380
2381     data->writableBufSize = size;
2382     data->writableBuffer  = newbuffer;
2383     return end;
2384 }
2385
2386 /**
2387 * Special normalization function for contraction in the previous iterator.
2388 * This normalization sequence will place the current character at source->pos
2389 * and its following normalized sequence into the buffer.
2390 * The fcd position, pos will be changed.
2391 * pos will now point to positions in the buffer.
2392 * Flags will be changed accordingly.
2393 * @param data collation iterator data
2394 */
2395 static
2396 inline void normalizePrevContraction(collIterate *data)
2397 {
2398     UChar      *buffer     = data->writableBuffer;
2399     uint32_t    buffersize = data->writableBufSize;
2400     uint32_t    nulltermsize;
2401     UErrorCode  status     = U_ZERO_ERROR;
2402     UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2403     UChar      *pStart;
2404     uint32_t    normLen;
2405     UChar      *pStartNorm;
2406
2407     if (data->flags & UCOL_ITER_HASLEN) {
2408         /*
2409         normalization buffer not used yet, we'll pull down the next
2410         character into the end of the buffer
2411         */
2412         *(buffer + (buffersize - 1)) = *(data->pos + 1);
2413         nulltermsize                  = buffersize - 1;
2414     }
2415     else {
2416         nulltermsize = buffersize;
2417         UChar *temp = buffer + (nulltermsize - 1);
2418         while (*(temp --) != 0) {
2419             nulltermsize --;
2420         }
2421     }
2422
2423     /* Start normalize */
2424     if (data->fcdPosition == NULL) {
2425         pStart = data->string;
2426     }
2427     else {
2428         pStart = data->fcdPosition + 1;
2429     }
2430
2431     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2432                               &status);
2433
2434     if (nulltermsize <= normLen) {
2435         uint32_t  size = buffersize - nulltermsize + normLen + 1;
2436         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2437         if(temp != NULL) {
2438           nulltermsize   = normLen + 1;
2439           uprv_memcpy(temp + normLen, buffer,
2440                       sizeof(UChar) * (buffersize - nulltermsize));
2441           freeHeapWritableBuffer(data);
2442           data->writableBuffer = temp;
2443           data->writableBufSize = size;
2444         }
2445     }
2446
2447     status = U_ZERO_ERROR;
2448     /*
2449     this puts the null termination infront of the normalized string instead
2450     of the end
2451     */
2452     pStartNorm   = buffer + (nulltermsize - normLen);
2453     *(pStartNorm - 1) = 0;
2454     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2455                     &status);
2456
2457     data->pos        = data->writableBuffer + nulltermsize;
2458     data->origFlags  = data->flags;
2459     data->flags     |= UCOL_ITER_INNORMBUF;
2460     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2461 }
2462
2463 /**
2464 * Contraction character management function that returns the previous character
2465 * for the backwards iterator.
2466 * Does nothing if the previous character is in buffer and not the first
2467 * character in it.
2468 * Else it checks previous character in data string to see if it is
2469 * normalizable.
2470 * If it is not, the character is simply copied into the buffer, else
2471 * the whole normalized substring is copied into the buffer, including the
2472 * current character.
2473 * @param data collation element iterator data
2474 * @return previous character
2475 */
2476 static
2477 inline UChar getPrevNormalizedChar(collIterate *data)
2478 {
2479     UChar  prevch;
2480     UChar  ch;
2481     UChar *start;
2482     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2483     UChar *pNull = NULL;
2484     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2485         (innormbuf && *(data->pos - 1) != 0)) {
2486         /*
2487         if no normalization.
2488         if previous character is in normalized buffer, no further normalization
2489         is required
2490         */
2491       if(data->flags & UCOL_USE_ITERATOR) {
2492         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2493         return (UChar)data->iterator->next(data->iterator);
2494       } else {
2495         return *(data->pos - 1);
2496       }
2497     }
2498
2499     start = data->pos;
2500     if (data->flags & UCOL_ITER_HASLEN) {
2501         /* in data string */
2502         if ((start - 1) == data->string) {
2503             return *(start - 1);
2504         }
2505         start --;
2506         ch     = *start;
2507         prevch = *(start - 1);
2508     }
2509     else {
2510         /*
2511         in writable buffer, at this point fcdPosition can not be NULL.
2512         see contracting tag.
2513         */
2514         if (data->fcdPosition == data->string) {
2515             /* at the start of the string, just dump it into the normalizer */
2516             insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2517             data->fcdPosition = NULL;
2518             return *(data->pos - 1);
2519         }
2520         pNull  = data->pos - 1;
2521         start  = data->fcdPosition;
2522         ch     = *start;
2523         prevch = *(start - 1);
2524     }
2525     /*
2526     * if the current character is not fcd.
2527     * Trailing combining class == 0.
2528     */
2529     if (data->fcdPosition > start &&
2530        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2531     {
2532         /*
2533         Need a more complete FCD check and possible normalization.
2534         normalize substring will be appended to buffer
2535         */
2536         UChar *backuppos = data->pos;
2537         data->pos = start;
2538         if (collPrevIterFCD(data)) {
2539             normalizePrevContraction(data);
2540             return *(data->pos - 1);
2541         }
2542         data->pos = backuppos;
2543         data->fcdPosition ++;
2544     }
2545
2546     if (innormbuf) {
2547     /*
2548     no normalization is to be done hence only one character will be
2549     appended to the buffer.
2550     */
2551         insertBufferFront(data, pNull, ch);
2552         data->fcdPosition --;
2553     }
2554
2555     return ch;
2556 }
2557
2558 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2559 /* It is called by getNextCE */
2560
2561 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2562   collIterateState entryState;
2563   backupState(source, &entryState);
2564   UChar32 cp = ch;
2565
2566   for (;;) {
2567     // This loop will repeat only in the case of contractions, and only when a contraction
2568     //   is found and the first CE resulting from that contraction is itself a special
2569     //   (an expansion, for example.)  All other special CE types are fully handled the
2570     //   first time through, and the loop exits.
2571
2572     const uint32_t *CEOffset = NULL;
2573     switch(getCETag(CE)) {
2574     case NOT_FOUND_TAG:
2575       /* This one is not found, and we'll let somebody else bother about it... no more games */
2576       return CE;
2577     case SURROGATE_TAG:
2578       /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2579       /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2580       /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2581       /* we return 0 (completely ignorable - per UCA specification */
2582       {
2583         UChar trail;
2584         collIterateState state;
2585         backupState(source, &state);
2586         if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2587           // we chould have stepped one char forward and it might have turned that it
2588           // was not a trail surrogate. In that case, we have to backup.
2589           loadState(source, &state, TRUE);
2590           return 0;
2591         } else {
2592           /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2593           CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail);
2594           if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2595             // We need to backup
2596             loadState(source, &state, TRUE);
2597             return CE;
2598           }
2599           // calculate the supplementary code point value, if surrogate was not tailored
2600           cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2601         }
2602       }
2603       break;
2604     case THAI_TAG:
2605       /* Thai/Lao reordering */
2606         if  (((source->flags) & UCOL_ITER_INNORMBUF)      /* Already Swapped     ||                 */
2607             || (source->iterator && !source->iterator->hasNext(source->iterator))
2608             || (source->pos && source->endp == source->pos)                /* At end of string.  No swap possible || */
2609             /*|| UCOL_ISTHAIBASECONSONANT(*(source->pos)) == 0*/)  /* next char not Thai base cons.*/ // This is from the old specs - we now rearrange unconditionally
2610         {
2611             // Treat Thai as a length one expansion */
2612             CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2613             CE = *CEOffset++;
2614         }
2615         else
2616         {
2617             // Move the prevowel and the following base Consonant into the normalization buffer
2618             //   with their order swapped
2619
2620             source->writableBuffer[0] = peekCharacter(source, 0);
2621             source->writableBuffer[1] = peekCharacter(source, -1);
2622             source->writableBuffer[2] = 0;
2623
2624             if(source->pos) {
2625               source->fcdPosition       = source->pos+1;   // Indicate where to continue in main input string
2626                                                            //   after exhausting the writableBuffer
2627             } else if(source->iterator) {
2628               source->iterator->next(source->iterator);
2629             }
2630         source->pos   = source->writableBuffer;
2631             source->origFlags         = source->flags;
2632             source->flags            |= UCOL_ITER_INNORMBUF;
2633             source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2634
2635         CE = UCOL_IGNORABLE;
2636       }
2637       break;
2638     case SPEC_PROC_TAG:
2639       {
2640         // Special processing is getting a CE that is preceded by a certain prefix
2641         // Currently this is only needed for optimizing Japanese length and iteration marks.
2642         // When we encouter a special processing tag, we go backwards and try to see if
2643         // we have a match.
2644         // Contraction tables are used - so the whole process is not unlike contraction.
2645         // prefix data is stored backwards in the table.
2646         const UChar *UCharOffset;
2647         UChar schar, tchar;
2648         collIterateState prefixState;
2649         backupState(source, &prefixState);
2650         loadState(source, &entryState, TRUE);
2651         goBackOne(source); // We want to look at the point where we entered - actually one
2652         // before that...
2653
2654         for(;;) {
2655         // This loop will run once per source string character, for as long as we
2656         //  are matching a potential contraction sequence
2657
2658           // First we position ourselves at the begining of contraction sequence
2659           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2660           if (collIter_bos(source)) {
2661             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2662             break;
2663           }
2664           schar = getPrevNormalizedChar(source);
2665           goBackOne(source);
2666
2667           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2668             UCharOffset++;
2669           }
2670
2671           if (schar == tchar) {
2672               // Found the source string char in the table.
2673               //  Pick up the corresponding CE from the table.
2674               CE = *(coll->contractionCEs +
2675                   (UCharOffset - coll->contractionIndex));
2676           }
2677           else
2678           {
2679               // if there is a completely ignorable code point in the middle of
2680               // a prefix, we need to act as if it's not there
2681               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
2682               // lone surrogates cannot be set to zero as it would break other processing
2683               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
2684               // it's easy for BMP code points
2685               if(isZeroCE == 0) {
2686                 continue;
2687               } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
2688                 // for supplementary code points, we have to check the next one
2689                 // situations where we are going to ignore
2690                 // 1. beginning of the string: schar is a lone surrogate
2691                 // 2. schar is a lone surrogate
2692                 // 3. schar is a trail surrogate in a valid surrogate sequence
2693                 //    that is explicitly set to zero.
2694                 if (!collIter_bos(source)) {
2695                   UChar lead;
2696                   if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
2697                     isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
2698                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
2699                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
2700                       if(finalCE == 0) {
2701                         // this is a real, assigned completely ignorable code point
2702                         goBackOne(source);
2703                         continue;
2704                       }
2705                     }
2706                   } else {
2707                     // lone surrogate, completely ignorable
2708                     continue;
2709                   }
2710                 } else {
2711                   // lone surrogate at the beggining, completely ignorable
2712                   continue;
2713                 }
2714               }
2715               // Source string char was not in the table.
2716               //   We have not found the prefix.
2717               CE = *(coll->contractionCEs +
2718                   (ContractionStart - coll->contractionIndex));
2719           }
2720
2721           if(!isPrefix(CE)) {
2722               // The source string char was in the contraction table, and the corresponding
2723               //   CE is not a prefix CE.  We found the prefix, break
2724               //   out of loop, this CE will end up being returned.  This is the normal
2725               //   way out of prefix handling when the source actually contained
2726               //   the prefix.
2727               break;
2728           }
2729         }
2730         if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
2731           loadState(source, &prefixState, TRUE);
2732           if(source->origFlags & UCOL_USE_ITERATOR) {
2733             source->flags = source->origFlags;
2734           }
2735         } else { // prefix search was a failure, we have to backup all the way to the start
2736           loadState(source, &entryState, TRUE);
2737         }
2738       break;
2739       }
2740     case CONTRACTION_TAG:
2741       {
2742       /* This should handle contractions */
2743       collIterateState state;
2744       backupState(source, &state);
2745       uint32_t firstCE = UCOL_NOT_FOUND;
2746       const UChar *UCharOffset;
2747       UChar schar, tchar;
2748
2749       for (;;) {
2750         /* This loop will run once per source string character, for as long as we     */
2751         /*  are matching a potential contraction sequence                  */
2752
2753         /* First we position ourselves at the begining of contraction sequence */
2754         const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
2755
2756         if (collIter_eos(source)) {
2757             // Ran off the end of the source string.
2758             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
2759             // So we'll pick whatever we have at the point...
2760             if (CE == UCOL_NOT_FOUND) {
2761                 // back up the source over all the chars we scanned going into this contraction.
2762                 CE = firstCE;
2763                 loadState(source, &state, TRUE);
2764                 if(source->origFlags & UCOL_USE_ITERATOR) {
2765                   source->flags = source->origFlags;
2766                 }
2767             }
2768             break;
2769         }
2770
2771         uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
2772         uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
2773
2774         schar = getNextNormalizedChar(source);
2775         while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
2776           UCharOffset++;
2777         }
2778
2779         if (schar == tchar) {
2780             // Found the source string char in the contraction table.
2781             //  Pick up the corresponding CE from the table.
2782             CE = *(coll->contractionCEs +
2783                 (UCharOffset - coll->contractionIndex));
2784         }
2785         else
2786         {
2787             // if there is a completely ignorable code point in the middle of
2788             // contraction, we need to act as if it's not there
2789             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
2790             // it's easy for BMP code points
2791             if(isZeroCE == 0) {
2792               continue;
2793             } else if(UTF_IS_LEAD(schar)) {
2794               if(!collIter_eos(source)) {
2795                 backupState(source, &state);
2796                 UChar trail = getNextNormalizedChar(source);
2797                 if(UTF_IS_TRAIL(trail)) { // do stuff with trail
2798                   if(getCETag(isZeroCE) == SURROGATE_TAG) {
2799                     uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
2800                     if(finalCE == 0) {
2801                       continue;
2802                     }
2803                   }
2804                 } else {
2805                   // broken surrogate sequence, thus completely ignorable
2806                   loadState(source, &state, TRUE);
2807                   continue;
2808                 }
2809                 loadState(source, &state, TRUE);
2810               } else { // no  more characters, so broken surrogate pair...
2811                 // this contraction will ultimately fail, but not because of us
2812                 continue;
2813               }
2814             } // else if(UTF_IS_LEAD(schar))
2815
2816             // Source string char was not in contraction table.
2817             //   Unless we have a discontiguous contraction, we have finished
2818             //   with this contraction.
2819             uint8_t sCC;
2820             if (schar < 0x300 ||
2821                 maxCC == 0 ||
2822                 (sCC = i_getCombiningClass(schar, coll)) == 0 ||
2823                 sCC>maxCC ||
2824                 (allSame != 0 && sCC == maxCC) ||
2825                 collIter_eos(source)) {
2826                     //  Contraction can not be discontiguous.
2827                     goBackOne(source);  // back up the source string by one,
2828                                         //  because  the character we just looked at was
2829                                         //  not part of the contraction.   */
2830                     CE = *(coll->contractionCEs +
2831                         (ContractionStart - coll->contractionIndex));
2832             } else {
2833                 //
2834                 // Contraction is possibly discontiguous.
2835                 //   Scan more of source string looking for a match
2836                 //
2837                 UChar tempchar;
2838                 /* find the next character if schar is not a base character
2839                     and we are not yet at the end of the string */
2840                 tempchar = getNextNormalizedChar(source);
2841                 goBackOne(source);
2842                 if (i_getCombiningClass(tempchar, coll) == 0) {
2843                     goBackOne(source);
2844                     /* Spit out the last char of the string, wasn't tasty enough */
2845                     CE = *(coll->contractionCEs +
2846                         (ContractionStart - coll->contractionIndex));
2847                 } else {
2848                     CE = getDiscontiguous(coll, source, ContractionStart);
2849                 }
2850             }
2851         } // else after if(schar == tchar)
2852
2853         if(CE == UCOL_NOT_FOUND) {
2854             /* The Source string did not match the contraction that we were checking.  */
2855             /*  Back up the source position to undo the effects of having partially    */
2856             /*   scanned through what ultimately proved to not be a contraction.       */
2857           loadState(source, &state, TRUE);
2858           CE = firstCE;
2859           if(source->origFlags & UCOL_USE_ITERATOR) {
2860             source->flags = source->origFlags;
2861           }
2862           break;
2863         }
2864
2865         if(!isContraction(CE)) {
2866             // The source string char was in the contraction table, and the corresponding
2867             //   CE is not a contraction CE.  We completed the contraction, break
2868             //   out of loop, this CE will end up being returned.  This is the normal
2869             //   way out of contraction handling when the source actually contained
2870             //   the contraction.
2871             break;
2872         }
2873
2874
2875         // The source string char was in the contraction table, and the corresponding
2876         //   CE is IS  a contraction CE.  We will continue looping to check the source
2877         //   string for the remaining chars in the contraction.
2878         uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
2879         if(tempCE != UCOL_NOT_FOUND) {
2880             // We have scanned a a section of source string for which there is a
2881             //  CE from the contraction table.  Remember the CE and scan position, so
2882             //  that we can return to this point if further scanning fails to
2883             //  match a longer contraction sequence.
2884             firstCE = tempCE;
2885
2886             goBackOne(source);
2887             backupState(source, &state);
2888             getNextNormalizedChar(source);
2889
2890             // Another way to do this is:
2891             //collIterateState tempState;
2892             //backupState(source, &tempState);
2893             //goBackOne(source);
2894             //backupState(source, &state);
2895             //loadState(source, &tempState, TRUE);
2896
2897             // The problem is that for incomplete contractions we have to remember the previous
2898             // position. Before, the only thing I needed to do was state.pos--;
2899             // After iterator introduction and especially after introduction of normalizing
2900             // iterators, it became much more difficult to decrease the saved state.
2901             // I'm not yet sure which of the two methods above is faster.
2902         }
2903       } // for(;;)
2904       break;
2905       } // case CONTRACTION_TAG:
2906     case LONG_PRIMARY_TAG:
2907       {
2908         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
2909         CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
2910         return CE;
2911       }
2912     case EXPANSION_TAG:
2913       {
2914       /* This should handle expansion. */
2915       /* NOTE: we can encounter both continuations and expansions in an expansion! */
2916       /* I have to decide where continuations are going to be dealt with */
2917       uint32_t size;
2918       uint32_t i;    /* general counter */
2919       CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2920       size = getExpansionCount(CE);
2921       CE = *CEOffset++;
2922       if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
2923         for(i = 1; i<size; i++) {
2924           *(source->CEpos++) = *CEOffset++;
2925         }
2926       } else { /* else, we do */
2927         while(*CEOffset != 0) {
2928           *(source->CEpos++) = *CEOffset++;
2929         }
2930       }
2931       return CE;
2932       }
2933     case DIGIT_TAG:
2934       {
2935       /*
2936          We do a check to see if we want to collate digits as numbers; if so we generate
2937          a custom collation key. Otherwise we pull out the value stored in the expansion table.
2938       */
2939       uint32_t size;
2940       uint32_t i;    /* general counter */
2941
2942       if (coll->numericCollation == UCOL_ON){
2943                 UChar32 char32 = 0;
2944
2945                 uint32_t digIndx = 0;
2946                 uint32_t endIndex = 0;
2947                 uint32_t trailingZeroIndex = 0;
2948
2949                 uint32_t primWeight = 0;
2950
2951                 uint32_t digVal = 0;
2952                 uint8_t collateVal = 0;
2953
2954                 UBool nonZeroValReached = false;
2955
2956                 uint8_t *numTempBuf;
2957                 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
2958                 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
2959
2960                 numTempBuf = stackNumTempBuf;
2961                 /*
2962                          We parse the source string until we hit a char that's NOT a digit.
2963                 Use this u_charDigitValue. This might be slow because we have to
2964                 handle surrogates...
2965         */
2966
2967         if (U16_IS_LEAD(ch)){
2968                 if (!collIter_eos(source))
2969                                 char32 = U16_GET_SUPPLEMENTARY(ch, getNextNormalizedChar(source));
2970                         else
2971                                 char32 = ch;
2972         }
2973                 else
2974                         char32 = ch;
2975                 digVal = u_charDigitValue(char32);
2976
2977                 /*
2978                         We  pad a zero in front of the first element anyways. This takes
2979                         care of the (probably) most common case where people are sorting things followed
2980                         by a single digit
2981                 */
2982                 digIndx++;
2983         for(;;){
2984         // Make sure we have enough space.
2985         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
2986         {
2987                 numTempBufSize *= 2;
2988                 if (numTempBuf == stackNumTempBuf){
2989                         numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
2990                         memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
2991                 }else
2992                         realloc(numTempBuf, numTempBufSize);
2993         }
2994
2995                         // Skipping over leading zeroes.
2996                 if (digVal != 0 || nonZeroValReached){
2997                                 if (digVal != 0 && !nonZeroValReached)
2998                                         nonZeroValReached = true;
2999
3000                                 /*
3001                                         We parse the digit string into base 100 numbers (this fits into a byte).
3002                                         We only add to the buffer in twos, thus if we are parsing an odd character,
3003                                         that serves as the 'tens' digit while the if we are parsing an even one, that
3004                                         is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3005                                         a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3006                                         overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3007                                         than all the other bytes.
3008                                  */
3009
3010                                 if (digIndx % 2 == 1){
3011                                         collateVal += (uint8_t)digVal;
3012
3013                                         // We don't enter the low-order-digit case unless we've already seen
3014                                         // the high order, or for the first digit, which is always non-zero.
3015                                         if (collateVal != 0)
3016                                                 trailingZeroIndex = 0;
3017
3018                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3019                                         collateVal = 0;
3020                                 }
3021                                 else{
3022                                         // We drop the collation value into the buffer so if we need to do
3023                                         // a "front patch" we don't have to check to see if we're hitting the
3024                                         // last element.
3025                                         collateVal = (uint8_t)(digVal * 10);
3026
3027                                         // Check for trailing zeroes.
3028                                         if (collateVal == 0)
3029                                         {
3030                                                 if (!trailingZeroIndex)
3031                                                         trailingZeroIndex = (digIndx/2) + 2;
3032                                         }
3033                                         else
3034                                                 trailingZeroIndex = 0;
3035
3036                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3037                                 }
3038                                 digIndx++;
3039                 }
3040
3041                 // Get next character.
3042                 if (!collIter_eos(source)){
3043                                 ch = getNextNormalizedChar(source);
3044                                 if (U16_IS_LEAD(ch)){
3045                                         if (!collIter_eos(source))
3046                                                 char32 = U16_GET_SUPPLEMENTARY(ch, getNextNormalizedChar(source));
3047                                 }
3048                                 else
3049                                         char32 = ch;
3050
3051                                 if ((digVal = u_charDigitValue(char32)) == -1){
3052                                         // Resetting position to point to the next unprocessed char. We
3053                                         // overshot it when doing our test/set for numbers.
3054                                         goBackOne(source);
3055                                         if (char32 > 0xFFFF) // For surrogates.
3056                                                 goBackOne(source);
3057                                         break;
3058                                 }
3059                         }else
3060                                 break;
3061                 }
3062
3063                 if (nonZeroValReached == false){
3064                         digIndx = 2;
3065                         numTempBuf[2] = 6;
3066                 }
3067
3068                 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3069                 if (digIndx % 2 != 0){
3070                         /*
3071                                 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3072                                 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3073                                 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3074                                 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3075                         */
3076
3077                         for(i = 2; i < endIndex; i++){
3078                                 numTempBuf[i] =         (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3079                                                                         (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3080                         }
3081                         --digIndx;
3082                 }
3083
3084                 // Subtract one off of the last byte.
3085                 numTempBuf[endIndex-1] -= 1;
3086
3087                 /*
3088                         We want to skip over the first two slots in the buffer. The first slot
3089                         is reserved for the header byte 0x1B. The second slot is for the
3090                         sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3091                 */
3092                 numTempBuf[0] = 0x1B;
3093                 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3094
3095                 // Now transfer the collation key to our collIterate struct.
3096                 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3097                   size = ((endIndex+1) & ~1)/2;
3098                   CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3099                                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3100                                 UCOL_BYTE_COMMON; // Tertiary weight.
3101                   i = 2; // Reset the index into the buffer.
3102                   while(i < endIndex)
3103                   {
3104                         primWeight = numTempBuf[i++] << 8;
3105                         if ( i < endIndex)
3106                                 primWeight |= numTempBuf[i++];
3107                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3108                   }
3109
3110                   if (numTempBuf != stackNumTempBuf)
3111                         free(numTempBuf);
3112       }
3113       else{
3114                   CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3115                   size = getExpansionCount(CE);
3116                   CE = *CEOffset++;
3117                   if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3118                         for(i = 1; i<size; i++) {
3119                           *(source->CEpos++) = *CEOffset++;
3120                         }
3121                   } else { /* else, we do */
3122                         while(*CEOffset != 0) {
3123                           *(source->CEpos++) = *CEOffset++;
3124                         }
3125                   }
3126           }
3127       return CE;
3128       }
3129     /* various implicits optimization */
3130     // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3131     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3132       //return getImplicit(cp, source, 0x04000000);
3133       return getImplicit(cp, source);
3134     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3135       /* UCA is filled with these. Tailorings are NOT_FOUND */
3136       //return getImplicit(cp, source, 0);
3137       return getImplicit(cp, source);
3138     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3139       return 0; /* broken surrogate sequence */
3140     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3141       UChar nextChar;
3142       if( source->flags & UCOL_USE_ITERATOR) {
3143         if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3144           cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3145           source->iterator->next(source->iterator);
3146           return getImplicit(cp, source);
3147         }  else {
3148           return 0;
3149         }
3150       } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3151         U_IS_TRAIL((nextChar=*source->pos))) {
3152         cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3153         source->pos++;
3154         return getImplicit(cp, source);
3155       } else {
3156         return 0; /* completely ignorable */
3157       }
3158     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3159       {
3160         const uint32_t
3161           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3162         //const uint32_t LCount = 19;
3163         const uint32_t VCount = 21;
3164         const uint32_t TCount = 28;
3165         //const uint32_t NCount = VCount * TCount;   // 588
3166         //const uint32_t SCount = LCount * NCount;   // 11172
3167         uint32_t L = ch - SBase;
3168
3169         // divide into pieces
3170
3171         uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3172         L /= TCount;
3173         uint32_t V = L % VCount;
3174         L /= VCount;
3175
3176         // offset them
3177
3178         L += LBase;
3179         V += VBase;
3180         T += TBase;
3181
3182         // return the first CE, but first put the rest into the expansion buffer
3183         if (!source->coll->image->jamoSpecial) { // FAST PATH
3184
3185           /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3186           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3187           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3188           if (T != TBase) {
3189               /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3190               /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3191               *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3192           }
3193
3194           /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3195           /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3196           return UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3197
3198         } else { // Jamo is Special
3199           // Since Hanguls pass the FCD check, it is
3200           // guaranteed that we won't be in
3201           // the normalization buffer if something like this happens
3202           // However, if we are using a uchar iterator and normalization
3203           // is ON, the Hangul that lead us here is going to be in that
3204           // normalization buffer. Here we want to restore the uchar
3205           // iterator state and pull out of the normalization buffer
3206           if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3207             source->flags = source->origFlags; // restore the iterator
3208             source->pos = NULL;
3209           }
3210           // Move Jamos into normalization buffer
3211           source->writableBuffer[0] = (UChar)L;
3212           source->writableBuffer[1] = (UChar)V;
3213           if (T != TBase) {
3214             source->writableBuffer[2] = (UChar)T;
3215             source->writableBuffer[3] = 0;
3216           } else {
3217             source->writableBuffer[2] = 0;
3218           }
3219
3220           source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3221                                                          //   after exhausting the writableBuffer
3222           source->pos   = source->writableBuffer;
3223           source->origFlags         = source->flags;
3224           source->flags            |= UCOL_ITER_INNORMBUF;
3225           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3226
3227           return(UCOL_IGNORABLE);
3228         }
3229       }
3230     case CHARSET_TAG:
3231     /* not yet implemented */
3232       /* probably after 1.8 */
3233       return UCOL_NOT_FOUND;
3234     default:
3235       *status = U_INTERNAL_PROGRAM_ERROR;
3236       CE=0;
3237       break;
3238     }
3239     if (CE <= UCOL_NOT_FOUND) break;
3240   }
3241   return CE;
3242 }
3243
3244
3245 /* now uses Mark's getImplicitPrimary code */
3246 static
3247 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3248   if(isNonChar(cp)) {
3249     return 0;
3250   }
3251
3252   uint32_t r = getImplicitPrimary(cp);
3253
3254   *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3255   collationSource->toReturn = collationSource->CEpos;
3256   return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3257 }
3258
3259 /**
3260  * This function handles the special CEs like contractions, expansions,
3261  * surrogates, Thai.
3262  * It is called by both getPrevCE
3263  */
3264 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3265                           collIterate *source,
3266                           UErrorCode *status)
3267 {
3268   const uint32_t *CEOffset    = NULL;
3269         UChar    *UCharOffset = NULL;
3270         UChar    schar;
3271   const UChar    *constart    = NULL;
3272         uint32_t size;
3273         UChar    buffer[UCOL_MAX_BUFFER];
3274         uint32_t *endCEBuffer;
3275         UChar   *strbuffer;
3276         int32_t noChars = 0;
3277
3278   for(;;)
3279   {
3280     /* the only ces that loops are thai and contractions */
3281     switch (getCETag(CE))
3282     {
3283     case NOT_FOUND_TAG:  /* this tag always returns */
3284       return CE;
3285     case SURROGATE_TAG:  /* This is a surrogate pair */
3286       /* essentialy an engaged lead surrogate. */
3287       /* if you have encountered it here, it means that a */
3288       /* broken sequence was encountered and this is an error */
3289       return 0;
3290     case THAI_TAG:
3291       if  ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
3292             source->string == source->pos        || /* At start of string.|| */
3293             /* previous char not Thai prevowel */
3294             /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3295             UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE)
3296             //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3297       {
3298           /* Treat Thai as a length one expansion */
3299           /* find the offset to expansion table */
3300           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE);
3301           CE = *CEOffset ++;
3302       }
3303       else
3304       {
3305           /*
3306           Move the prevowel and the following base Consonant into the
3307           normalization buffer with their order swapped
3308           */
3309           UChar *tempbuffer = source->writableBuffer +
3310                               (source->writableBufSize - 1);
3311           *(tempbuffer - 2) = 0;
3312           *(tempbuffer - 1) = peekCharacter(source, 0);
3313           *(tempbuffer)     = peekCharacter(source, -1);
3314
3315           /*
3316           Indicate where to continue in main input string after exhausting
3317           the writableBuffer
3318           */
3319           if (source->pos - 1 == source->string) {
3320               source->fcdPosition = NULL;
3321           } else {
3322             source->fcdPosition       = source->pos-2;
3323           }
3324
3325           source->pos               = tempbuffer;
3326           source->origFlags         = source->flags;
3327           source->flags            |= UCOL_ITER_INNORMBUF;
3328           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3329
3330           //CE = UCOL_IGNORABLE;
3331           return(UCOL_IGNORABLE);
3332       }
3333       break;
3334     case SPEC_PROC_TAG:
3335       {
3336         // Special processing is getting a CE that is preceded by a certain prefix
3337         // Currently this is only needed for optimizing Japanese length and iteration marks.
3338         // When we encouter a special processing tag, we go backwards and try to see if
3339         // we have a match.
3340         // Contraction tables are used - so the whole process is not unlike contraction.
3341         // prefix data is stored backwards in the table.
3342         const UChar *UCharOffset;
3343         UChar schar, tchar;
3344         collIterateState prefixState;
3345         backupState(source, &prefixState);
3346         for(;;) {
3347         // This loop will run once per source string character, for as long as we
3348         //  are matching a potential contraction sequence
3349
3350           // First we position ourselves at the begining of contraction sequence
3351           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3352
3353           if (collIter_bos(source)) {
3354             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3355             break;
3356           }
3357           schar = getPrevNormalizedChar(source);
3358           goBackOne(source);
3359
3360           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3361             UCharOffset++;
3362           }
3363
3364           if (schar == tchar) {
3365               // Found the source string char in the table.
3366               //  Pick up the corresponding CE from the table.
3367               CE = *(coll->contractionCEs +
3368                   (UCharOffset - coll->contractionIndex));
3369           }
3370           else
3371           {
3372               // if there is a completely ignorable code point in the middle of
3373               // a prefix, we need to act as if it's not there
3374               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3375               // lone surrogates cannot be set to zero as it would break other processing
3376               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3377               // it's easy for BMP code points
3378               if(isZeroCE == 0) {
3379                 continue;
3380               } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3381                 // for supplementary code points, we have to check the next one
3382                 // situations where we are going to ignore
3383                 // 1. beginning of the string: schar is a lone surrogate
3384                 // 2. schar is a lone surrogate
3385                 // 3. schar is a trail surrogate in a valid surrogate sequence
3386                 //    that is explicitly set to zero.
3387                 if (!collIter_bos(source)) {
3388                   UChar lead;
3389                   if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3390                     isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3391                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3392                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3393                       if(finalCE == 0) {
3394                         // this is a real, assigned completely ignorable code point
3395                         goBackOne(source);
3396                         continue;
3397                       }
3398                     }
3399                   } else {
3400                     // lone surrogate, completely ignorable
3401                     continue;
3402                   }
3403                 } else {
3404                   // lone surrogate at the beggining, completely ignorable
3405                   continue;
3406                 }
3407               }
3408               // Source string char was not in the table.
3409               //   We have not found the prefix.
3410               CE = *(coll->contractionCEs +
3411                   (ContractionStart - coll->contractionIndex));
3412           }
3413
3414           if(!isPrefix(CE)) {
3415               // The source string char was in the contraction table, and the corresponding
3416               //   CE is not a prefix CE.  We found the prefix, break
3417               //   out of loop, this CE will end up being returned.  This is the normal
3418               //   way out of prefix handling when the source actually contained
3419               //   the prefix.
3420               break;
3421           }
3422         }
3423       loadState(source, &prefixState, TRUE);
3424       break;
3425       }
3426
3427     case CONTRACTION_TAG:
3428         /* to ensure that the backwards and forwards iteration matches, we
3429         take the current region of most possible match and pass it through
3430         the forward iteration. this will ensure that the obstinate problem of
3431         overlapping contractions will not occur.
3432         */
3433         schar = peekCharacter(source, 0);
3434         constart = (UChar *)coll->image + getContractOffset(CE);
3435         if (isAtStartPrevIterate(source)
3436             /* commented away contraction end checks after adding the checks
3437             in getPrevCE  */) {
3438             /* start of string or this is not the end of any contraction */
3439             CE = *(coll->contractionCEs +
3440                      (constart - coll->contractionIndex));
3441             break;
3442         }
3443         strbuffer = buffer;
3444         UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3445         *(UCharOffset --) = 0;
3446         noChars = 0;
3447         // have to swap thai characters
3448         while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIBASECONSONANT(schar)) {
3449             *(UCharOffset) = schar;
3450             noChars++;
3451             UCharOffset --;
3452             schar = getPrevNormalizedChar(source);
3453             goBackOne(source);
3454             // TODO: when we exhaust the contraction buffer,
3455             // it needs to get reallocated. The problem is
3456             // that the size depends on the string which is
3457             // not iterated over. However, since we're travelling
3458             // backwards, we already had to set the iterator at
3459             // the end - so we might as well know where we are?
3460             if (UCharOffset + 1 == buffer) {
3461                 /* we have exhausted the buffer */
3462               int32_t newsize = 0;
3463               if(source->pos) { // actually dealing with a position
3464                 newsize = source->pos - source->string + 1;
3465               } else { // iterator
3466                 newsize = 4 * UCOL_MAX_BUFFER;
3467               }
3468                 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3469                                              (newsize + UCOL_MAX_BUFFER));
3470                                 /* test for NULL */
3471                                 if (strbuffer == NULL) {
3472                                         *status = U_MEMORY_ALLOCATION_ERROR;
3473                                         return UCOL_NO_MORE_CES;
3474                                 }
3475                 UCharOffset = strbuffer + newsize;
3476                 uprv_memcpy(UCharOffset, buffer,
3477                                              UCOL_MAX_BUFFER * sizeof(UChar));
3478                 UCharOffset --;
3479             }
3480             if ((source->pos && (source->pos == source->string ||
3481                 ((source->flags & UCOL_ITER_INNORMBUF) &&
3482                 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3483                 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3484                 break;
3485             }
3486         }
3487         /* adds the initial base character to the string */
3488         *(UCharOffset) = schar;
3489         noChars++;
3490
3491         /* a new collIterate is used to simply things, since using the current
3492         collIterate will mean that the forward and backwards iteration will
3493         share and change the same buffers. we don't want to get into that. */
3494         collIterate temp;
3495         //IInit_collIterate(coll, UCharOffset, -1, &temp);
3496         IInit_collIterate(coll, UCharOffset, noChars, &temp);
3497         temp.flags &= ~UCOL_ITER_NORM;
3498
3499         CE = ucol_IGetNextCE(coll, &temp, status);
3500         endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3501         while (CE != UCOL_NO_MORE_CES) {
3502             *(source->CEpos ++) = CE;
3503             if (source->CEpos == endCEBuffer) {
3504                 /* ran out of CE space, bail.
3505                 there's no guarantee of the right character position after
3506                 this bail*/
3507                 *status = U_BUFFER_OVERFLOW_ERROR;
3508                 source->CEpos = source->CEs;
3509                 freeHeapWritableBuffer(&temp);
3510                 if (strbuffer != buffer) {
3511                     uprv_free(strbuffer);
3512                 }
3513                 return UCOL_NULLORDER;
3514             }
3515             CE = ucol_IGetNextCE(coll, &temp, status);
3516         }
3517         freeHeapWritableBuffer(&temp);
3518         if (strbuffer != buffer) {
3519             uprv_free(strbuffer);
3520         }
3521         source->toReturn = source->CEpos - 1;
3522         if (source->toReturn == source->CEs) {
3523             source->CEpos = source->CEs;
3524         }
3525         return *(source->toReturn);
3526     case LONG_PRIMARY_TAG:
3527       {
3528         *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3529         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3530         source->toReturn = source->CEpos - 1;
3531         return *(source->toReturn);
3532       }
3533     case EXPANSION_TAG: /* this tag always returns */
3534       /*
3535       This should handle expansion.
3536       NOTE: we can encounter both continuations and expansions in an expansion!
3537       I have to decide where continuations are going to be dealt with
3538       */
3539       /* find the offset to expansion table */
3540       CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3541       size     = getExpansionCount(CE);
3542       if (size != 0) {
3543         /*
3544         if there are less than 16 elements in expansion, we don't terminate
3545         */
3546         uint32_t count;
3547         for (count = 0; count < size; count++) {
3548           *(source->CEpos ++) = *CEOffset++;
3549         }
3550       }
3551       else {
3552         /* else, we do */
3553         while (*CEOffset != 0) {
3554           *(source->CEpos ++) = *CEOffset ++;
3555         }
3556       }
3557       source->toReturn = source->CEpos - 1;
3558       // in case of one element expansion, we
3559       // want to immediately return CEpos
3560       if(source->toReturn == source->CEs) {
3561         source->CEpos = source->CEs;
3562       }
3563       return *(source->toReturn);
3564      case DIGIT_TAG:
3565       {
3566       /*
3567          We do a check to see if we want to collate digits as numbers; if so we generate
3568          a custom collation key. Otherwise we pull out the value stored in the expansion table.
3569       */
3570       uint32_t size;
3571       uint32_t i;    /* general counter */
3572
3573       if (coll->numericCollation == UCOL_ON){
3574                 UChar32 char32 = 0;
3575
3576                 uint32_t digIndx = 0;
3577                 uint32_t endIndex = 0;
3578                 uint32_t leadingZeroIndex = 0;
3579                 uint32_t trailingZeroCount = 0;
3580
3581                 uint32_t primWeight = 0;
3582
3583                 uint32_t digVal = 0;
3584                 uint8_t collateVal = 0;
3585
3586                 UBool nonZeroValReached = false;
3587
3588                 uint8_t *numTempBuf;
3589                 uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3590                 uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3591
3592                 numTempBuf = stackNumTempBuf;
3593                 /*
3594                          We parse the source string until we hit a char that's NOT a digit.
3595                 Use this u_charDigitValue. This might be slow because we have to
3596                 handle surrogates...
3597         */
3598
3599         if (U16_IS_TRAIL (ch)){
3600                 if (!collIter_bos(source)){
3601                                 char32 = U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source),ch);
3602                                 goBackOne(source);
3603                         }
3604                         else
3605                                 char32 = ch;
3606         }
3607                 else
3608                         char32 = ch;
3609                 digVal = u_charDigitValue(char32);
3610
3611         for(;;){
3612                         // Make sure we have enough space.
3613                         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3614                         {
3615                                 numTempBufSize *= 2;
3616                                 if (numTempBuf == stackNumTempBuf){
3617                                         numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
3618                                         memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3619                                 }else
3620                                         realloc(numTempBuf, numTempBufSize);
3621                         }
3622
3623                         // Skip over trailing zeroes, and keep a count of them.
3624                         if (digVal != 0)
3625                                 nonZeroValReached = true;
3626                 if (nonZeroValReached){
3627                                 /*
3628                                         We parse the digit string into base 100 numbers (this fits into a byte).
3629                                         We only add to the buffer in twos, thus if we are parsing an odd character,
3630                                         that serves as the 'tens' digit while the if we are parsing an even one, that
3631                                         is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3632                                         a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3633                                         overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3634                                         than all the other bytes.
3635
3636                                         Since we're doing in this reverse we want to put the first digit encountered into the
3637                                         ones place and the second digit encountered into the tens place.
3638                                  */
3639
3640                                 if ((digIndx + trailingZeroCount) % 2 == 1){
3641                                         // High-order digit case (tens place)
3642                                         collateVal += digVal * 10;
3643
3644                                         // We cannot set leadingZeroIndex unless it has been set for the
3645                                         // low-order digit. Therefore, all we can do for the high-order
3646                                         // digit is turn it off, never on.
3647                                         // The only time we will have a high digit without a low is for
3648                                         // the very first non-zero digit, so no zero check is necessary.
3649                                         if (collateVal != 0)
3650                                                 leadingZeroIndex = 0;
3651
3652                                         numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3653                                         collateVal = 0;
3654                                 }
3655                                 else{
3656                                         // Low-order digit case (ones place)
3657                                         collateVal = digVal;
3658
3659                                         // Check for leading zeroes.
3660                                         if (collateVal == 0)
3661                                         {
3662                                                 if (!leadingZeroIndex)
3663                                                         leadingZeroIndex = (digIndx/2) + 2;
3664                                         }
3665                                         else
3666                                                 leadingZeroIndex = 0;
3667
3668                                         // No need to write to buffer; the case of a last odd digit
3669                                         // is handled below.
3670                                 }
3671                         ++digIndx;
3672                 }
3673                 else
3674                         ++trailingZeroCount;
3675
3676                 if (!collIter_bos(source)){
3677                                 ch = getPrevNormalizedChar(source);
3678                                 goBackOne(source);
3679                                 if (U16_IS_TRAIL(ch)){
3680                                         if (!collIter_bos(source))
3681                                         {
3682                                                 char32 = U16_GET_SUPPLEMENTARY(getPrevNormalizedChar(source),ch);
3683                                                 goBackOne(source);
3684                                         }
3685                                 }
3686                                 else
3687                                         char32 = ch;
3688
3689                                 if ((digVal = u_charDigitValue(char32)) == -1){
3690                                         // Don't need to "reverse" the goBackOne call,
3691                                         // as this points to the next position to process..
3692                                         if (char32 > 0xFFFF) // For surrogates.
3693                                                 getNextNormalizedChar(source);
3694                                         break;
3695                                 }
3696                         }else
3697                                 break;
3698                 }
3699
3700                 if (nonZeroValReached == false){
3701                         digIndx = 2;
3702                         trailingZeroCount = 0;
3703                         numTempBuf[2] = 6;
3704                 }
3705
3706                 if ((digIndx + trailingZeroCount) % 2 != 0){
3707                                 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
3708                                 digIndx += 1;
3709                 }
3710
3711                 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
3712
3713                 // Subtract one off of the last byte. Really the first byte here, but it's reversed...
3714                 numTempBuf[2] -= 1;
3715
3716                 /*
3717                         We want to skip over the first two slots in the buffer. The first slot
3718                         is reserved for the header byte 0x1B. The second slot is for the
3719                         sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3720                         The exponent must be adjusted by the number of leading zeroes, and the number of
3721                         trailing zeroes.
3722                 */
3723                 numTempBuf[0] = 0x1B;
3724                 uint32_t exponent = (digIndx+trailingZeroCount)/2;
3725                 if (leadingZeroIndex)
3726                         exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
3727                 numTempBuf[1] = 0x80 + (exponent & 0x7F);
3728
3729                 // Now transfer the collation key to our collIterate struct.
3730                 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3731                 //size = ((endIndex+1) & ~1)/2;
3732                   *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3733                                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3734                                 UCOL_BYTE_COMMON; // Tertiary weight.
3735                   i = endIndex - 1; // Reset the index into the buffer.
3736                   while(i >= 2)
3737                   {
3738                         primWeight = numTempBuf[i--] << 8;
3739                         if ( i >= 2)
3740                                 primWeight |= numTempBuf[i--];
3741                         *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3742                   }
3743                   if (numTempBuf != stackNumTempBuf)
3744                         free(numTempBuf);
3745
3746                   source->toReturn = source->CEpos -1;
3747                   return *(source->toReturn);
3748       }
3749       else{
3750                 /* find the offset to expansion table */
3751                   CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3752                   size     = getExpansionCount(CE);
3753                   if (size != 0) {
3754                         /*
3755                         if there are less than 16 elements in expansion, we don't terminate
3756                         */
3757                         uint32_t count;
3758                         for (count = 0; count < size; count++) {
3759                           *(source->CEpos ++) = *CEOffset++;
3760                         }
3761                   }
3762                   else {
3763                         /* else, we do */
3764                         while (*CEOffset != 0) {
3765                           *(source->CEpos ++) = *CEOffset ++;
3766                         }
3767                   }
3768                   source->toReturn = source->CEpos - 1;
3769           // in case of one element expansion, we
3770           // want to immediately return CEpos
3771           if(source->toReturn == source->CEs) {
3772             source->CEpos = source->CEs;
3773           }
3774                   return *(source->toReturn);
3775           }
3776       }
3777     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3778       {
3779         const uint32_t
3780           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3781         //const uint32_t LCount = 19;
3782         const uint32_t VCount = 21;
3783         const uint32_t TCount = 28;
3784         //const uint32_t NCount = VCount * TCount;   /* 588 */
3785         //const uint32_t SCount = LCount * NCount;   /* 11172 */
3786
3787         uint32_t L = ch - SBase;
3788         /*
3789         divide into pieces.
3790         we do it in this order since some compilers can do % and / in one
3791         operation
3792         */
3793         uint32_t T = L % TCount;
3794         L /= TCount;
3795         uint32_t V = L % VCount;
3796         L /= VCount;
3797
3798         /* offset them */
3799         L += LBase;
3800         V += VBase;
3801         T += TBase;
3802
3803         /*
3804         return the first CE, but first put the rest into the expansion buffer
3805         */
3806         if (!source->coll->image->jamoSpecial)
3807         {
3808           /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
3809           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3810           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3811           /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
3812           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3813           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3814           if (T != TBase)
3815             /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
3816             /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3817             *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3818
3819           source->toReturn = source->CEpos - 1;
3820           return *(source->toReturn);
3821         } else {
3822           // Since Hanguls pass the FCD check, it is
3823           // guaranteed that we won't be in
3824           // the normalization buffer if something like this happens
3825           // Move Jamos into normalization buffer
3826           /*
3827           Move the Jamos into the
3828           normalization buffer
3829           */
3830           UChar *tempbuffer = source->writableBuffer +
3831                               (source->writableBufSize - 1);
3832           *(tempbuffer) = 0;
3833           if (T != TBase) {
3834             *(tempbuffer - 1) = (UChar)T;
3835             *(tempbuffer - 2) = (UChar)V;
3836             *(tempbuffer - 3) = (UChar)L;
3837             *(tempbuffer - 4) = 0;
3838           } else {
3839             *(tempbuffer - 1) = (UChar)V;
3840             *(tempbuffer - 2) = (UChar)L;
3841             *(tempbuffer - 3) = 0;
3842           }
3843
3844           /*
3845           Indicate where to continue in main input string after exhausting
3846           the writableBuffer
3847           */
3848           if (source->pos  == source->string) {
3849             source->fcdPosition = NULL;
3850           } else {
3851             source->fcdPosition       = source->pos-1;
3852           }
3853
3854           source->pos               = tempbuffer;
3855           source->origFlags         = source->flags;
3856           source->flags            |= UCOL_ITER_INNORMBUF;
3857           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3858
3859           return(UCOL_IGNORABLE);
3860         }
3861       }
3862     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3863       return 0; /* broken surrogate sequence */
3864     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3865     {
3866       UChar32 cp = 0;
3867       UChar  prevChar;
3868       UChar *prev;
3869       if (isAtStartPrevIterate(source)) {
3870           /* we are at the start of the string, wrong place to be at */
3871           return 0;
3872       }
3873       if (source->pos != source->writableBuffer) {
3874           prev     = source->pos - 1;
3875       } else {
3876           prev     = source->fcdPosition;
3877       }
3878       prevChar = *prev;
3879
3880       /* Handles Han and Supplementary characters here.*/
3881       if (UTF_IS_FIRST_SURROGATE(prevChar)) {
3882         cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
3883         source->pos = prev;
3884       } else {
3885         return 0; /* completely ignorable */
3886       }
3887       return getPrevImplicit(cp, source);
3888     }
3889     // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
3890     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3891       return getPrevImplicit(ch, source);
3892     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3893       return getPrevImplicit(ch, source);
3894       /* UCA is filled with these. Tailorings are NOT_FOUND */
3895     /* not yet implemented */
3896     case CHARSET_TAG:  /* this tag always returns */
3897       /* probably after 1.8 */
3898       return UCOL_NOT_FOUND;
3899     default:           /* this tag always returns */
3900       *status = U_INTERNAL_PROGRAM_ERROR;
3901       CE=0;
3902       break;
3903     }
3904     if (CE <= UCOL_NOT_FOUND) {
3905       break;
3906     }
3907   }
3908   return CE;
3909 }
3910
3911 /* This should really be a macro        */
3912 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
3913 /* anyway */
3914 static
3915 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
3916 #ifdef UCOL_DEBUG
3917   fprintf(stderr, ".");
3918 #endif
3919   uint8_t *newStart = NULL;
3920   uint32_t offset = *secondaries-secStart;
3921
3922   if(secStart==second) {
3923     newStart=(uint8_t*)uprv_malloc(newSize);
3924     if(newStart==NULL) {
3925       *status = U_MEMORY_ALLOCATION_ERROR;
3926       return NULL;
3927     }
3928     uprv_memcpy(newStart, secStart, *secondaries-secStart);
3929   } else {
3930     newStart=(uint8_t*)uprv_realloc(secStart, newSize);
3931     if(newStart==NULL) {
3932       *status = U_MEMORY_ALLOCATION_ERROR;
3933       return NULL;
3934     }
3935   }
3936   *secondaries=newStart+offset;
3937   *secSize=newSize;
3938   return newStart;
3939 }
3940
3941
3942 /* This should really be a macro                                                                      */
3943 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
3944 /* secondaries in French                                                                              */
3945 /*
3946 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
3947   uint8_t temp;
3948   while(start<end) {
3949     temp = *start;
3950     *start++ = *end;
3951     *end-- = temp;
3952   }
3953 }
3954 */
3955
3956 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
3957   TYPE tempA; \
3958 while((start)<(end)) { \
3959     tempA = *(start); \
3960     *(start)++ = *(end); \
3961     *(end)-- = tempA; \
3962 } \
3963 }
3964
3965 /****************************************************************************/
3966 /* Following are the sortkey generation functions                           */
3967 /*                                                                          */
3968 /****************************************************************************/
3969
3970 /**
3971  * Merge two sort keys.
3972  * This is useful, for example, to combine sort keys from first and last names
3973  * to sort such pairs.
3974  * Merged sort keys consider on each collation level the first part first entirely,
3975  * then the second one.
3976  * It is possible to merge multiple sort keys by consecutively merging
3977  * another one with the intermediate result.
3978  *
3979  * The length of the merge result is the sum of the lengths of the input sort keys
3980  * minus 1.
3981  *
3982  * @param src1 the first sort key
3983  * @param src1Length the length of the first sort key, including the zero byte at the end;
3984  *        can be -1 if the function is to find the length
3985  * @param src2 the second sort key
3986  * @param src2Length the length of the second sort key, including the zero byte at the end;
3987  *        can be -1 if the function is to find the length
3988  * @param dest the buffer where the merged sort key is written,
3989  *        can be NULL if destCapacity==0
3990  * @param destCapacity the number of bytes in the dest buffer
3991  * @return the length of the merged sort key, src1Length+src2Length-1;
3992  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
3993  *         in which cases the contents of dest is undefined
3994  *
3995  * @draft
3996  */
3997 U_CAPI int32_t U_EXPORT2
3998 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
3999                    const uint8_t *src2, int32_t src2Length,
4000                    uint8_t *dest, int32_t destCapacity) {
4001     int32_t destLength;
4002     uint8_t b;
4003
4004     /* check arguments */
4005     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4006         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4007         destCapacity<0 || (destCapacity>0 && dest==NULL)
4008     ) {
4009         /* error, attempt to write a zero byte and return 0 */
4010         if(dest!=NULL && destCapacity>0) {
4011             *dest=0;
4012         }
4013         return 0;
4014     }
4015
4016     /* check lengths and capacity */
4017     if(src1Length<0) {
4018         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4019     }
4020     if(src2Length<0) {
4021         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4022     }
4023
4024     destLength=src1Length+src2Length-1;
4025     if(destLength>destCapacity) {
4026         /* the merged sort key does not fit into the destination */
4027         return destLength;
4028     }
4029
4030     /* merge the sort keys with the same number of levels */
4031     while(*src1!=0 && *src2!=0) { /* while both have another level */
4032         /* copy level from src1 not including 00 or 01 */
4033         while((b=*src1)>=2) {
4034             ++src1;
4035             *dest++=b;
4036         }
4037
4038         /* add a 02 merge separator */
4039         *dest++=2;
4040
4041         /* copy level from src2 not including 00 or 01 */
4042         while((b=*src2)>=2) {
4043             ++src2;
4044             *dest++=b;
4045         }
4046
4047         /* if both sort keys have another level, then add a 01 level separator and continue */
4048         if(*src1==1 && *src2==1) {
4049             ++src1;
4050             ++src2;
4051             *dest++=1;
4052         }
4053     }
4054
4055     /*
4056      * here, at least one sort key is finished now, but the other one
4057      * might have some contents left from containing more levels;
4058      * that contents is just appended to the result
4059      */
4060     if(*src1!=0) {
4061         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4062         src2=src1;
4063     }
4064     /* append src2, "the other, unfinished sort key" */
4065     uprv_strcpy((char *)dest, (const char *)src2);
4066
4067     /* trust that neither sort key contained illegally embedded zero bytes */
4068     return destLength;
4069 }
4070
4071 /* sortkey API */
4072 U_CAPI int32_t U_EXPORT2
4073 ucol_getSortKey(const    UCollator    *coll,
4074         const    UChar        *source,
4075         int32_t        sourceLength,
4076         uint8_t        *result,
4077         int32_t        resultLength)
4078 {
4079   UErrorCode status = U_ZERO_ERROR;
4080
4081   if(source == NULL) {
4082     // this is actually an error situation, but we would need to
4083     // have an error code to return it. Until we introduce a new
4084     // API, it stays like this
4085     return 0;
4086   }
4087   /* this uses the function pointer that is set in updateinternalstate */
4088   /* currently, there are two funcs: */
4089   /*ucol_calcSortKey(...);*/
4090   /*ucol_calcSortKeySimpleTertiary(...);*/
4091
4092   int32_t keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4093   //((UCollator *)coll)->errorCode = status; /*semantically const */
4094   return keySize;
4095 }
4096
4097 /* this function is called by the C++ API for sortkey generation */
4098 U_CFUNC int32_t
4099 ucol_getSortKeyWithAllocation(const UCollator *coll,
4100                               const UChar *source, int32_t sourceLength,
4101                               uint8_t **pResult,
4102                               UErrorCode *pErrorCode) {
4103     *pResult = 0;
4104     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4105 }
4106
4107 #define UCOL_FSEC_BUF_SIZE 256
4108
4109 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4110 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
4111 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4112     UErrorCode status = U_ZERO_ERROR;
4113     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4114     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4115     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4116     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4117     UBool  doCase = (coll->caseLevel == UCOL_ON);
4118     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4119     //UBool  qShifted = shifted  && (compareQuad == 0);
4120     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4121     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4122     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4123     uint8_t *fSecs = fSecsBuff;
4124     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4125     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4126
4127     uint32_t variableTopValue = coll->variableTopValue;
4128     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4129     if(doHiragana) {
4130       UCOL_COMMON_BOT4++;
4131       /* allocate one more space for hiragana */
4132     }
4133     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4134
4135     uint32_t order = UCOL_NO_MORE_CES;
4136     uint8_t primary1 = 0;
4137     uint8_t primary2 = 0;
4138     uint8_t secondary = 0;
4139     uint8_t tertiary = 0;
4140     int32_t caseShift = 0;
4141     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4142
4143     uint8_t caseSwitch = coll->caseSwitch;
4144     uint8_t tertiaryMask = coll->tertiaryMask;
4145     uint8_t tertiaryCommon = coll->tertiaryCommon;
4146
4147     UBool wasShifted = FALSE;
4148     UBool notIsContinuation = FALSE;
4149     uint8_t leadPrimary = 0;
4150
4151
4152     for(;;) {
4153           order = ucol_IGetNextCE(coll, s, &status);
4154           if(order == UCOL_NO_MORE_CES) {
4155               break;
4156           }
4157
4158           if(order == 0) {
4159             continue;
4160           }
4161
4162           notIsContinuation = !isContinuation(order);
4163
4164
4165           if(notIsContinuation) {
4166             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4167           } else {
4168             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4169           }
4170           secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4171           primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4172           primary1 = (uint8_t)(order >> 8);
4173
4174
4175           if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4176             || (!notIsContinuation && wasShifted))
4177             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4178             /* and other ignorables should be removed if following a shifted code point */
4179             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4180                                 /* we should just completely ignore it */
4181               continue;
4182             }
4183             if(compareQuad == 0) {
4184               if(c4 > 0) {
4185                 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4186                 c4 = 0;
4187               }
4188               currentSize++;
4189               if(primary2 != 0) {
4190                 currentSize++;
4191               }
4192             }
4193             wasShifted = TRUE;
4194           } else {
4195             wasShifted = FALSE;
4196             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4197             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4198             /* calculate sortkey size */
4199             if(primary1 != UCOL_IGNORABLE) {
4200               if(notIsContinuation) {
4201                 if(leadPrimary == primary1) {
4202                   currentSize++;
4203                 } else {
4204                   if(leadPrimary != 0) {
4205                     currentSize++;
4206                   }
4207                   if(primary2 == UCOL_IGNORABLE) {
4208                   /* one byter, not compressed */
4209                       currentSize++;
4210                       leadPrimary = 0;
4211                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4212                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4213                       (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4214                   /* not compressible */
4215                       leadPrimary = 0;
4216                       currentSize+=2;
4217                   } else { /* compress */
4218                       leadPrimary = primary1;
4219                       currentSize+=2;
4220                   }
4221                 }
4222               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4223                 currentSize++;
4224                 if(primary2 != UCOL_IGNORABLE) {
4225                   currentSize++;
4226                 }
4227               }
4228             }
4229
4230             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4231               if(!isFrenchSec){
4232                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4233                   c2++;
4234                 } else {
4235                   if(c2 > 0) {
4236                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4237                       currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4238                     } else {
4239                       currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4240                     }
4241                     c2 = 0;
4242                   }
4243                   currentSize++;
4244                 }
4245               } else {
4246                 fSecs[fSecsLen++] = secondary;
4247                 if(fSecsLen == fSecsMaxLen) {
4248                   if(fSecs == fSecsBuff) {
4249                     fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4250                   } else {
4251                     fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4252                   }
4253                   if(fSecs == NULL) {
4254                     status = U_MEMORY_ALLOCATION_ERROR;
4255                     return -1;
4256                   }
4257                   fSecsMaxLen *= 2;
4258                 }
4259                 if(notIsContinuation) {
4260                   if (frenchStartPtr != NULL) {
4261                       /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4262                     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4263                     frenchStartPtr = NULL;
4264                   }
4265                 } else {
4266                   if (frenchStartPtr == NULL) {
4267                     frenchStartPtr = fSecs+fSecsLen-2;
4268                   }
4269                   frenchEndPtr = fSecs+fSecsLen-1;
4270                 }
4271               }
4272             }
4273
4274             if(doCase) {
4275               if (caseShift  == 0) {
4276                 currentSize++;
4277                 caseShift = UCOL_CASE_SHIFT_START;
4278               }
4279               if((tertiary&0x3F) > 0 && notIsContinuation) {
4280                 caseShift--;
4281                 if((tertiary &0xC0) != 0) {
4282                   if (caseShift  == 0) {
4283                     currentSize++;
4284                     caseShift = UCOL_CASE_SHIFT_START;
4285                   }
4286                   caseShift--;
4287                 }
4288               }
4289             } else {
4290               if(notIsContinuation) {
4291                 tertiary ^= caseSwitch;
4292               }
4293             }
4294
4295             tertiary &= tertiaryMask;
4296             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4297               if (tertiary == tertiaryCommon && notIsContinuation) {
4298                 c3++;
4299               } else {
4300                 if(c3 > 0) {
4301                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4302                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4303                     currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4304                   } else {
4305                     currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4306                   }
4307                   c3 = 0;
4308                 }
4309                 currentSize++;
4310               }
4311             }
4312
4313             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4314               if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4315                 if(c4>0) { // Close this part
4316                   currentSize += (c4/UCOL_BOT_COUNT4)+1;
4317                   c4 = 0;
4318                 }
4319                 currentSize++; // Add the Hiragana
4320               } else { // This wasn't Hiragana, so we can continue adding stuff
4321                 c4++;
4322               }
4323             }
4324
4325           }
4326     }
4327
4328     if(!isFrenchSec){
4329       if(c2 > 0) {
4330         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4331       }
4332     } else {
4333       uint32_t i = 0;
4334       if(frenchStartPtr != NULL) {
4335         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4336       }
4337       for(i = 0; i<fSecsLen; i++) {
4338         secondary = *(fSecs+fSecsLen-i-1);
4339         /* This is compression code. */
4340         if (secondary == UCOL_COMMON2) {
4341           ++c2;
4342         } else {
4343           if(c2 > 0) {
4344             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4345               currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4346             } else {
4347               currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4348             }
4349             c2 = 0;
4350           }
4351           currentSize++;
4352         }
4353       }
4354       if(c2 > 0) {
4355         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4356       }
4357       if(fSecs != fSecsBuff) {
4358         uprv_free(fSecs);
4359       }
4360     }
4361
4362     if(c3 > 0) {
4363       currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4364     }
4365
4366     if(c4 > 0  && compareQuad == 0) {
4367       currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4368     }
4369
4370     if(compareIdent) {
4371       currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4372     }
4373     return currentSize;
4374
4375 }
4376
4377 static
4378 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4379   if (caseShift  == 0) {
4380     *(*cases)++ = UCOL_CASE_BYTE_START;
4381     caseShift = UCOL_CASE_SHIFT_START;
4382   }
4383 }
4384
4385 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4386 // know how many values we wanted to add, even if we didn't add them all
4387 static
4388 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4389   size++;
4390   if(primaries < limit) {
4391     *(primaries)++ = value;
4392   }
4393 }
4394
4395 // Packs the secondary buffer when processing French locale. Adds the terminator.
4396 static
4397 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4398   uint8_t secondary;
4399   int32_t count2 = 0;
4400   uint32_t i = 0, size = 0;
4401   // we use i here since the key size already accounts for terminators, so we'll discard the increment
4402   addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4403   /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4404   if(frenchStartPtr != NULL) {
4405     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4406   }
4407   for(i = 0; i<*secsize; i++) {
4408     secondary = *(secondaries-i-1);
4409     /* This is compression code. */
4410     if (secondary == UCOL_COMMON2) {
4411       ++count2;
4412     } else {
4413       if (count2 > 0) {
4414         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4415           while (count2 > UCOL_TOP_COUNT2) {
4416             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4417             count2 -= (uint32_t)UCOL_TOP_COUNT2;
4418           }
4419           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4420         } else {
4421           while (count2 > UCOL_BOT_COUNT2) {
4422             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4423             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4424           }
4425           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4426         }
4427         count2 = 0;
4428       }
4429       addWithIncrement(primaries, primEnd, size, secondary);
4430     }
4431   }
4432   if (count2 > 0) {
4433     while (count2 > UCOL_BOT_COUNT2) {
4434       addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4435       count2 -= (uint32_t)UCOL_BOT_COUNT2;
4436     }
4437     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4438   }
4439   *secsize = size;
4440   return primaries;
4441 }
4442
4443 /* This is the sortkey work horse function */
4444 U_CFUNC int32_t U_CALLCONV
4445 ucol_calcSortKey(const    UCollator    *coll,
4446         const    UChar        *source,
4447         int32_t        sourceLength,
4448         uint8_t        **result,
4449         uint32_t        resultLength,
4450         UBool allocateSKBuffer,
4451         UErrorCode *status)
4452 {
4453     uint32_t i = 0; /* general purpose counter */
4454
4455     /* Stack allocated buffers for buffers we use */
4456     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4457
4458     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4459
4460     if(U_FAILURE(*status)) {
4461       return 0;
4462     }
4463
4464     if(primaries == NULL && allocateSKBuffer == TRUE) {
4465         primaries = *result = prim;
4466         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4467     }
4468
4469     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4470       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4471
4472     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4473
4474     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4475     UChar *normSource = normBuffer;
4476     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4477
4478     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4479
4480     UColAttributeValue strength = coll->strength;
4481
4482     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4483     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4484     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4485     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4486     UBool  doCase = (coll->caseLevel == UCOL_ON);
4487     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4488     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4489     //UBool  qShifted = shifted && (compareQuad == 0);
4490     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4491     const uint8_t *scriptOrder = coll->scriptOrder;
4492
4493     uint32_t variableTopValue = coll->variableTopValue;
4494     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4495     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4496     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4497     uint8_t UCOL_HIRAGANA_QUAD = 0;
4498     if(doHiragana) {
4499       UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4500       /* allocate one more space for hiragana, value for hiragana */
4501     }
4502     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4503
4504     /* support for special features like caselevel and funky secondaries */
4505     uint8_t *frenchStartPtr = NULL;
4506     uint8_t *frenchEndPtr = NULL;
4507     uint32_t caseShift = 0;
4508
4509     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4510
4511     /* If we need to normalize, we'll do it all at once at the beginning! */
4512     UNormalizationMode normMode;
4513     if(compareIdent) {
4514         normMode = UNORM_NFD;
4515     } else if(coll->normalizationMode != UCOL_OFF) {
4516         normMode = UNORM_FCD;
4517     } else {
4518         normMode = UNORM_NONE;
4519     }
4520
4521     if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4522         len = unorm_internalNormalize(normSource, normSourceLen,
4523                                       source, len,
4524                                       normMode, FALSE,
4525                                       status);
4526         if(*status == U_BUFFER_OVERFLOW_ERROR) {
4527             normSourceLen = len;
4528             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4529             if(normSource == NULL) {
4530                 *status = U_MEMORY_ALLOCATION_ERROR;
4531                 return 0;
4532             }
4533             *status = U_ZERO_ERROR;
4534             len = unorm_internalNormalize(normSource, normSourceLen,
4535                                           source, len,
4536                                           normMode, FALSE,
4537                                           status);
4538         }
4539
4540         if(U_FAILURE(*status)) {
4541             return 0;
4542         }
4543         source = normSource;
4544     }
4545
4546     collIterate s;
4547     IInit_collIterate(coll, (UChar *)source, len, &s);
4548     if(source == normSource) {
4549         s.flags &= ~UCOL_ITER_NORM;
4550     }
4551
4552     if(resultLength == 0 || primaries == NULL) {
4553       int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4554       if(normSource != normBuffer) {
4555           uprv_free(normSource);
4556       }
4557       return keyLen;
4558     }
4559     uint8_t *primarySafeEnd = primaries + resultLength - 2;
4560
4561     uint32_t minBufferSize = UCOL_MAX_BUFFER;
4562
4563     uint8_t *primStart = primaries;
4564     uint8_t *secStart = secondaries;
4565     uint8_t *terStart = tertiaries;
4566     uint8_t *caseStart = cases;
4567     uint8_t *quadStart = quads;
4568
4569     uint32_t order = 0;
4570
4571     uint8_t primary1 = 0;
4572     uint8_t primary2 = 0;
4573     uint8_t secondary = 0;
4574     uint8_t tertiary = 0;
4575     uint8_t caseSwitch = coll->caseSwitch;
4576     uint8_t tertiaryMask = coll->tertiaryMask;
4577     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
4578     uint8_t tertiaryTop = coll->tertiaryTop;
4579     uint8_t tertiaryBottom = coll->tertiaryBottom;
4580     uint8_t tertiaryCommon = coll->tertiaryCommon;
4581     uint8_t caseBits = 0;
4582
4583     UBool finished = FALSE;
4584     UBool wasShifted = FALSE;
4585     UBool notIsContinuation = FALSE;
4586
4587     uint32_t prevBuffSize = 0;
4588
4589     uint32_t count2 = 0, count3 = 0, count4 = 0;
4590     uint8_t leadPrimary = 0;
4591
4592     for(;;) {
4593         for(i=prevBuffSize; i<minBufferSize; ++i) {
4594
4595             order = ucol_IGetNextCE(coll, &s, status);
4596             if(order == UCOL_NO_MORE_CES) {
4597                 finished = TRUE;
4598                 break;
4599             }
4600
4601             if(order == 0) {
4602               continue;
4603             }
4604
4605             notIsContinuation = !isContinuation(order);
4606
4607             if(notIsContinuation) {
4608               tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
4609             } else {
4610               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4611             }
4612
4613             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4614             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4615             primary1 = (uint8_t)(order >> 8);
4616
4617             if(notIsContinuation) {
4618               if(scriptOrder != NULL) {
4619                 primary1 = scriptOrder[primary1];
4620               }
4621             }
4622
4623             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4624               || (!notIsContinuation && wasShifted))
4625               || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4626               /* and other ignorables should be removed if following a shifted code point */
4627               if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4628                                   /* we should just completely ignore it */
4629                 continue;
4630               }
4631               if(compareQuad == 0) {
4632                 if(count4 > 0) {
4633                   while (count4 > UCOL_BOT_COUNT4) {
4634                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4635                     count4 -= UCOL_BOT_COUNT4;
4636                   }
4637                   *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4638                   count4 = 0;
4639                 }
4640                 /* We are dealing with a variable and we're treating them as shifted */
4641                 /* This is a shifted ignorable */
4642                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
4643                   *quads++ = primary1;
4644                 }
4645                 if(primary2 != 0) {
4646                   *quads++ = primary2;
4647                 }
4648               }
4649               wasShifted = TRUE;
4650             } else {
4651               wasShifted = FALSE;
4652               /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4653               /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4654               /* regular and simple sortkey calc */
4655               if(primary1 != UCOL_IGNORABLE) {
4656                 if(notIsContinuation) {
4657                   if(leadPrimary == primary1) {
4658                     *primaries++ = primary2;
4659                   } else {
4660                     if(leadPrimary != 0) {
4661                       *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
4662                     }
4663                     if(primary2 == UCOL_IGNORABLE) {
4664                     /* one byter, not compressed */
4665                         *primaries++ = primary1;
4666                         leadPrimary = 0;
4667                     } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4668                         (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4669                     /* not compressible */
4670                         leadPrimary = 0;
4671                         *primaries++ = primary1;
4672                         *primaries++ = primary2;
4673                     } else { /* compress */
4674                         *primaries++ = leadPrimary = primary1;
4675                         *primaries++ = primary2;
4676                     }
4677                   }
4678                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4679                   *primaries++ = primary1;
4680                   if(primary2 != UCOL_IGNORABLE) {
4681                     *primaries++ = primary2; /* second part */
4682                   }
4683                 }
4684               }
4685
4686             if(secondary > compareSec) {
4687               if(!isFrenchSec) {
4688                 /* This is compression code. */
4689                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4690                   ++count2;
4691                 } else {
4692                   if (count2 > 0) {
4693                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4694                       while (count2 > UCOL_TOP_COUNT2) {
4695                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
4696                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
4697                       }
4698                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
4699                     } else {
4700                       while (count2 > UCOL_BOT_COUNT2) {
4701                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4702                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
4703                       }
4704                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4705                     }
4706                     count2 = 0;
4707                   }
4708                   *secondaries++ = secondary;
4709                 }
4710               } else {
4711                   *secondaries++ = secondary;
4712                   /* Do the special handling for French secondaries */
4713                   /* We need to get continuation elements and do intermediate restore */
4714                   /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
4715                   if(notIsContinuation) {
4716                     if (frenchStartPtr != NULL) {
4717                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4718                       uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4719                       frenchStartPtr = NULL;
4720                     }
4721                   } else {
4722                     if (frenchStartPtr == NULL) {
4723                       frenchStartPtr = secondaries - 2;
4724                     }
4725                     frenchEndPtr = secondaries-1;
4726                   }
4727                 }
4728               }
4729
4730               if(doCase) {
4731                 doCaseShift(&cases, caseShift);
4732                 if(notIsContinuation) {
4733                   caseBits = (uint8_t)(tertiary & 0xC0);
4734
4735                   if(tertiary != 0) {
4736                     if(coll->caseFirst == UCOL_UPPER_FIRST) {
4737                       if((caseBits & 0xC0) == 0) {
4738                         *(cases-1) |= 1 << (--caseShift);
4739                       } else {
4740                         *(cases-1) |= 0 << (--caseShift);
4741                         /* second bit */
4742                         doCaseShift(&cases, caseShift);
4743                         *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
4744                       }
4745                     } else {
4746                       if((caseBits & 0xC0) == 0) {
4747                         *(cases-1) |= 0 << (--caseShift);
4748                       } else {
4749                         *(cases-1) |= 1 << (--caseShift);
4750                         /* second bit */
4751                         doCaseShift(&cases, caseShift);
4752                         *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
4753                       }
4754                     }
4755                   }
4756
4757                 }
4758               } else {
4759                 if(notIsContinuation) {
4760                   tertiary ^= caseSwitch;
4761                 }
4762               }
4763
4764               tertiary &= tertiaryMask;
4765               if(tertiary > compareTer) {
4766                 /* This is compression code. */
4767                 /* sequence size check is included in the if clause */
4768                 if (tertiary == tertiaryCommon && notIsContinuation) {
4769                   ++count3;
4770                 } else {
4771                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4772                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4773                     tertiary += tertiaryAddition;
4774                   }
4775                   if (count3 > 0) {
4776                     if ((tertiary > tertiaryCommon)) {
4777                       while (count3 > coll->tertiaryTopCount) {
4778                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4779                         count3 -= (uint32_t)coll->tertiaryTopCount;
4780                       }
4781                       *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
4782                     } else {
4783                       while (count3 > coll->tertiaryBottomCount) {
4784                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4785                         count3 -= (uint32_t)coll->tertiaryBottomCount;
4786                       }
4787                       *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4788                     }
4789                     count3 = 0;
4790                   }
4791                   *tertiaries++ = tertiary;
4792                 }
4793               }
4794
4795               if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4796                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4797                   if(count4>0) { // Close this part
4798                     while (count4 > UCOL_BOT_COUNT4) {
4799                       *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4800                       count4 -= UCOL_BOT_COUNT4;
4801                     }
4802                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4803                     count4 = 0;
4804                   }
4805                   *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
4806                 } else { // This wasn't Hiragana, so we can continue adding stuff
4807                   count4++;
4808                 }
4809               }
4810             }
4811
4812             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
4813               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
4814                 IInit_collIterate(coll, (UChar *)source, len, &s);
4815                 if(source == normSource) {
4816                     s.flags &= ~UCOL_ITER_NORM;
4817                 }
4818                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4819                 *status = U_BUFFER_OVERFLOW_ERROR;
4820                 finished = TRUE;
4821                 break;
4822               } else { /* It's much nicer if we can actually reallocate */
4823                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
4824                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
4825                 if(U_SUCCESS(*status)) {
4826                   *result = primStart;
4827                   primarySafeEnd = primStart + resultLength - 2;
4828                 } else {
4829                   IInit_collIterate(coll, (UChar *)source, len, &s);
4830                   if(source == normSource) {
4831                       s.flags &= ~UCOL_ITER_NORM;
4832                   }
4833                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4834                   finished = TRUE;
4835                   break;
4836                 }
4837               }
4838             }
4839         }
4840         if(finished) {
4841             break;
4842         } else {
4843           prevBuffSize = minBufferSize;
4844           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
4845           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
4846           caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
4847           quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
4848           minBufferSize *= 2;
4849           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
4850             IInit_collIterate(coll, (UChar *)source, len, &s);
4851             if(source == normSource) {
4852                 s.flags &= ~UCOL_ITER_NORM;
4853             }
4854             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
4855             break;
4856           }
4857         }
4858     }
4859
4860     /* Here, we are generally done with processing */
4861     /* bailing out would not be too productive */
4862
4863     if(U_SUCCESS(*status)) {
4864       sortKeySize += (primaries - primStart);
4865       /* we have done all the CE's, now let's put them together to form a key */
4866       if(compareSec == 0) {
4867         if (count2 > 0) {
4868           while (count2 > UCOL_BOT_COUNT2) {
4869             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
4870             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4871           }
4872           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
4873         }
4874         uint32_t secsize = secondaries-secStart;
4875         if(!isFrenchSec) { // Regular situation, we know the length of secondaries
4876           sortKeySize += secsize;
4877           if(sortKeySize <= resultLength) {
4878             *(primaries++) = UCOL_LEVELTERMINATOR;
4879             uprv_memcpy(primaries, secStart, secsize);
4880             primaries += secsize;
4881           } else {
4882             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4883               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4884               if(U_SUCCESS(*status)) {
4885                 *result = primStart;
4886                 *(primaries++) = UCOL_LEVELTERMINATOR;
4887                 uprv_memcpy(primaries, secStart, secsize);
4888                 primaries += secsize;
4889               }
4890             } else {
4891               *status = U_BUFFER_OVERFLOW_ERROR;
4892             }
4893           }
4894         } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
4895           uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4896           sortKeySize += secsize;
4897           if(sortKeySize <= resultLength) { // if we managed to pack fine
4898             primaries = newPrim; // update the primary pointer
4899           } else { // overflow, need to reallocate and redo
4900             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
4901               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4902               if(U_SUCCESS(*status)) {
4903                 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
4904               }
4905             } else {
4906               *status = U_BUFFER_OVERFLOW_ERROR;
4907             }
4908           }
4909         }
4910       }
4911
4912       if(doCase) {
4913         uint32_t casesize = cases - caseStart;
4914         sortKeySize += casesize;
4915         if(sortKeySize <= resultLength) {
4916           *(primaries++) = UCOL_LEVELTERMINATOR;
4917           uprv_memcpy(primaries, caseStart, casesize);
4918           primaries += casesize;
4919         } else {
4920           if(allocateSKBuffer == TRUE) {
4921             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4922             if(U_SUCCESS(*status)) {
4923               *result = primStart;
4924               *(primaries++) = UCOL_LEVELTERMINATOR;
4925               uprv_memcpy(primaries, caseStart, casesize);
4926             }
4927           } else {
4928             *status = U_BUFFER_OVERFLOW_ERROR;
4929           }
4930         }
4931       }
4932
4933       if(compareTer == 0) {
4934         if (count3 > 0) {
4935           if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
4936             while (count3 >= coll->tertiaryTopCount) {
4937               *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
4938               count3 -= (uint32_t)coll->tertiaryTopCount;
4939             }
4940             *tertiaries++ = (uint8_t)(tertiaryTop - count3);
4941           } else {
4942             while (count3 > coll->tertiaryBottomCount) {
4943               *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
4944               count3 -= (uint32_t)coll->tertiaryBottomCount;
4945             }
4946             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
4947           }
4948         }
4949         uint32_t tersize = tertiaries - terStart;
4950         sortKeySize += tersize;
4951         if(sortKeySize <= resultLength) {
4952           *(primaries++) = UCOL_LEVELTERMINATOR;
4953           uprv_memcpy(primaries, terStart, tersize);
4954           primaries += tersize;
4955         } else {
4956           if(allocateSKBuffer == TRUE) {
4957             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4958             if(U_SUCCESS(*status)) {
4959               *result = primStart;
4960               *(primaries++) = UCOL_LEVELTERMINATOR;
4961               uprv_memcpy(primaries, terStart, tersize);
4962             }
4963           } else {
4964             *status = U_BUFFER_OVERFLOW_ERROR;
4965           }
4966         }
4967
4968         if(compareQuad == 0/*qShifted == TRUE*/) {
4969             if(count4 > 0) {
4970               while (count4 > UCOL_BOT_COUNT4) {
4971                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
4972                 count4 -= UCOL_BOT_COUNT4;
4973               }
4974               *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
4975             }
4976             uint32_t quadsize = quads - quadStart;
4977             sortKeySize += quadsize;
4978             if(sortKeySize <= resultLength) {
4979               *(primaries++) = UCOL_LEVELTERMINATOR;
4980               uprv_memcpy(primaries, quadStart, quadsize);
4981               primaries += quadsize;
4982             } else {
4983               if(allocateSKBuffer == TRUE) {
4984                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
4985                 if(U_SUCCESS(*status)) {
4986                   *result = primStart;
4987                   *(primaries++) = UCOL_LEVELTERMINATOR;
4988                   uprv_memcpy(primaries, quadStart, quadsize);
4989                 }
4990               } else {
4991                 *status = U_BUFFER_OVERFLOW_ERROR;
4992               }
4993             }
4994         }
4995
4996         if(compareIdent) {
4997           sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
4998           if(sortKeySize <= resultLength) {
4999             *(primaries++) = UCOL_LEVELTERMINATOR;
5000             primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5001           } else {
5002             if(allocateSKBuffer == TRUE) {
5003               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5004               if(U_SUCCESS(*status)) {
5005                 *result = primStart;
5006                 *(primaries++) = UCOL_LEVELTERMINATOR;
5007                 u_writeIdenticalLevelRun(s.string, len, primaries);
5008               }
5009             } else {
5010               *status = U_BUFFER_OVERFLOW_ERROR;
5011             }
5012           }
5013         }
5014       }
5015       *(primaries++) = '\0';
5016     }
5017
5018     if(terStart != tert) {
5019         uprv_free(terStart);
5020         uprv_free(secStart);
5021         uprv_free(caseStart);
5022         uprv_free(quadStart);
5023     }
5024
5025     if(normSource != normBuffer) {
5026         uprv_free(normSource);
5027     }
5028
5029     if(allocateSKBuffer == TRUE) {
5030       *result = (uint8_t*)uprv_malloc(sortKeySize);
5031           /* test for NULL */
5032           if (*result == NULL) {
5033                 *status = U_MEMORY_ALLOCATION_ERROR;
5034                 return sortKeySize;
5035           }
5036       uprv_memcpy(*result, primStart, sortKeySize);
5037       if(primStart != prim) {
5038         uprv_free(primStart);
5039       }
5040     }
5041
5042     return sortKeySize;
5043 }
5044
5045
5046 U_CFUNC int32_t U_CALLCONV
5047 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5048         const    UChar        *source,
5049         int32_t        sourceLength,
5050         uint8_t        **result,
5051         uint32_t        resultLength,
5052         UBool allocateSKBuffer,
5053         UErrorCode *status)
5054 {
5055     U_ALIGN_CODE(16);
5056     uint32_t i = 0; /* general purpose counter */
5057
5058     /* Stack allocated buffers for buffers we use */
5059     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5060
5061     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5062
5063     if(U_FAILURE(*status)) {
5064       return 0;
5065     }
5066
5067     if(primaries == NULL && allocateSKBuffer == TRUE) {
5068         primaries = *result = prim;
5069         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5070     }
5071
5072     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5073
5074     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5075
5076     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5077     UChar *normSource = normBuffer;
5078     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5079
5080     int32_t len =  sourceLength;
5081
5082     /* If we need to normalize, we'll do it all at once at the beginning! */
5083     if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5084         len = unorm_internalNormalize(normSource, normSourceLen,
5085                                       source, len,
5086                                       UNORM_FCD, FALSE,
5087                                       status);
5088         if(*status == U_BUFFER_OVERFLOW_ERROR) {
5089             normSourceLen = len;
5090             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5091             if(normSource == NULL) {
5092                 *status = U_MEMORY_ALLOCATION_ERROR;
5093                 return 0;
5094             }
5095             *status = U_ZERO_ERROR;
5096             len = unorm_internalNormalize(normSource, normSourceLen,
5097                                           source, len,
5098                                           UNORM_FCD, FALSE,
5099                                           status);
5100         }
5101
5102         if(U_FAILURE(*status)) {
5103             return 0;
5104         }
5105         source = normSource;
5106     }
5107
5108     collIterate s;
5109     IInit_collIterate(coll, (UChar *)source, len, &s);
5110     if(source == normSource) {
5111         s.flags &= ~UCOL_ITER_NORM;
5112     }
5113
5114     if(resultLength == 0 || primaries == NULL) {
5115         int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5116         if(normSource != normBuffer) {
5117             uprv_free(normSource);
5118         }
5119         return t;
5120     }
5121
5122     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5123
5124     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5125
5126     uint8_t *primStart = primaries;
5127     uint8_t *secStart = secondaries;
5128     uint8_t *terStart = tertiaries;
5129
5130     uint32_t order = 0;
5131
5132     uint8_t primary1 = 0;
5133     uint8_t primary2 = 0;
5134     uint8_t secondary = 0;
5135     uint8_t tertiary = 0;
5136     uint8_t caseSwitch = coll->caseSwitch;
5137     uint8_t tertiaryMask = coll->tertiaryMask;
5138     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5139     uint8_t tertiaryTop = coll->tertiaryTop;
5140     uint8_t tertiaryBottom = coll->tertiaryBottom;
5141     uint8_t tertiaryCommon = coll->tertiaryCommon;
5142
5143     uint32_t prevBuffSize = 0;
5144
5145     UBool finished = FALSE;
5146     UBool notIsContinuation = FALSE;
5147
5148     uint32_t count2 = 0, count3 = 0;
5149     uint8_t leadPrimary = 0;
5150
5151     for(;;) {
5152         for(i=prevBuffSize; i<minBufferSize; ++i) {
5153
5154             order = ucol_IGetNextCE(coll, &s, status);
5155
5156             if(order == 0) {
5157               continue;
5158             }
5159
5160             if(order == UCOL_NO_MORE_CES) {
5161                 finished = TRUE;
5162                 break;
5163             }
5164
5165             notIsContinuation = !isContinuation(order);
5166
5167             if(notIsContinuation) {
5168               tertiary = (uint8_t)((order & tertiaryMask));
5169             } else {
5170               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5171             }
5172             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5173             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5174             primary1 = (uint8_t)(order >> 8);
5175
5176             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5177             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5178             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5179             /* regular and simple sortkey calc */
5180             if(primary1 != UCOL_IGNORABLE) {
5181               if(notIsContinuation) {
5182                 if(leadPrimary == primary1) {
5183                   *primaries++ = primary2;
5184                 } else {
5185                   if(leadPrimary != 0) {
5186                     *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5187                   }
5188                   if(primary2 == UCOL_IGNORABLE) {
5189                   /* one byter, not compressed */
5190                       *primaries++ = primary1;
5191                       leadPrimary = 0;
5192                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5193                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5194                       (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5195                   /* not compressible */
5196                       leadPrimary = 0;
5197                       *primaries++ = primary1;
5198                       *primaries++ = primary2;
5199                   } else { /* compress */
5200                       *primaries++ = leadPrimary = primary1;
5201                       *primaries++ = primary2;
5202                   }
5203                 }
5204               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5205                 *primaries++ = primary1;
5206                 if(primary2 != UCOL_IGNORABLE) {
5207                   *primaries++ = primary2; /* second part */
5208                 }
5209               }
5210             }
5211
5212             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5213               /* This is compression code. */
5214               if (secondary == UCOL_COMMON2 && notIsContinuation) {
5215                 ++count2;
5216               } else {
5217                 if (count2 > 0) {
5218                   if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5219                     while (count2 > UCOL_TOP_COUNT2) {
5220                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5221                       count2 -= (uint32_t)UCOL_TOP_COUNT2;
5222                     }
5223                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5224                   } else {
5225                     while (count2 > UCOL_BOT_COUNT2) {
5226                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5227                       count2 -= (uint32_t)UCOL_BOT_COUNT2;
5228                     }
5229                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5230                   }
5231                   count2 = 0;
5232                 }
5233                 *secondaries++ = secondary;
5234               }
5235             }
5236
5237             if(notIsContinuation) {
5238               tertiary ^= caseSwitch;
5239             }
5240
5241               if(tertiary > 0) {
5242               /* This is compression code. */
5243               /* sequence size check is included in the if clause */
5244               if (tertiary == tertiaryCommon && notIsContinuation) {
5245                 ++count3;
5246               } else {
5247                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5248                   tertiary += tertiaryAddition;
5249                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5250                   tertiary -= tertiaryAddition;
5251                 }
5252                 if (count3 > 0) {
5253                   if ((tertiary > tertiaryCommon)) {
5254                     while (count3 > coll->tertiaryTopCount) {
5255                       *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5256                       count3 -= (uint32_t)coll->tertiaryTopCount;
5257                     }
5258                     *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5259                   } else {
5260                     while (count3 > coll->tertiaryBottomCount) {
5261                       *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5262                       count3 -= (uint32_t)coll->tertiaryBottomCount;
5263                     }
5264                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5265                   }
5266                   count3 = 0;
5267                 }
5268                 *tertiaries++ = tertiary;
5269               }
5270             }
5271
5272             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5273               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5274                 IInit_collIterate(coll, (UChar *)source, len, &s);
5275                 if(source == normSource) {
5276                     s.flags &= ~UCOL_ITER_NORM;
5277                 }
5278                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5279                 *status = U_BUFFER_OVERFLOW_ERROR;
5280                 finished = TRUE;
5281                 break;
5282               } else { /* It's much nicer if we can actually reallocate */
5283                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5284                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5285                 if(U_SUCCESS(*status)) {
5286                   *result = primStart;
5287                   primarySafeEnd = primStart + resultLength - 2;
5288                 } else {
5289                   IInit_collIterate(coll, (UChar *)source, len, &s);
5290                   if(source == normSource) {
5291                       s.flags &= ~UCOL_ITER_NORM;
5292                   }
5293                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5294                   finished = TRUE;
5295                   break;
5296                 }
5297               }
5298             }
5299         }
5300         if(finished) {
5301             break;
5302         } else {
5303           prevBuffSize = minBufferSize;
5304           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5305           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5306           minBufferSize *= 2;
5307           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5308             IInit_collIterate(coll, (UChar *)source, len, &s);
5309             if(source == normSource) {
5310                 s.flags &= ~UCOL_ITER_NORM;
5311             }
5312             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5313             break;
5314           }
5315         }
5316     }
5317
5318     if(U_SUCCESS(*status)) {
5319       sortKeySize += (primaries - primStart);
5320       /* we have done all the CE's, now let's put them together to form a key */
5321       if (count2 > 0) {
5322         while (count2 > UCOL_BOT_COUNT2) {
5323           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5324           count2 -= (uint32_t)UCOL_BOT_COUNT2;
5325         }
5326         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5327       }
5328       uint32_t secsize = secondaries-secStart;
5329       sortKeySize += secsize;
5330       if(sortKeySize <= resultLength) {
5331         *(primaries++) = UCOL_LEVELTERMINATOR;
5332         uprv_memcpy(primaries, secStart, secsize);
5333         primaries += secsize;
5334       } else {
5335         if(allocateSKBuffer == TRUE) {
5336           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5337           if(U_SUCCESS(*status)) {
5338             *(primaries++) = UCOL_LEVELTERMINATOR;
5339             *result = primStart;
5340             uprv_memcpy(primaries, secStart, secsize);
5341           }
5342         } else {
5343           *status = U_BUFFER_OVERFLOW_ERROR;
5344         }
5345       }
5346
5347       if (count3 > 0) {
5348         if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5349           while (count3 >= coll->tertiaryTopCount) {
5350             *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5351             count3 -= (uint32_t)coll->tertiaryTopCount;
5352           }
5353           *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5354         } else {
5355           while (count3 > coll->tertiaryBottomCount) {
5356             *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5357             count3 -= (uint32_t)coll->tertiaryBottomCount;
5358           }
5359           *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5360         }
5361       }
5362       uint32_t tersize = tertiaries - terStart;
5363       sortKeySize += tersize;
5364       if(sortKeySize <= resultLength) {
5365         *(primaries++) = UCOL_LEVELTERMINATOR;
5366         uprv_memcpy(primaries, terStart, tersize);
5367         primaries += tersize;
5368       } else {
5369         if(allocateSKBuffer == TRUE) {
5370           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5371           if(U_SUCCESS(*status)) {
5372             *result = primStart;
5373             *(primaries++) = UCOL_LEVELTERMINATOR;
5374             uprv_memcpy(primaries, terStart, tersize);
5375           }
5376         } else {
5377           *status = U_MEMORY_ALLOCATION_ERROR;
5378         }
5379       }
5380
5381       *(primaries++) = '\0';
5382     }
5383
5384     if(terStart != tert) {
5385         uprv_free(terStart);
5386         uprv_free(secStart);
5387     }
5388
5389     if(normSource != normBuffer) {
5390         uprv_free(normSource);
5391     }
5392
5393     if(allocateSKBuffer == TRUE) {
5394       *result = (uint8_t*)uprv_malloc(sortKeySize);
5395           /* test for NULL */
5396           if (*result == NULL) {
5397                 *status = U_MEMORY_ALLOCATION_ERROR;
5398                 return sortKeySize;
5399           }
5400       uprv_memcpy(*result, primStart, sortKeySize);
5401       if(primStart != prim) {
5402         uprv_free(primStart);
5403       }
5404     }
5405
5406     return sortKeySize;
5407 }
5408
5409 static inline
5410 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5411   UBool notIsContinuation = !isContinuation(CE);
5412   uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5413   if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5414     || (!notIsContinuation && *wasShifted))
5415     || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5416     // The stuff below should probably be in the sortkey code... maybe not...
5417     if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5418                         /* we should just completely ignore it */
5419       *wasShifted = TRUE;
5420       //continue;
5421     }
5422     //*wasShifted = TRUE;
5423     return TRUE;
5424   } else {
5425     *wasShifted = FALSE;
5426     return FALSE;
5427   }
5428 }
5429 static inline
5430 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5431   if(level < maxLevel) {
5432     dest[i++] = UCOL_LEVELTERMINATOR;
5433   } else {
5434     dest[i++] = 0;
5435   }
5436 }
5437
5438 /** enumeration of level identifiers for partial sort key generation */
5439 enum {
5440   UCOL_PSK_PRIMARY = 0,
5441     UCOL_PSK_SECONDARY = 1,
5442     UCOL_PSK_CASE = 2,
5443     UCOL_PSK_TERTIARY = 3,
5444     UCOL_PSK_QUATERNARY = 4,
5445     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5446     UCOL_PSK_IDENTICAL = 6,
5447     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5448     UCOL_PSK_LIMIT
5449 };
5450
5451 /** collation state enum. *_SHIFT value is how much to shift right
5452  *  to get the state piece to the right. *_MASK value should be
5453  *  ANDed with the shifted state. This data is stored in state[1]
5454  *  field.
5455  */
5456 enum {
5457     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5458     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5459     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5460     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5461     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5462      *  This field is also used to denote that the French secondary level is finished
5463      */
5464     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5465     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5466     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5467     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5468     /** When we do French we need to reverse secondary values. However, continuations
5469      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5470      */
5471     UCOL_PSK_USED_ELEMENTS_SHIFT = 7,
5472     UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF,
5473     UCOL_PSK_ITER_SKIP_SHIFT = 17,
5474     UCOL_PSK_ITER_SKIP_MASK = 0x7FFF
5475 };
5476
5477
5478 /** main sortkey part procedure. On the first call,
5479  *  you should pass in a collator, an iterator, empty state
5480  *  state[0] == state[1] == 0, a buffer to hold results
5481  *  number of bytes you need and an error code pointer.
5482  *  Make sure your buffer is big enough to hold the wanted
5483  *  number of sortkey bytes. I don't check.
5484  *  The only meaningful status you can get back is
5485  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5486  *  have been dealt a raw deal and that you probably won't
5487  *  be able to use partial sortkey generation for this
5488  *  particular combination of string and collator. This
5489  *  is highly unlikely, but you should still check the error code.
5490  *  Any other status means that you're not in a sane situation
5491  *  anymore. After the first call, preserve state values and
5492  *  use them on subsequent calls to obtain more bytes of a sortkey.
5493  *  Use until the number of bytes written is smaller than the requested
5494  *  number of bytes. Generated sortkey is not compatible with the
5495  *  one generated by ucol_getSortKey, as we don't do any compression.
5496  *  However, levels are still terminated by a 1 (one) and the sortkey
5497  *  is terminated by a 0 (zero). Identical level is the same as in the
5498  *  regular sortkey - internal bocu-1 implementation is used.
5499  *  For curious, although you cannot do much about this, here is
5500  *  the structure of state words.
5501  *  state[0] - iterator state. Depends on the iterator implementation,
5502  *             but allows the iterator to continue where it stopped in
5503  *             the last iteration.
5504  *  state[1] - collation processing state. Here is the distribution
5505  *             of the bits:
5506  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5507  *             quaternary, quin (we don't use this one), identical and
5508  *             null (producing only zeroes - first one to terminate the
5509  *             sortkey and subsequent to fill the buffer).
5510  *   3       - byte count. Number of bytes written on the primary level.
5511  *   4       - was shifted. Whether the previous iteration finished in the
5512  *             shifted state.
5513  *   5, 6    - French continuation bytes written. See the comment in the enum
5514  *   7..16   - Used elements. Number of CEs that were already used from the
5515  *             expansion buffer or number of bytes from a bocu sequence on
5516  *             the identical level.
5517  *  17..31   - iterator skip. Number of move operations iterator needs to
5518  *             skip from the current state in order to continue. This is used
5519  *             only if normalization is turned on, since the normalizing iterator
5520  *             can return undefined state, which means that it's in the middle
5521  *             of normalizing sequence.
5522  */
5523 U_CAPI int32_t U_EXPORT2
5524 ucol_nextSortKeyPart(const UCollator *coll,
5525                      UCharIterator *iter,
5526                      uint32_t state[2],
5527                      uint8_t *dest, int32_t count,
5528                      UErrorCode *status) {
5529     /* error checking */
5530     if(status==NULL || U_FAILURE(*status)) {
5531         return 0;
5532     }
5533     if( coll==NULL || iter==NULL ||
5534         state==NULL ||
5535         count<0 || (count>0 && dest==NULL)
5536     ) {
5537         *status=U_ILLEGAL_ARGUMENT_ERROR;
5538     }
5539
5540
5541     if(count==0) {
5542         /* nothing to do */
5543         return 0;
5544     }
5545
5546     /** Setting up situation according to the state we got from the previous iteration */
5547     // The state of the iterator from the previous invocation
5548     uint32_t iterState = state[0];
5549     // Has the last iteration ended in the shifted state
5550     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
5551     // What is the current level of the sortkey?
5552     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
5553     // Have we written only one byte from a two byte primary in the previous iteration?
5554     // Also on secondary level - have we finished with the French secondary?
5555     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
5556     // number of bytes in the continuation buffer for French
5557     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
5558     // Skip the CEs that we got from an extraction
5559     // and delivered in the previous call
5560     int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK;
5561     // Number of times to skip because the iterator returned
5562     // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
5563     // last valid state.
5564     int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK;
5565
5566     /** values that depend on the collator attributes */
5567     // strength of the collator.
5568     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
5569     // maximal level of the partial sortkey. Need to take whether case level is done
5570     int32_t maxLevel = 0;
5571     if(strength < UCOL_TERTIARY) {
5572       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5573         maxLevel = UCOL_PSK_CASE;
5574       } else {
5575         maxLevel = strength;
5576       }
5577     } else {
5578         if(strength == UCOL_TERTIARY) {
5579           maxLevel = UCOL_PSK_TERTIARY;
5580         } else if(strength == UCOL_QUATERNARY) {
5581           maxLevel = UCOL_PSK_QUATERNARY;
5582         } else { // identical
5583           maxLevel = UCOL_IDENTICAL;
5584         }
5585     }
5586     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
5587     uint8_t UCOL_HIRAGANA_QUAD =
5588       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
5589     // Boundary value that decides whether a CE is shifted or not
5590     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
5591     // Are we doing French collation?
5592     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
5593
5594     /** initializing the collation state */
5595     UBool notIsContinuation = FALSE;
5596     uint32_t CE = UCOL_NO_MORE_CES;
5597
5598     collIterate s;
5599     IInit_collIterate(coll, NULL, -1, &s);
5600     s.iterator = iter;
5601     s.flags |= UCOL_USE_ITERATOR;
5602     // This variable tells us whether we have produced some other levels in this iteration
5603     // before we moved to the identical level. In that case, we need to switch the
5604     // type of the iterator.
5605     UBool doingIdenticalFromStart = FALSE;
5606     // Normalizing iterator
5607     // The division for the array length may truncate the array size to
5608     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
5609     // for all platforms anyway.
5610     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
5611     UNormIterator *normIter = NULL;
5612     // If the normalization is turned on for the collator and we are below identical level
5613     // we will use a FCD normalizing iterator
5614     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
5615       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5616       s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
5617       s.flags &= ~UCOL_ITER_NORM;
5618       if(U_FAILURE(*status)) {
5619         return 0;
5620       }
5621     } else if(level == UCOL_PSK_IDENTICAL) {
5622       // for identical level, we need a NFD iterator. We need to instantiate it here, since we
5623       // will be updating the state - and this cannot be done on an ordinary iterator.
5624       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
5625       s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
5626       s.flags &= ~UCOL_ITER_NORM;
5627       if(U_FAILURE(*status)) {
5628         return 0;
5629       }
5630       doingIdenticalFromStart = TRUE;
5631     }
5632
5633     // This is the tentative new state of the iterator. The problem
5634     // is that the iterator might return an undefined state, in
5635     // which case we should save the last valid state and increase
5636     // the iterator skip value.
5637     uint32_t newState = 0;
5638
5639     // First, we set the iterator to the last valid position
5640     // from the last iteration. This was saved in state[0].
5641     if(iterState == 0) {
5642       /* initial state */
5643       if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
5644         s.iterator->move(s.iterator, 0, UITER_LIMIT);
5645       } else {
5646         s.iterator->move(s.iterator, 0, UITER_START);
5647       }
5648     } else {
5649         /* reset to previous state */
5650       s.iterator->setState(s.iterator, iterState, status);
5651       if(U_FAILURE(*status)) {
5652           return 0;
5653       }
5654     }
5655
5656     // Then, we may have to move more, if the normalizing iterator
5657     // was going through a normalizing sequence.
5658     if(iterSkips) {
5659       // if we are on secondary level AND we do French, we need to go backward instead of forward
5660       if(level == UCOL_PSK_SECONDARY && doingFrench) {
5661         s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT);
5662       } else {
5663         s.iterator->move(s.iterator, iterSkips, UITER_CURRENT);
5664       }
5665     }
5666
5667
5668     // Number of expansion CEs that were already consumed in the
5669     // previous iteration for the last code point processed. We
5670     // want to clean out the expansion buffer, so that we can
5671     // get correct CEs. This value is persistent over iterations,
5672     // since we can have several iterations on the one expansion
5673     // buffer.
5674     int32_t consumedExpansionCEs = usedElements;
5675     // Number of bytes already writted from a bocsu sequence. Since
5676     // the longes bocsu sequence is 4 long, this can be up to 3. It
5677     // shares the state field with consumedExpansionCEs value, since
5678     // they cannot simultanously appear on the same level
5679     int32_t bocsuBytesUsed = 0;
5680     // Clean out the expansion buffer unless we are on
5681     // identical level. In that case we use this field
5682     // to store the number of bytes already written
5683     // from the previous bocsu sequence.
5684     if(level < UCOL_PSK_IDENTICAL && usedElements != 0) {
5685       while(usedElements-->0) {
5686         // If we're doing French and we are on the secondary level,
5687         // we go backwards.
5688         if(level == UCOL_PSK_SECONDARY && doingFrench) {
5689           CE = ucol_IGetPrevCE(coll, &s, status);
5690         } else {
5691           CE = ucol_IGetNextCE(coll, &s, status);
5692         }
5693         if(CE==UCOL_NO_MORE_CES) {
5694           /* should not happen */
5695           *status=U_INTERNAL_PROGRAM_ERROR;
5696           return 0;
5697         }
5698       }
5699     } else {
5700       bocsuBytesUsed = usedElements;
5701     }
5702
5703     // This variable prevents the adjusting of iterator
5704     // skip variable when we are the first time on a
5705     // level. I hope there is a better way to do it, but
5706     // I could not think of it.
5707     UBool firstTimeOnLevel = TRUE;
5708     // French secondary needs to know whether the iterator state of zero came from previous level OR
5709     // from a new invocation...
5710     UBool wasDoingPrimary = FALSE;
5711     // Case level is kind of goofy. This variable tells us that
5712     // we are still not done with the case level.
5713     UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE;
5714     // destination buffer byte counter. When this guy
5715     // gets to count, we're done with the iteration
5716     int32_t i = 0;
5717     // used to count the zero bytes written after we
5718     // have finished with the sort key
5719     int32_t j = 0;
5720
5721
5722     // Hm.... I think we're ready to plunge in. Basic story is as following:
5723     // we have a fall through case based on level. This is used for initial
5724     // positioning on iteration start. Every level processor contains a
5725     // for(;;) which will be broken when we exhaust all the CEs. Other
5726     // way to exit is a goto saveState, which happens when we have filled
5727     // out our buffer.
5728     switch(level) {
5729     case UCOL_PSK_PRIMARY:
5730       wasDoingPrimary = TRUE;
5731       for(;;) {
5732           if(i==count) {
5733               goto saveState;
5734           }
5735           // We should save the state only if we
5736           // are sure that we are done with the
5737           // previous iterator state
5738           if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) {
5739             newState = s.iterator->getState(s.iterator);
5740             if(newState != UITER_NO_STATE) {
5741               iterState = newState;
5742               iterSkips = 0;
5743             } else {
5744               if(!firstTimeOnLevel && !byteCountOrFrenchDone) {
5745                 iterSkips++;
5746               }
5747             }
5748           }
5749           firstTimeOnLevel = FALSE;
5750           CE = ucol_IGetNextCE(coll, &s, status);
5751           if(CE==UCOL_NO_MORE_CES) {
5752               // Add the level separator
5753               terminatePSKLevel(level, maxLevel, i, dest);
5754               byteCountOrFrenchDone=0;
5755               // Restart the iteration an move to the
5756               // second level
5757               s.iterator->move(s.iterator, 0, UITER_START);
5758               level = UCOL_PSK_SECONDARY;
5759               break;
5760           }
5761           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5762             CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
5763             if(CE != 0) {
5764               if(byteCountOrFrenchDone == 0) {
5765                 // get the second byte of primary
5766                 dest[i++]=(uint8_t)(CE >> 8);
5767               } else {
5768                 byteCountOrFrenchDone = 0;
5769               }
5770               if((CE &=0xff)!=0) {
5771                   if(i==count) {
5772                       /* overflow */
5773                       byteCountOrFrenchDone=1;
5774                       goto saveState;
5775                   }
5776                   dest[i++]=(uint8_t)CE;
5777               }
5778             }
5779           }
5780           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5781             // s.pos != NULL means there is a normalization buffer in effect
5782             // in iterative case, this means that we are doing Thai (maybe discontiguos)
5783             consumedExpansionCEs++;
5784           } else {
5785             consumedExpansionCEs = 0;
5786           }
5787           if(s.pos && *s.pos == 0) {
5788             // maybe it is the end of Thai - we have to have
5789             // an extra skip
5790             iterSkips++;
5791           }
5792       }
5793       /* fall through to next level */
5794     case UCOL_PSK_SECONDARY:
5795       if(strength >= UCOL_SECONDARY) {
5796         if(!doingFrench) {
5797           for(;;) {
5798             if(i == count) {
5799               goto saveState;
5800             }
5801             // We should save the state only if we
5802             // are sure that we are done with the
5803             // previous iterator state
5804             if(consumedExpansionCEs == 0) {
5805               newState = s.iterator->getState(s.iterator);
5806               if(newState != UITER_NO_STATE) {
5807                 iterState = newState;
5808                 iterSkips = 0;
5809               } else {
5810                 if(!firstTimeOnLevel) {
5811                   iterSkips++;
5812                 }
5813               }
5814             }
5815             firstTimeOnLevel = FALSE;
5816             CE = ucol_IGetNextCE(coll, &s, status);
5817             if(CE==UCOL_NO_MORE_CES) {
5818                 // Add the level separator
5819                 terminatePSKLevel(level, maxLevel, i, dest);
5820                 byteCountOrFrenchDone=0;
5821                 // Restart the iteration an move to the
5822                 // second level
5823                 s.iterator->move(s.iterator, 0, UITER_START);
5824                 level = UCOL_PSK_CASE;
5825                 break;
5826             }
5827             if(!isShiftedCE(CE, LVT, &wasShifted)) {
5828               CE >>= 8; /* get secondary */
5829               if(CE != 0) {
5830                 dest[i++]=(uint8_t)CE;
5831               }
5832             }
5833             if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5834               consumedExpansionCEs++;
5835             } else {
5836               consumedExpansionCEs = 0;
5837             }
5838             if(s.pos && *s.pos == 0) {
5839               iterSkips++;
5840             }
5841           }
5842         } else { // French secondary processing
5843           uint8_t frenchBuff[UCOL_MAX_BUFFER];
5844           int32_t frenchIndex = 0;
5845           // Here we are going backwards.
5846           // If the iterator is at the beggining, it should be
5847           // moved to end.
5848           if(wasDoingPrimary) {
5849             s.iterator->move(s.iterator, 0, UITER_LIMIT);
5850           }
5851           for(;;) {
5852             if(i == count) {
5853               goto saveState;
5854             }
5855             if(consumedExpansionCEs == 0) {
5856               newState = s.iterator->getState(s.iterator);
5857               if(newState != UITER_NO_STATE) {
5858                 iterState = newState;
5859                 iterSkips = 0;
5860               } else {
5861                 if(!firstTimeOnLevel) {
5862                   iterSkips++;
5863                 }
5864               }
5865             }
5866             firstTimeOnLevel = FALSE;
5867             CE = ucol_IGetPrevCE(coll, &s, status);
5868             if(CE==UCOL_NO_MORE_CES) {
5869                 // Add the level separator
5870                 terminatePSKLevel(level, maxLevel, i, dest);
5871                 byteCountOrFrenchDone=0;
5872                 // Restart the iteration an move to the next level
5873                 s.iterator->move(s.iterator, 0, UITER_START);
5874                 level = UCOL_PSK_CASE;
5875                 break;
5876             }
5877             if(isContinuation(CE)) { // if it's a continuation, we want to save it and
5878               // reverse when we get a first non-continuation CE.
5879               CE >>= 8;
5880               frenchBuff[frenchIndex++] = (uint8_t)CE;
5881             } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
5882               CE >>= 8; /* get secondary */
5883               if(!frenchIndex) {
5884                 if(CE != 0) {
5885                   dest[i++]=(uint8_t)CE;
5886                 }
5887               } else {
5888                 frenchBuff[frenchIndex++] = (uint8_t)CE;
5889                 frenchIndex -= usedFrench;
5890                 usedFrench = 0;
5891                 while(i < count && frenchIndex) {
5892                   dest[i++] = frenchBuff[--frenchIndex];
5893                   usedFrench++;
5894                 }
5895               }
5896             }
5897             if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
5898               consumedExpansionCEs++;
5899             } else {
5900               consumedExpansionCEs = 0;
5901             }
5902             if(s.pos && *s.pos == 0) {
5903               iterSkips++;
5904             }
5905           }
5906         }
5907       } else {
5908         level = UCOL_PSK_CASE;
5909       }
5910         /* fall through to next level */
5911     case UCOL_PSK_CASE:
5912       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
5913         uint32_t caseShift = UCOL_CASE_SHIFT_START;
5914         uint8_t caseByte = UCOL_CASE_BYTE_START;
5915         uint8_t caseBits = 0;
5916
5917         for(;;) {
5918           if(i == count) {
5919             goto saveState;
5920           }
5921           // We should save the state only if we
5922           // are sure that we are done with the
5923           // previous iterator state
5924           if(consumedExpansionCEs == 0) {
5925             newState = s.iterator->getState(s.iterator);
5926             if(newState != UITER_NO_STATE) {
5927               iterState = newState;
5928               iterSkips = 0;
5929             } else {
5930               if(!firstTimeOnLevel) {
5931                 iterSkips++;
5932               }
5933             }
5934           }
5935           firstTimeOnLevel = FALSE;
5936           CE = ucol_IGetNextCE(coll, &s, status);
5937           if(CE==UCOL_NO_MORE_CES) {
5938             // On the case level we might have an unfinished
5939             // case byte. Add one if it's started.
5940             if(caseShift != UCOL_CASE_SHIFT_START) {
5941               dest[i++] = caseByte;
5942             }
5943             // This is kind of tricky - situation where
5944             // we need to keep the iterator in the old
5945             // state, but don't need to bring anything
5946             // to the next invocation
5947             if(i < count) {
5948               // Add the level separator
5949               terminatePSKLevel(level, maxLevel, i, dest);
5950               // Restart the iteration and move to the
5951               // next level
5952               s.iterator->move(s.iterator, 0, UITER_START);
5953               level = UCOL_PSK_TERTIARY;
5954             } else {
5955               dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE;
5956             }
5957             break;
5958           }
5959
5960           if(!isShiftedCE(CE, LVT, &wasShifted)) {
5961             if(!isContinuation(CE)) {
5962               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
5963               caseBits = (uint8_t)(CE & 0xC0);
5964               // this copies the case level logic from the
5965               // sort key generation code
5966               if(CE != 0) {
5967                 if(coll->caseFirst == UCOL_UPPER_FIRST) {
5968                   if((caseBits & 0xC0) == 0) {
5969                     caseByte |= 1 << (--caseShift);
5970                   } else {
5971                     caseByte |= 0 << (--caseShift);
5972                     /* second bit */
5973                     if(caseShift == 0) {
5974                       dest[i++] = caseByte;
5975                       caseShift = UCOL_CASE_SHIFT_START;
5976                       caseByte = UCOL_CASE_BYTE_START;
5977                     }
5978                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
5979                   }
5980                 } else {
5981                   if((caseBits & 0xC0) == 0) {
5982                     caseByte |= 0 << (--caseShift);
5983                   } else {
5984                     caseByte |= 1 << (--caseShift);
5985                     /* second bit */
5986                     if(caseShift == 0) {
5987                       dest[i++] = caseByte;
5988                       caseShift = UCOL_CASE_SHIFT_START;
5989                       caseByte = UCOL_CASE_BYTE_START;
5990                     }
5991                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
5992                   }
5993                 }
5994               }
5995
5996             }
5997           }
5998           // Not sure this is correct for the case level - revisit
5999           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6000             consumedExpansionCEs++;
6001           } else {
6002             consumedExpansionCEs = 0;
6003           }
6004           if(s.pos && *s.pos == 0) {
6005             iterSkips++;
6006           }
6007         }
6008       } else {
6009         level = UCOL_PSK_TERTIARY;
6010       }
6011         /* fall through to next level */
6012     case UCOL_PSK_TERTIARY:
6013       if(strength >= UCOL_TERTIARY) {
6014         for(;;) {
6015           if(i == count) {
6016             goto saveState;
6017           }
6018           // We should save the state only if we
6019           // are sure that we are done with the
6020           // previous iterator state
6021           if(consumedExpansionCEs == 0) {
6022             newState = s.iterator->getState(s.iterator);
6023             if(newState != UITER_NO_STATE) {
6024               iterState = newState;
6025               iterSkips = 0;
6026             } else {
6027               if(!firstTimeOnLevel) {
6028                 iterSkips++;
6029               }
6030             }
6031           }
6032           firstTimeOnLevel = FALSE;
6033           CE = ucol_IGetNextCE(coll, &s, status);
6034           if(CE==UCOL_NO_MORE_CES) {
6035               // Add the level separator
6036               terminatePSKLevel(level, maxLevel, i, dest);
6037               byteCountOrFrenchDone=0;
6038               // Restart the iteration an move to the
6039               // second level
6040               s.iterator->move(s.iterator, 0, UITER_START);
6041               level = UCOL_PSK_QUATERNARY;
6042               break;
6043           }
6044           if(!isShiftedCE(CE, LVT, &wasShifted)) {
6045             notIsContinuation = !isContinuation(CE);
6046
6047             if(notIsContinuation) {
6048               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6049               CE ^= coll->caseSwitch;
6050               CE &= coll->tertiaryMask;
6051             } else {
6052               CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6053             }
6054
6055             if(CE != 0) {
6056               dest[i++]=(uint8_t)CE;
6057             }
6058           }
6059           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6060             consumedExpansionCEs++;
6061           } else {
6062             consumedExpansionCEs = 0;
6063           }
6064           if(s.pos && *s.pos == 0) {
6065             iterSkips++;
6066           }
6067         }
6068       } else {
6069         // if we're not doing tertiary
6070         // skip to the end
6071         level = UCOL_PSK_NULL;
6072       }
6073         /* fall through to next level */
6074     case UCOL_PSK_QUATERNARY:
6075       if(strength >= UCOL_QUATERNARY) {
6076         for(;;) {
6077           if(i == count) {
6078             goto saveState;
6079           }
6080           // We should save the state only if we
6081           // are sure that we are done with the
6082           // previous iterator state
6083           if(consumedExpansionCEs == 0) {
6084             newState = s.iterator->getState(s.iterator);
6085             if(newState != UITER_NO_STATE) {
6086               iterState = newState;
6087               iterSkips = 0;
6088             } else {
6089               if(!firstTimeOnLevel) {
6090                 iterSkips++;
6091               }
6092             }
6093           }
6094           firstTimeOnLevel = FALSE;
6095           CE = ucol_IGetNextCE(coll, &s, status);
6096           if(CE==UCOL_NO_MORE_CES) {
6097               // Add the level separator
6098               terminatePSKLevel(level, maxLevel, i, dest);
6099               //dest[i++] = UCOL_LEVELTERMINATOR;
6100               byteCountOrFrenchDone=0;
6101               // Restart the iteration an move to the
6102               // second level
6103               s.iterator->move(s.iterator, 0, UITER_START);
6104               level = UCOL_PSK_QUIN;
6105               break;
6106           }
6107           if(isShiftedCE(CE, LVT, &wasShifted)) {
6108             CE >>= 16; /* get primary */
6109             if(CE != 0) {
6110               if(byteCountOrFrenchDone == 0) {
6111                 dest[i++]=(uint8_t)(CE >> 8);
6112               } else {
6113                 byteCountOrFrenchDone = 0;
6114               }
6115               if((CE &=0xff)!=0) {
6116                   if(i==count) {
6117                       /* overflow */
6118                       byteCountOrFrenchDone=1;
6119                       goto saveState;
6120                   }
6121                   dest[i++]=(uint8_t)CE;
6122               }
6123             }
6124           } else {
6125             notIsContinuation = !isContinuation(CE);
6126             if(notIsContinuation) {
6127               if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6128                 dest[i++] = UCOL_HIRAGANA_QUAD;
6129               } else {
6130                 dest[i++] = 0xFF;
6131               }
6132             }
6133           }
6134           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6135             consumedExpansionCEs++;
6136           } else {
6137             consumedExpansionCEs = 0;
6138           }
6139           if(s.pos && *s.pos == 0) {
6140             iterSkips++;
6141           }
6142         }
6143       } else {
6144         // if we're not doing quaternary
6145         // skip to the end
6146         level = UCOL_PSK_NULL;
6147       }
6148         /* fall through to next level */
6149     case UCOL_PSK_QUIN:
6150       level = UCOL_PSK_IDENTICAL;
6151         /* fall through to next level */
6152     case UCOL_PSK_IDENTICAL:
6153       if(strength >= UCOL_IDENTICAL) {
6154         UChar32 first, second;
6155         int32_t bocsuBytesWritten = 0;
6156         // We always need to do identical on
6157         // the NFD form of the string.
6158         if(normIter == NULL) {
6159           // we arrived from the level below and
6160           // normalization was not turned on.
6161           // therefore, we need to make a fresh NFD iterator
6162           normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6163           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6164         } else if(!doingIdenticalFromStart) {
6165           // there is an iterator, but we did some other levels.
6166           // therefore, we have a FCD iterator - need to make
6167           // a NFD one.
6168           // normIter being at the beginning does not guarantee
6169           // that the underlying iterator is at the beginning
6170           iter->move(iter, 0, UITER_START);
6171           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6172         }
6173         // At this point we have a NFD iterator that is positioned
6174         // in the right place
6175         if(U_FAILURE(*status)) {
6176           return 0;
6177         }
6178         first = uiter_previous32(s.iterator);
6179         // maybe we're at the start of the string
6180         if(first == U_SENTINEL) {
6181           first = 0;
6182         } else {
6183           uiter_next32(s.iterator);
6184         }
6185
6186         j = 0;
6187         for(;;) {
6188           if(i == count) {
6189             if(j+1 < bocsuBytesWritten) {
6190               bocsuBytesUsed = j+1;
6191             }
6192             goto saveState;
6193           }
6194
6195           // On identical level, we will always save
6196           // the state if we reach this point, since
6197           // we don't depend on getNextCE for content
6198           // all the content is in our buffer and we
6199           // already either stored the full buffer OR
6200           // otherwise we won't arrive here.
6201           newState = s.iterator->getState(s.iterator);
6202           if(newState != UITER_NO_STATE) {
6203             iterState = newState;
6204             iterSkips = 0;
6205           } else {
6206             iterSkips++;
6207           }
6208
6209           uint8_t buff[4];
6210           second = uiter_next32(s.iterator);
6211
6212           // end condition for identical level
6213           if(second == U_SENTINEL) {
6214             terminatePSKLevel(level, maxLevel, i, dest);
6215             level = UCOL_PSK_NULL;
6216             break;
6217           }
6218           bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6219           first = second;
6220
6221           j = 0;
6222           if(bocsuBytesUsed != 0) {
6223             while(bocsuBytesUsed-->0) {
6224               j++;
6225             }
6226           }
6227
6228           while(i < count && j < bocsuBytesWritten) {
6229             dest[i++] = buff[j++];
6230           }
6231         }
6232
6233       } else {
6234         level = UCOL_PSK_NULL;
6235       }
6236         /* fall through to next level */
6237     case UCOL_PSK_NULL:
6238       j = i;
6239       while(j<count) {
6240           dest[j++]=0;
6241       }
6242       break;
6243     default:
6244       *status = U_INTERNAL_PROGRAM_ERROR;
6245       return 0;
6246     }
6247
6248 saveState:
6249     // Now we need to return stuff. First we want to see whether we have
6250     // done everything for the current state of iterator.
6251     if(consumedExpansionCEs || byteCountOrFrenchDone
6252       || dontAdvanceIteratorBecauseWeNeedALevelTerminator) {
6253       // Any of above mean that the previous transaction
6254       // wasn't finished and that we should store the
6255       // previous iterator state.
6256       state[0] = iterState;
6257     } else {
6258       // The transaction is complete. We will continue in
6259       // next iteration.
6260       if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) {
6261         state[0] = s.iterator->getState(s.iterator);
6262         iterSkips = 0;
6263       } else {
6264         state[0] = iterState;
6265         iterSkips++;
6266       }
6267     }
6268     // Store the number of elements processed. On CE levels, this is
6269     // the number of expansion CEs processed. On identical level, this
6270     // is the number of bocsu bytes written.
6271     if(level < UCOL_PSK_IDENTICAL) {
6272       if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) {
6273         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6274       }
6275       state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6276     } else {
6277       if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) {
6278         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6279       }
6280       state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6281     }
6282
6283     // Next we put in the level of comparison
6284     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6285
6286     // If we are doing French, we need to store whether we have just finished the French level
6287     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6288       state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6289     } else {
6290       state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6291     }
6292
6293     // Was the latest CE shifted
6294     if(wasShifted) {
6295       state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6296     }
6297     // Check for iterSkips overflow
6298     if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) {
6299       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6300     }
6301     // Store iterSkips
6302     state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT);
6303
6304     // Check for French overflow
6305     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6306       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6307     }
6308     // Store number of bytes written in the French secondary continuation sequence
6309     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6310
6311
6312     // If we have used normalizing iterator, get rid of it
6313     if(normIter != NULL) {
6314       unorm_closeIter(normIter);
6315     }
6316
6317     // Return number of meaningful sortkey bytes.
6318     return i;
6319 }
6320
6321 /**
6322  * Produce a bound for a given sortkey and a number of levels.
6323  */
6324 U_CAPI int32_t U_EXPORT2
6325 ucol_getBound(const uint8_t       *source,
6326         int32_t             sourceLength,
6327         UColBoundMode       boundType,
6328         uint32_t            noOfLevels,
6329         uint8_t             *result,
6330         int32_t             resultLength,
6331         UErrorCode          *status) {
6332   // consistency checks
6333   if(status == NULL || U_FAILURE(*status)) {
6334     return 0;
6335   }
6336   if(source == NULL) {
6337     *status = U_ILLEGAL_ARGUMENT_ERROR;
6338     return 0;
6339   }
6340
6341   int32_t sourceIndex = 0;
6342   // Scan the string until we skip enough of the key OR reach the end of the key
6343   do {
6344     sourceIndex++;
6345     if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6346       noOfLevels--;
6347     }
6348   } while (noOfLevels > 0
6349     && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6350
6351   if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6352     && noOfLevels > 0) {
6353     *status = U_SORT_KEY_TOO_SHORT_WARNING;
6354   }
6355
6356
6357   // READ ME: this code assumes that the values for boundType
6358   // enum will not changes. They are set so that the enum value
6359   // corresponds to the number of extra bytes each bound type
6360   // needs.
6361   if(result != NULL && resultLength >= sourceIndex+boundType) {
6362     uprv_memcpy(result, source, sourceIndex);
6363     switch(boundType) {
6364     // Lower bound just gets terminated. No extra bytes
6365     case UCOL_BOUND_LOWER: // = 0
6366       break;
6367     // Upper bound needs one extra byte
6368     case UCOL_BOUND_UPPER: // = 1
6369       result[sourceIndex++] = 2;
6370       break;
6371     // Upper long bound needs two extra bytes
6372     case UCOL_BOUND_UPPER_LONG: // = 2
6373       result[sourceIndex++] = 0xFF;
6374       result[sourceIndex++] = 0xFF;
6375       break;
6376     default:
6377       *status = U_ILLEGAL_ARGUMENT_ERROR;
6378       return 0;
6379     }
6380     result[sourceIndex++] = 0;
6381
6382     return sourceIndex;
6383   } else {
6384     return sourceIndex+boundType+1;
6385   }
6386 }
6387
6388 static
6389 inline void uprv_appendByteToHexString(char *dst, uint8_t val) {
6390   uint32_t len = (uint32_t)uprv_strlen(dst);
6391   *(dst+len) = T_CString_itosOffset((val >> 4));
6392   *(dst+len+1) = T_CString_itosOffset((val & 0xF));
6393   *(dst+len+2) = 0;
6394 }
6395
6396 /* this function makes a string with representation of a sortkey */
6397 U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
6398   int32_t strength = UCOL_PRIMARY;
6399   uint32_t res_size = 0;
6400   UBool doneCase = FALSE;
6401
6402   char *current = buffer;
6403   const uint8_t *currentSk = sortkey;
6404
6405   uprv_strcpy(current, "[");
6406
6407   while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
6408     if(strength > UCOL_PRIMARY) {
6409       strcat(current, " . ");
6410     }
6411     while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
6412       uprv_appendByteToHexString(current, *currentSk++);
6413       uprv_strcat(current, " ");
6414     }
6415     if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
6416         doneCase = TRUE;
6417     } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
6418       strength ++;
6419     }
6420     uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
6421     if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6422       break;
6423     }
6424   }
6425
6426   if(coll->strength == UCOL_IDENTICAL) {
6427     uprv_strcat(current, " . ");
6428     while(*currentSk != 0) {
6429       uprv_appendByteToHexString(current, *currentSk++);
6430       uprv_strcat(current, " ");
6431     }
6432
6433     uprv_appendByteToHexString(current, *currentSk++);
6434   }
6435   uprv_strcat(current, "]");
6436
6437   if(res_size > *len) {
6438     return NULL;
6439   }
6440
6441   return buffer;
6442 }
6443
6444
6445 /****************************************************************************/
6446 /* Following are the functions that deal with the properties of a collator  */
6447 /* there are new APIs and some compatibility APIs                           */
6448 /****************************************************************************/
6449
6450 static inline void
6451 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6452                     int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6453   uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6454   UBool reverseSecondary = FALSE;
6455   if(!isContinuation(CE)) {
6456     tertiary = (uint8_t)((CE & coll->tertiaryMask));
6457     tertiary ^= coll->caseSwitch;
6458     reverseSecondary = TRUE;
6459   } else {
6460     tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6461     tertiary &= UCOL_REMOVE_CASE;
6462     reverseSecondary = FALSE;
6463   }
6464
6465   secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6466   primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6467   primary1 = (uint8_t)(CE >> 8);
6468
6469   if(primary1 != 0) {
6470     coll->latinOneCEs[ch] |= (primary1 << *primShift);
6471     *primShift -= 8;
6472   }
6473   if(primary2 != 0) {
6474     if(*primShift < 0) {
6475       coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6476       coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6477       coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6478       return;
6479     }
6480     coll->latinOneCEs[ch] |= (primary2 << *primShift);
6481     *primShift -= 8;
6482   }
6483   if(secondary != 0) {
6484     if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6485       coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6486       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6487     } else { // normal case
6488       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6489     }
6490     *secShift -= 8;
6491   }
6492   if(tertiary != 0) {
6493     coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6494     *terShift -= 8;
6495   }
6496 }
6497
6498 static inline UBool
6499 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6500     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6501     if(newTable == NULL) {
6502       *status = U_MEMORY_ALLOCATION_ERROR;
6503       coll->latinOneFailed = TRUE;
6504       return FALSE;
6505     }
6506     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6507     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6508     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6509     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6510     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6511     coll->latinOneTableLen = size;
6512     uprv_free(coll->latinOneCEs);
6513     coll->latinOneCEs = newTable;
6514     return TRUE;
6515 }
6516
6517 static UBool
6518 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6519   UBool result = TRUE;
6520   if(coll->latinOneCEs == NULL) {
6521     coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6522     if(coll->latinOneCEs == NULL) {
6523       *status = U_MEMORY_ALLOCATION_ERROR;
6524       return FALSE;
6525     }
6526     coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6527   }
6528   UChar ch = 0;
6529   UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6530   uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6531
6532   int32_t primShift = 24, secShift = 24, terShift = 24;
6533   uint32_t CE = 0;
6534   int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6535
6536   // TODO: make safe if you get more than you wanted...
6537   for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
6538     primShift = 24; secShift = 24; terShift = 24;
6539     if(ch < 0x100) {
6540       CE = coll->latinOneMapping[ch];
6541     } else {
6542       CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
6543       if(CE == UCOL_NOT_FOUND) {
6544         CE = UTRIE_GET32_FROM_LEAD(UCA->mapping, ch);
6545       }
6546     }
6547     if(CE < UCOL_NOT_FOUND) {
6548       ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6549     } else {
6550       switch (getCETag(CE)) {
6551       case EXPANSION_TAG:
6552         ucol_setText(it, &ch, 1, status);
6553         while((CE = ucol_next(it, status)) != UCOL_NULLORDER) {
6554           if(primShift < 0 || secShift < 0 || terShift < 0) {
6555             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6556             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6557             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6558             break;
6559           }
6560           ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
6561         }
6562         break;
6563       case CONTRACTION_TAG:
6564         // here is the trick
6565         // F2 is contraction. We do something very similar to contractions
6566         // but have two indices, one in the real contraction table and the
6567         // other to where we stuffed things. This hopes that we don't have
6568         // many contractions (this should work for latin-1 tables).
6569         {
6570           if((CE & 0x00FFF000) != 0) {
6571             *status = U_UNSUPPORTED_ERROR;
6572             return FALSE;
6573           }
6574
6575           const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
6576
6577           CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
6578
6579           coll->latinOneCEs[ch] = CE;
6580           coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
6581           coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
6582
6583           // We're going to jump into contraction table, pick the elements
6584           // and use them
6585           do {
6586               CE = *(coll->contractionCEs +
6587                   (UCharOffset - coll->contractionIndex));
6588               if(getCETag(CE) == EXPANSION_TAG) {
6589                 uint32_t size;
6590                 uint32_t i;    /* general counter */
6591                 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
6592                 size = getExpansionCount(CE);
6593                 //CE = *CEOffset++;
6594                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
6595                   for(i = 0; i<size; i++) {
6596                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6597                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6598                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6599                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6600                       break;
6601                     }
6602                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6603                   }
6604                 } else { /* else, we do */
6605                   while(*CEOffset != 0) {
6606                     if(primShift < 0 || secShift < 0 || terShift < 0) {
6607                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6608                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6609                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6610                       break;
6611                     }
6612                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
6613                   }
6614                 }
6615                 contractionOffset++;
6616               } else if(CE < UCOL_NOT_FOUND) {
6617                 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
6618               } else {
6619                 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6620                 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6621                 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
6622                 contractionOffset++;
6623               }
6624               UCharOffset++;
6625               primShift = 24; secShift = 24; terShift = 24;
6626               if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
6627                 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
6628                   return FALSE;
6629                 }
6630               }
6631           } while(*UCharOffset != 0xFFFF);
6632         }
6633         break;
6634       default:
6635         coll->latinOneFailed = TRUE;
6636         result = FALSE;
6637         break;
6638       }
6639     }
6640   }
6641   ucol_closeElements(it);
6642   // compact table
6643   if(contractionOffset < coll->latinOneTableLen) {
6644     if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
6645       return FALSE;
6646     }
6647   }
6648   return result;
6649 }
6650
6651 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
6652       if(U_SUCCESS(*status)) {
6653         if(coll->caseFirst == UCOL_UPPER_FIRST) {
6654           coll->caseSwitch = UCOL_CASE_SWITCH;
6655         } else {
6656           coll->caseSwitch = UCOL_NO_CASE_SWITCH;
6657         }
6658
6659         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
6660           coll->tertiaryMask = UCOL_REMOVE_CASE;
6661           coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6662           coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
6663           coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
6664           coll->tertiaryBottom = UCOL_COMMON_BOT3;
6665         } else {
6666           coll->tertiaryMask = UCOL_KEEP_CASE;
6667           coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
6668           if(coll->caseFirst == UCOL_UPPER_FIRST) {
6669             coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
6670             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
6671             coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
6672           } else {
6673             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
6674             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
6675             coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
6676           }
6677         }
6678
6679         /* Set the compression values */
6680         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
6681         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
6682         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
6683
6684         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
6685           && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6686           coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
6687         } else {
6688           coll->sortKeyGen = ucol_calcSortKey;
6689         }
6690         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY
6691           && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
6692           if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
6693             if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
6694               //fprintf(stderr, "F");
6695               coll->latinOneUse = TRUE;
6696             } else {
6697               coll->latinOneUse = FALSE;
6698             }
6699           } else { // latin1Table exists and it doesn't need to be regenerated, just use it
6700             coll->latinOneUse = TRUE;
6701           }
6702         } else {
6703           coll->latinOneUse = FALSE;
6704         }
6705       }
6706
6707 }
6708
6709 U_CAPI uint32_t  U_EXPORT2
6710 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
6711   if(U_FAILURE(*status) || coll == NULL) {
6712     return 0;
6713   }
6714   if(len == -1) {
6715     len = u_strlen(varTop);
6716   }
6717   if(len == 0) {
6718     *status = U_ILLEGAL_ARGUMENT_ERROR;
6719     return 0;
6720   }
6721
6722   collIterate s;
6723   IInit_collIterate(coll, varTop, len, &s);
6724
6725   uint32_t CE = ucol_IGetNextCE(coll, &s, status);
6726
6727   /* here we check if we have consumed all characters */
6728   /* you can put in either one character or a contraction */
6729   /* you shouldn't put more... */
6730   if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
6731     *status = U_CE_NOT_FOUND_ERROR;
6732     return 0;
6733   }
6734
6735   uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
6736
6737   if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
6738     *status = U_PRIMARY_TOO_LONG_ERROR;
6739     return 0;
6740   }
6741
6742   coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
6743
6744   return CE & UCOL_PRIMARYMASK;
6745 }
6746
6747 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
6748   if(U_FAILURE(*status) || coll == NULL) {
6749     return 0;
6750   }
6751   return coll->variableTopValue<<16;
6752 }
6753
6754 U_CAPI void  U_EXPORT2
6755 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
6756   if(U_FAILURE(*status) || coll == NULL) {
6757     return;
6758   }
6759   coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
6760 }
6761 /* Attribute setter API */
6762 U_CAPI void  U_EXPORT2
6763 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
6764     if(U_FAILURE(*status) || coll == NULL) {
6765       return;
6766     }
6767     UColAttributeValue oldFrench = coll->frenchCollation;
6768     UColAttributeValue oldCaseFirst = coll->caseFirst;
6769     switch(attr) {
6770     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
6771       if(value == UCOL_ON) {
6772         coll->numericCollation = UCOL_ON;
6773         coll->numericCollationisDefault = FALSE;
6774       } else if (value == UCOL_OFF) {
6775         coll->numericCollation = UCOL_OFF;
6776         coll->numericCollationisDefault = FALSE;
6777       } else if (value == UCOL_DEFAULT) {
6778         coll->numericCollationisDefault = TRUE;
6779         coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
6780       } else {
6781         *status = U_ILLEGAL_ARGUMENT_ERROR;
6782       }
6783       break;
6784     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
6785       if(value == UCOL_ON) {
6786         coll->hiraganaQ = UCOL_ON;
6787         coll->hiraganaQisDefault = FALSE;
6788       } else if (value == UCOL_OFF) {
6789         coll->hiraganaQ = UCOL_OFF;
6790         coll->hiraganaQisDefault = FALSE;
6791       } else if (value == UCOL_DEFAULT) {
6792         coll->hiraganaQisDefault = TRUE;
6793         coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
6794       } else {
6795         *status = U_ILLEGAL_ARGUMENT_ERROR;
6796       }
6797       break;
6798     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6799         if(value == UCOL_ON) {
6800             coll->frenchCollation = UCOL_ON;
6801             coll->frenchCollationisDefault = FALSE;
6802         } else if (value == UCOL_OFF) {
6803             coll->frenchCollation = UCOL_OFF;
6804             coll->frenchCollationisDefault = FALSE;
6805         } else if (value == UCOL_DEFAULT) {
6806             coll->frenchCollationisDefault = TRUE;
6807             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
6808         } else {
6809             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6810         }
6811         break;
6812     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6813         if(value == UCOL_SHIFTED) {
6814             coll->alternateHandling = UCOL_SHIFTED;
6815             coll->alternateHandlingisDefault = FALSE;
6816         } else if (value == UCOL_NON_IGNORABLE) {
6817             coll->alternateHandling = UCOL_NON_IGNORABLE;
6818             coll->alternateHandlingisDefault = FALSE;
6819         } else if (value == UCOL_DEFAULT) {
6820             coll->alternateHandlingisDefault = TRUE;
6821             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
6822         } else {
6823             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6824         }
6825         break;
6826     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6827         if(value == UCOL_LOWER_FIRST) {
6828             coll->caseFirst = UCOL_LOWER_FIRST;
6829             coll->caseFirstisDefault = FALSE;
6830         } else if (value == UCOL_UPPER_FIRST) {
6831             coll->caseFirst = UCOL_UPPER_FIRST;
6832             coll->caseFirstisDefault = FALSE;
6833         } else if (value == UCOL_OFF) {
6834           coll->caseFirst = UCOL_OFF;
6835           coll->caseFirstisDefault = FALSE;
6836         } else if (value == UCOL_DEFAULT) {
6837             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
6838             coll->caseFirstisDefault = TRUE;
6839         } else {
6840             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6841         }
6842         break;
6843     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6844         if(value == UCOL_ON) {
6845             coll->caseLevel = UCOL_ON;
6846             coll->caseLevelisDefault = FALSE;
6847         } else if (value == UCOL_OFF) {
6848             coll->caseLevel = UCOL_OFF;
6849             coll->caseLevelisDefault = FALSE;
6850         } else if (value == UCOL_DEFAULT) {
6851             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
6852             coll->caseLevelisDefault = TRUE;
6853         } else {
6854             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6855         }
6856         break;
6857     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6858         if(value == UCOL_ON) {
6859             coll->normalizationMode = UCOL_ON;
6860             coll->normalizationModeisDefault = FALSE;
6861         } else if (value == UCOL_OFF) {
6862             coll->normalizationMode = UCOL_OFF;
6863             coll->normalizationModeisDefault = FALSE;
6864         } else if (value == UCOL_DEFAULT) {
6865             coll->normalizationModeisDefault = TRUE;
6866             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
6867         } else {
6868             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6869         }
6870         break;
6871     case UCOL_STRENGTH:         /* attribute for strength */
6872         if (value == UCOL_DEFAULT) {
6873             coll->strengthisDefault = TRUE;
6874             coll->strength = (UColAttributeValue)coll->options->strength;
6875         } else if (value <= UCOL_IDENTICAL) {
6876             coll->strengthisDefault = FALSE;
6877             coll->strength = value;
6878         } else {
6879             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
6880         }
6881         break;
6882     case UCOL_ATTRIBUTE_COUNT:
6883     default:
6884         *status = U_ILLEGAL_ARGUMENT_ERROR;
6885         break;
6886     }
6887     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
6888       coll->latinOneRegenTable = TRUE;
6889     } else {
6890       coll->latinOneRegenTable = FALSE;
6891     }
6892     ucol_updateInternalState(coll, status);
6893 }
6894
6895 U_CAPI UColAttributeValue  U_EXPORT2
6896 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
6897     if(U_FAILURE(*status) || coll == NULL) {
6898       return UCOL_DEFAULT;
6899     }
6900     switch(attr) {
6901     case UCOL_NUMERIC_COLLATION:
6902       return coll->numericCollation;
6903     case UCOL_HIRAGANA_QUATERNARY_MODE:
6904       return coll->hiraganaQ;
6905     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
6906         return coll->frenchCollation;
6907     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
6908         return coll->alternateHandling;
6909     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
6910         return coll->caseFirst;
6911     case UCOL_CASE_LEVEL: /* do we have an extra case level */
6912         return coll->caseLevel;
6913     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
6914         return coll->normalizationMode;
6915     case UCOL_STRENGTH:         /* attribute for strength */
6916         return coll->strength;
6917     case UCOL_ATTRIBUTE_COUNT:
6918     default:
6919         *status = U_ILLEGAL_ARGUMENT_ERROR;
6920         break;
6921     }
6922     return UCOL_DEFAULT;
6923 }
6924
6925 U_CAPI void U_EXPORT2
6926 ucol_setStrength(    UCollator                *coll,
6927             UCollationStrength        strength)
6928 {
6929   UErrorCode status = U_ZERO_ERROR;
6930   ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
6931 }
6932
6933 U_CAPI UCollationStrength U_EXPORT2
6934 ucol_getStrength(const UCollator *coll)
6935 {
6936   UErrorCode status = U_ZERO_ERROR;
6937   return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
6938 }
6939
6940 /****************************************************************************/
6941 /* Following are misc functions                                             */
6942 /* there are new APIs and some compatibility APIs                           */
6943 /****************************************************************************/
6944
6945 U_CAPI UCollator* U_EXPORT2
6946 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
6947 {
6948     UCollator * localCollator;
6949     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
6950     char *stackBufferChars = (char *)stackBuffer;
6951
6952     if (status == NULL || U_FAILURE(*status)){
6953         return 0;
6954     }
6955     if ((stackBuffer && !pBufferSize) || !coll){
6956        *status = U_ILLEGAL_ARGUMENT_ERROR;
6957         return 0;
6958     }
6959     /* Pointers on 64-bit platforms need to be aligned
6960      * on a 64-bit boundry in memory.
6961      */
6962     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
6963         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
6964         *pBufferSize -= offsetUp;
6965         stackBufferChars += offsetUp;
6966     }
6967     stackBuffer = (void *)stackBufferChars;
6968
6969     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
6970         *pBufferSize =  bufferSizeNeeded;
6971         return 0;
6972     }
6973     if (!stackBuffer || *pBufferSize < bufferSizeNeeded) {
6974         /* allocate one here...*/
6975         int32_t length;
6976         const UChar * rules = ucol_getRules(coll, &length);
6977
6978         localCollator = ucol_openRules(rules,
6979                                        length,
6980                                        ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status),
6981                                        ucol_getStrength(coll),
6982                                        NULL,
6983                                        status);
6984         if (U_SUCCESS(*status))
6985         {
6986             *status = U_SAFECLONE_ALLOCATED_WARNING;
6987         }
6988     } else {
6989         localCollator = (UCollator *)stackBuffer;
6990         memcpy(localCollator, coll, sizeof(UCollator));
6991         localCollator->freeOnClose = FALSE;
6992                 localCollator->requestedLocale = NULL; // zero copies of pointers
6993                 localCollator->validLocale = NULL;
6994     }
6995     return localCollator;
6996 }
6997
6998 U_CAPI int32_t U_EXPORT2
6999 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
7000   UErrorCode status = U_ZERO_ERROR;
7001   int32_t len = 0;
7002   int32_t UCAlen = 0;
7003   const UChar* ucaRules = 0;
7004   const UChar *rules = ucol_getRules(coll, &len);
7005   if(delta == UCOL_FULL_RULES) {
7006     /* take the UCA rules and append real rules at the end */
7007     /* UCA rules will be probably coming from the root RB */
7008     ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status);
7009   }
7010   if(U_FAILURE(status)) {
7011     return 0;
7012   }
7013   if(buffer!=0 && bufferLen>0){
7014       *buffer=0;
7015       if(UCAlen > 0) {
7016         u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen));
7017       }
7018       if(len > 0 && bufferLen > UCAlen) {
7019         u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen));
7020       }
7021   }
7022   return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status);
7023 }
7024
7025 static const UChar _NUL = 0;
7026
7027 U_CAPI const UChar* U_EXPORT2
7028 ucol_getRules(    const    UCollator       *coll,
7029         int32_t            *length)
7030 {
7031   if(coll->rules != NULL) {
7032     *length = coll->rulesLength;
7033     return coll->rules;
7034   } else {
7035     UErrorCode status = U_ZERO_ERROR;
7036     if(coll->rb != NULL) {
7037       UResourceBundle *collElem = ures_getByKey(coll->rb, "CollationElements", NULL, &status);
7038       if(U_SUCCESS(status)) {
7039         /*Semantic const */
7040         ((UCollator *)coll)->rules = ures_getStringByKey(collElem, "Sequence", length, &status);
7041         ((UCollator *)coll)->rulesLength = *length;
7042         ((UCollator *)coll)->freeRulesOnClose = FALSE;
7043         ures_close(collElem);
7044         return coll->rules;
7045       }
7046     }
7047     *length = 0;
7048     return &_NUL;
7049   }
7050 }
7051
7052 U_CAPI int32_t U_EXPORT2
7053 ucol_getDisplayName(    const    char        *objLoc,
7054             const    char        *dispLoc,
7055             UChar             *result,
7056             int32_t         resultLength,
7057             UErrorCode        *status)
7058 {
7059
7060   if(U_FAILURE(*status)) return -1;
7061   UnicodeString dst;
7062   if(!(result==NULL && resultLength==0)) {
7063     // NULL destination for pure preflighting: empty dummy string
7064     // otherwise, alias the destination buffer
7065     dst.setTo(result, 0, resultLength);
7066   }
7067   Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst);
7068   return dst.extract(result, resultLength, *status);
7069 }
7070
7071 U_CAPI const char* U_EXPORT2
7072 ucol_getAvailable(int32_t index)
7073 {
7074   return uloc_getAvailable(index);
7075 }
7076
7077 U_CAPI int32_t U_EXPORT2
7078 ucol_countAvailable()
7079 {
7080   return uloc_countAvailable();
7081 }
7082
7083 U_CAPI void U_EXPORT2
7084 ucol_getVersion(const UCollator* coll,
7085                 UVersionInfo versionInfo)
7086 {
7087     /* RunTime version  */
7088     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7089     /* Builder version*/
7090     uint8_t bdVersion = coll->image->version[0];
7091
7092     /* Charset Version. Need to get the version from cnv files
7093      * makeconv should populate cnv files with version and
7094      * an api has to be provided in ucnv.h to obtain this version
7095      */
7096     uint8_t csVersion = 0;
7097
7098     /* combine the version info */
7099     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7100
7101     /* Tailoring rules */
7102     versionInfo[0] = (uint8_t)(cmbVersion>>8);
7103     versionInfo[1] = (uint8_t)cmbVersion;
7104     versionInfo[2] = coll->image->version[1];
7105     versionInfo[3] = UCA->image->UCAVersion[0];
7106 }
7107
7108
7109 /* This internal API checks whether a character is tailored or not */
7110 U_CAPI UBool  U_EXPORT2
7111 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7112   uint32_t CE = UCOL_NOT_FOUND;
7113   const UChar *ContractionStart = NULL;
7114   if(U_SUCCESS(*status) && coll != NULL) {
7115     if(coll == UCA) {
7116       return FALSE;
7117     } else if(u < 0x100) { /* latin-1 */
7118       CE = coll->latinOneMapping[u];
7119       if(CE == UCA->latinOneMapping[u]) {
7120         return FALSE;
7121       }
7122     } else { /* regular */
7123       /*CE = ucmpe32_get(coll->mapping, u);*/
7124       CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u);
7125
7126     }
7127
7128     if(isContraction(CE)) {
7129       ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7130       CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7131     }
7132
7133     if(CE == UCOL_NOT_FOUND) {
7134       return FALSE;
7135     } else {
7136       return TRUE;
7137     }
7138   } else {
7139     return FALSE;
7140   }
7141 }
7142
7143
7144 /****************************************************************************/
7145 /* Following are the string compare functions                               */
7146 /*                                                                          */
7147 /****************************************************************************/
7148
7149
7150 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
7151 /*                     Used by strcoll if strength == identical and strings  */
7152 /*                     are otherwise equal.  Moved out-of-line because this  */
7153 /*                     is a rare case.                                       */
7154 /*                                                                           */
7155 /*                     Comparison must be done on NFD normalized strings.    */
7156 /*                     FCD is not good enough.                               */
7157 /*                                                                           */
7158 /*      TODO:  make an incremental NFD Comparison function, which could      */
7159 /*             be of general use                                             */
7160
7161 static
7162 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7163 {
7164
7165   // TODO: When we have an UChar iterator, we need to access the whole string. One
7166   // useful modification would be a UChar iterator extract API, since reset next next...
7167   // is not optimal.
7168   // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7169
7170   // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7171   // of same type, but that doesn't really mean that it will stay that way.
7172
7173     // The division for the array length may truncate the array size to
7174     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7175     // for all platforms anyway.
7176     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7177     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7178     //UChar sStackBuf[256], tStackBuf[256];
7179     //int32_t sBufSize = 256, tBufSize = 256;
7180     int32_t            comparison;
7181     int32_t          sLen        = 0;
7182     UChar            *sBuf       = NULL;
7183     int32_t          tLen        = 0;
7184     UChar            *tBuf       = NULL;
7185     UBool freeSBuf = FALSE, freeTBuf = FALSE;
7186
7187     if (sColl->flags & UCOL_USE_ITERATOR) {
7188       UNormIterator *sNIt = NULL, *tNIt = NULL;
7189       sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7190       tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7191       sColl->iterator->move(sColl->iterator, 0, UITER_START);
7192       tColl->iterator->move(tColl->iterator, 0, UITER_START);
7193       UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7194       UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7195       comparison = u_strCompareIter(sIt, tIt, TRUE);
7196       unorm_closeIter(sNIt);
7197       unorm_closeIter(tNIt);
7198     } else {
7199       sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7200       sBuf = sColl->string;
7201       tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7202       tBuf = tColl->string;
7203
7204       if (normalize) {
7205           *status = U_ZERO_ERROR;
7206           if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7207               sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7208                                      sBuf, sLen,
7209                                      FALSE, 0,
7210                                      status);
7211               if(*status == U_BUFFER_OVERFLOW_ERROR) {
7212                   if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7213                                              &sColl->writableBuffer,
7214                                              (int32_t *)&sColl->writableBufSize, sLen,
7215                                              0)
7216                   ) {
7217                       *status = U_MEMORY_ALLOCATION_ERROR;
7218                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7219                   }
7220                   *status = U_ZERO_ERROR;
7221                   sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7222                                          sBuf, sLen,
7223                                          FALSE, 0,
7224                                          status);
7225               }
7226               if(freeSBuf) {
7227                 uprv_free(sBuf);
7228                 freeSBuf = FALSE;
7229               }
7230               sBuf = sColl->writableBuffer;
7231               if (sBuf != sColl->stackWritableBuffer) {
7232                   sColl->flags |= UCOL_ITER_ALLOCATED;
7233               }
7234           }
7235
7236           *status = U_ZERO_ERROR;
7237           if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7238               tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7239                                      tBuf, tLen,
7240                                      FALSE, 0,
7241                                      status);
7242               if(*status == U_BUFFER_OVERFLOW_ERROR) {
7243                   if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7244                                              &tColl->writableBuffer,
7245                                              (int32_t *)&tColl->writableBufSize, tLen,
7246                                              0)
7247                   ) {
7248                       *status = U_MEMORY_ALLOCATION_ERROR;
7249                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7250                   }
7251                   *status = U_ZERO_ERROR;
7252                   tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7253                                          tBuf, tLen,
7254                                          FALSE, 0,
7255                                          status);
7256               }
7257               if(freeTBuf) {
7258                 uprv_free(tBuf);
7259                 freeTBuf = FALSE;
7260               }
7261               tBuf = tColl->writableBuffer;
7262               if (tBuf != tColl->stackWritableBuffer) {
7263                   tColl->flags |= UCOL_ITER_ALLOCATED;
7264               }
7265           }
7266       }
7267
7268       if (sLen == -1 && tLen == -1) {
7269           comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7270       } else {
7271           if (sLen == -1) {
7272               sLen = u_strlen(sBuf);
7273           }
7274           if (tLen == -1) {
7275               tLen = u_strlen(tBuf);
7276           }
7277           comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7278           if (comparison == 0) {
7279               comparison = sLen - tLen;
7280           }
7281       }
7282     }
7283
7284     if (comparison < 0) {
7285         return UCOL_LESS;
7286     } else if (comparison == 0) {
7287         return UCOL_EQUAL;
7288     } else /* comparison > 0 */ {
7289         return UCOL_GREATER;
7290     }
7291 }
7292
7293 /*  CEBuf - A struct and some inline functions to handle the saving    */
7294 /*          of CEs in a buffer within ucol_strcoll                     */
7295
7296 #define UCOL_CEBUF_SIZE 512
7297 typedef struct ucol_CEBuf {
7298     uint32_t    *buf;
7299     uint32_t    *endp;
7300     uint32_t    *pos;
7301     uint32_t     localArray[UCOL_CEBUF_SIZE];
7302 } ucol_CEBuf;
7303
7304
7305 static
7306 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7307     (b)->buf = (b)->pos = (b)->localArray;
7308     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7309 };
7310
7311 static
7312 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7313     uint32_t  oldSize;
7314     uint32_t  newSize;
7315     uint32_t  *newBuf;
7316
7317     ci->flags |= UCOL_ITER_ALLOCATED;
7318     oldSize = b->pos - b->buf;
7319     newSize = oldSize * 2;
7320     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7321     if(newBuf != NULL) {
7322       uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7323       if (b->buf != b->localArray) {
7324           uprv_free(b->buf);
7325       }
7326       b->buf = newBuf;
7327       b->endp = b->buf + newSize;
7328       b->pos  = b->buf + oldSize;
7329     }
7330 }
7331
7332 static
7333 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7334     if (b->pos == b->endp) {
7335         ucol_CEBuf_Expand(b, ci);
7336 }
7337     *(b)->pos++ = ce;
7338 };
7339
7340 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7341 /* It is used when compare gets in trouble and needs to bail out                     */
7342 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7343                                                   collIterate *tColl)
7344 {
7345     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7346     uint8_t *sourceKeyP = sourceKey;
7347     uint8_t *targetKeyP = targetKey;
7348     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7349     const UCollator *coll = sColl->coll;
7350     UChar *source = NULL;
7351     UChar *target = NULL;
7352     UChar sStackBuf[256], tStackBuf[256];
7353     int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7354     int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7355
7356     // TODO: Handle long strings. Do the same in ucol_checkIdent.
7357     if(sColl->flags & UCOL_USE_ITERATOR) {
7358       sColl->iterator->move(sColl->iterator, 0, UITER_START);
7359       tColl->iterator->move(tColl->iterator, 0, UITER_START);
7360       source = sStackBuf;
7361       UChar *sBufp = source;
7362       target = tStackBuf;
7363       UChar *tBufp = target;
7364       while(sColl->iterator->hasNext(sColl->iterator)) {
7365         *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7366       }
7367       while(tColl->iterator->hasNext(tColl->iterator)) {
7368         *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7369       }
7370       sourceLength = sBufp - source;
7371       targetLength = tBufp - target;
7372     } else { // no iterators
7373       sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7374       targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7375       source = sColl->string;
7376       target = tColl->string;
7377     }
7378
7379
7380
7381     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7382     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7383         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7384         if(sourceKeyP != NULL) {
7385           sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7386         }
7387     }
7388
7389     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7390     if(targetKeyLen > UCOL_MAX_BUFFER) {
7391         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7392         if(targetKeyP != NULL) {
7393           targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7394         }
7395     }
7396
7397     int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7398
7399     if(sourceKeyP != sourceKey) {
7400         uprv_free(sourceKeyP);
7401     }
7402
7403     if(targetKeyP != targetKey) {
7404         uprv_free(targetKeyP);
7405     }
7406
7407     if(result<0) {
7408         return UCOL_LESS;
7409     } else if(result>0) {
7410         return UCOL_GREATER;
7411     } else {
7412         return UCOL_EQUAL;
7413     }
7414 }
7415
7416
7417 static inline UCollationResult
7418 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7419 //              const UCollator    *coll,
7420 //              const UChar        *source,
7421 //              int32_t            sourceLength,
7422 //              const UChar        *target,
7423 //              int32_t            targetLength,
7424               UErrorCode *status)
7425 {
7426     U_ALIGN_CODE(16);
7427
7428     const UCollator *coll = sColl->coll;
7429
7430
7431     // setting up the collator parameters
7432     UColAttributeValue strength = coll->strength;
7433     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7434
7435     UBool checkSecTer = initialCheckSecTer;
7436     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7437     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7438     UBool checkIdent = (strength == UCOL_IDENTICAL);
7439     UBool checkCase = (coll->caseLevel == UCOL_ON);
7440     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7441     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7442     UBool qShifted = shifted && checkQuad;
7443     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7444
7445     if(doHiragana && shifted) {
7446       return (ucol_compareUsingSortKeys(sColl, tColl));
7447     }
7448     uint8_t caseSwitch = coll->caseSwitch;
7449     uint8_t tertiaryMask = coll->tertiaryMask;
7450
7451     // This is the lowest primary value that will not be ignored if shifted
7452     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7453
7454     UCollationResult result = UCOL_EQUAL;
7455     UCollationResult hirResult = UCOL_EQUAL;
7456
7457     // Preparing the CE buffers. They will be filled during the primary phase
7458     ucol_CEBuf   sCEs;
7459     ucol_CEBuf   tCEs;
7460     UCOL_INIT_CEBUF(&sCEs);
7461     UCOL_INIT_CEBUF(&tCEs);
7462
7463     uint32_t secS = 0, secT = 0;
7464     uint32_t sOrder=0, tOrder=0;
7465
7466     // Non shifted primary processing is quite simple
7467     if(!shifted) {
7468       for(;;) {
7469
7470         // We fetch CEs until we hit a non ignorable primary or end.
7471         do {
7472           // We get the next CE
7473           sOrder = ucol_IGetNextCE(coll, sColl, status);
7474           // Stuff it in the buffer
7475           UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7476           // And keep just the primary part.
7477           sOrder &= UCOL_PRIMARYMASK;
7478         } while(sOrder == 0);
7479
7480         // see the comments on the above block
7481         do {
7482           tOrder = ucol_IGetNextCE(coll, tColl, status);
7483           UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7484           tOrder &= UCOL_PRIMARYMASK;
7485         } while(tOrder == 0);
7486
7487         // if both primaries are the same
7488         if(sOrder == tOrder) {
7489             // and there are no more CEs, we advance to the next level
7490             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7491               break;
7492             }
7493             if(doHiragana && hirResult == UCOL_EQUAL) {
7494               if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
7495                 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
7496                   ? UCOL_LESS:UCOL_GREATER;
7497               }
7498             }
7499         } else {
7500             // if two primaries are different, we are done
7501             result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
7502             goto commonReturn;
7503         }
7504       } // no primary difference... do the rest from the buffers
7505     } else { // shifted - do a slightly more complicated processing :)
7506       for(;;) {
7507         UBool sInShifted = FALSE;
7508         UBool tInShifted = FALSE;
7509         // This version of code can be refactored. However, it seems easier to understand this way.
7510         // Source loop. Sam as the target loop.
7511         for(;;) {
7512           sOrder = ucol_IGetNextCE(coll, sColl, status);
7513           if(sOrder == UCOL_NO_MORE_CES) {
7514             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7515             break;
7516           } else if(sOrder == 0
7517             || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
7518             /* UCA amendment - ignore ignorables that follow shifted code points */
7519             continue;
7520           } else if(isContinuation(sOrder)) {
7521             if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7522               if(sInShifted) {
7523                 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7524                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7525                 continue;
7526               } else {
7527                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7528                 break;
7529               }
7530             } else { /* Just lower level values */
7531               if(sInShifted) {
7532                 continue;
7533               } else {
7534                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7535                 continue;
7536               }
7537             }
7538           } else { /* regular */
7539             if((sOrder & UCOL_PRIMARYMASK) > LVT) {
7540               UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7541               break;
7542             } else {
7543               if((sOrder & UCOL_PRIMARYMASK) > 0) {
7544                 sInShifted = TRUE;
7545                 sOrder &= UCOL_PRIMARYMASK;
7546                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7547                 continue;
7548               } else {
7549                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
7550                 sInShifted = FALSE;
7551                 continue;
7552               }
7553             }
7554           }
7555         }
7556         sOrder &= UCOL_PRIMARYMASK;
7557         sInShifted = FALSE;
7558
7559         for(;;) {
7560           tOrder = ucol_IGetNextCE(coll, tColl, status);
7561           if(tOrder == UCOL_NO_MORE_CES) {
7562             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7563             break;
7564           } else if(tOrder == 0
7565             || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
7566             /* UCA amendment - ignore ignorables that follow shifted code points */
7567             continue;
7568           } else if(isContinuation(tOrder)) {
7569             if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
7570               if(tInShifted) {
7571                 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
7572                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7573                 continue;
7574               } else {
7575                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7576                 break;
7577               }
7578             } else { /* Just lower level values */
7579               if(tInShifted) {
7580                 continue;
7581               } else {
7582                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7583                 continue;
7584               }
7585             }
7586           } else { /* regular */
7587             if((tOrder & UCOL_PRIMARYMASK) > LVT) {
7588               UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7589               break;
7590             } else {
7591               if((tOrder & UCOL_PRIMARYMASK) > 0) {
7592                 tInShifted = TRUE;
7593                 tOrder &= UCOL_PRIMARYMASK;
7594                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7595                 continue;
7596               } else {
7597                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
7598                 tInShifted = FALSE;
7599                 continue;
7600               }
7601             }
7602           }
7603         }
7604         tOrder &= UCOL_PRIMARYMASK;
7605         tInShifted = FALSE;
7606
7607         if(sOrder == tOrder) {
7608           /*
7609             if(doHiragana && hirResult == UCOL_EQUAL) {
7610               if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
7611                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
7612                   ? UCOL_LESS:UCOL_GREATER;
7613               }
7614             }
7615           */
7616             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
7617               break;
7618             } else {
7619               sOrder = 0; tOrder = 0;
7620               continue;
7621             }
7622         } else {
7623             result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
7624             goto commonReturn;
7625         }
7626       } /* no primary difference... do the rest from the buffers */
7627     }
7628
7629     /* now, we're gonna reexamine collected CEs */
7630     uint32_t    *sCE;
7631     uint32_t    *tCE;
7632
7633     /* This is the secondary level of comparison */
7634     if(checkSecTer) {
7635       if(!isFrenchSec) { /* normal */
7636         sCE = sCEs.buf;
7637         tCE = tCEs.buf;
7638         for(;;) {
7639           while (secS == 0) {
7640             secS = *(sCE++) & UCOL_SECONDARYMASK;
7641           }
7642
7643           while(secT == 0) {
7644               secT = *(tCE++) & UCOL_SECONDARYMASK;
7645           }
7646
7647           if(secS == secT) {
7648             if(secS == UCOL_NO_MORE_CES_SECONDARY) {
7649               break;
7650             } else {
7651               secS = 0; secT = 0;
7652               continue;
7653             }
7654           } else {
7655                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7656                goto commonReturn;
7657           }
7658         }
7659       } else { /* do the French */
7660         uint32_t *sCESave = NULL;
7661         uint32_t *tCESave = NULL;
7662         sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
7663         tCE = tCEs.pos-2;
7664         for(;;) {
7665           while (secS == 0 && sCE >= sCEs.buf) {
7666             if(sCESave == 0) {
7667               secS = *(sCE--);
7668               if(isContinuation(secS)) {
7669                 while(isContinuation(secS = *(sCE--)));
7670                 /* after this, secS has the start of continuation, and sCEs points before that */
7671                 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
7672                 sCE+=2;  /* need to point to the first continuation CP */
7673                 /* However, now you can just continue doing stuff */
7674               }
7675             } else {
7676               secS = *(sCE++);
7677               if(!isContinuation(secS)) { /* This means we have finished with this cont */
7678                 sCE = sCESave;            /* reset the pointer to before continuation */
7679                 sCESave = 0;
7680                 continue;
7681               }
7682             }
7683             secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7684           }
7685
7686           while(secT == 0 && tCE >= tCEs.buf) {
7687             if(tCESave == 0) {
7688               secT = *(tCE--);
7689               if(isContinuation(secT)) {
7690                 while(isContinuation(secT = *(tCE--)));
7691                 /* after this, secS has the start of continuation, and sCEs points before that */
7692                 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
7693                 tCE+=2;  /* need to point to the first continuation CP */
7694                 /* However, now you can just continue doing stuff */
7695               }
7696             } else {
7697               secT = *(tCE++);
7698               if(!isContinuation(secT)) { /* This means we have finished with this cont */
7699                 tCE = tCESave;          /* reset the pointer to before continuation */
7700                 tCESave = 0;
7701                 continue;
7702               }
7703             }
7704             secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
7705           }
7706
7707           if(secS == secT) {
7708             if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
7709               break;
7710             } else {
7711               secS = 0; secT = 0;
7712               continue;
7713             }
7714           } else {
7715               result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7716               goto commonReturn;
7717           }
7718         }
7719       }
7720     }
7721
7722     /* doing the case bit */
7723     if(checkCase) {
7724       sCE = sCEs.buf;
7725       tCE = tCEs.buf;
7726       for(;;) {
7727         while((secS & UCOL_REMOVE_CASE) == 0) {
7728           if(!isContinuation(*sCE++)) {
7729             secS =*(sCE-1) & UCOL_TERT_CASE_MASK;
7730             secS ^= caseSwitch;
7731           } else {
7732             secS = 0;
7733           }
7734         }
7735
7736         while((secT & UCOL_REMOVE_CASE) == 0) {
7737           if(!isContinuation(*tCE++)) {
7738             secT = *(tCE-1) & UCOL_TERT_CASE_MASK;
7739             secT ^= caseSwitch;
7740           } else {
7741             secT = 0;
7742           }
7743         }
7744
7745         if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
7746           result = UCOL_LESS;
7747           goto commonReturn;
7748         } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
7749           result = UCOL_GREATER;
7750           goto commonReturn;
7751         }
7752
7753         if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
7754           break;
7755         } else {
7756           secS = 0;
7757           secT = 0;
7758         }
7759       }
7760     }
7761
7762     /* Tertiary level */
7763     if(checkTertiary) {
7764       secS = 0;
7765       secT = 0;
7766       sCE = sCEs.buf;
7767       tCE = tCEs.buf;
7768       for(;;) {
7769         while((secS & UCOL_REMOVE_CASE) == 0) {
7770           secS = *(sCE++) & tertiaryMask;
7771           if(!isContinuation(secS)) {
7772             secS ^= caseSwitch;
7773           } else {
7774             secS &= UCOL_REMOVE_CASE;
7775           }
7776         }
7777
7778         while((secT & UCOL_REMOVE_CASE)  == 0) {
7779           secT = *(tCE++) & tertiaryMask;
7780           if(!isContinuation(secT)) {
7781             secT ^= caseSwitch;
7782           } else {
7783             secT &= UCOL_REMOVE_CASE;
7784           }
7785         }
7786
7787         if(secS == secT) {
7788           if((secS & UCOL_REMOVE_CASE) == 1) {
7789             break;
7790           } else {
7791             secS = 0; secT = 0;
7792             continue;
7793           }
7794         } else {
7795             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7796             goto commonReturn;
7797         }
7798       }
7799     }
7800
7801
7802     if(qShifted /*checkQuad*/) {
7803       UBool sInShifted = TRUE;
7804       UBool tInShifted = TRUE;
7805       secS = 0;
7806       secT = 0;
7807       sCE = sCEs.buf;
7808       tCE = tCEs.buf;
7809       for(;;) {
7810         while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
7811           secS = *(sCE++);
7812           if(isContinuation(secS)) {
7813             if(!sInShifted) {
7814               continue;
7815             }
7816           } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
7817             secS = UCOL_PRIMARYMASK;
7818             sInShifted = FALSE;
7819           } else {
7820             sInShifted = TRUE;
7821           }
7822         }
7823         secS &= UCOL_PRIMARYMASK;
7824
7825
7826         while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
7827           secT = *(tCE++);
7828           if(isContinuation(secT)) {
7829             if(!tInShifted) {
7830               continue;
7831             }
7832           } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
7833             secT = UCOL_PRIMARYMASK;
7834             tInShifted = FALSE;
7835           } else {
7836             tInShifted = TRUE;
7837           }
7838         }
7839         secT &= UCOL_PRIMARYMASK;
7840
7841         if(secS == secT) {
7842           if(secS == UCOL_NO_MORE_CES_PRIMARY) {
7843             break;
7844           } else {
7845             secS = 0; secT = 0;
7846             continue;
7847           }
7848         } else {
7849             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
7850             goto commonReturn;
7851         }
7852       }
7853     } else if(doHiragana && hirResult != UCOL_EQUAL) {
7854       // If we're fine on quaternaries, we might be different
7855       // on Hiragana. This, however, might fail us in shifted.
7856       result = hirResult;
7857       goto commonReturn;
7858     }
7859
7860     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
7861     /*  as a tiebreaker if all else is equal.                                */
7862     /*  Getting here  should be quite rare - strings are not identical -     */
7863     /*     that is checked first, but compared == through all other checks.  */
7864     if(checkIdent)
7865     {
7866         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
7867         result = ucol_checkIdent(sColl, tColl, TRUE, status);
7868     }
7869
7870 commonReturn:
7871     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
7872         freeHeapWritableBuffer(sColl);
7873         freeHeapWritableBuffer(tColl);
7874
7875         if (sCEs.buf != sCEs.localArray ) {
7876             uprv_free(sCEs.buf);
7877         }
7878         if (tCEs.buf != tCEs.localArray ) {
7879             uprv_free(tCEs.buf);
7880         }
7881     }
7882
7883     return result;
7884 }
7885
7886
7887 static inline uint32_t
7888 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
7889                           uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
7890   const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
7891   int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
7892   int32_t offset = 1;
7893   UChar schar = 0, tchar = 0;
7894
7895   for(;;) {
7896     if(len == -1) {
7897       if(s[*index] == 0) { // end of string
7898         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7899       } else {
7900         schar = s[*index];
7901       }
7902     } else {
7903       if(*index == len) {
7904         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7905       } else {
7906         schar = s[*index];
7907       }
7908     }
7909
7910     while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
7911       offset++;
7912     }
7913
7914     if (schar == tchar) {
7915       (*index)++;
7916       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
7917     }
7918     else
7919     {
7920       if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
7921         return UCOL_BAIL_OUT_CE;
7922       }
7923       // skip completely ignorables
7924       uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
7925       if(isZeroCE == 0) { // we have to ignore completely ignorables
7926         (*index)++;
7927         continue;
7928       }
7929
7930       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
7931     }
7932   }
7933 }
7934
7935
7936 /**
7937  * This is a fast strcoll, geared towards text in Latin-1.
7938  * It supports contractions of size two, French secondaries
7939  * and case switching. You can use it with strengths primary
7940  * to tertiary. It does not support shifted and case level.
7941  * It relies on the table build by setupLatin1Table. If it
7942  * doesn't understand something, it will go to the regular
7943  * strcoll.
7944  */
7945 static inline UCollationResult
7946 ucol_strcollUseLatin1( const UCollator    *coll,
7947               const UChar        *source,
7948               int32_t            sLen,
7949               const UChar        *target,
7950               int32_t            tLen,
7951               UErrorCode *status)
7952 {
7953     U_ALIGN_CODE(16);
7954     int32_t strength = coll->strength;
7955
7956     int32_t sIndex = 0, tIndex = 0;
7957     UChar sChar = 0, tChar = 0;
7958     uint32_t sOrder=0, tOrder=0;
7959
7960     UBool endOfSource = FALSE, endOfTarget = FALSE;
7961
7962     uint32_t *elements = coll->latinOneCEs;
7963
7964     UBool haveContractions = FALSE; // if we have contractions in our string
7965                                     // we cannot do French secondary
7966
7967     // Do the primary level
7968     for(;;) {
7969       while(sOrder==0) { // this loop skips primary ignorables
7970         // sOrder=getNextlatinOneCE(source);
7971         if(sLen==-1) {   // handling zero terminated strings
7972           sChar=source[sIndex++];
7973           if(sChar==0) {
7974             endOfSource = TRUE;
7975             break;
7976           }
7977         } else {        // handling strings with known length
7978           if(sIndex==sLen) {
7979             endOfSource = TRUE;
7980             break;
7981           }
7982           sChar=source[sIndex++];
7983         }
7984         if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
7985           //fprintf(stderr, "R");
7986           goto returnRegular;
7987           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
7988         }
7989         sOrder = elements[sChar];
7990         if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
7991           // specials can basically be either contractions or bail-out signs. If we get anything
7992           // else, we'll bail out anywasy
7993           if(getCETag(sOrder) == CONTRACTION_TAG) {
7994             sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
7995             haveContractions = TRUE; // if there are contractions, we cannot do French secondary
7996             // However, if there are contractions in the table, but we always use just one char,
7997             // we might be able to do French. This should be checked out.
7998           }
7999           if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8000             //fprintf(stderr, "S");
8001             goto returnRegular;
8002             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8003           }
8004         }
8005       }
8006
8007       while(tOrder==0) {  // this loop skips primary ignorables
8008         // tOrder=getNextlatinOneCE(target);
8009         if(tLen==-1) {    // handling zero terminated strings
8010           tChar=target[tIndex++];
8011           if(tChar==0) {
8012             if(endOfSource) { // this is different than source loop,
8013               // as we already know that source loop is done here,
8014               // so we can either finish the primary loop if both
8015               // strings are done or anounce the result if only
8016               // target is done. Same below.
8017               goto endOfPrimLoop;
8018             } else {
8019               return UCOL_GREATER;
8020             }
8021           }
8022         } else {          // handling strings with known length
8023           if(tIndex==tLen) {
8024             if(endOfSource) {
8025               goto endOfPrimLoop;
8026             } else {
8027               return UCOL_GREATER;
8028             }
8029           }
8030           tChar=target[tIndex++];
8031         }
8032         if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8033           //fprintf(stderr, "R");
8034           goto returnRegular;
8035           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8036         }
8037         tOrder = elements[tChar];
8038         if(tOrder >= UCOL_NOT_FOUND) {
8039           // Handling specials, see the comments for source
8040           if(getCETag(tOrder) == CONTRACTION_TAG) {
8041             tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8042             haveContractions = TRUE;
8043           }
8044           if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8045             //fprintf(stderr, "S");
8046             goto returnRegular;
8047             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8048           }
8049         }
8050       }
8051       if(endOfSource) { // source is finished, but target is not, say the result.
8052           return UCOL_LESS;
8053       }
8054
8055       if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8056         sOrder = 0; tOrder = 0;
8057         continue;
8058       } else {
8059         // compare current top bytes
8060         if(((sOrder^tOrder)&0xFF000000)!=0) {
8061           // top bytes differ, return difference
8062           if(sOrder < tOrder) {
8063             return UCOL_LESS;
8064           } else if(sOrder > tOrder) {
8065             return UCOL_GREATER;
8066           }
8067           // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8068           // since we must return enum value
8069         }
8070
8071         // top bytes match, continue with following bytes
8072         sOrder<<=8;
8073         tOrder<<=8;
8074       }
8075     }
8076
8077 endOfPrimLoop:
8078     // after primary loop, we definitely know the sizes of strings,
8079     // so we set it and use simpler loop for secondaries and tertiaries
8080     sLen = sIndex; tLen = tIndex;
8081     if(strength >= UCOL_SECONDARY) {
8082       // adjust the table beggining
8083       elements += coll->latinOneTableLen;
8084       endOfSource = FALSE; endOfTarget = FALSE;
8085
8086       if(coll->frenchCollation == UCOL_OFF) { // non French
8087         // This loop is a simplified copy of primary loop
8088         // at this point we know that whole strings are latin-1, so we don't
8089         // check for that. We also know that we only have contractions as
8090         // specials.
8091         sIndex = 0; tIndex = 0;
8092         for(;;) {
8093           while(sOrder==0) {
8094             if(sIndex==sLen) {
8095               endOfSource = TRUE;
8096               break;
8097             }
8098             sChar=source[sIndex++];
8099             sOrder = elements[sChar];
8100             if(sOrder > UCOL_NOT_FOUND) {
8101               sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8102             }
8103           }
8104
8105           while(tOrder==0) {
8106             if(tIndex==tLen) {
8107               if(endOfSource) {
8108                 goto endOfSecLoop;
8109               } else {
8110                 return UCOL_GREATER;
8111               }
8112             }
8113             tChar=target[tIndex++];
8114             tOrder = elements[tChar];
8115             if(tOrder > UCOL_NOT_FOUND) {
8116               tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8117             }
8118           }
8119           if(endOfSource) {
8120               return UCOL_LESS;
8121           }
8122
8123           if(sOrder == tOrder) {
8124             sOrder = 0; tOrder = 0;
8125             continue;
8126           } else {
8127             // see primary loop for comments on this
8128             if(((sOrder^tOrder)&0xFF000000)!=0) {
8129               if(sOrder < tOrder) {
8130                 return UCOL_LESS;
8131               } else if(sOrder > tOrder) {
8132                 return UCOL_GREATER;
8133               }
8134             }
8135             sOrder<<=8;
8136             tOrder<<=8;
8137           }
8138         }
8139       } else { // French
8140         if(haveContractions) { // if we have contractions, we have to bail out
8141           // since we don't really know how to handle them here
8142           goto returnRegular;
8143           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8144         }
8145         // For French, we go backwards
8146         sIndex = sLen; tIndex = tLen;
8147         for(;;) {
8148           while(sOrder==0) {
8149             if(sIndex==0) {
8150               endOfSource = TRUE;
8151               break;
8152             }
8153             sChar=source[--sIndex];
8154             sOrder = elements[sChar];
8155             // don't even look for contractions
8156           }
8157
8158           while(tOrder==0) {
8159             if(tIndex==0) {
8160               if(endOfSource) {
8161                 goto endOfSecLoop;
8162               } else {
8163                 return UCOL_GREATER;
8164               }
8165             }
8166             tChar=target[--tIndex];
8167             tOrder = elements[tChar];
8168             // don't even look for contractions
8169           }
8170           if(endOfSource) {
8171               return UCOL_LESS;
8172           }
8173
8174           if(sOrder == tOrder) {
8175             sOrder = 0; tOrder = 0;
8176             continue;
8177           } else {
8178             // see the primary loop for comments
8179             if(((sOrder^tOrder)&0xFF000000)!=0) {
8180               if(sOrder < tOrder) {
8181                 return UCOL_LESS;
8182               } else if(sOrder > tOrder) {
8183                 return UCOL_GREATER;
8184               }
8185             }
8186             sOrder<<=8;
8187             tOrder<<=8;
8188           }
8189         }
8190       }
8191     }
8192
8193 endOfSecLoop:
8194     if(strength >= UCOL_TERTIARY) {
8195       // tertiary loop is the same as secondary (except no French)
8196       elements += coll->latinOneTableLen;
8197       sIndex = 0; tIndex = 0;
8198       endOfSource = FALSE; endOfTarget = FALSE;
8199       for(;;) {
8200         while(sOrder==0) {
8201           if(sIndex==sLen) {
8202             endOfSource = TRUE;
8203             break;
8204           }
8205           sChar=source[sIndex++];
8206           sOrder = elements[sChar];
8207           if(sOrder > UCOL_NOT_FOUND) {
8208             sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8209           }
8210         }
8211         while(tOrder==0) {
8212           if(tIndex==tLen) {
8213             if(endOfSource) {
8214               return UCOL_EQUAL; // if both strings are at the end, they are equal
8215             } else {
8216               return UCOL_GREATER;
8217             }
8218           }
8219           tChar=target[tIndex++];
8220           tOrder = elements[tChar];
8221           if(tOrder > UCOL_NOT_FOUND) {
8222             tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8223           }
8224         }
8225         if(endOfSource) {
8226             return UCOL_LESS;
8227         }
8228         if(sOrder == tOrder) {
8229           sOrder = 0; tOrder = 0;
8230           continue;
8231         } else {
8232           if(((sOrder^tOrder)&0xff000000)!=0) {
8233             if(sOrder < tOrder) {
8234               return UCOL_LESS;
8235             } else if(sOrder > tOrder) {
8236               return UCOL_GREATER;
8237             }
8238           }
8239           sOrder<<=8;
8240           tOrder<<=8;
8241         }
8242       }
8243     }
8244     return UCOL_EQUAL;
8245
8246 returnRegular:
8247     // Preparing the context objects for iterating over strings
8248     collIterate sColl, tColl;
8249
8250     IInit_collIterate(coll, source, sLen, &sColl);
8251     IInit_collIterate(coll, target, tLen, &tColl);
8252     return ucol_strcollRegular(&sColl, &tColl, status);
8253 }
8254
8255
8256 U_CAPI UCollationResult U_EXPORT2
8257 ucol_strcollIter( const UCollator    *coll,
8258                  UCharIterator *sIter,
8259                  UCharIterator *tIter,
8260                  UErrorCode         *status) {
8261   if(!status || U_FAILURE(*status) || sIter == tIter) {
8262     return UCOL_EQUAL;
8263   }
8264   if(sIter == NULL || tIter == NULL || coll == NULL) {
8265     *status = U_ILLEGAL_ARGUMENT_ERROR;
8266     return UCOL_EQUAL;
8267   }
8268
8269   UCollationResult result = UCOL_EQUAL;
8270
8271   // Preparing the context objects for iterating over strings
8272   collIterate sColl, tColl;
8273   // The division for the array length may truncate the array size to
8274   // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8275   // for all platforms anyway.
8276   UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8277   UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8278   UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8279
8280   IInit_collIterate(coll, NULL, -1, &sColl);
8281   sColl.iterator = sIter;
8282   sColl.flags |= UCOL_USE_ITERATOR;
8283   IInit_collIterate(coll, NULL, -1, &tColl);
8284   tColl.flags |= UCOL_USE_ITERATOR;
8285   tColl.iterator = tIter;
8286
8287   if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8288     sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8289     sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8290     sColl.flags &= ~UCOL_ITER_NORM;
8291
8292     tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8293     tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8294     tColl.flags &= ~UCOL_ITER_NORM;
8295   }
8296
8297   UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8298
8299   while((sChar = sColl.iterator->next(sColl.iterator)) ==
8300     (tChar = tColl.iterator->next(tColl.iterator))) {
8301     if(sChar == U_SENTINEL) {
8302       result = UCOL_EQUAL;
8303       goto end_compare;
8304     }
8305   }
8306
8307   if(sChar == U_SENTINEL) {
8308     tChar = tColl.iterator->previous(tColl.iterator);
8309   }
8310
8311   if(tChar == U_SENTINEL) {
8312     sChar = sColl.iterator->previous(sColl.iterator);
8313   }
8314
8315   sChar = sColl.iterator->previous(sColl.iterator);
8316   tChar = tColl.iterator->previous(tColl.iterator);
8317
8318   if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8319   {
8320       // We are stopped in the middle of a contraction.
8321       // Scan backwards through the == part of the string looking for the start of the contraction.
8322       //   It doesn't matter which string we scan, since they are the same in this region.
8323       do
8324       {
8325         sChar = sColl.iterator->previous(sColl.iterator);
8326         tChar = tColl.iterator->previous(tColl.iterator);
8327       }
8328       while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8329   }
8330
8331
8332   if(U_SUCCESS(*status)) {
8333     result = ucol_strcollRegular(&sColl, &tColl, status);
8334   }
8335
8336 end_compare:
8337   if(sNormIter || tNormIter) {
8338     unorm_closeIter(sNormIter);
8339     unorm_closeIter(tNormIter);
8340   }
8341
8342   return result;
8343 }
8344
8345
8346
8347 /*                                                                      */
8348 /* ucol_strcoll     Main public API string comparison function          */
8349 /*                                                                      */
8350 U_CAPI UCollationResult U_EXPORT2
8351 ucol_strcoll( const UCollator    *coll,
8352               const UChar        *source,
8353               int32_t            sourceLength,
8354               const UChar        *target,
8355               int32_t            targetLength) {
8356     U_ALIGN_CODE(16);
8357     UErrorCode status = U_ZERO_ERROR;
8358     if(source == NULL || target == NULL) {
8359       // do not crash, but return. Should have
8360       // status argument to return error.
8361       return UCOL_EQUAL;
8362     }
8363       collIterate sColl, tColl;
8364
8365     /* Scan the strings.  Find:                                                             */
8366     /*    The length of any leading portion that is equal                                   */
8367     /*    Whether they are exactly equal.  (in which case we just return)                   */
8368     const UChar    *pSrc    = source;
8369     const UChar    *pTarg   = target;
8370     int32_t        equalLength;
8371
8372     if (sourceLength == -1 && targetLength == -1) {
8373         // Both strings are null terminated.
8374         //    Check for them being the same string, and scan through
8375         //    any leading equal portion.
8376         if (source==target) {
8377             return UCOL_EQUAL;
8378         }
8379
8380         for (;;) {
8381             if ( *pSrc != *pTarg || *pSrc == 0) {
8382                 break;
8383             }
8384             pSrc++;
8385             pTarg++;
8386         }
8387         if (*pSrc == 0 && *pTarg == 0) {
8388             return UCOL_EQUAL;
8389         }
8390         equalLength = pSrc - source;
8391     }
8392     else
8393     {
8394         // One or both strings has an explicit length.
8395         /* check if source and target are same strings */
8396
8397         if (source==target  && sourceLength==targetLength) {
8398             return UCOL_EQUAL;
8399         }
8400         const UChar    *pSrcEnd = source + sourceLength;
8401         const UChar    *pTargEnd = target + targetLength;
8402
8403
8404         // Scan while the strings are bitwise ==, or until one is exhausted.
8405             for (;;) {
8406                 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8407                     break;
8408                 }
8409                 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8410                     break;
8411                 }
8412                 if (*pSrc != *pTarg) {
8413                     break;
8414                 }
8415                 pSrc++;
8416                 pTarg++;
8417             }
8418             equalLength = pSrc - source;
8419
8420             // If we made it all the way through both strings, we are done.  They are ==
8421             if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8422                 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))  {  /* and also at end of dest string                  */
8423                 return UCOL_EQUAL;
8424             }
8425     }
8426     if (equalLength > 0) {
8427         /* There is an identical portion at the beginning of the two strings.        */
8428         /*   If the identical portion ends within a contraction or a comibining      */
8429         /*   character sequence, back up to the start of that sequence.              */
8430         pSrc  = source + equalLength;        /* point to the first differing chars   */
8431         pTarg = target + equalLength;
8432         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
8433             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
8434         {
8435             // We are stopped in the middle of a contraction.
8436             // Scan backwards through the == part of the string looking for the start of the contraction.
8437             //   It doesn't matter which string we scan, since they are the same in this region.
8438             do
8439             {
8440                 equalLength--;
8441                 pSrc--;
8442             }
8443             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
8444         }
8445
8446         source += equalLength;
8447         target += equalLength;
8448         if (sourceLength > 0) {
8449             sourceLength -= equalLength;
8450         }
8451         if (targetLength > 0) {
8452             targetLength -= equalLength;
8453         }
8454     }
8455
8456     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
8457       // Preparing the context objects for iterating over strings
8458       IInit_collIterate(coll, source, sourceLength, &sColl);
8459       IInit_collIterate(coll, target, targetLength, &tColl);
8460       return ucol_strcollRegular(&sColl, &tColl, &status);
8461     } else {
8462       return ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
8463     }
8464 }
8465
8466 /* convenience function for comparing strings */
8467 U_CAPI UBool U_EXPORT2
8468 ucol_greater(    const    UCollator        *coll,
8469         const    UChar            *source,
8470         int32_t            sourceLength,
8471         const    UChar            *target,
8472         int32_t            targetLength)
8473 {
8474   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8475       == UCOL_GREATER);
8476 }
8477
8478 /* convenience function for comparing strings */
8479 U_CAPI UBool U_EXPORT2
8480 ucol_greaterOrEqual(    const    UCollator    *coll,
8481             const    UChar        *source,
8482             int32_t        sourceLength,
8483             const    UChar        *target,
8484             int32_t        targetLength)
8485 {
8486   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8487       != UCOL_LESS);
8488 }
8489
8490 /* convenience function for comparing strings */
8491 U_CAPI UBool U_EXPORT2
8492 ucol_equal(        const    UCollator        *coll,
8493             const    UChar            *source,
8494             int32_t            sourceLength,
8495             const    UChar            *target,
8496             int32_t            targetLength)
8497 {
8498   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
8499       == UCOL_EQUAL);
8500 }
8501
8502 /* returns the locale name the collation data comes from */
8503 U_CAPI const char * U_EXPORT2
8504 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
8505   const char *result = NULL;
8506   if(status == NULL || U_FAILURE(*status)) {
8507     return NULL;
8508   }
8509   switch(type) {
8510   case ULOC_ACTUAL_LOCALE:
8511     // validLocale is set only if service registration has explicitly set the
8512     // requested and valid locales.  if this is the case, the actual locale
8513     // is considered to be the valid locale.
8514     if (coll->validLocale != NULL) {
8515       result = coll->validLocale;
8516     } else if(coll->elements != NULL) {
8517       result = ures_getLocale(coll->elements, status);
8518     }
8519     break;
8520   case ULOC_VALID_LOCALE:
8521     if (coll->validLocale != NULL) {
8522       result = coll->validLocale;
8523     } else if(coll->rb != NULL) {
8524       result = ures_getLocale(coll->rb, status);
8525     }
8526     break;
8527   case ULOC_REQUESTED_LOCALE:
8528     result = coll->requestedLocale;
8529     break;
8530   default:
8531     *status = U_ILLEGAL_ARGUMENT_ERROR;
8532   }
8533   return result;
8534 }
8535
8536 U_CAPI USet * U_EXPORT2
8537 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
8538 {
8539   if(status == NULL || U_FAILURE(*status)) {
8540     return NULL;
8541   }
8542   if(coll == NULL) {
8543     *status = U_ILLEGAL_ARGUMENT_ERROR;
8544   }
8545   UParseError parseError;
8546   UColTokenParser src;
8547   int32_t rulesLen = 0;
8548   const UChar *rules = ucol_getRules(coll, &rulesLen);
8549   const UChar *current = NULL;
8550   UBool startOfRules = TRUE;
8551   // we internally use the C++ class, for the following reasons:
8552   // 1. we need to utilize canonical iterator, which is a C++ only class
8553   // 2. canonical iterator returns UnicodeStrings - USet cannot take them
8554   // 3. USet is internally really UnicodeSet, C is just a wrapper
8555   UnicodeSet *tailored = new UnicodeSet();
8556   UnicodeString pattern;
8557   CanonicalIterator it("", *status);
8558
8559
8560   // The idea is to tokenize the rule set. For each non-reset token,
8561   // we add all the canonicaly equivalent FCD sequences
8562   ucol_tok_initTokenList(&src, rules, rulesLen, UCA, status);
8563   while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
8564     startOfRules = FALSE;
8565     if(src.parsedToken.strength != UCOL_TOK_RESET) {
8566       const UChar *stuff = src.source+(src.parsedToken.charsOffset);
8567       it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
8568       pattern = it.next();
8569       while(!pattern.isBogus()) {
8570         if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
8571           tailored->add(pattern);
8572         }
8573         pattern = it.next();
8574       }
8575     }
8576   }
8577   ucol_tok_closeTokenList(&src);
8578   return (USet *)tailored;
8579 }
8580
8581 U_CAPI UBool U_EXPORT2
8582 ucol_equals(const UCollator *source, const UCollator *target) {
8583   UErrorCode status = U_ZERO_ERROR;
8584   // if pointers are equal, collators are equal
8585   if(source == target) {
8586     return TRUE;
8587   }
8588   int32_t i = 0, j = 0;
8589   // if any of attributes are different, collators are not equal
8590   for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
8591     if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) {
8592       return FALSE;
8593     }
8594   }
8595
8596   int32_t sourceRulesLen = 0, targetRulesLen = 0;
8597   const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen);
8598   const UChar *targetRules = ucol_getRules(target, &targetRulesLen);
8599
8600   if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) {
8601     // all the attributes are equal and the rules are equal - collators are equal
8602     return(TRUE);
8603   }
8604   // hard part, need to construct tree from rules and see if they yield the same tailoring
8605   UBool result = TRUE;
8606   UParseError parseError;
8607   UColTokenParser sourceParser, targetParser;
8608   int32_t sourceListLen = 0, targetListLen = 0;
8609   ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, UCA, &status);
8610   ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, UCA, &status);
8611   sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status);
8612   targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status);
8613
8614   if(sourceListLen != targetListLen) {
8615     // different number of resets
8616     result = FALSE;
8617   } else {
8618     UColToken *sourceReset = NULL, *targetReset = NULL;
8619     UChar *sourceResetString = NULL, *targetResetString = NULL;
8620     int32_t sourceStringLen = 0, targetStringLen = 0;
8621     for(i = 0; i < sourceListLen; i++) {
8622       sourceReset = sourceParser.lh[i].reset;
8623       sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
8624       sourceStringLen = sourceReset->source >> 24;
8625       for(j = 0; j < sourceListLen; j++) {
8626         targetReset = targetParser.lh[j].reset;
8627         targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
8628         targetStringLen = targetReset->source >> 24;
8629         if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) {
8630           sourceReset = sourceParser.lh[i].first;
8631           targetReset = targetParser.lh[j].first;
8632           while(sourceReset != NULL && targetReset != NULL) {
8633             sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
8634             sourceStringLen = sourceReset->source >> 24;
8635             targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
8636             targetStringLen = targetReset->source >> 24;
8637             if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
8638               result = FALSE;
8639               goto returnResult;
8640             }
8641             // probably also need to check the expansions
8642             if(sourceReset->expansion) {
8643               if(!targetReset->expansion) {
8644                 result = FALSE;
8645                 goto returnResult;
8646               } else {
8647                 // compare expansions
8648                 sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF);
8649                 sourceStringLen = sourceReset->expansion >> 24;
8650                 targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF);
8651                 targetStringLen = targetReset->expansion >> 24;
8652                 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
8653                   result = FALSE;
8654                   goto returnResult;
8655                 }
8656               }
8657             } else {
8658               if(targetReset->expansion) {
8659                 result = FALSE;
8660                 goto returnResult;
8661               }
8662             }
8663             sourceReset = sourceReset->next;
8664             targetReset = targetReset->next;
8665           }
8666           if(sourceReset != targetReset) { // at least one is not NULL
8667             // there are more tailored elements in one list
8668             result = FALSE;
8669             goto returnResult;
8670           }
8671
8672
8673           break;
8674         }
8675       }
8676       // couldn't find the reset anchor, so the collators are not equal
8677       if(j == sourceListLen) {
8678         result = FALSE;
8679         goto returnResult;
8680       }
8681     }
8682   }
8683
8684 returnResult:
8685   ucol_tok_closeTokenList(&sourceParser);
8686   ucol_tok_closeTokenList(&targetParser);
8687   return result;
8688
8689 }
8690 #endif /* #if !UCONFIG_NO_COLLATION */