icuSources/i18n/ucol.cpp

   1 /*
   2 *******************************************************************************
   3 *   Copyright (C) 1996-2004, International Business Machines
   4 *   Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 *   file name:  ucol.cpp
   7 *   encoding:   US-ASCII
   8 *   tab size:   8 (not used)
   9 *   indentation:4
  10 *
  11 * Modification history
  12 * Date        Name      Comments
  13 * 1996-1999   various members of ICU team maintained C API for collation framework
  14 * 02/16/2001  synwee    Added internal method getPrevSpecialCE
  15 * 03/01/2001  synwee    Added maxexpansion functionality.
  16 * 03/16/2001  weiv      Collation framework is rewritten in C and made UCA compliant
  17 */
  18
  19 #include "unicode/utypes.h"
  20 #include "ustrenum.h"
  21 #include "uassert.h"
  22
  23 #if !UCONFIG_NO_COLLATION
  24
  25 #include "unicode/uloc.h"
  26 #include "unicode/coll.h"
  27 #include "unicode/tblcoll.h"
  28 #include "unicode/coleitr.h"
  29 #include "unicode/unorm.h"
  30 #include "unicode/udata.h"
  31 #include "unicode/uchar.h"
  32 #include "unicode/caniter.h"
  33
  34 #include "ucol_bld.h"
  35 #include "ucol_imp.h"
  36 #include "ucol_tok.h"
  37 #include "ucol_elm.h"
  38 #include "bocsu.h"
  39
  40 #include "unormimp.h"
  41 #include "unorm_it.h"
  42 #include "uresimp.h"
  43 #include "umutex.h"
  44 #include "uhash.h"
  45 #include "ucln_in.h"
  46 #include "cstring.h"
  47 #include "utracimp.h"
  48 #include "putilimp.h"
  49
  50 #ifdef UCOL_DEBUG
  51 #include <stdio.h>
  52 #endif
  53
  54 U_NAMESPACE_USE
  55
  56 /* added by synwee for trie manipulation*/
  57 #define STAGE_1_SHIFT_            10
  58 #define STAGE_2_SHIFT_            4
  59 #define STAGE_2_MASK_AFTER_SHIFT_ 0x3F
  60 #define STAGE_3_MASK_             0xF
  61 #define LAST_BYTE_MASK_           0xFF
  62 #define SECOND_LAST_BYTE_SHIFT_   8
  63
  64 #define ZERO_CC_LIMIT_            0xC0
  65
  66 // static UCA. There is only one. Collators don't use it.
  67 // It is referenced only in ucol_initUCA and ucol_cleanup
  68 static UCollator* _staticUCA = NULL;
  69 // static pointer to udata memory. Inited in ucol_initUCA
  70 // used for cleanup in ucol_cleanup
  71 static UDataMemory* UCA_DATA_MEM = NULL;
  72
  73 // this is static pointer to the normalizer fcdTrieIndex
  74 // it is always the same between calls to u_cleanup
  75 // and therefore writing to it is not synchronized.
  76 // It is cleaned in ucol_cleanup
  77 static const uint16_t *fcdTrieIndex=NULL;
  78
  79 U_CDECL_BEGIN
  80 static UBool U_CALLCONV
  81 isAcceptableUCA(void * /*context*/,
  82              const char * /*type*/, const char * /*name*/,
  83              const UDataInfo *pInfo){
  84   /* context, type & name are intentionally not used */
  85     if( pInfo->size>=20 &&
  86         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  87         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  88         pInfo->dataFormat[0]==UCA_DATA_FORMAT_0 &&   /* dataFormat="UCol" */
  89         pInfo->dataFormat[1]==UCA_DATA_FORMAT_1 &&
  90         pInfo->dataFormat[2]==UCA_DATA_FORMAT_2 &&
  91         pInfo->dataFormat[3]==UCA_DATA_FORMAT_3 &&
  92         pInfo->formatVersion[0]==UCA_FORMAT_VERSION_0 &&
  93         pInfo->formatVersion[1]>=UCA_FORMAT_VERSION_1// &&
  94         //pInfo->formatVersion[1]==UCA_FORMAT_VERSION_1 &&
  95         //pInfo->formatVersion[2]==UCA_FORMAT_VERSION_2 && // Too harsh
  96         //pInfo->formatVersion[3]==UCA_FORMAT_VERSION_3 && // Too harsh
  97         ) {
  98         UVersionInfo UCDVersion;
  99         u_getUnicodeVersion(UCDVersion);
 100         if(pInfo->dataVersion[0]==UCDVersion[0] &&
 101           pInfo->dataVersion[1]==UCDVersion[1]) { // &&
 102         //pInfo->dataVersion[2]==ucaDataInfo.dataVersion[2] &&
 103         //pInfo->dataVersion[3]==ucaDataInfo.dataVersion[3]) {
 104           return TRUE;
 105         } else {
 106           return FALSE;
 107         }
 108     } else {
 109         return FALSE;
 110     }
 111 }
 112
 113
 114 static int32_t U_CALLCONV
 115 _getFoldingOffset(uint32_t data) {
 116     return (int32_t)(data&0xFFFFFF);
 117 }
 118
 119 U_CDECL_END
 120
 121 static
 122 inline void  IInit_collIterate(const UCollator *collator, const UChar *sourceString,
 123                               int32_t sourceLen, collIterate *s) {
 124     (s)->string = (s)->pos = (UChar *)(sourceString);
 125     (s)->origFlags = 0;
 126     (s)->flags = 0;
 127     if (sourceLen >= 0) {
 128         s->flags |= UCOL_ITER_HASLEN;
 129         (s)->endp = (UChar *)sourceString+sourceLen;
 130     }
 131     else {
 132         /* change to enable easier checking for end of string for fcdpositon */
 133         (s)->endp = NULL;
 134     }
 135     (s)->CEpos = (s)->toReturn = (s)->CEs;
 136     (s)->writableBuffer = (s)->stackWritableBuffer;
 137     (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
 138     (s)->coll = (collator);
 139     (s)->fcdPosition = 0;
 140     if(collator->normalizationMode == UCOL_ON) {
 141         (s)->flags |= UCOL_ITER_NORM;
 142     }
 143     if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) {
 144       (s)->flags |= UCOL_HIRAGANA_Q;
 145     }
 146     (s)->iterator = NULL;
 147     //(s)->iteratorIndex = 0;
 148 }
 149
 150 U_CAPI void  U_EXPORT2
 151 uprv_init_collIterate(const UCollator *collator, const UChar *sourceString,
 152                              int32_t sourceLen, collIterate *s){
 153     /* Out-of-line version for use from other files. */
 154     IInit_collIterate(collator, sourceString, sourceLen, s);
 155 }
 156
 157
 158 /**
 159 * Backup the state of the collIterate struct data
 160 * @param data collIterate to backup
 161 * @param backup storage
 162 */
 163 static
 164 inline void backupState(const collIterate *data, collIterateState *backup)
 165 {
 166     backup->fcdPosition = data->fcdPosition;
 167     backup->flags       = data->flags;
 168     backup->origFlags   = data->origFlags;
 169     backup->pos         = data->pos;
 170     backup->bufferaddress = data->writableBuffer;
 171     backup->buffersize    = data->writableBufSize;
 172     if(data->iterator != NULL) {
 173       //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT);
 174       backup->iteratorIndex = data->iterator->getState(data->iterator);
 175       // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE
 176       backup->iteratorMove = 0;
 177       if(backup->iteratorIndex == UITER_NO_STATE) {
 178         while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) {
 179           backup->iteratorMove++;
 180           data->iterator->move(data->iterator, -1, UITER_CURRENT);
 181         }
 182         data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 183       }
 184     }
 185 }
 186
 187 /**
 188 * Loads the state into the collIterate struct data
 189 * @param data collIterate to backup
 190 * @param backup storage
 191 * @param forwards boolean to indicate if forwards iteration is used,
 192 *        false indicates backwards iteration
 193 */
 194 static
 195 inline void loadState(collIterate *data, const collIterateState *backup,
 196                       UBool        forwards)
 197 {
 198   UErrorCode status = U_ZERO_ERROR;
 199     data->flags       = backup->flags;
 200     data->origFlags   = backup->origFlags;
 201     if(data->iterator != NULL) {
 202       //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO);
 203       data->iterator->setState(data->iterator, backup->iteratorIndex, &status);
 204       if(backup->iteratorMove != 0) {
 205         data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT);
 206       }
 207     }
 208     data->pos         = backup->pos;
 209     if ((data->flags & UCOL_ITER_INNORMBUF) &&
 210         data->writableBuffer != backup->bufferaddress) {
 211         /*
 212         this is when a new buffer has been reallocated and we'll have to
 213         calculate the new position.
 214         note the new buffer has to contain the contents of the old buffer.
 215         */
 216         if (forwards) {
 217             data->pos = data->writableBuffer +
 218                                          (data->pos - backup->bufferaddress);
 219         }
 220         else {
 221             /* backwards direction */
 222             uint32_t temp = backup->buffersize -
 223                                   (data->pos - backup->bufferaddress);
 224             data->pos = data->writableBuffer + (data->writableBufSize - temp);
 225         }
 226     }
 227     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
 228         /*
 229         this is alittle tricky.
 230         if we are initially not in the normalization buffer, even if we
 231         normalize in the later stage, the data in the buffer will be
 232         ignored, since we skip back up to the data string.
 233         however if we are already in the normalization buffer, any
 234         further normalization will pull data into the normalization
 235         buffer and modify the fcdPosition.
 236         since we are keeping the data in the buffer for use, the
 237         fcdPosition can not be reverted back.
 238         arrgghh....
 239         */
 240         data->fcdPosition = backup->fcdPosition;
 241     }
 242 }
 243
 244
 245 /*
 246 * collIter_eos()
 247 *     Checks for a collIterate being positioned at the end of
 248 *     its source string.
 249 *
 250 */
 251 static
 252 inline UBool collIter_eos(collIterate *s) {
 253     if(s->flags & UCOL_USE_ITERATOR) {
 254       return !(s->iterator->hasNext(s->iterator));
 255     }
 256     if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) {
 257         // Null terminated string, but not at null, so not at end.
 258         //   Whether in main or normalization buffer doesn't matter.
 259         return FALSE;
 260     }
 261
 262     // String with length.  Can't be in normalization buffer, which is always
 263     //  null termintated.
 264     if (s->flags & UCOL_ITER_HASLEN) {
 265         return (s->pos == s->endp);
 266     }
 267
 268     // We are at a null termination, could be either normalization buffer or main string.
 269     if ((s->flags & UCOL_ITER_INNORMBUF) == 0) {
 270         // At null at end of main string.
 271         return TRUE;
 272     }
 273
 274     // At null at end of normalization buffer.  Need to check whether there there are
 275     //   any characters left in the main buffer.
 276     if(s->origFlags & UCOL_USE_ITERATOR) {
 277       return !(s->iterator->hasNext(s->iterator));
 278     } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) {
 279         // Null terminated main string.  fcdPosition is the 'return' position into main buf.
 280         return (*s->fcdPosition == 0);
 281     }
 282     else {
 283         // Main string with an end pointer.
 284         return s->fcdPosition == s->endp;
 285     }
 286 }
 287
 288 /*
 289 * collIter_bos()
 290 *     Checks for a collIterate being positioned at the start of
 291 *     its source string.
 292 *
 293 */
 294 static
 295 inline UBool collIter_bos(collIterate *source) {
 296   // if we're going backwards, we need to know whether there is more in the
 297   // iterator, even if we are in the side buffer
 298   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 299     return !source->iterator->hasPrevious(source->iterator);
 300   }
 301   if (source->pos <= source->string ||
 302       ((source->flags & UCOL_ITER_INNORMBUF) &&
 303       *(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
 304     return TRUE;
 305   }
 306   return FALSE;
 307 }
 308
 309 static
 310 inline UBool collIter_SimpleBos(collIterate *source) {
 311   // if we're going backwards, we need to know whether there is more in the
 312   // iterator, even if we are in the side buffer
 313   if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) {
 314     return !source->iterator->hasPrevious(source->iterator);
 315   }
 316   if (source->pos == source->string) {
 317     return TRUE;
 318   }
 319   return FALSE;
 320 }
 321     //return (data->pos == data->string) ||
 322
 323
 324 /**
 325 * Checks and free writable buffer if it is not the original stack buffer
 326 * in collIterate. This function does not reassign the writable buffer.
 327 * @param data collIterate struct to determine and free the writable buffer
 328 */
 329 static
 330 inline void freeHeapWritableBuffer(collIterate *data)
 331 {
 332     if (data->writableBuffer != data->stackWritableBuffer) {
 333         uprv_free(data->writableBuffer);
 334     }
 335 }
 336
 337
 338 /****************************************************************************/
 339 /* Following are the open/close functions                                   */
 340 /*                                                                          */
 341 /****************************************************************************/
 342 static UCollator*
 343 tryOpeningFromRules(UResourceBundle *collElem, UErrorCode *status) {
 344   int32_t rulesLen = 0;
 345   const UChar *rules = ures_getStringByKey(collElem, "Sequence", &rulesLen, status);
 346   return ucol_openRules(rules, rulesLen, UCOL_DEFAULT, UCOL_DEFAULT, NULL, status);
 347
 348 }
 349
 350
 351 U_CAPI UCollator*
 352 ucol_open(const char *loc,
 353           UErrorCode *status)
 354 {
 355   UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN);
 356   UTRACE_DATA1(UTRACE_INFO, "locale = \"%s\"", loc);
 357   UCollator *result = NULL;
 358
 359   u_init(status);
 360 #if !UCONFIG_NO_SERVICE
 361   result = Collator::createUCollator(loc, status);
 362   if (result == NULL)
 363 #endif
 364   {
 365     result = ucol_open_internal(loc, status);
 366   }
 367   UTRACE_EXIT_PTR_STATUS(result, *status);
 368   return result;
 369 }
 370
 371 // API in ucol_imp.h
 372
 373 U_CFUNC UCollator*
 374 ucol_open_internal(const char *loc,
 375                    UErrorCode *status)
 376 {
 377   const UCollator* UCA = ucol_initUCA(status);
 378
 379   /* New version */
 380   if(U_FAILURE(*status)) return 0;
 381
 382
 383
 384   UCollator *result = NULL;
 385   UResourceBundle *b = ures_open(U_ICUDATA_COLL, loc, status);
 386
 387   /* we try to find stuff from keyword */
 388   UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status);
 389   UResourceBundle *collElem = NULL;
 390   char keyBuffer[256];
 391   // if there is a keyword, we pick it up and try to get elements
 392   if(!uloc_getKeywordValue(loc, "collation", keyBuffer, 256, status)) {
 393     // no keyword. we try to find the default setting, which will give us the keyword value
 394     UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status);
 395     if(U_SUCCESS(*status)) {
 396       int32_t defaultKeyLen = 0;
 397       const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status);
 398       u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen);
 399       keyBuffer[defaultKeyLen] = 0;
 400     } else {
 401       *status = U_INTERNAL_PROGRAM_ERROR;
 402       return NULL;
 403     }
 404     ures_close(defaultColl);
 405   }
 406   collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status);
 407
 408   UResourceBundle *binary = NULL;
 409   UErrorCode binaryStatus = U_ZERO_ERROR;
 410
 411   if(*status == U_MISSING_RESOURCE_ERROR) { /* We didn't find the tailoring data, we fallback to the UCA */
 412     *status = U_USING_DEFAULT_WARNING;
 413     result = ucol_initCollator(UCA->image, result, UCA, status);
 414     // if we use UCA, real locale is root
 415     result->rb = ures_open(U_ICUDATA_COLL, "", status);
 416     result->elements = ures_open(U_ICUDATA_COLL, "", status);
 417     if(U_FAILURE(*status)) {
 418       goto clean;
 419     }
 420     ures_close(b);
 421     result->hasRealData = FALSE;
 422   } else if(U_SUCCESS(*status)) {
 423     binary = ures_getByKey(collElem, "%%CollationBin", NULL, &binaryStatus);
 424
 425     if(binaryStatus == U_MISSING_RESOURCE_ERROR) { /* we didn't find the binary image, we should use the rules */
 426       binary = NULL;
 427       result = tryOpeningFromRules(collElem, status);
 428       if(U_FAILURE(*status)) {
 429         goto clean;
 430       }
 431     } else if(U_SUCCESS(*status)) { /* otherwise, we'll pick a collation data that exists */
 432       int32_t len = 0;
 433       const uint8_t *inData = ures_getBinary(binary, &len, status);
 434       UCATableHeader *colData = (UCATableHeader *)inData;
 435       if(uprv_memcmp(colData->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
 436         uprv_memcmp(colData->UCDVersion, UCA->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
 437         colData->version[0] != UCOL_BUILDER_VERSION) {
 438         *status = U_DIFFERENT_UCA_VERSION;
 439         result = tryOpeningFromRules(collElem, status);
 440       } else {
 441         if(U_FAILURE(*status)){
 442           goto clean;
 443         }
 444         if((uint32_t)len > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
 445           result = ucol_initCollator((const UCATableHeader *)inData, result, UCA, status);
 446           if(U_FAILURE(*status)){
 447             goto clean;
 448           }
 449           result->hasRealData = TRUE;
 450         } else {
 451           result = ucol_initCollator(UCA->image, result, UCA, status);
 452           ucol_setOptionsFromHeader(result, (UColOptionSet *)(inData+((const UCATableHeader *)inData)->options), status);
 453           if(U_FAILURE(*status)){
 454             goto clean;
 455           }
 456           result->hasRealData = FALSE;
 457         }
 458         result->freeImageOnClose = FALSE;
 459       }
 460     }
 461     result->rb = b;
 462     result->elements = collElem;
 463   } else { /* There is another error, and we're just gonna clean up */
 464 clean:
 465     ures_close(b);
 466     ures_close(collElem);
 467     ures_close(collations);
 468     ures_close(binary);
 469     return NULL;
 470   }
 471
 472   result->validLocale = NULL; // default is to use rb info
 473
 474   if(loc == NULL) {
 475     loc = ures_getLocale(result->rb, status);
 476   }
 477   result->requestedLocale = (char *)uprv_malloc((uprv_strlen(loc)+1)*sizeof(char));
 478   /* test for NULL */
 479   if (result->requestedLocale == NULL) {
 480     *status = U_MEMORY_ALLOCATION_ERROR;
 481     ures_close(b); // ??? appears needed
 482     ures_close(collElem);
 483     ures_close(collations);
 484     ures_close(binary); // ??? appears needed
 485     return NULL;
 486   }
 487   uprv_strcpy(result->requestedLocale, loc);
 488
 489   ures_close(binary);
 490   ures_close(collations); //??? we have to decide on that. Probably affects something :)
 491   return result;
 492 }
 493
 494
 495 U_CAPI void U_EXPORT2
 496 ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *validLocaleToAdopt)
 497 {
 498   if (coll) {
 499     if (coll->validLocale) {
 500       uprv_free(coll->validLocale);
 501     }
 502     coll->validLocale = validLocaleToAdopt;
 503     if (coll->requestedLocale) { // should always have
 504       uprv_free(coll->requestedLocale);
 505     }
 506     coll->requestedLocale = requestedLocaleToAdopt;
 507   }
 508 }
 509
 510 U_CAPI void U_EXPORT2
 511 ucol_close(UCollator *coll)
 512 {
 513   UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE);
 514   UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll);
 515   if(coll != NULL) {
 516       // these are always owned by each UCollator struct,
 517       // so we always free them
 518       if(coll->validLocale != NULL) {
 519           uprv_free(coll->validLocale);
 520       }
 521       if(coll->requestedLocale != NULL) {
 522           uprv_free(coll->requestedLocale);
 523       }
 524
 525       /* Here, it would be advisable to close: */
 526       /* - UData for UCA (unless we stuff it in the root resb */
 527       /* Again, do we need additional housekeeping... HMMM! */
 528       UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose);
 529       if(coll->freeOnClose){
 530       /* for safeClone, if freeOnClose is FALSE,
 531           don't free the other instance data */
 532           if(coll->freeOptionsOnClose != FALSE) {
 533               if(coll->options != NULL) {
 534                   uprv_free(coll->options);
 535               }
 536           }
 537           if(coll->mapping != NULL) {
 538               /*ucmpe32_close(coll->mapping);*/
 539               uprv_free(coll->mapping);
 540           }
 541           if(coll->rules != NULL && coll->freeRulesOnClose) {
 542               uprv_free((UChar *)coll->rules);
 543           }
 544           if(coll->rb != NULL) { /* pointing to read-only memory */
 545               ures_close(coll->rb);
 546           }
 547           if(coll->freeImageOnClose == TRUE) {
 548               uprv_free((UCATableHeader *)coll->image);
 549           }
 550           if(coll->elements != NULL) {
 551               ures_close(coll->elements);
 552           }
 553           if(coll->latinOneCEs != NULL) {
 554               uprv_free(coll->latinOneCEs);
 555           }
 556           uprv_free(coll);
 557       }
 558   }
 559   UTRACE_EXIT();
 560 }
 561
 562 U_CAPI UCollator* U_EXPORT2
 563 ucol_openRules( const UChar        *rules,
 564                 int32_t            rulesLength,
 565                 UColAttributeValue normalizationMode,
 566                 UCollationStrength strength,
 567                 UParseError        *parseError,
 568                 UErrorCode         *status)
 569 {
 570   uint32_t listLen = 0;
 571   UColTokenParser src;
 572   UColAttributeValue norm;
 573   UParseError tErr;
 574
 575   if(status == NULL || U_FAILURE(*status)){
 576     return 0;
 577   }
 578
 579   u_init(status);
 580   if (U_FAILURE(*status)) {
 581       return NULL;
 582   }
 583
 584   if(rulesLength < -1 || (rules == NULL && rulesLength != 0)) {
 585     *status = U_ILLEGAL_ARGUMENT_ERROR;
 586     return 0;
 587   }
 588
 589   if(rulesLength == -1) {
 590     rulesLength = u_strlen(rules);
 591   }
 592
 593   if(parseError == NULL){
 594     parseError = &tErr;
 595   }
 596
 597   switch(normalizationMode) {
 598   case UCOL_OFF:
 599   case UCOL_ON:
 600   case UCOL_DEFAULT:
 601     norm = normalizationMode;
 602     break;
 603   default:
 604     *status = U_ILLEGAL_ARGUMENT_ERROR;
 605     return 0;
 606   }
 607
 608   UCollator *UCA = ucol_initUCA(status);
 609
 610   if(U_FAILURE(*status)){
 611     return NULL;
 612   }
 613
 614   ucol_tok_initTokenList(&src, rules, rulesLength, UCA, status);
 615   listLen = ucol_tok_assembleTokenList(&src,parseError, status);
 616
 617   if(U_FAILURE(*status)) {
 618     /* if status is U_ILLEGAL_ARGUMENT_ERROR, src->current points at the offending option */
 619     /* if status is U_INVALID_FORMAT_ERROR, src->current points after the problematic part of the rules */
 620     /* so something might be done here... or on lower level */
 621 #ifdef UCOL_DEBUG
 622     if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
 623       fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
 624     } else {
 625       fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
 626     }
 627 #endif
 628     ucol_tok_closeTokenList(&src);
 629     return NULL;
 630   }
 631   UCollator *result = NULL;
 632   UCATableHeader *table = NULL;
 633
 634   if(src.resultLen > 0 || src.removeSet != NULL) { /* we have a set of rules, let's make something of it */
 635     /* also, if we wanted to remove some contractions, we should make a tailoring */
 636     table = ucol_assembleTailoringTable(&src, status);
 637     if(U_SUCCESS(*status)) {
 638       // builder version
 639       table->version[0] = UCOL_BUILDER_VERSION;
 640       // no tailoring information on this level
 641       table->version[1] = table->version[2] = table->version[3] = 0;
 642       // set UCD version
 643       u_getUnicodeVersion(table->UCDVersion);
 644       // set UCA version
 645       uprv_memcpy(table->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo));
 646       result = ucol_initCollator(table, 0, UCA, status);
 647       result->hasRealData = TRUE;
 648       result->freeImageOnClose = TRUE;
 649     }
 650   } else { /* no rules, but no error either */
 651     // must be only options
 652     // We will init the collator from UCA
 653     result = ucol_initCollator(UCA->image, 0, UCA, status);
 654     // And set only the options
 655     UColOptionSet *opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
 656     /* test for NULL */
 657     if (opts == NULL) {
 658         *status = U_MEMORY_ALLOCATION_ERROR;
 659         goto cleanup;
 660     }
 661     uprv_memcpy(opts, src.opts, sizeof(UColOptionSet));
 662     ucol_setOptionsFromHeader(result, opts, status);
 663     result->freeOptionsOnClose = TRUE;
 664     result->hasRealData = FALSE;
 665     result->freeImageOnClose = FALSE;
 666   }
 667
 668   if(U_SUCCESS(*status)) {
 669     UChar *newRules;
 670     result->dataInfo.dataVersion[0] = UCOL_BUILDER_VERSION;
 671     if(rulesLength > 0) {
 672       newRules = (UChar *)uprv_malloc((rulesLength+1)*U_SIZEOF_UCHAR);
 673       /* test for NULL */
 674       if (newRules == NULL) {
 675           *status = U_MEMORY_ALLOCATION_ERROR;
 676           goto cleanup;
 677       }
 678       uprv_memcpy(newRules, rules, rulesLength*U_SIZEOF_UCHAR);
 679       newRules[rulesLength]=0;
 680       result->rules = newRules;
 681       result->rulesLength = rulesLength;
 682       result->freeRulesOnClose = TRUE;
 683     }
 684     result->rb = NULL;
 685     result->elements = NULL;
 686     result->validLocale = NULL;
 687     result->requestedLocale = NULL;
 688     ucol_setAttribute(result, UCOL_STRENGTH, strength, status);
 689     ucol_setAttribute(result, UCOL_NORMALIZATION_MODE, norm, status);
 690   } else {
 691 cleanup:
 692     if(result != NULL) {
 693       ucol_close(result);
 694     } else {
 695       if(table != NULL) {
 696         uprv_free(table);
 697       }
 698     }
 699     result = NULL;
 700   }
 701
 702   ucol_tok_closeTokenList(&src);
 703
 704   return result;
 705 }
 706
 707 /* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/
 708 /* you should be able to get the binary chunk to write out...  Doesn't look very full now */
 709 U_CAPI uint8_t* U_EXPORT2
 710 ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status)
 711 {
 712   uint8_t *result = NULL;
 713   if(U_FAILURE(*status)) {
 714     return NULL;
 715   }
 716   if(coll->hasRealData == TRUE) {
 717     *length = coll->image->size;
 718     result = (uint8_t *)uprv_malloc(*length);
 719     /* test for NULL */
 720     if (result == NULL) {
 721         *status = U_MEMORY_ALLOCATION_ERROR;
 722         return NULL;
 723     }
 724     uprv_memcpy(result, coll->image, *length);
 725   } else {
 726     *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
 727     result = (uint8_t *)uprv_malloc(*length);
 728     /* test for NULL */
 729     if (result == NULL) {
 730         *status = U_MEMORY_ALLOCATION_ERROR;
 731         return NULL;
 732     }
 733
 734     /* build the UCATableHeader with minimal entries */
 735     /* do not copy the header from the UCA file because its values are wrong! */
 736     /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
 737
 738     /* reset everything */
 739     uprv_memset(result, 0, *length);
 740
 741     /* set the tailoring-specific values */
 742     UCATableHeader *myData = (UCATableHeader *)result;
 743     myData->size = *length;
 744
 745     /* offset for the options, the only part of the data that is present after the header */
 746     myData->options = sizeof(UCATableHeader);
 747
 748     /* need to always set the expansion value for an upper bound of the options */
 749     myData->expansion = myData->options + sizeof(UColOptionSet);
 750
 751     myData->magic = UCOL_HEADER_MAGIC;
 752     myData->isBigEndian = U_IS_BIG_ENDIAN;
 753     myData->charSetFamily = U_CHARSET_FAMILY;
 754
 755     /* copy UCA's version; genrb will override all but the builder version with tailoring data */
 756     uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
 757
 758     uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
 759     uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
 760     uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
 761     myData->jamoSpecial = coll->image->jamoSpecial;
 762
 763     /* copy the collator options */
 764     uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
 765   }
 766   return result;
 767 }
 768
 769 void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 770   if(U_FAILURE(*status)) {
 771     return;
 772   }
 773     result->caseFirst = (UColAttributeValue)opts->caseFirst;
 774     result->caseLevel = (UColAttributeValue)opts->caseLevel;
 775     result->frenchCollation = (UColAttributeValue)opts->frenchCollation;
 776     result->normalizationMode = (UColAttributeValue)opts->normalizationMode;
 777     result->strength = (UColAttributeValue)opts->strength;
 778     result->variableTopValue = opts->variableTopValue;
 779     result->alternateHandling = (UColAttributeValue)opts->alternateHandling;
 780     result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ;
 781     result->numericCollation = (UColAttributeValue)opts->numericCollation;
 782
 783     result->caseFirstisDefault = TRUE;
 784     result->caseLevelisDefault = TRUE;
 785     result->frenchCollationisDefault = TRUE;
 786     result->normalizationModeisDefault = TRUE;
 787     result->strengthisDefault = TRUE;
 788     result->variableTopValueisDefault = TRUE;
 789     result->hiraganaQisDefault = TRUE;
 790     result->numericCollationisDefault = TRUE;
 791
 792     ucol_updateInternalState(result, status);
 793
 794     result->options = opts;
 795 }
 796
 797 #if 0
 798 // doesn't look like anybody is using this
 799 void ucol_putOptionsToHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) {
 800   if(U_FAILURE(*status)) {
 801     return;
 802   }
 803     opts->caseFirst = result->caseFirst;
 804     opts->caseLevel = result->caseLevel;
 805     opts->frenchCollation = result->frenchCollation;
 806     opts->normalizationMode = result->normalizationMode;
 807     opts->strength = result->strength;
 808     opts->variableTopValue = result->variableTopValue;
 809     opts->alternateHandling = result->alternateHandling;
 810     opts->hiraganaQ = result->hiraganaQ;
 811     opts->numericCollation = result->numericCollation;
 812 }
 813 #endif
 814
 815
 816 /**
 817 * Approximate determination if a character is at a contraction end.
 818 * Guaranteed to be TRUE if a character is at the end of a contraction,
 819 * otherwise it is not deterministic.
 820 * @param c character to be determined
 821 * @param coll collator
 822 */
 823 static
 824 inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
 825     if (UTF_IS_TRAIL(c)) {
 826       return TRUE;
 827     }
 828
 829     if (c < coll->minContrEndCP) {
 830         return FALSE;
 831     }
 832
 833     int32_t  hash = c;
 834     uint8_t  htbyte;
 835     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 836         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 837     }
 838     htbyte = coll->contrEndCP[hash>>3];
 839     return (((htbyte >> (hash & 7)) & 1) == 1);
 840 }
 841
 842
 843
 844 /*
 845 *   i_getCombiningClass()
 846 *        A fast, at least partly inline version of u_getCombiningClass()
 847 *        This is a candidate for further optimization.  Used heavily
 848 *        in contraction processing.
 849 */
 850 static
 851 inline uint8_t i_getCombiningClass(UChar c, const UCollator *coll) {
 852     uint8_t sCC = 0;
 853     if (c >= 0x300 && ucol_unsafeCP(c, coll)) {
 854         sCC = u_getCombiningClass(c);
 855     }
 856     return sCC;
 857 }
 858
 859
 860 UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) {
 861     UChar c;
 862     UCollator *result = fillIn;
 863     if(U_FAILURE(*status) || image == NULL) {
 864         return NULL;
 865     }
 866
 867     if(result == NULL) {
 868         result = (UCollator *)uprv_malloc(sizeof(UCollator));
 869         if(result == NULL) {
 870             *status = U_MEMORY_ALLOCATION_ERROR;
 871             return result;
 872         }
 873         result->freeOnClose = TRUE;
 874     } else {
 875         result->freeOnClose = FALSE;
 876     }
 877
 878     result->image = image;
 879     const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
 880     /*CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);*/
 881     UTrie *newUCAmapping = (UTrie *)uprv_malloc(sizeof(UTrie));
 882     if(newUCAmapping != NULL) {
 883       utrie_unserialize(newUCAmapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status);
 884     } else {
 885       *status = U_MEMORY_ALLOCATION_ERROR;
 886       if(result->freeOnClose == TRUE) {
 887           uprv_free(result);
 888           result = NULL;
 889       }
 890       return result;
 891     }
 892     if(U_SUCCESS(*status)) {
 893         result->mapping = newUCAmapping;
 894     } else {
 895         if(result->freeOnClose == TRUE) {
 896             uprv_free(result);
 897             result = NULL;
 898         }
 899         uprv_free(newUCAmapping);
 900         return result;
 901     }
 902
 903     /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/
 904     result->latinOneMapping = UTRIE_GET32_LATIN1(result->mapping);
 905     result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs);
 906     result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex);
 907     result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion);
 908
 909     result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options);
 910     result->freeOptionsOnClose = FALSE;
 911
 912     /* set attributes */
 913     result->caseFirst = (UColAttributeValue)result->options->caseFirst;
 914     result->caseLevel = (UColAttributeValue)result->options->caseLevel;
 915     result->frenchCollation = (UColAttributeValue)result->options->frenchCollation;
 916     result->normalizationMode = (UColAttributeValue)result->options->normalizationMode;
 917     result->strength = (UColAttributeValue)result->options->strength;
 918     result->variableTopValue = result->options->variableTopValue;
 919     result->alternateHandling = (UColAttributeValue)result->options->alternateHandling;
 920     result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ;
 921     result->numericCollation = (UColAttributeValue)result->options->numericCollation;
 922
 923     result->caseFirstisDefault = TRUE;
 924     result->caseLevelisDefault = TRUE;
 925     result->frenchCollationisDefault = TRUE;
 926     result->normalizationModeisDefault = TRUE;
 927     result->strengthisDefault = TRUE;
 928     result->variableTopValueisDefault = TRUE;
 929     result->alternateHandlingisDefault = TRUE;
 930     result->hiraganaQisDefault = TRUE;
 931     result->numericCollationisDefault = TRUE;
 932
 933     result->scriptOrder = NULL;
 934
 935     result->rules = NULL;
 936     result->rulesLength = 0;
 937
 938     /* get the version info from UCATableHeader and populate the Collator struct*/
 939     result->dataInfo.dataVersion[0] = result->image->version[0]; /* UCA Builder version*/
 940     result->dataInfo.dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/
 941
 942     result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP;
 943     result->minUnsafeCP = 0;
 944     for (c=0; c<0x300; c++) {  // Find the smallest unsafe char.
 945         if (ucol_unsafeCP(c, result)) break;
 946     }
 947     result->minUnsafeCP = c;
 948
 949     result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
 950     result->minContrEndCP = 0;
 951     for (c=0; c<0x300; c++) {  // Find the Contraction-ending char.
 952         if (ucol_contractionEndCP(c, result)) break;
 953     }
 954     result->minContrEndCP = c;
 955
 956     /* max expansion tables */
 957     result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
 958                                          result->image->endExpansionCE);
 959     result->lastEndExpansionCE = result->endExpansionCE +
 960                                  result->image->endExpansionCECount - 1;
 961     result->expansionCESize = (uint8_t*)result->image +
 962                                                result->image->expansionCESize;
 963
 964
 965     //result->errorCode = *status;
 966
 967     result->latinOneCEs = NULL;
 968
 969     result->latinOneRegenTable = FALSE;
 970     result->latinOneFailed = FALSE;
 971     result->UCA = UCA;
 972
 973     ucol_updateInternalState(result, status);
 974
 975
 976     return result;
 977 }
 978
 979 /* new Mark's code */
 980
 981 /**
 982  * For generation of Implicit CEs
 983  * @author Davis
 984  *
 985  * Cleaned up so that changes can be made more easily.
 986  * Old values:
 987 # First Implicit: E26A792D
 988 # Last Implicit: E3DC70C0
 989 # First CJK: E0030300
 990 # Last CJK: E0A9DD00
 991 # First CJK_A: E0A9DF00
 992 # Last CJK_A: E0DE3100
 993  */
 994 /* Following is a port of Mark's code for new treatment of implicits.
 995  * It is positioned here, since ucol_initUCA need to initialize the
 996  * variables below according to the data in the fractional UCA.
 997  */
 998
 999 /**
1000     * Function used to:
1001     * a) collapse the 2 different Han ranges from UCA into one (in the right order), and
1002     * b) bump any non-CJK characters by 10FFFF.
1003     * The relevant blocks are:
1004     * A:    4E00..9FFF; CJK Unified Ideographs
1005     *       F900..FAFF; CJK Compatibility Ideographs
1006     * B:    3400..4DBF; CJK Unified Ideographs Extension A
1007     *       20000..XX;  CJK Unified Ideographs Extension B (and others later on)
1008     * As long as
1009     *   no new B characters are allocated between 4E00 and FAFF, and
1010     *   no new A characters are outside of this range,
1011     * (very high probability) this simple code will work.
1012     * The reordered blocks are:
1013     * Block1 is CJK
1014     * Block2 is CJK_COMPAT_USED
1015     * Block3 is CJK_A
1016     * (all contiguous)
1017     * Any other CJK gets its normal code point
1018     * Any non-CJK gets +10FFFF
1019     * When we reorder Block1, we make sure that it is at the very start,
1020     * so that it will use a 3-byte form.
1021     * Warning: the we only pick up the compatibility characters that are
1022     * NOT decomposed, so that block is smaller!
1023     */
1024
1025 // CONSTANTS
1026 static const UChar32
1027     NON_CJK_OFFSET = 0x110000,
1028     UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2
1029
1030 /**
1031  * Precomputed by constructor
1032  */
1033 static int32_t
1034     final3Multiplier = 0,
1035     final4Multiplier = 0,
1036     final3Count = 0,
1037     final4Count = 0,
1038     medialCount = 0,
1039     min3Primary = 0,
1040     min4Primary = 0,
1041     max4Primary = 0,
1042     minTrail = 0,
1043     maxTrail = 0,
1044     max3Trail = 0,
1045     max4Trail = 0,
1046     min4Boundary = 0;
1047
1048 static const UChar32
1049     CJK_BASE = 0x4E00,
1050     CJK_LIMIT = 0x9FFF+1,
1051     CJK_COMPAT_USED_BASE = 0xFA0E,
1052     CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
1053     CJK_A_BASE = 0x3400,
1054     CJK_A_LIMIT = 0x4DBF+1,
1055     CJK_B_BASE = 0x20000,
1056     CJK_B_LIMIT = 0x2A6DF+1;
1057
1058 static UChar32 swapCJK(UChar32 i) {
1059
1060     if (i >= CJK_BASE) {
1061         if (i < CJK_LIMIT)              return i - CJK_BASE;
1062
1063         if (i < CJK_COMPAT_USED_BASE)   return i + NON_CJK_OFFSET;
1064
1065         if (i < CJK_COMPAT_USED_LIMIT)  return i - CJK_COMPAT_USED_BASE
1066                                                 + (CJK_LIMIT - CJK_BASE);
1067         if (i < CJK_B_BASE)             return i + NON_CJK_OFFSET;
1068
1069         if (i < CJK_B_LIMIT)            return i; // non-BMP-CJK
1070
1071         return i + NON_CJK_OFFSET;  // non-CJK
1072     }
1073     if (i < CJK_A_BASE)                 return i + NON_CJK_OFFSET;
1074
1075     if (i < CJK_A_LIMIT)                return i - CJK_A_BASE
1076                                                 + (CJK_LIMIT - CJK_BASE)
1077                                                 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1078     return i + NON_CJK_OFFSET; // non-CJK
1079 }
1080
1081 U_CAPI UChar32 U_EXPORT2
1082 uprv_uca_getRawFromCodePoint(UChar32 i) {
1083     return swapCJK(i)+1;
1084 }
1085
1086 U_CAPI UChar32 U_EXPORT2
1087 uprv_uca_getCodePointFromRaw(UChar32 i) {
1088     i--;
1089     UChar32 result = 0;
1090     if(i >= NON_CJK_OFFSET) {
1091         result = i - NON_CJK_OFFSET;
1092     } else if(i >= CJK_B_BASE) {
1093         result = i;
1094     } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted
1095         if(i < CJK_LIMIT - CJK_BASE) {
1096             result = i + CJK_BASE;
1097         } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) {
1098             result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE);
1099         } else {
1100             result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
1101         }
1102     } else {
1103         result = -1;
1104     }
1105     return result;
1106 }
1107
1108 // GET IMPLICIT PRIMARY WEIGHTS
1109 // Return value is left justified primary key
1110 U_CAPI uint32_t U_EXPORT2
1111 uprv_uca_getImplicitFromRaw(UChar32 cp) {
1112     /*
1113     if (cp < 0 || cp > UCOL_MAX_INPUT) {
1114         throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp));
1115     }
1116     */
1117     int32_t last0 = cp - min4Boundary;
1118     if (last0 < 0) {
1119         int32_t last1 = cp / final3Count;
1120         last0 = cp % final3Count;
1121
1122         int32_t last2 = last1 / medialCount;
1123         last1 %= medialCount;
1124
1125         last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start
1126         last1 = minTrail + last1; // offset
1127         last2 = min3Primary + last2; // offset
1128         /*
1129         if (last2 >= min4Primary) {
1130             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2));
1131         }
1132         */
1133         return (last2 << 24) + (last1 << 16) + (last0 << 8);
1134     } else {
1135         int32_t last1 = last0 / final4Count;
1136         last0 %= final4Count;
1137
1138         int32_t last2 = last1 / medialCount;
1139         last1 %= medialCount;
1140
1141         int32_t last3 = last2 / medialCount;
1142         last2 %= medialCount;
1143
1144         last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start
1145         last1 = minTrail + last1; // offset
1146         last2 = minTrail + last2; // offset
1147         last3 = min4Primary + last3; // offset
1148         /*
1149         if (last3 > max4Primary) {
1150             throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3));
1151         }
1152         */
1153         return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0;
1154     }
1155 }
1156
1157 U_CAPI uint32_t U_EXPORT2
1158 uprv_uca_getImplicitPrimary(UChar32 cp) {
1159     //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
1160
1161     cp = swapCJK(cp);
1162     cp++;
1163     // we now have a range of numbers from 0 to 21FFFF.
1164
1165     //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
1166
1167     return uprv_uca_getImplicitFromRaw(cp);
1168 }
1169
1170 /**
1171  * Converts implicit CE into raw integer ("code point")
1172  * @param implicit
1173  * @return -1 if illegal format
1174  */
1175 U_CAPI UChar32 U_EXPORT2
1176 uprv_uca_getRawFromImplicit(uint32_t implicit) {
1177     UChar32 result;
1178     UChar32 b3 = implicit & 0xFF;
1179     implicit >>= 8;
1180     UChar32 b2 = implicit & 0xFF;
1181     implicit >>= 8;
1182     UChar32 b1 = implicit & 0xFF;
1183     implicit >>= 8;
1184     UChar32 b0 = implicit & 0xFF;
1185
1186     // simple parameter checks
1187     if (b0 < min3Primary || b0 > max4Primary
1188       || b1 < minTrail || b1 > maxTrail) return -1;
1189     // normal offsets
1190     b1 -= minTrail;
1191
1192     // take care of the final values, and compose
1193     if (b0 < min4Primary) {
1194         if (b2 < minTrail || b2 > max3Trail || b3 != 0) return -1;
1195         b2 -= minTrail;
1196         UChar32 remainder = b2 % final3Multiplier;
1197         if (remainder != 0) return -1;
1198         b0 -= min3Primary;
1199         b2 /= final3Multiplier;
1200         result = ((b0 * medialCount) + b1) * final3Count + b2;
1201     } else {
1202          if (b2 < minTrail || b2 > maxTrail
1203         || b3 < minTrail || b3 > max4Trail) return -1;
1204         b2 -= minTrail;
1205         b3 -= minTrail;
1206         UChar32 remainder = b3 % final4Multiplier;
1207         if (remainder != 0) return -1;
1208         b3 /= final4Multiplier;
1209         b0 -= min4Primary;
1210         result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary;
1211     }
1212     // final check
1213     if (result < 0 || result > UCOL_MAX_INPUT) return -1;
1214     return result;
1215 }
1216
1217
1218 static inline int32_t divideAndRoundUp(int a, int b) {
1219     return 1 + (a-1)/b;
1220 }
1221
1222 /* this function is either called from initUCA or from genUCA before
1223  * doing canonical closure for the UCA.
1224  */
1225
1226 /**
1227  * Set up to generate implicits.
1228  * @param minPrimary
1229  * @param maxPrimary
1230  * @param minTrail final byte
1231  * @param maxTrail final byte
1232  * @param gap3 the gap we leave for tailoring for 3-byte forms
1233  * @param gap4 the gap we leave for tailoring for 4-byte forms
1234  */
1235 static void initImplicitConstants(int minPrimary, int maxPrimary,
1236                                     int minTrailIn, int maxTrailIn,
1237                                     int gap3, int primaries3count,
1238                                     UErrorCode *status) {
1239     // some simple parameter checks
1240     if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) {
1241         *status = U_ILLEGAL_ARGUMENT_ERROR;
1242         return;
1243     };
1244     if (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) {
1245         *status = U_ILLEGAL_ARGUMENT_ERROR;
1246         return;
1247     };
1248     if (primaries3count < 1) {
1249         *status = U_ILLEGAL_ARGUMENT_ERROR;
1250         return;
1251     };
1252
1253     minTrail = minTrailIn;
1254     maxTrail = maxTrailIn;
1255
1256     min3Primary = minPrimary;
1257     max4Primary = maxPrimary;
1258     // compute constants for use later.
1259     // number of values we can use in trailing bytes
1260     // leave room for empty values between AND above, e.g. if gap = 2
1261     // range 3..7 => +3 -4 -5 -6 -7: so 1 value
1262     // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values
1263     // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values
1264     final3Multiplier = gap3 + 1;
1265     final3Count = (maxTrail - minTrail + 1) / final3Multiplier;
1266     max3Trail = minTrail + (final3Count - 1) * final3Multiplier;
1267
1268     // medials can use full range
1269     medialCount = (maxTrail - minTrail + 1);
1270     // find out how many values fit in each form
1271     int32_t threeByteCount = medialCount * final3Count;
1272     // now determine where the 3/4 boundary is.
1273     // we use 3 bytes below the boundary, and 4 above
1274     int32_t primariesAvailable = maxPrimary - minPrimary + 1;
1275     int32_t primaries4count = primariesAvailable - primaries3count;
1276
1277
1278     int32_t min3ByteCoverage = primaries3count * threeByteCount;
1279     min4Primary = minPrimary + primaries3count;
1280     min4Boundary = min3ByteCoverage;
1281     // Now expand out the multiplier for the 4 bytes, and redo.
1282
1283     int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary;
1284     int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count);
1285     //if (DEBUG) System.out.println("neededPerPrimaryByte: " + neededPerPrimaryByte);
1286     int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount);
1287     //if (DEBUG) System.out.println("neededPerFinalByte: " + neededPerFinalByte);
1288     int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte;
1289     //if (DEBUG) System.out.println("expandedGap: " + gap4);
1290     if (gap4 < 1) {
1291         *status = U_ILLEGAL_ARGUMENT_ERROR;
1292         return;
1293     }
1294     final4Multiplier = gap4 + 1;
1295     final4Count = neededPerFinalByte;
1296     max4Trail = minTrail + (final4Count - 1) * final4Multiplier;
1297     /*
1298     if (DEBUG) {
1299         System.out.println("final4Count: " + final4Count);
1300         for (int counter = 0; counter <= final4Count; ++counter) {
1301             int value = minTrail + (1 + counter)*final4Multiplier;
1302             System.out.println(counter + "\t" + value + "\t" + Utility.hex(value));
1303         }
1304     }
1305     */
1306 }
1307
1308     /**
1309      * Supply parameters for generating implicit CEs
1310      */
1311 U_CAPI void U_EXPORT2
1312 uprv_uca_initImplicitConstants(int32_t minPrimary, int32_t maxPrimary, UErrorCode *status) {
1313     // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
1314     initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status);
1315 }
1316
1317 U_CDECL_BEGIN
1318 static UBool U_CALLCONV
1319 ucol_cleanup(void)
1320 {
1321     if (UCA_DATA_MEM) {
1322         udata_close(UCA_DATA_MEM);
1323         UCA_DATA_MEM = NULL;
1324     }
1325     if (_staticUCA) {
1326         ucol_close(_staticUCA);
1327         _staticUCA = NULL;
1328     }
1329     fcdTrieIndex = NULL;
1330     return TRUE;
1331 }
1332 U_CDECL_END
1333
1334 /* do not close UCA returned by ucol_initUCA! */
1335 UCollator *
1336 ucol_initUCA(UErrorCode *status) {
1337     if(U_FAILURE(*status)) {
1338         return NULL;
1339     }
1340     umtx_lock(NULL);
1341     UBool f = (_staticUCA == NULL);
1342     umtx_unlock(NULL);
1343
1344     if(f) {
1345         UCollator *newUCA = NULL;
1346         UDataMemory *result = udata_openChoice(NULL, UCA_DATA_TYPE, UCA_DATA_NAME, isAcceptableUCA, NULL, status);
1347
1348         if(U_FAILURE(*status)) {
1349             if (result) {
1350                 udata_close(result);
1351             }
1352             uprv_free(newUCA);
1353         }
1354
1355         // init FCD data
1356         if (fcdTrieIndex == NULL) {
1357             fcdTrieIndex = unorm_getFCDTrie(status);
1358             ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1359         }
1360
1361         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1362             newUCA = ucol_initCollator((const UCATableHeader *)udata_getMemory(result), newUCA, newUCA, status);
1363             if(U_SUCCESS(*status)){
1364                 newUCA->rb = NULL;
1365                 newUCA->elements = NULL;
1366                 newUCA->validLocale = NULL;
1367                 newUCA->requestedLocale = NULL;
1368                 newUCA->hasRealData = FALSE; // real data lives in .dat file...
1369                 newUCA->freeImageOnClose = FALSE;
1370                 umtx_lock(NULL);
1371                 if(_staticUCA == NULL) {
1372                     _staticUCA = newUCA;
1373                     UCA_DATA_MEM = result;
1374                     result = NULL;
1375                     newUCA = NULL;
1376                 }
1377                 umtx_unlock(NULL);
1378
1379                 if(newUCA != NULL) {
1380                     udata_close(result);
1381                     uprv_free(newUCA);
1382                 }
1383                 else {
1384                     ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
1385                 }
1386                 // Initalize variables for implicit generation
1387                 const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)_staticUCA->image + _staticUCA->image->UCAConsts);
1388                 uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN, UCAconsts->UCA_PRIMARY_IMPLICIT_MAX, status);
1389                 _staticUCA->mapping->getFoldingOffset = _getFoldingOffset;
1390             }else{
1391                 udata_close(result);
1392                 uprv_free(newUCA);
1393                 _staticUCA= NULL;
1394             }
1395         }
1396     }
1397     return _staticUCA;
1398 }
1399
1400
1401 /*    collIterNormalize     Incremental Normalization happens here.                       */
1402 /*                          pick up the range of chars identifed by FCD,                  */
1403 /*                          normalize it into the collIterate's writable buffer,          */
1404 /*                          switch the collIterate's state to use the writable buffer.    */
1405 /*                                                                                        */
1406 static
1407 void collIterNormalize(collIterate *collationSource)
1408 {
1409     UErrorCode  status = U_ZERO_ERROR;
1410
1411     int32_t    normLen;
1412     UChar      *srcP = collationSource->pos - 1;      /*  Start of chars to normalize    */
1413     UChar      *endP = collationSource->fcdPosition;  /* End of region to normalize+1    */
1414
1415     normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1416                               srcP, (int32_t)(endP - srcP),
1417                               FALSE, 0,
1418                               &status);
1419     if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
1420         // reallocate and terminate
1421         if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1422                                    &collationSource->writableBuffer,
1423                                    (int32_t *)&collationSource->writableBufSize, normLen + 1,
1424                                    0)
1425         ) {
1426 #ifdef UCOL_DEBUG
1427             fprintf(stderr, "collIterNormalize(), out of memory\n");
1428 #endif
1429             return;
1430         }
1431         status = U_ZERO_ERROR;
1432         normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize,
1433                                   srcP, (int32_t)(endP - srcP),
1434                                   FALSE, 0,
1435                                   &status);
1436     }
1437     if (U_FAILURE(status)) {
1438 #ifdef UCOL_DEBUG
1439         fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status));
1440 #endif
1441         return;
1442     }
1443
1444   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1445       collationSource->flags |= UCOL_ITER_ALLOCATED;
1446   }
1447   collationSource->pos        = collationSource->writableBuffer;
1448   collationSource->origFlags  = collationSource->flags;
1449   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1450   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1451 }
1452
1453
1454 // This function takes the iterator and extracts normalized stuff up to the next boundary
1455 // It is similar in the end results to the collIterNormalize, but for the cases when we
1456 // use an iterator
1457 static
1458 inline void normalizeIterator(collIterate *collationSource) {
1459   UErrorCode status = U_ZERO_ERROR;
1460   UBool wasNormalized = FALSE;
1461   //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT);
1462   uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator);
1463   int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1464     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1465   if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) {
1466     // reallocate and terminate
1467     if(!u_growBufferFromStatic(collationSource->stackWritableBuffer,
1468                                &collationSource->writableBuffer,
1469                                (int32_t *)&collationSource->writableBufSize, normLen + 1,
1470                                0)
1471     ) {
1472     #ifdef UCOL_DEBUG
1473         fprintf(stderr, "normalizeIterator(), out of memory\n");
1474     #endif
1475         return;
1476     }
1477     status = U_ZERO_ERROR;
1478     //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO);
1479     collationSource->iterator->setState(collationSource->iterator, iterIndex, &status);
1480     normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer,
1481     (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status);
1482   }
1483   // Terminate the buffer - we already checked that it is big enough
1484   collationSource->writableBuffer[normLen] = 0;
1485   if(collationSource->writableBuffer != collationSource->stackWritableBuffer) {
1486       collationSource->flags |= UCOL_ITER_ALLOCATED;
1487   }
1488   collationSource->pos        = collationSource->writableBuffer;
1489   collationSource->origFlags  = collationSource->flags;
1490   collationSource->flags     |= UCOL_ITER_INNORMBUF;
1491   collationSource->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
1492 }
1493
1494
1495 /* Incremental FCD check and normalize                                                    */
1496 /*   Called from getNextCE when normalization state is suspect.                           */
1497 /*   When entering, the state is known to be this:                                        */
1498 /*      o   We are working in the main buffer of the collIterate, not the side            */
1499 /*          writable buffer.  When in the side buffer, normalization mode is always off,  */
1500 /*          so we won't get here.                                                         */
1501 /*      o   The leading combining class from the current character is 0 or                */
1502 /*          the trailing combining class of the previous char was zero.                   */
1503 /*          True because the previous call to this function will have always exited       */
1504 /*          that way, and we get called for every char where cc might be non-zero.        */
1505 static
1506 inline UBool collIterFCD(collIterate *collationSource) {
1507     UChar       c, c2;
1508     const UChar *srcP, *endP;
1509     uint8_t     leadingCC;
1510     uint8_t     prevTrailingCC = 0;
1511     uint16_t    fcd;
1512     UBool       needNormalize = FALSE;
1513
1514     srcP = collationSource->pos-1;
1515
1516     if (collationSource->flags & UCOL_ITER_HASLEN) {
1517         endP = collationSource->endp;
1518     } else {
1519         endP = NULL;
1520     }
1521
1522     // Get the trailing combining class of the current character.  If it's zero,
1523     //   we are OK.
1524     c = *srcP++;
1525     /* trie access */
1526     fcd = unorm_getFCD16(fcdTrieIndex, c);
1527     if (fcd != 0) {
1528         if (UTF_IS_FIRST_SURROGATE(c)) {
1529             if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1530                 ++srcP;
1531                 fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1532             } else {
1533                 fcd = 0;
1534             }
1535         }
1536
1537         prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1538
1539         if (prevTrailingCC != 0) {
1540             // The current char has a non-zero trailing CC.  Scan forward until we find
1541             //   a char with a leading cc of zero.
1542             while (endP == NULL || srcP != endP)
1543             {
1544                 const UChar *savedSrcP = srcP;
1545
1546                 c = *srcP++;
1547                 /* trie access */
1548                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1549                 if (fcd != 0 && UTF_IS_FIRST_SURROGATE(c)) {
1550                     if ((endP == NULL || srcP != endP) && UTF_IS_SECOND_SURROGATE(c2=*srcP)) {
1551                         ++srcP;
1552                         fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c2);
1553                     } else {
1554                         fcd = 0;
1555                     }
1556                 }
1557                 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1558                 if (leadingCC == 0) {
1559                     srcP = savedSrcP;      // Hit char that is not part of combining sequence.
1560                                            //   back up over it.  (Could be surrogate pair!)
1561                     break;
1562                 }
1563
1564                 if (leadingCC < prevTrailingCC) {
1565                     needNormalize = TRUE;
1566                 }
1567
1568                 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1569             }
1570         }
1571     }
1572
1573     collationSource->fcdPosition = (UChar *)srcP;
1574
1575     return needNormalize;
1576 }
1577
1578 /****************************************************************************/
1579 /* Following are the CE retrieval functions                                 */
1580 /*                                                                          */
1581 /****************************************************************************/
1582
1583 /* there should be a macro version of this function in the header file */
1584 /* This is the first function that tries to fetch a collation element  */
1585 /* If it's not succesfull or it encounters a more difficult situation  */
1586 /* some more sofisticated and slower functions are invoked             */
1587 static
1588 inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1589     uint32_t order = 0;
1590     if (collationSource->CEpos > collationSource->toReturn) {       /* Are there any CEs from previous expansions? */
1591       order = *(collationSource->toReturn++);                         /* if so, return them */
1592       if(collationSource->CEpos == collationSource->toReturn) {
1593         collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
1594       }
1595       return order;
1596     }
1597
1598     UChar ch = 0;
1599
1600     for (;;)                           /* Loop handles case when incremental normalize switches   */
1601     {                                  /*   to or from the side buffer / original string, and we  */
1602                                        /*   need to start again to get the next character.        */
1603
1604         if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0)
1605         {
1606             // The source string is null terminated and we're not working from the side buffer,
1607             //   and we're not normalizing.  This is the fast path.
1608             //   (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.)
1609             ch = *collationSource->pos++;
1610             if (ch != 0) {
1611                 break;
1612             }
1613             else {
1614                 return UCOL_NO_MORE_CES;
1615             }
1616         }
1617
1618         if (collationSource->flags & UCOL_ITER_HASLEN) {
1619             // Normal path for strings when length is specified.
1620             //   (We can't be in side buffer because it is always null terminated.)
1621             if (collationSource->pos >= collationSource->endp) {
1622                 // Ran off of the end of the main source string.  We're done.
1623                 return UCOL_NO_MORE_CES;
1624             }
1625             ch = *collationSource->pos++;
1626         }
1627         else if(collationSource->flags & UCOL_USE_ITERATOR) {
1628             UChar32 iterCh = collationSource->iterator->next(collationSource->iterator);
1629             if(iterCh == U_SENTINEL) {
1630               return UCOL_NO_MORE_CES;
1631             }
1632             ch = (UChar)iterCh;
1633         }
1634         else
1635         {
1636             // Null terminated string.
1637             ch = *collationSource->pos++;
1638             if (ch == 0) {
1639                 // Ran off end of buffer.
1640                 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1641                     // Ran off end of main string. backing up one character.
1642                     collationSource->pos--;
1643                     return UCOL_NO_MORE_CES;
1644                 }
1645                 else
1646                 {
1647                     // Hit null in the normalize side buffer.
1648                     // Usually this means the end of the normalized data,
1649                     // except for one odd case: a null followed by combining chars,
1650                     //   which is the case if we are at the start of the buffer.
1651                     if (collationSource->pos == collationSource->writableBuffer+1) {
1652                         break;
1653                     }
1654
1655                     //  Null marked end of side buffer.
1656                     //   Revert to the main string and
1657                     //   loop back to top to try again to get a character.
1658                     collationSource->pos   = collationSource->fcdPosition;
1659                     collationSource->flags = collationSource->origFlags;
1660                     continue;
1661                 }
1662             }
1663         }
1664
1665         if(collationSource->flags&UCOL_HIRAGANA_Q) {
1666           if((ch>=0x3040 && ch<=0x3094) || ch == 0x309d || ch == 0x309e) {
1667             collationSource->flags |= UCOL_WAS_HIRAGANA;
1668           } else {
1669             collationSource->flags &= ~UCOL_WAS_HIRAGANA;
1670           }
1671         }
1672
1673         // We've got a character.  See if there's any fcd and/or normalization stuff to do.
1674         //    Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer.
1675         if ((collationSource->flags & UCOL_ITER_NORM) == 0) {
1676             break;
1677         }
1678
1679         if (collationSource->fcdPosition >= collationSource->pos) {
1680             // An earlier FCD check has already covered the current character.
1681             // We can go ahead and process this char.
1682             break;
1683         }
1684
1685         if (ch < ZERO_CC_LIMIT_ ) {
1686             // Fast fcd safe path.  Trailing combining class == 0.  This char is OK.
1687             break;
1688         }
1689
1690         if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
1691             // We need to peek at the next character in order to tell if we are FCD
1692             if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) {
1693                 // We are at the last char of source string.
1694                 //  It is always OK for FCD check.
1695                 break;
1696             }
1697
1698             // Not at last char of source string (or we'll check against terminating null).  Do the FCD fast test
1699             if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) {
1700                 break;
1701             }
1702         }
1703
1704
1705         // Need a more complete FCD check and possible normalization.
1706         if (collIterFCD(collationSource)) {
1707             collIterNormalize(collationSource);
1708         }
1709         if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) {
1710             //  No normalization was needed.  Go ahead and process the char we already had.
1711             break;
1712         }
1713
1714         // Some normalization happened.  Next loop iteration will pick up a char
1715         //   from the normalization buffer.
1716
1717     }   // end for (;;)
1718
1719
1720       if (ch <= 0xFF) {
1721           /*  For latin-1 characters we never need to fall back to the UCA table        */
1722           /*    because all of the UCA data is replicated in the latinOneMapping array  */
1723           order = coll->latinOneMapping[ch];
1724           if (order > UCOL_NOT_FOUND) {
1725               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);
1726           }
1727       }
1728       else
1729       {
1730           order = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
1731           if(order > UCOL_NOT_FOUND) {                                       /* if a CE is special                */
1732               order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status);    /* and try to get the special CE     */
1733           }
1734           if(order == UCOL_NOT_FOUND && coll->UCA) {   /* We couldn't find a good CE in the tailoring */
1735             /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
1736             order = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
1737
1738             if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
1739               order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status);
1740             }
1741           }
1742       }
1743     return order; /* return the CE */
1744 }
1745
1746 /* ucol_getNextCE, out-of-line version for use from other files.   */
1747 U_CAPI uint32_t  U_EXPORT2
1748 ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) {
1749     return ucol_IGetNextCE(coll, collationSource, status);
1750 }
1751
1752
1753 /**
1754 * Incremental previous normalization happens here. Pick up the range of chars
1755 * identifed by FCD, normalize it into the collIterate's writable buffer,
1756 * switch the collIterate's state to use the writable buffer.
1757 * @param data collation iterator data
1758 */
1759 static
1760 void collPrevIterNormalize(collIterate *data)
1761 {
1762     UErrorCode status  = U_ZERO_ERROR;
1763     UChar      *pEnd   = data->pos;         /* End normalize + 1 */
1764     UChar      *pStart;
1765     uint32_t    normLen;
1766     UChar      *pStartNorm;
1767
1768     /* Start normalize */
1769     if (data->fcdPosition == NULL) {
1770         pStart = data->string;
1771     }
1772     else {
1773         pStart = data->fcdPosition + 1;
1774     }
1775
1776     normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0,
1777                               data->writableBuffer, 0, &status);
1778
1779     if (data->writableBufSize <= normLen) {
1780             freeHeapWritableBuffer(data);
1781             data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) *
1782                                                         sizeof(UChar));
1783             if(data->writableBuffer == NULL) { // something is wrong here, return
1784               return;
1785             }
1786             data->flags |= UCOL_ITER_ALLOCATED;
1787             /* to handle the zero termination */
1788             data->writableBufSize = normLen + 1;
1789     }
1790             status = U_ZERO_ERROR;
1791     /*
1792     this puts the null termination infront of the normalized string instead
1793     of the end
1794     */
1795     pStartNorm = data->writableBuffer + (data->writableBufSize - normLen);
1796     *(pStartNorm - 1) = 0;
1797     unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm,
1798                     normLen, &status);
1799
1800     data->pos        = data->writableBuffer + data->writableBufSize;
1801     data->origFlags  = data->flags;
1802     data->flags     |= UCOL_ITER_INNORMBUF;
1803     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
1804 }
1805
1806
1807 /**
1808 * Incremental FCD check for previous iteration and normalize. Called from
1809 * getPrevCE when normalization state is suspect.
1810 * When entering, the state is known to be this:
1811 * o  We are working in the main buffer of the collIterate, not the side
1812 *    writable buffer. When in the side buffer, normalization mode is always
1813 *    off, so we won't get here.
1814 * o  The leading combining class from the current character is 0 or the
1815 *    trailing combining class of the previous char was zero.
1816 *    True because the previous call to this function will have always exited
1817 *    that way, and we get called for every char where cc might be non-zero.
1818 * @param data collation iterate struct
1819 * @return normalization status, TRUE for normalization to be done, FALSE
1820 *         otherwise
1821 */
1822 static
1823 inline UBool collPrevIterFCD(collIterate *data)
1824 {
1825     const UChar *src, *start;
1826     UChar       c, c2;
1827     uint8_t     leadingCC;
1828     uint8_t     trailingCC = 0;
1829     uint16_t    fcd;
1830     UBool       result = FALSE;
1831
1832     start = data->string;
1833     src = data->pos + 1;
1834
1835     /* Get the trailing combining class of the current character. */
1836     c = *--src;
1837     if (!UTF_IS_SURROGATE(c)) {
1838         fcd = unorm_getFCD16(fcdTrieIndex, c);
1839     } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1840         --src;
1841         fcd = unorm_getFCD16(fcdTrieIndex, c2);
1842         if (fcd != 0) {
1843             fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1844         }
1845     } else /* unpaired surrogate */ {
1846         fcd = 0;
1847     }
1848
1849     leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1850
1851     if (leadingCC != 0) {
1852         /*
1853         The current char has a non-zero leading combining class.
1854         Scan backward until we find a char with a trailing cc of zero.
1855         */
1856         for (;;)
1857         {
1858             if (start == src) {
1859                 data->fcdPosition = NULL;
1860                 return result;
1861             }
1862
1863             c = *--src;
1864             if (!UTF_IS_SURROGATE(c)) {
1865                 fcd = unorm_getFCD16(fcdTrieIndex, c);
1866             } else if (UTF_IS_SECOND_SURROGATE(c) && start < src && UTF_IS_FIRST_SURROGATE(c2 = *(src - 1))) {
1867                 --src;
1868                 fcd = unorm_getFCD16(fcdTrieIndex, c2);
1869                 if (fcd != 0) {
1870                     fcd = unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c);
1871                 }
1872             } else /* unpaired surrogate */ {
1873                 fcd = 0;
1874             }
1875
1876             trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_);
1877
1878             if (trailingCC == 0) {
1879                 break;
1880             }
1881
1882             if (leadingCC < trailingCC) {
1883                 result = TRUE;
1884             }
1885
1886             leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_);
1887         }
1888     }
1889
1890     data->fcdPosition = (UChar *)src;
1891
1892     return result;
1893 }
1894
1895 /** gets a character from the string at a given offset
1896  *  Handles both normal and iterative cases.
1897  *  No error checking - caller beware!
1898  */
1899 inline static
1900 UChar peekCharacter(collIterate *source, int32_t offset) {
1901   if(source->pos != NULL) {
1902     return *(source->pos + offset);
1903   } else if(source->iterator != NULL) {
1904     if(offset != 0) {
1905       source->iterator->move(source->iterator, offset, UITER_CURRENT);
1906       UChar toReturn = (UChar)source->iterator->next(source->iterator);
1907       source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
1908       return toReturn;
1909     } else {
1910       return (UChar)source->iterator->current(source->iterator);
1911     }
1912   } else {
1913     return (UChar)U_SENTINEL;
1914   }
1915 }
1916
1917 /**
1918 * Determines if we are at the start of the data string in the backwards
1919 * collation iterator
1920 * @param data collation iterator
1921 * @return TRUE if we are at the start
1922 */
1923 static
1924 inline UBool isAtStartPrevIterate(collIterate *data) {
1925   if(data->pos == NULL && data->iterator != NULL) {
1926     return !data->iterator->hasPrevious(data->iterator);
1927   }
1928   //return (collIter_bos(data)) ||
1929   return (data->pos == data->string) ||
1930             ((data->flags & UCOL_ITER_INNORMBUF) &&
1931             *(data->pos - 1) == 0 && data->fcdPosition == NULL);
1932 }
1933
1934 static
1935 inline void goBackOne(collIterate *data) {
1936 # if 0
1937   // somehow, it looks like we need to keep iterator synced up
1938   // at all times, as above.
1939   if(data->pos) {
1940     data->pos--;
1941   }
1942   if(data->iterator) {
1943     data->iterator->previous(data->iterator);
1944   }
1945 #endif
1946   if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) {
1947     data->iterator->previous(data->iterator);
1948   }
1949   if(data->pos) {
1950     data->pos --;
1951   }
1952 }
1953
1954 /**
1955 * Inline function that gets a simple CE.
1956 * So what it does is that it will first check the expansion buffer. If the
1957 * expansion buffer is not empty, ie the end pointer to the expansion buffer
1958 * is different from the string pointer, we return the collation element at the
1959 * return pointer and decrement it.
1960 * For more complicated CEs it resorts to getComplicatedCE.
1961 * @param coll collator data
1962 * @param data collation iterator struct
1963 * @param status error status
1964 */
1965 static
1966 inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
1967                                UErrorCode *status)
1968 {
1969     uint32_t result = (uint32_t)UCOL_NULLORDER;
1970     if (data->toReturn > data->CEs) {
1971         data->toReturn --;
1972         result = *(data->toReturn);
1973         if (data->CEs == data->toReturn) {
1974             data->CEpos = data->toReturn;
1975         }
1976     }
1977     else {
1978         UChar ch = 0;
1979         /*
1980         Loop handles case when incremental normalize switches to or from the
1981         side buffer / original string, and we need to start again to get the
1982         next character.
1983         */
1984         for (;;) {
1985             if (data->flags & UCOL_ITER_HASLEN) {
1986                 /*
1987                 Normal path for strings when length is specified.
1988                 Not in side buffer because it is always null terminated.
1989                 */
1990                 if (data->pos <= data->string) {
1991                     /* End of the main source string */
1992                     return UCOL_NO_MORE_CES;
1993                 }
1994                 data->pos --;
1995                 ch = *data->pos;
1996             }
1997             // we are using an iterator to go back. Pray for us!
1998             else if (data->flags & UCOL_USE_ITERATOR) {
1999               UChar32 iterCh = data->iterator->previous(data->iterator);
2000               if(iterCh == U_SENTINEL) {
2001                 return UCOL_NO_MORE_CES;
2002               } else {
2003                 ch = (UChar)iterCh;
2004               }
2005             }
2006             else {
2007                 data->pos --;
2008                 ch = *data->pos;
2009                 /* we are in the side buffer. */
2010                 if (ch == 0) {
2011                     /*
2012                     At the start of the normalize side buffer.
2013                     Go back to string.
2014                     Because pointer points to the last accessed character,
2015                     hence we have to increment it by one here.
2016                     */
2017                     if (data->fcdPosition == NULL) {
2018                         data->pos = data->string;
2019                         return UCOL_NO_MORE_CES;
2020                     }
2021                     else {
2022                         data->pos   = data->fcdPosition + 1;
2023                     }
2024                     data->flags = data->origFlags;
2025                     continue;
2026                 }
2027             }
2028
2029             if(data->flags&UCOL_HIRAGANA_Q) {
2030               if(ch>=0x3040 && ch<=0x309f) {
2031                 data->flags |= UCOL_WAS_HIRAGANA;
2032               } else {
2033                 data->flags &= ~UCOL_WAS_HIRAGANA;
2034               }
2035             }
2036
2037             /*
2038             * got a character to determine if there's fcd and/or normalization
2039             * stuff to do.
2040             * if the current character is not fcd.
2041             * if current character is at the start of the string
2042             * Trailing combining class == 0.
2043             * Note if pos is in the writablebuffer, norm is always 0
2044             */
2045             if (ch < ZERO_CC_LIMIT_ ||
2046               // this should propel us out of the loop in the iterator case
2047                 (data->flags & UCOL_ITER_NORM) == 0 ||
2048                 (data->fcdPosition != NULL && data->fcdPosition <= data->pos)
2049                 || data->string == data->pos) {
2050                 break;
2051             }
2052
2053             if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) {
2054                 /* if next character is FCD */
2055                 if (data->pos == data->string) {
2056                     /* First char of string is always OK for FCD check */
2057                     break;
2058                 }
2059
2060                 /* Not first char of string, do the FCD fast test */
2061                 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) {
2062                     break;
2063                 }
2064             }
2065
2066             /* Need a more complete FCD check and possible normalization. */
2067             if (collPrevIterFCD(data)) {
2068                 collPrevIterNormalize(data);
2069             }
2070
2071             if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2072                 /*  No normalization. Go ahead and process the char. */
2073                 break;
2074             }
2075
2076             /*
2077             Some normalization happened.
2078             Next loop picks up a char from the normalization buffer.
2079             */
2080         }
2081
2082         /* attempt to handle contractions, after removal of the backwards
2083         contraction
2084         */
2085         if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) {
2086             result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status);
2087         }
2088         else {
2089               // TODO: fix me for THAI - I reference *(data->pos-1)
2090                 if ((data->flags & UCOL_ITER_INNORMBUF) == 0 &&
2091                     /*UCOL_ISTHAIBASECONSONANT(ch) &&*/   // This is from the old specs - we now rearrange unconditionally
2092                     // makes sure that we're not at the beggining of the string
2093                     //data->pos > data->string &&
2094                     !collIter_bos(data) &&
2095                     UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1)))
2096                     //UCOL_ISTHAIPREVOWEL(*(data->pos -1)))
2097                 {
2098                     collIterateState entryState;
2099                     backupState(data, &entryState);
2100                     // we have to check if the previous character is also Thai
2101                     // if not, we can just set the result
2102                     goBackOne(data);
2103                     if(collIter_bos(data) || !UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
2104                       loadState(data, &entryState, FALSE);
2105                       result = UCOL_THAI;
2106                     } else { // previous is also reordered
2107                       // we need to go back as long as they are being reordered
2108                       // count over the range of reorderable characters and see
2109                       // if there is an even or odd number of them
2110                       // if even, we should not reorder. If odd we should reorder.
2111                       int32_t noReordered = 1; // the one we already detected
2112                       while(!collIter_bos(data) && UCOL_ISTHAIPREVOWEL(peekCharacter(data, -1))) {
2113                         noReordered++;
2114                         goBackOne(data);
2115                       }
2116                       if(noReordered & 1) { // odd number of reorderables
2117                         result = UCOL_THAI;
2118                       } else {
2119                         result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
2120                       }
2121                       loadState(data, &entryState, FALSE);
2122                     }
2123                 }
2124             else if (ch <= 0xFF) {
2125               result = coll->latinOneMapping[ch];
2126               //if (result > UCOL_NOT_FOUND) {
2127                 //result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2128               //}
2129             }
2130                 else {
2131                     /*result = ucmpe32_get(coll->mapping, ch);*/
2132                     result = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
2133                 }
2134                     if (result > UCOL_NOT_FOUND) {
2135                         result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status);
2136                     }
2137                 if (result == UCOL_NOT_FOUND) {
2138                   if (!isAtStartPrevIterate(data) &&
2139                       ucol_contractionEndCP(ch, data->coll)) {
2140                       result = UCOL_CONTRACTION;
2141                   }
2142                   else {
2143                         /*result = ucmpe32_get(UCA->mapping, ch);*/
2144                       if(coll->UCA) {
2145                         result = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
2146                       }
2147                   }
2148
2149                   if (result > UCOL_NOT_FOUND && coll->UCA) {
2150                     result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status);
2151                   }
2152                 }
2153             }
2154         }
2155     return result;
2156 }
2157
2158
2159 /*   ucol_getPrevCE, out-of-line version for use from other files.  */
2160 U_CAPI uint32_t  U_EXPORT2
2161 ucol_getPrevCE(const UCollator *coll, collIterate *data,
2162                         UErrorCode *status) {
2163     return ucol_IGetPrevCE(coll, data, status);
2164 }
2165
2166
2167 /* this should be connected to special Jamo handling */
2168 U_CAPI uint32_t  U_EXPORT2
2169 ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) {
2170   collIterate colIt;
2171   uint32_t order;
2172   IInit_collIterate(coll, &u, 1, &colIt);
2173   order = ucol_IGetNextCE(coll, &colIt, status);
2174   /*UCOL_GETNEXTCE(order, coll, colIt, status);*/
2175   return order;
2176 }
2177
2178 /**
2179 * Inserts the argument character into the end of the buffer pushing back the
2180 * null terminator.
2181 * @param data collIterate struct data
2182 * @param pNull pointer to the null termination
2183 * @param ch character to be appended
2184 * @return the position of the new addition
2185 */
2186 static
2187 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch)
2188 {
2189           uint32_t  size    = data->writableBufSize;
2190           UChar    *newbuffer;
2191     const uint32_t  incsize = 5;
2192
2193     if ((data->writableBuffer + size) > (pNull + 1)) {
2194         *pNull = ch;
2195         *(pNull + 1) = 0;
2196         return pNull;
2197     }
2198
2199     /*
2200     buffer will always be null terminated at the end.
2201     giving extra space since it is likely that more characters will be added.
2202     */
2203     size += incsize;
2204     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2205     if(newbuffer != NULL) { // something wrong, but no status
2206       uprv_memcpy(newbuffer, data->writableBuffer,
2207                   data->writableBufSize * sizeof(UChar));
2208
2209       freeHeapWritableBuffer(data);
2210       data->writableBufSize = size;
2211       data->writableBuffer  = newbuffer;
2212
2213       newbuffer        = newbuffer + data->writableBufSize;
2214       *newbuffer       = ch;
2215       *(newbuffer + 1) = 0;
2216     }
2217     return newbuffer;
2218 }
2219
2220 /**
2221 * Inserts the argument string into the end of the buffer pushing back the
2222 * null terminator.
2223 * @param data collIterate struct data
2224 * @param pNull pointer to the null termination
2225 * @param string to be appended
2226 * @param length of the string to be appended
2227 * @return the position of the new addition
2228 */
2229 static
2230 inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str,
2231                                int32_t length)
2232 {
2233     uint32_t  size = pNull - data->writableBuffer;
2234     UChar    *newbuffer;
2235
2236     if (data->writableBuffer + data->writableBufSize > pNull + length + 1) {
2237         uprv_memcpy(pNull, str, length * sizeof(UChar));
2238         *(pNull + length) = 0;
2239         return pNull;
2240     }
2241
2242     /*
2243     buffer will always be null terminated at the end.
2244     giving extra space since it is likely that more characters will be added.
2245     */
2246     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1));
2247     if(newbuffer != NULL) {
2248       uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar));
2249       uprv_memcpy(newbuffer + size, str, length * sizeof(UChar));
2250
2251       freeHeapWritableBuffer(data);
2252       data->writableBufSize = size + length + 1;
2253       data->writableBuffer  = newbuffer;
2254     }
2255
2256     return newbuffer;
2257 }
2258
2259 /**
2260 * Special normalization function for contraction in the forwards iterator.
2261 * This normalization sequence will place the current character at source->pos
2262 * and its following normalized sequence into the buffer.
2263 * The fcd position, pos will be changed.
2264 * pos will now point to positions in the buffer.
2265 * Flags will be changed accordingly.
2266 * @param data collation iterator data
2267 */
2268 static
2269 inline void normalizeNextContraction(collIterate *data)
2270 {
2271     UChar      *buffer     = data->writableBuffer;
2272     uint32_t    buffersize = data->writableBufSize;
2273     uint32_t    strsize;
2274     UErrorCode  status     = U_ZERO_ERROR;
2275     /* because the pointer points to the next character */
2276     UChar      *pStart     = data->pos - 1;
2277     UChar      *pEnd;
2278     uint32_t    normLen;
2279     UChar      *pStartNorm;
2280
2281     if ((data->flags & UCOL_ITER_INNORMBUF) == 0) {
2282         *data->writableBuffer = *(pStart - 1);
2283         strsize               = 1;
2284     }
2285     else {
2286         strsize = u_strlen(data->writableBuffer);
2287     }
2288
2289     pEnd = data->fcdPosition;
2290
2291     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2292                               &status);
2293
2294     if (buffersize <= normLen + strsize) {
2295         uint32_t  size = strsize + normLen + 1;
2296         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2297         if(temp != NULL) {
2298           uprv_memcpy(temp, buffer, sizeof(UChar) * strsize);
2299           freeHeapWritableBuffer(data);
2300           data->writableBuffer = temp;
2301           data->writableBufSize = size;
2302           data->flags |= UCOL_ITER_ALLOCATED;
2303         }
2304     }
2305
2306     status            = U_ZERO_ERROR;
2307     pStartNorm        = buffer + strsize;
2308     /* null-termination will be added here */
2309     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm,
2310                     normLen + 1, &status);
2311
2312     data->pos        = data->writableBuffer + strsize;
2313     data->origFlags  = data->flags;
2314     data->flags     |= UCOL_ITER_INNORMBUF;
2315     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2316 }
2317
2318 /**
2319 * Contraction character management function that returns the next character
2320 * for the forwards iterator.
2321 * Does nothing if the next character is in buffer and not the first character
2322 * in it.
2323 * Else it checks next character in data string to see if it is normalizable.
2324 * If it is not, the character is simply copied into the buffer, else
2325 * the whole normalized substring is copied into the buffer, including the
2326 * current character.
2327 * @param data collation element iterator data
2328 * @return next character
2329 */
2330 static
2331 inline UChar getNextNormalizedChar(collIterate *data)
2332 {
2333     UChar  nextch;
2334     UChar  ch;
2335     // Here we need to add the iterator code. One problem is the way
2336     // end of string is handled. If we just return next char, it could
2337     // be the sentinel. Most of the cases already check for this, but we
2338     // need to be sure.
2339     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) {
2340          /* if no normalization and not in buffer. */
2341       if(data->flags & UCOL_USE_ITERATOR) {
2342          return (UChar)data->iterator->next(data->iterator);
2343       } else {
2344          return *(data->pos ++);
2345       }
2346     }
2347
2348     //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) {
2349       //normalizeIterator(data);
2350     //}
2351
2352     UChar  *pEndWritableBuffer = NULL;
2353     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2354     if ((innormbuf && *data->pos != 0) ||
2355         (data->fcdPosition != NULL && !innormbuf &&
2356         data->pos < data->fcdPosition)) {
2357         /*
2358         if next character is in normalized buffer, no further normalization
2359         is required
2360         */
2361         return *(data->pos ++);
2362     }
2363
2364     if (data->flags & UCOL_ITER_HASLEN) {
2365         /* in data string */
2366         if (data->pos + 1 == data->endp) {
2367             return *(data->pos ++);
2368         }
2369     }
2370     else {
2371         if (innormbuf) {
2372           // inside the normalization buffer, but at the end
2373           // (since we encountered zero). This means, in the
2374           // case we're using char iterator, that we need to
2375           // do another round of normalization.
2376           //if(data->origFlags & UCOL_USE_ITERATOR) {
2377             // we need to restore original flags,
2378             // otherwise, we'll lose them
2379             //data->flags = data->origFlags;
2380             //normalizeIterator(data);
2381             //return *(data->pos++);
2382           //} else {
2383             /*
2384             in writable buffer, at this point fcdPosition can not be
2385             pointing to the end of the data string. see contracting tag.
2386             */
2387           if(data->fcdPosition) {
2388             if (*(data->fcdPosition + 1) == 0 ||
2389                 data->fcdPosition + 1 == data->endp) {
2390                 /* at the end of the string, dump it into the normalizer */
2391                 data->pos = insertBufferEnd(data, data->pos,
2392                                             *(data->fcdPosition)) + 1;
2393                 return *(data->fcdPosition ++);
2394             }
2395             pEndWritableBuffer = data->pos;
2396             data->pos = data->fcdPosition;
2397           } else if(data->origFlags & UCOL_USE_ITERATOR) {
2398             // if we are here, we're using a normalizing iterator.
2399             // we should just continue further.
2400             data->flags = data->origFlags;
2401             data->pos = NULL;
2402             return (UChar)data->iterator->next(data->iterator);
2403           }
2404           //}
2405         }
2406         else {
2407             if (*(data->pos + 1) == 0) {
2408                 return *(data->pos ++);
2409             }
2410         }
2411     }
2412
2413     ch = *data->pos ++;
2414     nextch = *data->pos;
2415
2416     /*
2417     * if the current character is not fcd.
2418     * Trailing combining class == 0.
2419     */
2420     if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) &&
2421         (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ ||
2422          ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) {
2423             /*
2424             Need a more complete FCD check and possible normalization.
2425             normalize substring will be appended to buffer
2426             */
2427         if (collIterFCD(data)) {
2428             normalizeNextContraction(data);
2429             return *(data->pos ++);
2430         }
2431         else if (innormbuf) {
2432             /* fcdposition shifted even when there's no normalization, if we
2433             don't input the rest into this, we'll get the wrong position when
2434             we reach the end of the writableBuffer */
2435             int32_t length = data->fcdPosition - data->pos + 1;
2436             data->pos = insertBufferEnd(data, pEndWritableBuffer,
2437                                         data->pos - 1, length);
2438             return *(data->pos ++);
2439         }
2440     }
2441
2442     if (innormbuf) {
2443         /*
2444         no normalization is to be done hence only one character will be
2445         appended to the buffer.
2446         */
2447         data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1;
2448     }
2449
2450     /* points back to the pos in string */
2451     return ch;
2452 }
2453
2454
2455
2456 /**
2457 * Function to copy the buffer into writableBuffer and sets the fcd position to
2458 * the correct position
2459 * @param source data string source
2460 * @param buffer character buffer
2461 * @param tempdb current position in buffer that has been used up
2462 */
2463 static
2464 inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer,
2465                                      UChar *tempdb)
2466 {
2467     /* okay confusing part here. to ensure that the skipped characters are
2468     considered later, we need to place it in the appropriate position in the
2469     normalization buffer and reassign the pos pointer. simple case if pos
2470     reside in string, simply copy to normalization buffer and
2471     fcdposition = pos, pos = start of normalization buffer. if pos in
2472     normalization buffer, we'll insert the copy infront of pos and point pos
2473     to the start of the normalization buffer. why am i doing these copies?
2474     well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does
2475     not require any changes, which be really painful. */
2476     uint32_t length = u_strlen(buffer);;
2477     if (source->flags & UCOL_ITER_INNORMBUF) {
2478         u_strcpy(tempdb, source->pos);
2479     }
2480     else {
2481         source->fcdPosition  = source->pos;
2482         source->origFlags    = source->flags;
2483         source->flags       |= UCOL_ITER_INNORMBUF;
2484         source->flags       &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2485     }
2486
2487     if (length >= source->writableBufSize) {
2488         freeHeapWritableBuffer(source);
2489         source->writableBuffer =
2490                      (UChar *)uprv_malloc((length + 1) * sizeof(UChar));
2491         if(source->writableBuffer == NULL) {
2492           return;
2493         }
2494         source->writableBufSize = length;
2495     }
2496
2497     u_strcpy(source->writableBuffer, buffer);
2498     source->pos = source->writableBuffer;
2499 }
2500
2501 /**
2502 * Function to get the discontiguos collation element within the source.
2503 * Note this function will set the position to the appropriate places.
2504 * @param coll current collator used
2505 * @param source data string source
2506 * @param constart index to the start character in the contraction table
2507 * @return discontiguos collation element offset
2508 */
2509 static
2510 uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
2511                                 const UChar *constart)
2512 {
2513     /* source->pos currently points to the second combining character after
2514        the start character */
2515           UChar   *temppos      = source->pos;
2516           UChar    buffer[4*UCOL_MAX_BUFFER];
2517           UChar   *tempdb       = buffer;
2518     const UChar   *tempconstart = constart;
2519           uint8_t  tempflags    = source->flags;
2520           UBool    multicontraction = FALSE;
2521           UChar   *tempbufferpos = 0;
2522           collIterateState discState;
2523
2524           backupState(source, &discState);
2525
2526     //*tempdb = *(source->pos - 1);
2527           *tempdb = peekCharacter(source, -1);
2528     tempdb ++;
2529     while (TRUE) {
2530         UChar    *UCharOffset;
2531         UChar     schar,
2532                   tchar;
2533         uint32_t  result;
2534
2535         if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
2536             || (peekCharacter(source, 0) == 0  &&
2537             //|| (*source->pos == 0  &&
2538                 ((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
2539                  source->fcdPosition == NULL ||
2540                  source->fcdPosition == source->endp ||
2541                  *(source->fcdPosition) == 0 ||
2542                  u_getCombiningClass(*(source->fcdPosition)) == 0)) ||
2543                  /* end of string in null terminated string or stopped by a
2544                  null character, note fcd does not always point to a base
2545                  character after the discontiguos change */
2546                  u_getCombiningClass(peekCharacter(source, 0)) == 0) {
2547                  //u_getCombiningClass(*(source->pos)) == 0) {
2548             //constart = (UChar *)coll->image + getContractOffset(CE);
2549             if (multicontraction) {
2550                 *tempbufferpos = 0;
2551                 source->pos    = temppos - 1;
2552                 setDiscontiguosAttribute(source, buffer, tempdb);
2553                 return *(coll->contractionCEs +
2554                                     (tempconstart - coll->contractionIndex));
2555             }
2556             constart = tempconstart;
2557             break;
2558         }
2559
2560         UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/
2561         schar = getNextNormalizedChar(source);
2562
2563         while (schar > (tchar = *UCharOffset)) {
2564             UCharOffset++;
2565         }
2566
2567         if (schar != tchar) {
2568             /* not the correct codepoint. we stuff the current codepoint into
2569             the discontiguos buffer and try the next character */
2570             *tempdb = schar;
2571             tempdb ++;
2572             continue;
2573         }
2574         else {
2575             if (u_getCombiningClass(schar) ==
2576                 u_getCombiningClass(peekCharacter(source, -2))) {
2577                 //u_getCombiningClass(*(source->pos - 2))) {
2578                 *tempdb = schar;
2579                 tempdb ++;
2580                 continue;
2581             }
2582             result = *(coll->contractionCEs +
2583                                       (UCharOffset - coll->contractionIndex));
2584         }
2585         *tempdb = 0;
2586
2587         if (result == UCOL_NOT_FOUND) {
2588           break;
2589         } else if (isContraction(result)) {
2590             /* this is a multi-contraction*/
2591             tempconstart = (UChar *)coll->image + getContractOffset(result);
2592             if (*(coll->contractionCEs + (constart - coll->contractionIndex))
2593                 != UCOL_NOT_FOUND) {
2594                 multicontraction = TRUE;
2595                 temppos       = source->pos + 1;
2596                 tempbufferpos = buffer + u_strlen(buffer);
2597             }
2598         } else {
2599             setDiscontiguosAttribute(source, buffer, tempdb);
2600             return result;
2601         }
2602     }
2603
2604     /* no problems simply reverting just like that,
2605     if we are in string before getting into this function, points back to
2606     string hence no problem.
2607     if we are in normalization buffer before getting into this function,
2608     since we'll never use another normalization within this function, we
2609     know that fcdposition points to a base character. the normalization buffer
2610     never change, hence this revert works. */
2611     loadState(source, &discState, TRUE);
2612     goBackOne(source);
2613
2614     //source->pos   = temppos - 1;
2615     source->flags = tempflags;
2616     return *(coll->contractionCEs + (constart - coll->contractionIndex));
2617 }
2618
2619 static
2620 inline UBool isNonChar(UChar32 cp) {
2621   if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)) {
2622     return TRUE;
2623   }
2624   return FALSE;
2625 }
2626
2627 /* now uses Mark's getImplicitPrimary code */
2628 static
2629 inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
2630   if(isNonChar(cp)) {
2631     return 0;
2632   }
2633   uint32_t r = uprv_uca_getImplicitPrimary(cp);
2634   *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
2635   return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
2636 }
2637
2638 /**
2639 * Inserts the argument character into the front of the buffer replacing the
2640 * front null terminator.
2641 * @param data collation element iterator data
2642 * @param pNull pointer to the null terminator
2643 * @param ch character to be appended
2644 * @return positon of added character
2645 */
2646 static
2647 inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch)
2648 {
2649           uint32_t  size    = data->writableBufSize;
2650           UChar    *end;
2651           UChar    *newbuffer;
2652     const uint32_t  incsize = 5;
2653
2654     if (pNull > data->writableBuffer + 1) {
2655         *pNull       = ch;
2656         *(pNull - 1) = 0;
2657         return pNull;
2658     }
2659
2660     /*
2661     buffer will always be null terminated infront.
2662     giving extra space since it is likely that more characters will be added.
2663     */
2664     size += incsize;
2665     newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size);
2666     if(newbuffer == NULL) {
2667       return NULL;
2668     }
2669     end = newbuffer + incsize;
2670     uprv_memcpy(end, data->writableBuffer,
2671                 data->writableBufSize * sizeof(UChar));
2672     *end       = ch;
2673     *(end - 1) = 0;
2674
2675     freeHeapWritableBuffer(data);
2676
2677     data->writableBufSize = size;
2678     data->writableBuffer  = newbuffer;
2679     return end;
2680 }
2681
2682 /**
2683 * Special normalization function for contraction in the previous iterator.
2684 * This normalization sequence will place the current character at source->pos
2685 * and its following normalized sequence into the buffer.
2686 * The fcd position, pos will be changed.
2687 * pos will now point to positions in the buffer.
2688 * Flags will be changed accordingly.
2689 * @param data collation iterator data
2690 */
2691 static
2692 inline void normalizePrevContraction(collIterate *data)
2693 {
2694     UChar      *buffer     = data->writableBuffer;
2695     uint32_t    buffersize = data->writableBufSize;
2696     uint32_t    nulltermsize;
2697     UErrorCode  status     = U_ZERO_ERROR;
2698     UChar      *pEnd       = data->pos + 1;         /* End normalize + 1 */
2699     UChar      *pStart;
2700     uint32_t    normLen;
2701     UChar      *pStartNorm;
2702
2703     if (data->flags & UCOL_ITER_HASLEN) {
2704         /*
2705         normalization buffer not used yet, we'll pull down the next
2706         character into the end of the buffer
2707         */
2708         *(buffer + (buffersize - 1)) = *(data->pos + 1);
2709         nulltermsize                  = buffersize - 1;
2710     }
2711     else {
2712         nulltermsize = buffersize;
2713         UChar *temp = buffer + (nulltermsize - 1);
2714         while (*(temp --) != 0) {
2715             nulltermsize --;
2716         }
2717     }
2718
2719     /* Start normalize */
2720     if (data->fcdPosition == NULL) {
2721         pStart = data->string;
2722     }
2723     else {
2724         pStart = data->fcdPosition + 1;
2725     }
2726
2727     normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0,
2728                               &status);
2729
2730     if (nulltermsize <= normLen) {
2731         uint32_t  size = buffersize - nulltermsize + normLen + 1;
2732         UChar    *temp = (UChar *)uprv_malloc(size * sizeof(UChar));
2733         if(temp != NULL) {
2734           nulltermsize   = normLen + 1;
2735           uprv_memcpy(temp + normLen, buffer,
2736                       sizeof(UChar) * (buffersize - nulltermsize));
2737           freeHeapWritableBuffer(data);
2738           data->writableBuffer = temp;
2739           data->writableBufSize = size;
2740         }
2741     }
2742
2743     status = U_ZERO_ERROR;
2744     /*
2745     this puts the null termination infront of the normalized string instead
2746     of the end
2747     */
2748     pStartNorm   = buffer + (nulltermsize - normLen);
2749     *(pStartNorm - 1) = 0;
2750     unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen,
2751                     &status);
2752
2753     data->pos        = data->writableBuffer + nulltermsize;
2754     data->origFlags  = data->flags;
2755     data->flags     |= UCOL_ITER_INNORMBUF;
2756     data->flags     &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
2757 }
2758
2759 /**
2760 * Contraction character management function that returns the previous character
2761 * for the backwards iterator.
2762 * Does nothing if the previous character is in buffer and not the first
2763 * character in it.
2764 * Else it checks previous character in data string to see if it is
2765 * normalizable.
2766 * If it is not, the character is simply copied into the buffer, else
2767 * the whole normalized substring is copied into the buffer, including the
2768 * current character.
2769 * @param data collation element iterator data
2770 * @return previous character
2771 */
2772 static
2773 inline UChar getPrevNormalizedChar(collIterate *data)
2774 {
2775     UChar  prevch;
2776     UChar  ch;
2777     UChar *start;
2778     UBool  innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF);
2779     UChar *pNull = NULL;
2780     if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ||
2781         (innormbuf && *(data->pos - 1) != 0)) {
2782         /*
2783         if no normalization.
2784         if previous character is in normalized buffer, no further normalization
2785         is required
2786         */
2787       if(data->flags & UCOL_USE_ITERATOR) {
2788         data->iterator->move(data->iterator, -1, UITER_CURRENT);
2789         return (UChar)data->iterator->next(data->iterator);
2790       } else {
2791         return *(data->pos - 1);
2792       }
2793     }
2794
2795     start = data->pos;
2796     if (data->flags & UCOL_ITER_HASLEN) {
2797         /* in data string */
2798         if ((start - 1) == data->string) {
2799             return *(start - 1);
2800         }
2801         start --;
2802         ch     = *start;
2803         prevch = *(start - 1);
2804     }
2805     else {
2806         /*
2807         in writable buffer, at this point fcdPosition can not be NULL.
2808         see contracting tag.
2809         */
2810         if (data->fcdPosition == data->string) {
2811             /* at the start of the string, just dump it into the normalizer */
2812             insertBufferFront(data, data->pos - 1, *(data->fcdPosition));
2813             data->fcdPosition = NULL;
2814             return *(data->pos - 1);
2815         }
2816         pNull  = data->pos - 1;
2817         start  = data->fcdPosition;
2818         ch     = *start;
2819         prevch = *(start - 1);
2820     }
2821     /*
2822     * if the current character is not fcd.
2823     * Trailing combining class == 0.
2824     */
2825     if (data->fcdPosition > start &&
2826        (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_))
2827     {
2828         /*
2829         Need a more complete FCD check and possible normalization.
2830         normalize substring will be appended to buffer
2831         */
2832         UChar *backuppos = data->pos;
2833         data->pos = start;
2834         if (collPrevIterFCD(data)) {
2835             normalizePrevContraction(data);
2836             return *(data->pos - 1);
2837         }
2838         data->pos = backuppos;
2839         data->fcdPosition ++;
2840     }
2841
2842     if (innormbuf) {
2843     /*
2844     no normalization is to be done hence only one character will be
2845     appended to the buffer.
2846     */
2847         insertBufferFront(data, pNull, ch);
2848         data->fcdPosition --;
2849     }
2850
2851     return ch;
2852 }
2853
2854 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
2855 /* It is called by getNextCE */
2856
2857 uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) {
2858   collIterateState entryState;
2859   backupState(source, &entryState);
2860   UChar32 cp = ch;
2861
2862   for (;;) {
2863     // This loop will repeat only in the case of contractions, and only when a contraction
2864     //   is found and the first CE resulting from that contraction is itself a special
2865     //   (an expansion, for example.)  All other special CE types are fully handled the
2866     //   first time through, and the loop exits.
2867
2868     const uint32_t *CEOffset = NULL;
2869     switch(getCETag(CE)) {
2870     case NOT_FOUND_TAG:
2871       /* This one is not found, and we'll let somebody else bother about it... no more games */
2872       return CE;
2873     case SURROGATE_TAG:
2874       /* we encountered a leading surrogate. We shall get the CE by using the following code unit */
2875       /* two things can happen here: next code point can be a trailing surrogate - we will use it */
2876       /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
2877       /* we return 0 (completely ignorable - per UCA specification */
2878       {
2879         UChar trail;
2880         collIterateState state;
2881         backupState(source, &state);
2882         if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
2883           // we chould have stepped one char forward and it might have turned that it
2884           // was not a trail surrogate. In that case, we have to backup.
2885           loadState(source, &state, TRUE);
2886           return 0;
2887         } else {
2888           /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
2889           CE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, CE&0xFFFFFF, trail);
2890           if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one.
2891             // We need to backup
2892             loadState(source, &state, TRUE);
2893             return CE;
2894           }
2895           // calculate the supplementary code point value, if surrogate was not tailored
2896           cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
2897         }
2898       }
2899       break;
2900     case THAI_TAG:
2901       /* Thai/Lao reordering */
2902         if  (((source->flags) & UCOL_ITER_INNORMBUF)      /* Already Swapped     ||                 */
2903           || collIter_eos(source))                        /* At end of string.  No swap possible    */
2904         {
2905             // Treat Thai as a length one expansion */
2906             CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
2907             CE = *CEOffset++;
2908         }
2909         else
2910         {
2911           // Move the prevowel and the following base Consonant into the normalization buffer
2912           //   with their order swapped
2913           // Note: this operation might activate the normalization buffer. We have to check for
2914           // that and act accordingly.
2915           UChar thCh = getNextNormalizedChar(source);
2916           UChar32 cp = 0;
2917           if(U16_IS_LEAD(thCh)) {
2918             if(!collIter_eos(source)) {
2919               collIterateState thaiState;
2920               backupState(source, &thaiState);
2921               UChar trailCh = getNextNormalizedChar(source);
2922               if(U16_IS_TRAIL(trailCh)) {
2923                 cp = U16_GET_SUPPLEMENTARY(thCh, trailCh);
2924               } else {
2925                 loadState(source, &thaiState, TRUE);
2926                 cp = (UChar32)thCh;
2927               }
2928             } else {
2929               cp = (UChar32)thCh;
2930             }
2931           } else {
2932               cp = (UChar32)thCh;
2933           }
2934           // Now we have the character that needs to be decomposed
2935           // if the normalizing buffer was not used, we can just use our structure and be happy.
2936           if((source->flags & UCOL_ITER_INNORMBUF) == 0) {
2937             // decompose into writable buffer
2938             int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
2939             if(decompLen < 0) {
2940               decompLen = -decompLen;
2941             }
2942             // reorder Thai and the character after it
2943             if(decompLen >= 2 && U16_IS_LEAD(source->writableBuffer[1]) && U16_IS_TRAIL(source->writableBuffer[2])) {
2944               source->writableBuffer[0] = source->writableBuffer[1];
2945               source->writableBuffer[1] = source->writableBuffer[2];
2946               source->writableBuffer[2] = ch;
2947             } else {
2948               source->writableBuffer[0] = source->writableBuffer[1];
2949               source->writableBuffer[1] = ch;
2950             }
2951             // zero terminate, since normalization buffer is always zero terminated
2952             source->writableBuffer[decompLen+1] = 0; // we added the prevowel
2953             if(source->pos) {
2954               source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
2955                                                            //   after exhausting the writableBuffer
2956             }
2957             source->pos   = source->writableBuffer;
2958             source->origFlags         = source->flags;
2959             source->flags            |= UCOL_ITER_INNORMBUF;
2960             source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR);
2961           }
2962           else {
2963               // stuff is already normalized... what to do here???
2964
2965               // if we are in the normalization buffer, thCh must be in it
2966               // prove by contradiction
2967               // if thCh is not in the normalization buffer,
2968               // that means that trailCh is the normalization buffer
2969               // that means that trailCh is a trail surrogate by the above
2970               // bounding if block, this is a contradiction because there
2971               // are no characters at the moment that decomposes to an
2972               // unmatched surrogate. qed.
2973               if (cp >= 0x10000) {
2974                   source->writableBuffer[0] = source->writableBuffer[1];
2975                   source->writableBuffer[1] = source->writableBuffer[2];
2976                   source->writableBuffer[2] = ch;
2977               }
2978               else {
2979                   source->writableBuffer[0] = source->writableBuffer[1];
2980                   source->writableBuffer[1] = ch;
2981               }
2982               source->pos = source->writableBuffer;
2983           }
2984           CE = UCOL_IGNORABLE;
2985       }
2986       break;
2987     case SPEC_PROC_TAG:
2988       {
2989         // Special processing is getting a CE that is preceded by a certain prefix
2990         // Currently this is only needed for optimizing Japanese length and iteration marks.
2991         // When we encouter a special processing tag, we go backwards and try to see if
2992         // we have a match.
2993         // Contraction tables are used - so the whole process is not unlike contraction.
2994         // prefix data is stored backwards in the table.
2995         const UChar *UCharOffset;
2996         UChar schar, tchar;
2997         collIterateState prefixState;
2998         backupState(source, &prefixState);
2999         loadState(source, &entryState, TRUE);
3000         goBackOne(source); // We want to look at the point where we entered - actually one
3001         // before that...
3002
3003         for(;;) {
3004         // This loop will run once per source string character, for as long as we
3005         //  are matching a potential contraction sequence
3006
3007           // First we position ourselves at the begining of contraction sequence
3008           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3009           if (collIter_bos(source)) {
3010             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3011             break;
3012           }
3013           schar = getPrevNormalizedChar(source);
3014           goBackOne(source);
3015
3016           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3017             UCharOffset++;
3018           }
3019
3020           if (schar == tchar) {
3021               // Found the source string char in the table.
3022               //  Pick up the corresponding CE from the table.
3023               CE = *(coll->contractionCEs +
3024                   (UCharOffset - coll->contractionIndex));
3025           }
3026           else
3027           {
3028               // if there is a completely ignorable code point in the middle of
3029               // a prefix, we need to act as if it's not there
3030               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3031               // lone surrogates cannot be set to zero as it would break other processing
3032               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3033               // it's easy for BMP code points
3034               if(isZeroCE == 0) {
3035                 continue;
3036               } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3037                 // for supplementary code points, we have to check the next one
3038                 // situations where we are going to ignore
3039                 // 1. beginning of the string: schar is a lone surrogate
3040                 // 2. schar is a lone surrogate
3041                 // 3. schar is a trail surrogate in a valid surrogate sequence
3042                 //    that is explicitly set to zero.
3043                 if (!collIter_bos(source)) {
3044                   UChar lead;
3045                   if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3046                     isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3047                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3048                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3049                       if(finalCE == 0) {
3050                         // this is a real, assigned completely ignorable code point
3051                         goBackOne(source);
3052                         continue;
3053                       }
3054                     }
3055                   } else {
3056                     // lone surrogate, completely ignorable
3057                     continue;
3058                   }
3059                 } else {
3060                   // lone surrogate at the beggining, completely ignorable
3061                   continue;
3062                 }
3063               }
3064               // Source string char was not in the table.
3065               //   We have not found the prefix.
3066               CE = *(coll->contractionCEs +
3067                   (ContractionStart - coll->contractionIndex));
3068           }
3069
3070           if(!isPrefix(CE)) {
3071               // The source string char was in the contraction table, and the corresponding
3072               //   CE is not a prefix CE.  We found the prefix, break
3073               //   out of loop, this CE will end up being returned.  This is the normal
3074               //   way out of prefix handling when the source actually contained
3075               //   the prefix.
3076               break;
3077           }
3078         }
3079         if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue
3080           loadState(source, &prefixState, TRUE);
3081           if(source->origFlags & UCOL_USE_ITERATOR) {
3082             source->flags = source->origFlags;
3083           }
3084         } else { // prefix search was a failure, we have to backup all the way to the start
3085           loadState(source, &entryState, TRUE);
3086         }
3087       break;
3088       }
3089     case CONTRACTION_TAG:
3090       {
3091       /* This should handle contractions */
3092       collIterateState state;
3093       backupState(source, &state);
3094       uint32_t firstCE = UCOL_NOT_FOUND;
3095       const UChar *UCharOffset;
3096       UChar schar, tchar;
3097
3098       for (;;) {
3099         /* This loop will run once per source string character, for as long as we     */
3100         /*  are matching a potential contraction sequence                  */
3101
3102         /* First we position ourselves at the begining of contraction sequence */
3103         const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3104
3105         if (collIter_eos(source)) {
3106             // Ran off the end of the source string.
3107             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3108             // So we'll pick whatever we have at the point...
3109             if (CE == UCOL_NOT_FOUND) {
3110                 // back up the source over all the chars we scanned going into this contraction.
3111                 CE = firstCE;
3112                 loadState(source, &state, TRUE);
3113                 if(source->origFlags & UCOL_USE_ITERATOR) {
3114                     source->flags = source->origFlags;
3115                 }
3116             }
3117             break;
3118         }
3119
3120         uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */
3121         uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8);
3122
3123         schar = getNextNormalizedChar(source);
3124         while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3125           UCharOffset++;
3126         }
3127
3128         if (schar == tchar) {
3129             // Found the source string char in the contraction table.
3130             //  Pick up the corresponding CE from the table.
3131             CE = *(coll->contractionCEs +
3132                 (UCharOffset - coll->contractionIndex));
3133         }
3134         else
3135         {
3136             // if there is a completely ignorable code point in the middle of
3137             // contraction, we need to act as if it's not there
3138             uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3139             // it's easy for BMP code points
3140             if(isZeroCE == 0) {
3141                 continue;
3142             } else if(UTF_IS_LEAD(schar)) {
3143               if(!collIter_eos(source)) {
3144                 backupState(source, &state);
3145                 UChar trail = getNextNormalizedChar(source);
3146                 if(UTF_IS_TRAIL(trail)) { // do stuff with trail
3147                   if(getCETag(isZeroCE) == SURROGATE_TAG) {
3148                     uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
3149                     if(finalCE == 0) {
3150                       continue;
3151                     }
3152                   }
3153                 } else {
3154                   // broken surrogate sequence, thus completely ignorable
3155                   loadState(source, &state, TRUE);
3156                   continue;
3157                 }
3158                 loadState(source, &state, TRUE);
3159               } else { // no  more characters, so broken surrogate pair...
3160                 // this contraction will ultimately fail, but not because of us
3161                 continue;
3162               }
3163             } // else if(UTF_IS_LEAD(schar))
3164
3165             // Source string char was not in contraction table.
3166             //   Unless we have a discontiguous contraction, we have finished
3167             //   with this contraction.
3168             uint8_t sCC;
3169             if (schar < 0x300 ||
3170                 maxCC == 0 ||
3171                 (sCC = i_getCombiningClass(schar, coll)) == 0 ||
3172                 sCC>maxCC ||
3173                 (allSame != 0 && sCC == maxCC) ||
3174                 collIter_eos(source)) {
3175                     //  Contraction can not be discontiguous.
3176                     goBackOne(source);  // back up the source string by one,
3177                                         //  because  the character we just looked at was
3178                                         //  not part of the contraction.   */
3179                     CE = *(coll->contractionCEs +
3180                         (ContractionStart - coll->contractionIndex));
3181             } else {
3182                 //
3183                 // Contraction is possibly discontiguous.
3184                 //   Scan more of source string looking for a match
3185                 //
3186                 UChar tempchar;
3187                 /* find the next character if schar is not a base character
3188                     and we are not yet at the end of the string */
3189                 tempchar = getNextNormalizedChar(source);
3190                 goBackOne(source);
3191                 if (i_getCombiningClass(tempchar, coll) == 0) {
3192                     goBackOne(source);
3193                     /* Spit out the last char of the string, wasn't tasty enough */
3194                     CE = *(coll->contractionCEs +
3195                         (ContractionStart - coll->contractionIndex));
3196                 } else {
3197                     CE = getDiscontiguous(coll, source, ContractionStart);
3198                 }
3199             }
3200         } // else after if(schar == tchar)
3201
3202         if(CE == UCOL_NOT_FOUND) {
3203             /* The Source string did not match the contraction that we were checking.  */
3204             /*  Back up the source position to undo the effects of having partially    */
3205             /*   scanned through what ultimately proved to not be a contraction.       */
3206           loadState(source, &state, TRUE);
3207           CE = firstCE;
3208           break;
3209         }
3210
3211         if(!isContraction(CE)) {
3212             // The source string char was in the contraction table, and the corresponding
3213             //   CE is not a contraction CE.  We completed the contraction, break
3214             //   out of loop, this CE will end up being returned.  This is the normal
3215             //   way out of contraction handling when the source actually contained
3216             //   the contraction.
3217             break;
3218         }
3219
3220
3221         // The source string char was in the contraction table, and the corresponding
3222         //   CE is IS  a contraction CE.  We will continue looping to check the source
3223         //   string for the remaining chars in the contraction.
3224         uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));
3225         if(tempCE != UCOL_NOT_FOUND) {
3226             // We have scanned a a section of source string for which there is a
3227             //  CE from the contraction table.  Remember the CE and scan position, so
3228             //  that we can return to this point if further scanning fails to
3229             //  match a longer contraction sequence.
3230             firstCE = tempCE;
3231
3232             goBackOne(source);
3233             backupState(source, &state);
3234             getNextNormalizedChar(source);
3235
3236             // Another way to do this is:
3237             //collIterateState tempState;
3238             //backupState(source, &tempState);
3239             //goBackOne(source);
3240             //backupState(source, &state);
3241             //loadState(source, &tempState, TRUE);
3242
3243             // The problem is that for incomplete contractions we have to remember the previous
3244             // position. Before, the only thing I needed to do was state.pos--;
3245             // After iterator introduction and especially after introduction of normalizing
3246             // iterators, it became much more difficult to decrease the saved state.
3247             // I'm not yet sure which of the two methods above is faster.
3248         }
3249       } // for(;;)
3250       break;
3251       } // case CONTRACTION_TAG:
3252     case LONG_PRIMARY_TAG:
3253       {
3254         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3255         CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3256         return CE;
3257       }
3258     case EXPANSION_TAG:
3259       {
3260       /* This should handle expansion. */
3261       /* NOTE: we can encounter both continuations and expansions in an expansion! */
3262       /* I have to decide where continuations are going to be dealt with */
3263       uint32_t size;
3264       uint32_t i;    /* general counter */
3265       CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3266       size = getExpansionCount(CE);
3267       CE = *CEOffset++;
3268       if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3269         for(i = 1; i<size; i++) {
3270           *(source->CEpos++) = *CEOffset++;
3271         }
3272       } else { /* else, we do */
3273         while(*CEOffset != 0) {
3274           *(source->CEpos++) = *CEOffset++;
3275         }
3276       }
3277       return CE;
3278       }
3279     case DIGIT_TAG:
3280       {
3281       /*
3282          We do a check to see if we want to collate digits as numbers; if so we generate
3283          a custom collation key. Otherwise we pull out the value stored in the expansion table.
3284       */
3285       uint32_t size;
3286       uint32_t i;    /* general counter */
3287       collIterateState digitState;
3288
3289       if (source->coll->numericCollation == UCOL_ON){
3290         UChar32 char32 = 0;
3291
3292         uint32_t digIndx = 0;
3293         uint32_t endIndex = 0;
3294         uint32_t trailingZeroIndex = 0;
3295
3296         uint32_t primWeight = 0;
3297
3298         int32_t digVal = 0;
3299         uint8_t collateVal = 0;
3300
3301         UBool nonZeroValReached = FALSE;
3302
3303         uint8_t *numTempBuf;
3304         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3305         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3306
3307         numTempBuf = stackNumTempBuf;
3308         /*
3309              We parse the source string until we hit a char that's NOT a digit.
3310             Use this u_charDigitValue. This might be slow because we have to
3311             handle surrogates...
3312         */
3313 /*
3314         if (U16_IS_LEAD(ch)){
3315           if (!collIter_eos(source)) {
3316             backupState(source, &digitState);
3317             UChar trail = getNextNormalizedChar(source);
3318             if(U16_IS_TRAIL(trail)) {
3319               char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3320             } else {
3321               loadState(source, &digitState, TRUE);
3322               char32 = ch;
3323             }
3324           } else {
3325             char32 = ch;
3326           }
3327         } else {
3328           char32 = ch;
3329         }
3330         digVal = u_charDigitValue(char32);
3331 */
3332         digVal = u_charDigitValue(cp); // if we have arrived here, we have
3333         // already processed possible supplementaries that trigered the digit tag -
3334         // all supplementaries are marked in the UCA.
3335         /*
3336             We  pad a zero in front of the first element anyways. This takes
3337             care of the (probably) most common case where people are sorting things followed
3338             by a single digit
3339         */
3340         digIndx++;
3341         for(;;){
3342         // Make sure we have enough space.
3343         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
3344         {
3345             numTempBufSize *= 2;
3346             if (numTempBuf == stackNumTempBuf){
3347                 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
3348                 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
3349             }else
3350                 uprv_realloc(numTempBuf, numTempBufSize);
3351         }
3352
3353             // Skipping over leading zeroes.
3354             if (digVal != 0 || nonZeroValReached){
3355                 if (digVal != 0 && !nonZeroValReached)
3356                     nonZeroValReached = TRUE;
3357
3358                 /*
3359                     We parse the digit string into base 100 numbers (this fits into a byte).
3360                     We only add to the buffer in twos, thus if we are parsing an odd character,
3361                     that serves as the 'tens' digit while the if we are parsing an even one, that
3362                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
3363                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
3364                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
3365                     than all the other bytes.
3366                  */
3367
3368                 if (digIndx % 2 == 1){
3369                     collateVal += (uint8_t)digVal;
3370
3371                     // We don't enter the low-order-digit case unless we've already seen
3372                     // the high order, or for the first digit, which is always non-zero.
3373                     if (collateVal != 0)
3374                         trailingZeroIndex = 0;
3375
3376                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3377                     collateVal = 0;
3378                 }
3379                 else{
3380                     // We drop the collation value into the buffer so if we need to do
3381                     // a "front patch" we don't have to check to see if we're hitting the
3382                     // last element.
3383                     collateVal = (uint8_t)(digVal * 10);
3384
3385                     // Check for trailing zeroes.
3386                     if (collateVal == 0)
3387                     {
3388                         if (!trailingZeroIndex)
3389                             trailingZeroIndex = (digIndx/2) + 2;
3390                     }
3391                     else
3392                         trailingZeroIndex = 0;
3393
3394                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
3395                 }
3396                 digIndx++;
3397             }
3398
3399             // Get next character.
3400             if (!collIter_eos(source)){
3401                 ch = getNextNormalizedChar(source);
3402                 if (U16_IS_LEAD(ch)){
3403                   if (!collIter_eos(source)) {
3404                     backupState(source, &digitState);
3405                     UChar trail = getNextNormalizedChar(source);
3406                     if(U16_IS_TRAIL(trail)) {
3407                       char32 = U16_GET_SUPPLEMENTARY(ch, trail);
3408                     } else {
3409                       loadState(source, &digitState, TRUE);
3410                       char32 = ch;
3411                     }
3412                   }
3413                 } else {
3414                   char32 = ch;
3415                 }
3416
3417                 if ((digVal = u_charDigitValue(char32)) == -1){
3418                     // Resetting position to point to the next unprocessed char. We
3419                     // overshot it when doing our test/set for numbers.
3420                   if (char32 > 0xFFFF) { // For surrogates.
3421                     loadState(source, &digitState, TRUE);
3422                     //goBackOne(source);
3423                   }
3424               goBackOne(source);
3425                   break;
3426                 }
3427             } else {
3428               break;
3429             }
3430         }
3431
3432         if (nonZeroValReached == FALSE){
3433             digIndx = 2;
3434             numTempBuf[2] = 6;
3435         }
3436
3437         endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
3438         if (digIndx % 2 != 0){
3439             /*
3440                 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
3441                 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
3442                 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
3443                 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
3444             */
3445
3446             for(i = 2; i < endIndex; i++){
3447                 numTempBuf[i] =     (((((numTempBuf[i] - 6)/2) % 10) * 10) +
3448                                     (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
3449             }
3450             --digIndx;
3451         }
3452
3453         // Subtract one off of the last byte.
3454         numTempBuf[endIndex-1] -= 1;
3455
3456         /*
3457             We want to skip over the first two slots in the buffer. The first slot
3458             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
3459             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
3460         */
3461         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
3462         numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
3463
3464         // Now transfer the collation key to our collIterate struct.
3465         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
3466           size = ((endIndex+1) & ~1)/2;
3467           CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
3468                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
3469                 UCOL_BYTE_COMMON; // Tertiary weight.
3470           i = 2; // Reset the index into the buffer.
3471           while(i < endIndex)
3472           {
3473             primWeight = numTempBuf[i++] << 8;
3474             if ( i < endIndex)
3475                 primWeight |= numTempBuf[i++];
3476             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
3477           }
3478
3479           if (numTempBuf != stackNumTempBuf)
3480             uprv_free(numTempBuf);
3481       } else {
3482         // no numeric mode, we'll just switch to whatever we stashed and continue
3483           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3484           CE = *CEOffset++;
3485           break;
3486 #if 0
3487           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
3488           size = getExpansionCount(CE);
3489           CE = *CEOffset++;
3490           if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
3491             for(i = 1; i<size; i++) {
3492               *(source->CEpos++) = *CEOffset++;
3493             }
3494           } else { /* else, we do */
3495             while(*CEOffset != 0) {
3496               *(source->CEpos++) = *CEOffset++;
3497             }
3498           }
3499 #endif
3500       }
3501       return CE;
3502       }
3503     /* various implicits optimization */
3504     // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
3505     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
3506       //return getImplicit(cp, source, 0x04000000);
3507       return getImplicit(cp, source);
3508     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
3509       /* UCA is filled with these. Tailorings are NOT_FOUND */
3510       //return getImplicit(cp, source, 0);
3511       return getImplicit(cp, source);
3512     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
3513       return 0; /* broken surrogate sequence */
3514     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
3515       UChar nextChar;
3516       if( source->flags & UCOL_USE_ITERATOR) {
3517         if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) {
3518           cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3519           source->iterator->next(source->iterator);
3520           return getImplicit(cp, source);
3521         }  else {
3522           return 0;
3523         }
3524       } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
3525         U_IS_TRAIL((nextChar=*source->pos))) {
3526         cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
3527         source->pos++;
3528         return getImplicit(cp, source);
3529       } else {
3530         return 0; /* completely ignorable */
3531       }
3532     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
3533       {
3534         const uint32_t
3535           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
3536         //const uint32_t LCount = 19;
3537         const uint32_t VCount = 21;
3538         const uint32_t TCount = 28;
3539         //const uint32_t NCount = VCount * TCount;   // 588
3540         //const uint32_t SCount = LCount * NCount;   // 11172
3541         uint32_t L = ch - SBase;
3542
3543         // divide into pieces
3544
3545         uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation
3546         L /= TCount;
3547         uint32_t V = L % VCount;
3548         L /= VCount;
3549
3550         // offset them
3551
3552         L += LBase;
3553         V += VBase;
3554         T += TBase;
3555
3556         // return the first CE, but first put the rest into the expansion buffer
3557         if (!source->coll->image->jamoSpecial) { // FAST PATH
3558
3559           /**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
3560           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
3561           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
3562           if (T != TBase) {
3563               /**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
3564               /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
3565               *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
3566           }
3567
3568           /*return ucmpe32_get(UCA->mapping, L);*/ // return first one
3569           /*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
3570           return UTRIE_GET32_FROM_LEAD(coll->mapping, L);
3571
3572         } else { // Jamo is Special
3573           // Since Hanguls pass the FCD check, it is
3574           // guaranteed that we won't be in
3575           // the normalization buffer if something like this happens
3576           // However, if we are using a uchar iterator and normalization
3577           // is ON, the Hangul that lead us here is going to be in that
3578           // normalization buffer. Here we want to restore the uchar
3579           // iterator state and pull out of the normalization buffer
3580           if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) {
3581             source->flags = source->origFlags; // restore the iterator
3582             source->pos = NULL;
3583           }
3584           // Move Jamos into normalization buffer
3585           source->writableBuffer[0] = (UChar)L;
3586           source->writableBuffer[1] = (UChar)V;
3587           if (T != TBase) {
3588             source->writableBuffer[2] = (UChar)T;
3589             source->writableBuffer[3] = 0;
3590           } else {
3591             source->writableBuffer[2] = 0;
3592           }
3593
3594           source->fcdPosition       = source->pos;   // Indicate where to continue in main input string
3595                                                          //   after exhausting the writableBuffer
3596           source->pos   = source->writableBuffer;
3597           source->origFlags         = source->flags;
3598           source->flags            |= UCOL_ITER_INNORMBUF;
3599           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3600
3601           return(UCOL_IGNORABLE);
3602         }
3603       }
3604     case CHARSET_TAG:
3605     /* not yet implemented */
3606       /* probably after 1.8 */
3607       return UCOL_NOT_FOUND;
3608     default:
3609       *status = U_INTERNAL_PROGRAM_ERROR;
3610       CE=0;
3611       break;
3612     }
3613     if (CE <= UCOL_NOT_FOUND) break;
3614   }
3615   return CE;
3616 }
3617
3618
3619 /* now uses Mark's getImplicitPrimary code */
3620 static
3621 inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
3622   if(isNonChar(cp)) {
3623     return 0;
3624   }
3625
3626   uint32_t r = uprv_uca_getImplicitPrimary(cp);
3627
3628   *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
3629   collationSource->toReturn = collationSource->CEpos;
3630   return ((r & 0x0000FFFF)<<16) | 0x000000C0;
3631 }
3632
3633 /**
3634  * This function handles the special CEs like contractions, expansions,
3635  * surrogates, Thai.
3636  * It is called by both getPrevCE
3637  */
3638 uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
3639                           collIterate *source,
3640                           UErrorCode *status)
3641 {
3642   const uint32_t *CEOffset    = NULL;
3643         UChar    *UCharOffset = NULL;
3644         UChar    schar;
3645   const UChar    *constart    = NULL;
3646         uint32_t size;
3647         UChar    buffer[UCOL_MAX_BUFFER];
3648         uint32_t *endCEBuffer;
3649         UChar   *strbuffer;
3650         int32_t noChars = 0;
3651
3652   for(;;)
3653   {
3654     /* the only ces that loops are thai and contractions */
3655     switch (getCETag(CE))
3656     {
3657     case NOT_FOUND_TAG:  /* this tag always returns */
3658       return CE;
3659     case SURROGATE_TAG:  /* This is a surrogate pair */
3660       /* essentialy an engaged lead surrogate. */
3661       /* if you have encountered it here, it means that a */
3662       /* broken sequence was encountered and this is an error */
3663       return 0;
3664     case THAI_TAG:
3665       if  ((source->flags & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
3666             source->string == source->pos        || /* At start of string.|| */
3667             /* previous char not Thai prevowel */
3668             /*UCOL_ISTHAIBASECONSONANT(*(source->pos)) == FALSE ||*/ // This is from the old specs - we now rearrange unconditionally
3669             UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1)) == FALSE)
3670             //UCOL_ISTHAIPREVOWEL(*(source->pos - 1)) == FALSE)
3671       {
3672           /* Treat Thai as a length one expansion */
3673           /* find the offset to expansion table */
3674           CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE);
3675           CE = *CEOffset ++;
3676       }
3677       else
3678       {
3679           /*
3680           Move the prevowel and the following base Consonant into the
3681           normalization buffer with their order swapped
3682           */
3683           UChar32 cp = (UChar32)peekCharacter(source, 0);
3684           UBool reorder = TRUE;
3685
3686           int32_t decompLen = unorm_getDecomposition(cp, FALSE, source->writableBuffer, UCOL_WRITABLE_BUFFER_SIZE-1);
3687           if(decompLen < 0) {
3688             decompLen = -decompLen; // there was no decomposition
3689           } else { // we need to check if we will hit a contraction trigger because of decomposition
3690             int32_t i = decompLen;
3691             for(i = 0; i < decompLen; i++) {
3692               if(ucol_contractionEndCP(source->writableBuffer[i], coll)) {
3693                 reorder = FALSE;
3694               }
3695             }
3696           }
3697
3698           UChar *tempbuffer = source->writableBuffer +
3699                               (source->writableBufSize - 1);
3700           uprv_memcpy(tempbuffer-decompLen + 1, source->writableBuffer, sizeof(UChar)*decompLen);
3701           if(reorder) {
3702             *(tempbuffer - decompLen) = *(tempbuffer - decompLen + 1);
3703             *(tempbuffer - decompLen + 1)     = peekCharacter(source, -1);
3704           } else {
3705             *(tempbuffer - decompLen) = peekCharacter(source, -1);
3706           }
3707           *(tempbuffer - decompLen - 1) = 0;
3708
3709
3710 /*
3711           UChar *tempbuffer = source->writableBuffer +
3712                               (source->writableBufSize - 1);
3713           *(tempbuffer - 2) = 0;
3714           *(tempbuffer - 1) = peekCharacter(source, 0);
3715           *(tempbuffer)     = peekCharacter(source, -1);
3716 */
3717           /*
3718           Indicate where to continue in main input string after exhausting
3719           the writableBuffer
3720           */
3721           if (source->pos - 1 == source->string) {
3722               source->fcdPosition = NULL;
3723           } else {
3724             source->fcdPosition       = source->pos-2;
3725           }
3726
3727           source->pos               = tempbuffer+1; // we're doing predecrement, right?
3728           source->origFlags         = source->flags;
3729           source->flags            |= UCOL_ITER_INNORMBUF;
3730           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
3731
3732           //CE = UCOL_IGNORABLE;
3733           return(UCOL_IGNORABLE);
3734       }
3735       break;
3736     case SPEC_PROC_TAG:
3737       {
3738         // Special processing is getting a CE that is preceded by a certain prefix
3739         // Currently this is only needed for optimizing Japanese length and iteration marks.
3740         // When we encouter a special processing tag, we go backwards and try to see if
3741         // we have a match.
3742         // Contraction tables are used - so the whole process is not unlike contraction.
3743         // prefix data is stored backwards in the table.
3744         const UChar *UCharOffset;
3745         UChar schar, tchar;
3746         collIterateState prefixState;
3747         backupState(source, &prefixState);
3748         for(;;) {
3749         // This loop will run once per source string character, for as long as we
3750         //  are matching a potential contraction sequence
3751
3752           // First we position ourselves at the begining of contraction sequence
3753           const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
3754
3755           if (collIter_bos(source)) {
3756             CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
3757             break;
3758           }
3759           schar = getPrevNormalizedChar(source);
3760           goBackOne(source);
3761
3762           while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
3763             UCharOffset++;
3764           }
3765
3766           if (schar == tchar) {
3767               // Found the source string char in the table.
3768               //  Pick up the corresponding CE from the table.
3769               CE = *(coll->contractionCEs +
3770                   (UCharOffset - coll->contractionIndex));
3771           }
3772           else
3773           {
3774               // if there is a completely ignorable code point in the middle of
3775               // a prefix, we need to act as if it's not there
3776               // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
3777               // lone surrogates cannot be set to zero as it would break other processing
3778               uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
3779               // it's easy for BMP code points
3780               if(isZeroCE == 0) {
3781                 continue;
3782               } else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
3783                 // for supplementary code points, we have to check the next one
3784                 // situations where we are going to ignore
3785                 // 1. beginning of the string: schar is a lone surrogate
3786                 // 2. schar is a lone surrogate
3787                 // 3. schar is a trail surrogate in a valid surrogate sequence
3788                 //    that is explicitly set to zero.
3789                 if (!collIter_bos(source)) {
3790                   UChar lead;
3791                   if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
3792                     isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
3793                     if(getCETag(isZeroCE) == SURROGATE_TAG) {
3794                       uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
3795                       if(finalCE == 0) {
3796                         // this is a real, assigned completely ignorable code point
3797                         goBackOne(source);
3798                         continue;
3799                       }
3800                     }
3801                   } else {
3802                     // lone surrogate, completely ignorable
3803                     continue;
3804                   }
3805                 } else {
3806                   // lone surrogate at the beggining, completely ignorable
3807                   continue;
3808                 }
3809               }
3810               // Source string char was not in the table.
3811               //   We have not found the prefix.
3812               CE = *(coll->contractionCEs +
3813                   (ContractionStart - coll->contractionIndex));
3814           }
3815
3816           if(!isPrefix(CE)) {
3817               // The source string char was in the contraction table, and the corresponding
3818               //   CE is not a prefix CE.  We found the prefix, break
3819               //   out of loop, this CE will end up being returned.  This is the normal
3820               //   way out of prefix handling when the source actually contained
3821               //   the prefix.
3822               break;
3823           }
3824         }
3825       loadState(source, &prefixState, TRUE);
3826       break;
3827       }
3828
3829     case CONTRACTION_TAG:
3830         /* to ensure that the backwards and forwards iteration matches, we
3831         take the current region of most possible match and pass it through
3832         the forward iteration. this will ensure that the obstinate problem of
3833         overlapping contractions will not occur.
3834         */
3835         schar = peekCharacter(source, 0);
3836         constart = (UChar *)coll->image + getContractOffset(CE);
3837         if (isAtStartPrevIterate(source)
3838             /* commented away contraction end checks after adding the checks
3839             in getPrevCE  */) {
3840             /* start of string or this is not the end of any contraction */
3841             CE = *(coll->contractionCEs +
3842                      (constart - coll->contractionIndex));
3843             break;
3844         }
3845         strbuffer = buffer;
3846         UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
3847         *(UCharOffset --) = 0;
3848         noChars = 0;
3849         // have to swap thai characters
3850         while (ucol_unsafeCP(schar, coll) || UCOL_ISTHAIPREVOWEL(peekCharacter(source, -1))) {
3851           // we might have ended here after trying to reorder Thai, but seeing that there are unsafe points
3852           // in the backward processing
3853             *(UCharOffset) = schar;
3854             noChars++;
3855             UCharOffset --;
3856             schar = getPrevNormalizedChar(source);
3857             goBackOne(source);
3858             // TODO: when we exhaust the contraction buffer,
3859             // it needs to get reallocated. The problem is
3860             // that the size depends on the string which is
3861             // not iterated over. However, since we're travelling
3862             // backwards, we already had to set the iterator at
3863             // the end - so we might as well know where we are?
3864             if (UCharOffset + 1 == buffer) {
3865                 /* we have exhausted the buffer */
3866               int32_t newsize = 0;
3867               if(source->pos) { // actually dealing with a position
3868                 newsize = source->pos - source->string + 1;
3869               } else { // iterator
3870                 newsize = 4 * UCOL_MAX_BUFFER;
3871               }
3872                 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) *
3873                                              (newsize + UCOL_MAX_BUFFER));
3874                 /* test for NULL */
3875                 if (strbuffer == NULL) {
3876                     *status = U_MEMORY_ALLOCATION_ERROR;
3877                     return UCOL_NO_MORE_CES;
3878                 }
3879                 UCharOffset = strbuffer + newsize;
3880                 uprv_memcpy(UCharOffset, buffer,
3881                                              UCOL_MAX_BUFFER * sizeof(UChar));
3882                 UCharOffset --;
3883             }
3884             if ((source->pos && (source->pos == source->string ||
3885                 ((source->flags & UCOL_ITER_INNORMBUF) &&
3886                 *(source->pos - 1) == 0 && source->fcdPosition == NULL)))
3887                 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) {
3888                 break;
3889             }
3890         }
3891         /* adds the initial base character to the string */
3892         *(UCharOffset) = schar;
3893         noChars++;
3894
3895         /* a new collIterate is used to simplify things, since using the current
3896         collIterate will mean that the forward and backwards iteration will
3897         share and change the same buffers. we don't want to get into that. */
3898         collIterate temp;
3899         //IInit_collIterate(coll, UCharOffset, -1, &temp);
3900         IInit_collIterate(coll, UCharOffset, noChars, &temp);
3901         temp.flags &= ~UCOL_ITER_NORM;
3902
3903         CE = ucol_IGetNextCE(coll, &temp, status);
3904         endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE;
3905         while (CE != UCOL_NO_MORE_CES) {
3906             *(source->CEpos ++) = CE;
3907             if (source->CEpos == endCEBuffer) {
3908                 /* ran out of CE space, bail.
3909                 there's no guarantee of the right character position after
3910                 this bail*/
3911                 *status = U_BUFFER_OVERFLOW_ERROR;
3912                 source->CEpos = source->CEs;
3913                 freeHeapWritableBuffer(&temp);
3914                 if (strbuffer != buffer) {
3915                     uprv_free(strbuffer);
3916                 }
3917                 return (uint32_t)UCOL_NULLORDER;
3918             }
3919             CE = ucol_IGetNextCE(coll, &temp, status);
3920         }
3921         freeHeapWritableBuffer(&temp);
3922         if (strbuffer != buffer) {
3923             uprv_free(strbuffer);
3924         }
3925         source->toReturn = source->CEpos - 1;
3926         if (source->toReturn == source->CEs) {
3927             source->CEpos = source->CEs;
3928         }
3929         return *(source->toReturn);
3930     case LONG_PRIMARY_TAG:
3931       {
3932         *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON;
3933         *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
3934         source->toReturn = source->CEpos - 1;
3935         return *(source->toReturn);
3936       }
3937     case EXPANSION_TAG: /* this tag always returns */
3938       /*
3939       This should handle expansion.
3940       NOTE: we can encounter both continuations and expansions in an expansion!
3941       I have to decide where continuations are going to be dealt with
3942       */
3943       /* find the offset to expansion table */
3944       CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
3945       size     = getExpansionCount(CE);
3946       if (size != 0) {
3947         /*
3948         if there are less than 16 elements in expansion, we don't terminate
3949         */
3950         uint32_t count;
3951         for (count = 0; count < size; count++) {
3952           *(source->CEpos ++) = *CEOffset++;
3953         }
3954       }
3955       else {
3956         /* else, we do */
3957         while (*CEOffset != 0) {
3958           *(source->CEpos ++) = *CEOffset ++;
3959         }
3960       }
3961       source->toReturn = source->CEpos - 1;
3962       // in case of one element expansion, we
3963       // want to immediately return CEpos
3964       if(source->toReturn == source->CEs) {
3965         source->CEpos = source->CEs;
3966       }
3967       return *(source->toReturn);
3968      case DIGIT_TAG:
3969       {
3970       /*
3971          We do a check to see if we want to collate digits as numbers; if so we generate
3972          a custom collation key. Otherwise we pull out the value stored in the expansion table.
3973       */
3974       //uint32_t size;
3975       uint32_t i;    /* general counter */
3976       collIterateState state;
3977
3978       if (source->coll->numericCollation == UCOL_ON){
3979         UChar32 char32 = 0;
3980
3981         uint32_t digIndx = 0;
3982         uint32_t endIndex = 0;
3983         uint32_t leadingZeroIndex = 0;
3984         uint32_t trailingZeroCount = 0;
3985
3986         uint32_t primWeight = 0;
3987
3988         int32_t digVal = 0;
3989         uint8_t collateVal = 0;
3990
3991         UBool nonZeroValReached = FALSE;
3992
3993         uint8_t *numTempBuf;
3994         uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
3995         uint32_t numTempBufSize = UCOL_MAX_BUFFER;
3996
3997         numTempBuf = stackNumTempBuf;
3998         /*
3999              We parse the source string until we hit a char that's NOT a digit.
4000             Use this u_charDigitValue. This might be slow because we have to
4001             handle surrogates...
4002         */
4003
4004         if (U16_IS_TRAIL (ch)){
4005             if (!collIter_bos(source)){
4006               UChar lead = getPrevNormalizedChar(source);
4007               if(U16_IS_LEAD(lead)) {
4008                 char32 = U16_GET_SUPPLEMENTARY(lead,ch);
4009                 goBackOne(source);
4010               } else {
4011                 char32 = ch;
4012               }
4013             } else {
4014                 char32 = ch;
4015             }
4016         } else {
4017             char32 = ch;
4018         }
4019         digVal = u_charDigitValue(char32);
4020
4021         for(;;){
4022         // Make sure we have enough space.
4023         if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
4024         {
4025             numTempBufSize *= 2;
4026             if (numTempBuf == stackNumTempBuf){
4027                 numTempBuf = (uint8_t *)uprv_malloc(sizeof(uint8_t) * numTempBufSize);
4028                 uprv_memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
4029             }else
4030                 uprv_realloc(numTempBuf, numTempBufSize);
4031         }
4032
4033             // Skip over trailing zeroes, and keep a count of them.
4034             if (digVal != 0)
4035                     nonZeroValReached = TRUE;
4036             if (nonZeroValReached){
4037                 /*
4038                     We parse the digit string into base 100 numbers (this fits into a byte).
4039                     We only add to the buffer in twos, thus if we are parsing an odd character,
4040                     that serves as the 'tens' digit while the if we are parsing an even one, that
4041                     is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
4042                     a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
4043                     overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
4044                     than all the other bytes.
4045
4046                     Since we're doing in this reverse we want to put the first digit encountered into the
4047                     ones place and the second digit encountered into the tens place.
4048                  */
4049
4050                 if ((digIndx + trailingZeroCount) % 2 == 1){
4051                     // High-order digit case (tens place)
4052                     collateVal += (uint8_t)(digVal * 10);
4053
4054                     // We cannot set leadingZeroIndex unless it has been set for the
4055                     // low-order digit. Therefore, all we can do for the high-order
4056                     // digit is turn it off, never on.
4057                     // The only time we will have a high digit without a low is for
4058                     // the very first non-zero digit, so no zero check is necessary.
4059                     if (collateVal != 0)
4060                         leadingZeroIndex = 0;
4061
4062                     numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6;
4063                     collateVal = 0;
4064                 }
4065                 else{
4066                     // Low-order digit case (ones place)
4067                     collateVal = (uint8_t)digVal;
4068
4069                     // Check for leading zeroes.
4070                     if (collateVal == 0)
4071                     {
4072                         if (!leadingZeroIndex)
4073                             leadingZeroIndex = (digIndx/2) + 2;
4074                     }
4075                     else
4076                         leadingZeroIndex = 0;
4077
4078                     // No need to write to buffer; the case of a last odd digit
4079                     // is handled below.
4080                 }
4081                 ++digIndx;
4082             }
4083             else
4084                 ++trailingZeroCount;
4085
4086             if (!collIter_bos(source)){
4087                 ch = getPrevNormalizedChar(source);
4088                 //goBackOne(source);
4089                 if (U16_IS_TRAIL(ch)){
4090                     backupState(source, &state);
4091                     if (!collIter_bos(source))
4092                     {
4093                         goBackOne(source);
4094                         UChar lead = getPrevNormalizedChar(source);
4095                         if(U16_IS_LEAD(lead)) {
4096                           char32 = U16_GET_SUPPLEMENTARY(lead,ch);
4097                         } else {
4098                           loadState(source, &state, FALSE);
4099                           char32 = ch;
4100                         }
4101                     }
4102                 }
4103                 else
4104                     char32 = ch;
4105
4106                 if ((digVal = u_charDigitValue(char32)) == -1){
4107                   if (char32 > 0xFFFF) {// For surrogates.
4108                     loadState(source, &state, FALSE);
4109                   }
4110                     // Don't need to "reverse" the goBackOne call,
4111                     // as this points to the next position to process..
4112                     //if (char32 > 0xFFFF) // For surrogates.
4113                         //getNextNormalizedChar(source);
4114                     break;
4115                 }
4116                 goBackOne(source);
4117             }else
4118                 break;
4119         }
4120
4121         if (nonZeroValReached == FALSE){
4122             digIndx = 2;
4123             trailingZeroCount = 0;
4124             numTempBuf[2] = 6;
4125         }
4126
4127         if ((digIndx + trailingZeroCount) % 2 != 0){
4128                 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
4129             digIndx += 1;       // The implicit leading zero
4130             }
4131         if (trailingZeroCount % 2 != 0){
4132             // We had to consume one trailing zero for the low digit
4133             // of the least significant byte
4134             digIndx += 1;       // The trailing zero not in the exponent
4135             trailingZeroCount -= 1;
4136         }
4137
4138         endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
4139
4140         // Subtract one off of the last byte. Really the first byte here, but it's reversed...
4141         numTempBuf[2] -= 1;
4142
4143         /*
4144             We want to skip over the first two slots in the buffer. The first slot
4145             is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the
4146             sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
4147             The exponent must be adjusted by the number of leading zeroes, and the number of
4148             trailing zeroes.
4149         */
4150         numTempBuf[0] = UCOL_CODAN_PLACEHOLDER;
4151         uint32_t exponent = (digIndx+trailingZeroCount)/2;
4152         if (leadingZeroIndex)
4153             exponent -= ((digIndx/2) + 2 - leadingZeroIndex);
4154         numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F));
4155
4156         // Now transfer the collation key to our collIterate struct.
4157         // The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
4158         //size = ((endIndex+1) & ~1)/2;
4159           *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
4160                 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
4161                 UCOL_BYTE_COMMON; // Tertiary weight.
4162           i = endIndex - 1; // Reset the index into the buffer.
4163           while(i >= 2)
4164           {
4165             primWeight = numTempBuf[i--] << 8;
4166             if ( i >= 2)
4167                 primWeight |= numTempBuf[i--];
4168             *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
4169           }
4170           if (numTempBuf != stackNumTempBuf)
4171             uprv_free(numTempBuf);
4172
4173           source->toReturn = source->CEpos -1;
4174           return *(source->toReturn);
4175       }
4176       else {
4177           CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4178           CE = *(CEOffset++);
4179           break;
4180 #if 0
4181         /* find the offset to expansion table */
4182           CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
4183           size     = getExpansionCount(CE);
4184           if (size != 0) {
4185             /*
4186             if there are less than 16 elements in expansion, we don't terminate
4187             */
4188             uint32_t count;
4189             for (count = 0; count < size; count++) {
4190               *(source->CEpos ++) = *CEOffset++;
4191             }
4192           }
4193           else {
4194             /* else, we do */
4195             while (*CEOffset != 0) {
4196               *(source->CEpos ++) = *CEOffset ++;
4197             }
4198           }
4199           source->toReturn = source->CEpos - 1;
4200           // in case of one element expansion, we
4201           // want to immediately return CEpos
4202           if(source->toReturn == source->CEs) {
4203             source->CEpos = source->CEs;
4204           }
4205           return *(source->toReturn);
4206 #endif
4207       }
4208       }
4209     case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
4210       {
4211         const uint32_t
4212           SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7;
4213         //const uint32_t LCount = 19;
4214         const uint32_t VCount = 21;
4215         const uint32_t TCount = 28;
4216         //const uint32_t NCount = VCount * TCount;   /* 588 */
4217         //const uint32_t SCount = LCount * NCount;   /* 11172 */
4218
4219         uint32_t L = ch - SBase;
4220         /*
4221         divide into pieces.
4222         we do it in this order since some compilers can do % and / in one
4223         operation
4224         */
4225         uint32_t T = L % TCount;
4226         L /= TCount;
4227         uint32_t V = L % VCount;
4228         L /= VCount;
4229
4230         /* offset them */
4231         L += LBase;
4232         V += VBase;
4233         T += TBase;
4234
4235         /*
4236         return the first CE, but first put the rest into the expansion buffer
4237         */
4238         if (!source->coll->image->jamoSpecial)
4239         {
4240           /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
4241           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
4242           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L);
4243           /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
4244           /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
4245           *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
4246           if (T != TBase)
4247             /**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
4248             /**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
4249             *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
4250
4251           source->toReturn = source->CEpos - 1;
4252           return *(source->toReturn);
4253         } else {
4254           // Since Hanguls pass the FCD check, it is
4255           // guaranteed that we won't be in
4256           // the normalization buffer if something like this happens
4257           // Move Jamos into normalization buffer
4258           /*
4259           Move the Jamos into the
4260           normalization buffer
4261           */
4262           UChar *tempbuffer = source->writableBuffer +
4263                               (source->writableBufSize - 1);
4264           *(tempbuffer) = 0;
4265           if (T != TBase) {
4266             *(tempbuffer - 1) = (UChar)T;
4267             *(tempbuffer - 2) = (UChar)V;
4268             *(tempbuffer - 3) = (UChar)L;
4269             *(tempbuffer - 4) = 0;
4270           } else {
4271             *(tempbuffer - 1) = (UChar)V;
4272             *(tempbuffer - 2) = (UChar)L;
4273             *(tempbuffer - 3) = 0;
4274           }
4275
4276           /*
4277           Indicate where to continue in main input string after exhausting
4278           the writableBuffer
4279           */
4280           if (source->pos  == source->string) {
4281             source->fcdPosition = NULL;
4282           } else {
4283             source->fcdPosition       = source->pos-1;
4284           }
4285
4286           source->pos               = tempbuffer;
4287           source->origFlags         = source->flags;
4288           source->flags            |= UCOL_ITER_INNORMBUF;
4289           source->flags            &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN);
4290
4291           return(UCOL_IGNORABLE);
4292         }
4293       }
4294     case LEAD_SURROGATE_TAG:  /* D800-DBFF*/
4295       return 0; /* broken surrogate sequence */
4296     case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
4297     {
4298       UChar32 cp = 0;
4299       UChar  prevChar;
4300       UChar *prev;
4301       if (isAtStartPrevIterate(source)) {
4302           /* we are at the start of the string, wrong place to be at */
4303           return 0;
4304       }
4305       if (source->pos != source->writableBuffer) {
4306           prev     = source->pos - 1;
4307       } else {
4308           prev     = source->fcdPosition;
4309       }
4310       prevChar = *prev;
4311
4312       /* Handles Han and Supplementary characters here.*/
4313       if (UTF_IS_FIRST_SURROGATE(prevChar)) {
4314         cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
4315         source->pos = prev;
4316       } else {
4317         return 0; /* completely ignorable */
4318       }
4319       return getPrevImplicit(cp, source);
4320     }
4321     // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
4322     case CJK_IMPLICIT_TAG:    /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
4323       return getPrevImplicit(ch, source);
4324     case IMPLICIT_TAG:        /* everything that is not defined otherwise */
4325       return getPrevImplicit(ch, source);
4326       /* UCA is filled with these. Tailorings are NOT_FOUND */
4327     /* not yet implemented */
4328     case CHARSET_TAG:  /* this tag always returns */
4329       /* probably after 1.8 */
4330       return UCOL_NOT_FOUND;
4331     default:           /* this tag always returns */
4332       *status = U_INTERNAL_PROGRAM_ERROR;
4333       CE=0;
4334       break;
4335     }
4336     if (CE <= UCOL_NOT_FOUND) {
4337       break;
4338     }
4339   }
4340   return CE;
4341 }
4342
4343 /* This should really be a macro        */
4344 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
4345 /* anyway */
4346 static
4347 uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) {
4348 #ifdef UCOL_DEBUG
4349   fprintf(stderr, ".");
4350 #endif
4351   uint8_t *newStart = NULL;
4352   uint32_t offset = *secondaries-secStart;
4353
4354   if(secStart==second) {
4355     newStart=(uint8_t*)uprv_malloc(newSize);
4356     if(newStart==NULL) {
4357       *status = U_MEMORY_ALLOCATION_ERROR;
4358       return NULL;
4359     }
4360     uprv_memcpy(newStart, secStart, *secondaries-secStart);
4361   } else {
4362     newStart=(uint8_t*)uprv_realloc(secStart, newSize);
4363     if(newStart==NULL) {
4364       *status = U_MEMORY_ALLOCATION_ERROR;
4365       return NULL;
4366     }
4367   }
4368   *secondaries=newStart+offset;
4369   *secSize=newSize;
4370   return newStart;
4371 }
4372
4373
4374 /* This should really be a macro                                                                      */
4375 /* This function is used to reverse parts of a buffer. We need this operation when doing continuation */
4376 /* secondaries in French                                                                              */
4377 /*
4378 void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) {
4379   uint8_t temp;
4380   while(start<end) {
4381     temp = *start;
4382     *start++ = *end;
4383     *end-- = temp;
4384   }
4385 }
4386 */
4387
4388 #define uprv_ucol_reverse_buffer(TYPE, start, end) { \
4389   TYPE tempA; \
4390 while((start)<(end)) { \
4391     tempA = *(start); \
4392     *(start)++ = *(end); \
4393     *(end)-- = tempA; \
4394 } \
4395 }
4396
4397 /****************************************************************************/
4398 /* Following are the sortkey generation functions                           */
4399 /*                                                                          */
4400 /****************************************************************************/
4401
4402 /**
4403  * Merge two sort keys.
4404  * This is useful, for example, to combine sort keys from first and last names
4405  * to sort such pairs.
4406  * Merged sort keys consider on each collation level the first part first entirely,
4407  * then the second one.
4408  * It is possible to merge multiple sort keys by consecutively merging
4409  * another one with the intermediate result.
4410  *
4411  * The length of the merge result is the sum of the lengths of the input sort keys
4412  * minus 1.
4413  *
4414  * @param src1 the first sort key
4415  * @param src1Length the length of the first sort key, including the zero byte at the end;
4416  *        can be -1 if the function is to find the length
4417  * @param src2 the second sort key
4418  * @param src2Length the length of the second sort key, including the zero byte at the end;
4419  *        can be -1 if the function is to find the length
4420  * @param dest the buffer where the merged sort key is written,
4421  *        can be NULL if destCapacity==0
4422  * @param destCapacity the number of bytes in the dest buffer
4423  * @return the length of the merged sort key, src1Length+src2Length-1;
4424  *         can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments),
4425  *         in which cases the contents of dest is undefined
4426  *
4427  * @draft
4428  */
4429 U_CAPI int32_t U_EXPORT2
4430 ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length,
4431                    const uint8_t *src2, int32_t src2Length,
4432                    uint8_t *dest, int32_t destCapacity) {
4433     int32_t destLength;
4434     uint8_t b;
4435
4436     /* check arguments */
4437     if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) ||
4438         src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) ||
4439         destCapacity<0 || (destCapacity>0 && dest==NULL)
4440     ) {
4441         /* error, attempt to write a zero byte and return 0 */
4442         if(dest!=NULL && destCapacity>0) {
4443             *dest=0;
4444         }
4445         return 0;
4446     }
4447
4448     /* check lengths and capacity */
4449     if(src1Length<0) {
4450         src1Length=(int32_t)uprv_strlen((const char *)src1)+1;
4451     }
4452     if(src2Length<0) {
4453         src2Length=(int32_t)uprv_strlen((const char *)src2)+1;
4454     }
4455
4456     destLength=src1Length+src2Length-1;
4457     if(destLength>destCapacity) {
4458         /* the merged sort key does not fit into the destination */
4459         return destLength;
4460     }
4461
4462     /* merge the sort keys with the same number of levels */
4463     while(*src1!=0 && *src2!=0) { /* while both have another level */
4464         /* copy level from src1 not including 00 or 01 */
4465         while((b=*src1)>=2) {
4466             ++src1;
4467             *dest++=b;
4468         }
4469
4470         /* add a 02 merge separator */
4471         *dest++=2;
4472
4473         /* copy level from src2 not including 00 or 01 */
4474         while((b=*src2)>=2) {
4475             ++src2;
4476             *dest++=b;
4477         }
4478
4479         /* if both sort keys have another level, then add a 01 level separator and continue */
4480         if(*src1==1 && *src2==1) {
4481             ++src1;
4482             ++src2;
4483             *dest++=1;
4484         }
4485     }
4486
4487     /*
4488      * here, at least one sort key is finished now, but the other one
4489      * might have some contents left from containing more levels;
4490      * that contents is just appended to the result
4491      */
4492     if(*src1!=0) {
4493         /* src1 is not finished, therefore *src2==0, and src1 is appended */
4494         src2=src1;
4495     }
4496     /* append src2, "the other, unfinished sort key" */
4497     uprv_strcpy((char *)dest, (const char *)src2);
4498
4499     /* trust that neither sort key contained illegally embedded zero bytes */
4500     return destLength;
4501 }
4502
4503 /* sortkey API */
4504 U_CAPI int32_t U_EXPORT2
4505 ucol_getSortKey(const    UCollator    *coll,
4506         const    UChar        *source,
4507         int32_t        sourceLength,
4508         uint8_t        *result,
4509         int32_t        resultLength)
4510 {
4511   UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY);
4512   if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
4513       int32_t actualSrcLen = sourceLength;
4514       if (actualSrcLen==-1 && source!=NULL) {
4515           actualSrcLen = u_strlen(source);
4516       }
4517       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, actualSrcLen);
4518   }
4519
4520   UErrorCode status = U_ZERO_ERROR;
4521   int32_t keySize   = 0;
4522
4523   if(source != NULL) {
4524       // source == NULL is actually an error situation, but we would need to
4525       // have an error code to return it. Until we introduce a new
4526       // API, it stays like this
4527
4528       /* this uses the function pointer that is set in updateinternalstate */
4529       /* currently, there are two funcs: */
4530       /*ucol_calcSortKey(...);*/
4531       /*ucol_calcSortKeySimpleTertiary(...);*/
4532
4533       keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status);
4534       //((UCollator *)coll)->errorCode = status; /*semantically const */
4535   }
4536   UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize);
4537   UTRACE_EXIT_STATUS(status);
4538   return keySize;
4539 }
4540
4541 /* this function is called by the C++ API for sortkey generation */
4542 U_CFUNC int32_t
4543 ucol_getSortKeyWithAllocation(const UCollator *coll,
4544                               const UChar *source, int32_t sourceLength,
4545                               uint8_t **pResult,
4546                               UErrorCode *pErrorCode) {
4547     *pResult = 0;
4548     return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode);
4549 }
4550
4551 #define UCOL_FSEC_BUF_SIZE 256
4552
4553 /* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0  */
4554 /* or if we run out of space while making a sortkey and want to return ASAP                                   */
4555 int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
4556     UErrorCode status = U_ZERO_ERROR;
4557     const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4558     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4559     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4560     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4561     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4562     UBool  doCase = (coll->caseLevel == UCOL_ON);
4563     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4564     //UBool  qShifted = shifted  && (compareQuad == 0);
4565     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4566     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4567     uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE];
4568     uint8_t *fSecs = fSecsBuff;
4569     uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE;
4570     uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL;
4571
4572     uint32_t variableTopValue = coll->variableTopValue;
4573     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4574     if(doHiragana) {
4575       UCOL_COMMON_BOT4++;
4576       /* allocate one more space for hiragana */
4577     }
4578     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4579
4580     uint32_t order = UCOL_NO_MORE_CES;
4581     uint8_t primary1 = 0;
4582     uint8_t primary2 = 0;
4583     uint8_t secondary = 0;
4584     uint8_t tertiary = 0;
4585     int32_t caseShift = 0;
4586     uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */
4587
4588     uint8_t caseSwitch = coll->caseSwitch;
4589     uint8_t tertiaryMask = coll->tertiaryMask;
4590     uint8_t tertiaryCommon = coll->tertiaryCommon;
4591
4592     UBool wasShifted = FALSE;
4593     UBool notIsContinuation = FALSE;
4594     uint8_t leadPrimary = 0;
4595
4596
4597     for(;;) {
4598           order = ucol_IGetNextCE(coll, s, &status);
4599           if(order == UCOL_NO_MORE_CES) {
4600               break;
4601           }
4602
4603           if(order == 0) {
4604             continue;
4605           }
4606
4607           notIsContinuation = !isContinuation(order);
4608
4609
4610           if(notIsContinuation) {
4611             tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK));
4612           } else {
4613             tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
4614           }
4615           secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4616           primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
4617           primary1 = (uint8_t)(order >> 8);
4618
4619
4620           if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
4621             || (!notIsContinuation && wasShifted))
4622             || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
4623             /* and other ignorables should be removed if following a shifted code point */
4624             if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
4625                                 /* we should just completely ignore it */
4626               continue;
4627             }
4628             if(compareQuad == 0) {
4629               if(c4 > 0) {
4630                 currentSize += (c2/UCOL_BOT_COUNT4)+1;
4631                 c4 = 0;
4632               }
4633               currentSize++;
4634               if(primary2 != 0) {
4635                 currentSize++;
4636               }
4637             }
4638             wasShifted = TRUE;
4639           } else {
4640             wasShifted = FALSE;
4641             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
4642             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
4643             /* calculate sortkey size */
4644             if(primary1 != UCOL_IGNORABLE) {
4645               if(notIsContinuation) {
4646                 if(leadPrimary == primary1) {
4647                   currentSize++;
4648                 } else {
4649                   if(leadPrimary != 0) {
4650                     currentSize++;
4651                   }
4652                   if(primary2 == UCOL_IGNORABLE) {
4653                   /* one byter, not compressed */
4654                       currentSize++;
4655                       leadPrimary = 0;
4656                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
4657                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
4658                       (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
4659                   /* not compressible */
4660                       leadPrimary = 0;
4661                       currentSize+=2;
4662                   } else { /* compress */
4663                       leadPrimary = primary1;
4664                       currentSize+=2;
4665                   }
4666                 }
4667               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
4668                 currentSize++;
4669                 if(primary2 != UCOL_IGNORABLE) {
4670                   currentSize++;
4671                 }
4672               }
4673             }
4674
4675             if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */
4676               if(!isFrenchSec){
4677                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
4678                   c2++;
4679                 } else {
4680                   if(c2 > 0) {
4681                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4682                       currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1;
4683                     } else {
4684                       currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1;
4685                     }
4686                     c2 = 0;
4687                   }
4688                   currentSize++;
4689                 }
4690               } else {
4691                 fSecs[fSecsLen++] = secondary;
4692                 if(fSecsLen == fSecsMaxLen) {
4693                   if(fSecs == fSecsBuff) {
4694                     fSecs = (uint8_t *)uprv_malloc(2*fSecsLen);
4695                   } else {
4696                     fSecs = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen);
4697                   }
4698                   if(fSecs == NULL) {
4699                     status = U_MEMORY_ALLOCATION_ERROR;
4700                     return -1;
4701                   }
4702                   fSecsMaxLen *= 2;
4703                 }
4704                 if(notIsContinuation) {
4705                   if (frenchStartPtr != NULL) {
4706                       /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
4707                     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4708                     frenchStartPtr = NULL;
4709                   }
4710                 } else {
4711                   if (frenchStartPtr == NULL) {
4712                     frenchStartPtr = fSecs+fSecsLen-2;
4713                   }
4714                   frenchEndPtr = fSecs+fSecsLen-1;
4715                 }
4716               }
4717             }
4718
4719             if(doCase) {
4720               if (caseShift  == 0) {
4721                 currentSize++;
4722                 caseShift = UCOL_CASE_SHIFT_START;
4723               }
4724               if((tertiary&0x3F) > 0 && notIsContinuation) {
4725                 caseShift--;
4726                 if((tertiary &0xC0) != 0) {
4727                   if (caseShift  == 0) {
4728                     currentSize++;
4729                     caseShift = UCOL_CASE_SHIFT_START;
4730                   }
4731                   caseShift--;
4732                 }
4733               }
4734             } else {
4735               if(notIsContinuation) {
4736                 tertiary ^= caseSwitch;
4737               }
4738             }
4739
4740             tertiary &= tertiaryMask;
4741             if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */
4742               if (tertiary == tertiaryCommon && notIsContinuation) {
4743                 c3++;
4744               } else {
4745                 if(c3 > 0) {
4746                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
4747                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
4748                     currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1;
4749                   } else {
4750                     currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1;
4751                   }
4752                   c3 = 0;
4753                 }
4754                 currentSize++;
4755               }
4756             }
4757
4758             if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
4759               if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
4760                 if(c4>0) { // Close this part
4761                   currentSize += (c4/UCOL_BOT_COUNT4)+1;
4762                   c4 = 0;
4763                 }
4764                 currentSize++; // Add the Hiragana
4765               } else { // This wasn't Hiragana, so we can continue adding stuff
4766                 c4++;
4767               }
4768             }
4769
4770           }
4771     }
4772
4773     if(!isFrenchSec){
4774       if(c2 > 0) {
4775         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4776       }
4777     } else {
4778       uint32_t i = 0;
4779       if(frenchStartPtr != NULL) {
4780         uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4781       }
4782       for(i = 0; i<fSecsLen; i++) {
4783         secondary = *(fSecs+fSecsLen-i-1);
4784         /* This is compression code. */
4785         if (secondary == UCOL_COMMON2) {
4786           ++c2;
4787         } else {
4788           if(c2 > 0) {
4789             if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4790               currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0);
4791             } else {
4792               currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4793             }
4794             c2 = 0;
4795           }
4796           currentSize++;
4797         }
4798       }
4799       if(c2 > 0) {
4800         currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0);
4801       }
4802       if(fSecs != fSecsBuff) {
4803         uprv_free(fSecs);
4804       }
4805     }
4806
4807     if(c3 > 0) {
4808       currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0);
4809     }
4810
4811     if(c4 > 0  && compareQuad == 0) {
4812       currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0);
4813     }
4814
4815     if(compareIdent) {
4816       currentSize += u_lengthOfIdenticalLevelRun(s->string, len);
4817     }
4818     return currentSize;
4819
4820 }
4821
4822 static
4823 inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) {
4824   if (caseShift  == 0) {
4825     *(*cases)++ = UCOL_CASE_BYTE_START;
4826     caseShift = UCOL_CASE_SHIFT_START;
4827   }
4828 }
4829
4830 // Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we
4831 // know how many values we wanted to add, even if we didn't add them all
4832 static
4833 inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) {
4834   size++;
4835   if(primaries < limit) {
4836     *(primaries)++ = value;
4837   }
4838 }
4839
4840 // Packs the secondary buffer when processing French locale. Adds the terminator.
4841 static
4842 inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) {
4843   uint8_t secondary;
4844   int32_t count2 = 0;
4845   uint32_t i = 0, size = 0;
4846   // we use i here since the key size already accounts for terminators, so we'll discard the increment
4847   addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR);
4848   /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */
4849   if(frenchStartPtr != NULL) {
4850     uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
4851   }
4852   for(i = 0; i<*secsize; i++) {
4853     secondary = *(secondaries-i-1);
4854     /* This is compression code. */
4855     if (secondary == UCOL_COMMON2) {
4856       ++count2;
4857     } else {
4858       if (count2 > 0) {
4859         if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
4860           while (count2 > UCOL_TOP_COUNT2) {
4861             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2));
4862             count2 -= (uint32_t)UCOL_TOP_COUNT2;
4863           }
4864           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)));
4865         } else {
4866           while (count2 > UCOL_BOT_COUNT2) {
4867             addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4868             count2 -= (uint32_t)UCOL_BOT_COUNT2;
4869           }
4870           addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4871         }
4872         count2 = 0;
4873       }
4874       addWithIncrement(primaries, primEnd, size, secondary);
4875     }
4876   }
4877   if (count2 > 0) {
4878     while (count2 > UCOL_BOT_COUNT2) {
4879       addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2));
4880       count2 -= (uint32_t)UCOL_BOT_COUNT2;
4881     }
4882     addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)));
4883   }
4884   *secsize = size;
4885   return primaries;
4886 }
4887
4888 /* This is the sortkey work horse function */
4889 U_CFUNC int32_t U_CALLCONV
4890 ucol_calcSortKey(const    UCollator    *coll,
4891         const    UChar        *source,
4892         int32_t        sourceLength,
4893         uint8_t        **result,
4894         uint32_t        resultLength,
4895         UBool allocateSKBuffer,
4896         UErrorCode *status)
4897 {
4898     const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
4899
4900     uint32_t i = 0; /* general purpose counter */
4901
4902     /* Stack allocated buffers for buffers we use */
4903     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER];
4904
4905     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad;
4906
4907     if(U_FAILURE(*status)) {
4908       return 0;
4909     }
4910
4911     if(primaries == NULL && allocateSKBuffer == TRUE) {
4912         primaries = *result = prim;
4913         resultLength = UCOL_PRIMARY_MAX_BUFFER;
4914     }
4915
4916     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER,
4917       caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER;
4918
4919     uint32_t sortKeySize = 1; /* it is always \0 terminated */
4920
4921     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
4922     UChar *normSource = normBuffer;
4923     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
4924
4925     int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
4926
4927     UColAttributeValue strength = coll->strength;
4928
4929     uint8_t compareSec   = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF);
4930     uint8_t compareTer   = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF);
4931     uint8_t compareQuad  = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF);
4932     UBool  compareIdent = (strength == UCOL_IDENTICAL);
4933     UBool  doCase = (coll->caseLevel == UCOL_ON);
4934     UBool  isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0);
4935     UBool  shifted = (coll->alternateHandling == UCOL_SHIFTED);
4936     //UBool  qShifted = shifted && (compareQuad == 0);
4937     UBool  doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0);
4938     const uint8_t *scriptOrder = coll->scriptOrder;
4939
4940     uint32_t variableTopValue = coll->variableTopValue;
4941     // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no
4942     // qShifted, we don't need to set UCOL_COMMON_BOT4 so high.
4943     uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1);
4944     uint8_t UCOL_HIRAGANA_QUAD = 0;
4945     if(doHiragana) {
4946       UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++;
4947       /* allocate one more space for hiragana, value for hiragana */
4948     }
4949     uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4);
4950
4951     /* support for special features like caselevel and funky secondaries */
4952     uint8_t *frenchStartPtr = NULL;
4953     uint8_t *frenchEndPtr = NULL;
4954     uint32_t caseShift = 0;
4955
4956     sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0));
4957
4958     /* If we need to normalize, we'll do it all at once at the beginning! */
4959     UNormalizationMode normMode;
4960     if(compareIdent) {
4961         normMode = UNORM_NFD;
4962     } else if(coll->normalizationMode != UCOL_OFF) {
4963         normMode = UNORM_FCD;
4964     } else {
4965         normMode = UNORM_NONE;
4966     }
4967
4968     if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) {
4969         len = unorm_internalNormalize(normSource, normSourceLen,
4970                                       source, len,
4971                                       normMode, FALSE,
4972                                       status);
4973         if(*status == U_BUFFER_OVERFLOW_ERROR) {
4974             normSourceLen = len;
4975             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
4976             if(normSource == NULL) {
4977                 *status = U_MEMORY_ALLOCATION_ERROR;
4978                 return 0;
4979             }
4980             *status = U_ZERO_ERROR;
4981             len = unorm_internalNormalize(normSource, normSourceLen,
4982                                           source, len,
4983                                           normMode, FALSE,
4984                                           status);
4985         }
4986
4987         if(U_FAILURE(*status)) {
4988             return 0;
4989         }
4990         source = normSource;
4991     }
4992
4993     collIterate s;
4994     IInit_collIterate(coll, (UChar *)source, len, &s);
4995     if(source == normSource) {
4996         s.flags &= ~UCOL_ITER_NORM;
4997     }
4998
4999     if(resultLength == 0 || primaries == NULL) {
5000       int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5001       if(normSource != normBuffer) {
5002           uprv_free(normSource);
5003       }
5004       return keyLen;
5005     }
5006     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5007
5008     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5009
5010     uint8_t *primStart = primaries;
5011     uint8_t *secStart = secondaries;
5012     uint8_t *terStart = tertiaries;
5013     uint8_t *caseStart = cases;
5014     uint8_t *quadStart = quads;
5015
5016     uint32_t order = 0;
5017
5018     uint8_t primary1 = 0;
5019     uint8_t primary2 = 0;
5020     uint8_t secondary = 0;
5021     uint8_t tertiary = 0;
5022     uint8_t caseSwitch = coll->caseSwitch;
5023     uint8_t tertiaryMask = coll->tertiaryMask;
5024     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5025     uint8_t tertiaryTop = coll->tertiaryTop;
5026     uint8_t tertiaryBottom = coll->tertiaryBottom;
5027     uint8_t tertiaryCommon = coll->tertiaryCommon;
5028     uint8_t caseBits = 0;
5029
5030     UBool finished = FALSE;
5031     UBool wasShifted = FALSE;
5032     UBool notIsContinuation = FALSE;
5033
5034     uint32_t prevBuffSize = 0;
5035
5036     uint32_t count2 = 0, count3 = 0, count4 = 0;
5037     uint8_t leadPrimary = 0;
5038
5039     for(;;) {
5040         for(i=prevBuffSize; i<minBufferSize; ++i) {
5041
5042             order = ucol_IGetNextCE(coll, &s, status);
5043             if(order == UCOL_NO_MORE_CES) {
5044                 finished = TRUE;
5045                 break;
5046             }
5047
5048             if(order == 0) {
5049               continue;
5050             }
5051
5052             notIsContinuation = !isContinuation(order);
5053
5054             if(notIsContinuation) {
5055               tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK);
5056             } else {
5057               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5058             }
5059
5060             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5061             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5062             primary1 = (uint8_t)(order >> 8);
5063
5064             if(notIsContinuation) {
5065               if(scriptOrder != NULL) {
5066                 primary1 = scriptOrder[primary1];
5067               }
5068             }
5069
5070             if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0)
5071               || (!notIsContinuation && wasShifted))
5072               || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5073               /* and other ignorables should be removed if following a shifted code point */
5074               if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */
5075                                   /* we should just completely ignore it */
5076                 continue;
5077               }
5078               if(compareQuad == 0) {
5079                 if(count4 > 0) {
5080                   while (count4 > UCOL_BOT_COUNT4) {
5081                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5082                     count4 -= UCOL_BOT_COUNT4;
5083                   }
5084                   *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5085                   count4 = 0;
5086                 }
5087                 /* We are dealing with a variable and we're treating them as shifted */
5088                 /* This is a shifted ignorable */
5089                 if(primary1 != 0) { /* we need to check this since we could be in continuation */
5090                   *quads++ = primary1;
5091                 }
5092                 if(primary2 != 0) {
5093                   *quads++ = primary2;
5094                 }
5095               }
5096               wasShifted = TRUE;
5097             } else {
5098               wasShifted = FALSE;
5099               /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5100               /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5101               /* regular and simple sortkey calc */
5102               if(primary1 != UCOL_IGNORABLE) {
5103                 if(notIsContinuation) {
5104                   if(leadPrimary == primary1) {
5105                     *primaries++ = primary2;
5106                   } else {
5107                     if(leadPrimary != 0) {
5108                       *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5109                     }
5110                     if(primary2 == UCOL_IGNORABLE) {
5111                     /* one byter, not compressed */
5112                         *primaries++ = primary1;
5113                         leadPrimary = 0;
5114                     } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5115                         (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5116                     /* not compressible */
5117                         leadPrimary = 0;
5118                         *primaries++ = primary1;
5119                         *primaries++ = primary2;
5120                     } else { /* compress */
5121                         *primaries++ = leadPrimary = primary1;
5122                         *primaries++ = primary2;
5123                     }
5124                   }
5125                 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5126                   *primaries++ = primary1;
5127                   if(primary2 != UCOL_IGNORABLE) {
5128                     *primaries++ = primary2; /* second part */
5129                   }
5130                 }
5131               }
5132
5133             if(secondary > compareSec) {
5134               if(!isFrenchSec) {
5135                 /* This is compression code. */
5136                 if (secondary == UCOL_COMMON2 && notIsContinuation) {
5137                   ++count2;
5138                 } else {
5139                   if (count2 > 0) {
5140                     if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5141                       while (count2 > UCOL_TOP_COUNT2) {
5142                         *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5143                         count2 -= (uint32_t)UCOL_TOP_COUNT2;
5144                       }
5145                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5146                     } else {
5147                       while (count2 > UCOL_BOT_COUNT2) {
5148                         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5149                         count2 -= (uint32_t)UCOL_BOT_COUNT2;
5150                       }
5151                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5152                     }
5153                     count2 = 0;
5154                   }
5155                   *secondaries++ = secondary;
5156                 }
5157               } else {
5158                   *secondaries++ = secondary;
5159                   /* Do the special handling for French secondaries */
5160                   /* We need to get continuation elements and do intermediate restore */
5161                   /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */
5162                   if(notIsContinuation) {
5163                     if (frenchStartPtr != NULL) {
5164                         /* reverse secondaries from frenchStartPtr up to frenchEndPtr */
5165                       uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr);
5166                       frenchStartPtr = NULL;
5167                     }
5168                   } else {
5169                     if (frenchStartPtr == NULL) {
5170                       frenchStartPtr = secondaries - 2;
5171                     }
5172                     frenchEndPtr = secondaries-1;
5173                   }
5174                 }
5175               }
5176
5177               if(doCase) {
5178                 doCaseShift(&cases, caseShift);
5179                 if(notIsContinuation) {
5180                   caseBits = (uint8_t)(tertiary & 0xC0);
5181
5182                   if(tertiary != 0) {
5183                     if(coll->caseFirst == UCOL_UPPER_FIRST) {
5184                       if((caseBits & 0xC0) == 0) {
5185                         *(cases-1) |= 1 << (--caseShift);
5186                       } else {
5187                         *(cases-1) |= 0 << (--caseShift);
5188                         /* second bit */
5189                         doCaseShift(&cases, caseShift);
5190                         *(cases-1) |= ((caseBits>>6)&1) << (--caseShift);
5191                       }
5192                     } else {
5193                       if((caseBits & 0xC0) == 0) {
5194                         *(cases-1) |= 0 << (--caseShift);
5195                       } else {
5196                         *(cases-1) |= 1 << (--caseShift);
5197                         /* second bit */
5198                         doCaseShift(&cases, caseShift);
5199                         *(cases-1) |= ((caseBits>>7)&1) << (--caseShift);
5200                       }
5201                     }
5202                   }
5203
5204                 }
5205               } else {
5206                 if(notIsContinuation) {
5207                   tertiary ^= caseSwitch;
5208                 }
5209               }
5210
5211               tertiary &= tertiaryMask;
5212               if(tertiary > compareTer) {
5213                 /* This is compression code. */
5214                 /* sequence size check is included in the if clause */
5215                 if (tertiary == tertiaryCommon && notIsContinuation) {
5216                   ++count3;
5217                 } else {
5218                   if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL)
5219                     || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) {
5220                     tertiary += tertiaryAddition;
5221                   }
5222                   if (count3 > 0) {
5223                     if ((tertiary > tertiaryCommon)) {
5224                       while (count3 > coll->tertiaryTopCount) {
5225                         *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5226                         count3 -= (uint32_t)coll->tertiaryTopCount;
5227                       }
5228                       *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5229                     } else {
5230                       while (count3 > coll->tertiaryBottomCount) {
5231                         *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5232                         count3 -= (uint32_t)coll->tertiaryBottomCount;
5233                       }
5234                       *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5235                     }
5236                     count3 = 0;
5237                   }
5238                   *tertiaries++ = tertiary;
5239                 }
5240               }
5241
5242               if(/*qShifted*/(compareQuad==0)  && notIsContinuation) {
5243                 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
5244                   if(count4>0) { // Close this part
5245                     while (count4 > UCOL_BOT_COUNT4) {
5246                       *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5247                       count4 -= UCOL_BOT_COUNT4;
5248                     }
5249                     *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5250                     count4 = 0;
5251                   }
5252                   *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana
5253                 } else { // This wasn't Hiragana, so we can continue adding stuff
5254                   count4++;
5255                 }
5256               }
5257             }
5258
5259             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5260               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5261                 IInit_collIterate(coll, (UChar *)source, len, &s);
5262                 if(source == normSource) {
5263                     s.flags &= ~UCOL_ITER_NORM;
5264                 }
5265                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5266                 *status = U_BUFFER_OVERFLOW_ERROR;
5267                 finished = TRUE;
5268                 break;
5269               } else { /* It's much nicer if we can actually reallocate */
5270                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart);
5271                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5272                 if(U_SUCCESS(*status)) {
5273                   *result = primStart;
5274                   primarySafeEnd = primStart + resultLength - 2;
5275                 } else {
5276                   IInit_collIterate(coll, (UChar *)source, len, &s);
5277                   if(source == normSource) {
5278                       s.flags &= ~UCOL_ITER_NORM;
5279                   }
5280                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5281                   finished = TRUE;
5282                   break;
5283                 }
5284               }
5285             }
5286         }
5287         if(finished) {
5288             break;
5289         } else {
5290           prevBuffSize = minBufferSize;
5291           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5292           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5293           caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status);
5294           quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status);
5295           minBufferSize *= 2;
5296           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5297             IInit_collIterate(coll, (UChar *)source, len, &s);
5298             if(source == normSource) {
5299                 s.flags &= ~UCOL_ITER_NORM;
5300             }
5301             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
5302             break;
5303           }
5304         }
5305     }
5306
5307     /* Here, we are generally done with processing */
5308     /* bailing out would not be too productive */
5309
5310     if(U_SUCCESS(*status)) {
5311       sortKeySize += (primaries - primStart);
5312       /* we have done all the CE's, now let's put them together to form a key */
5313       if(compareSec == 0) {
5314         if (count2 > 0) {
5315           while (count2 > UCOL_BOT_COUNT2) {
5316             *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5317             count2 -= (uint32_t)UCOL_BOT_COUNT2;
5318           }
5319           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5320         }
5321         uint32_t secsize = secondaries-secStart;
5322         if(!isFrenchSec) { // Regular situation, we know the length of secondaries
5323           sortKeySize += secsize;
5324           if(sortKeySize <= resultLength) {
5325             *(primaries++) = UCOL_LEVELTERMINATOR;
5326             uprv_memcpy(primaries, secStart, secsize);
5327             primaries += secsize;
5328           } else {
5329             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5330               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5331               if(U_SUCCESS(*status)) {
5332                 *result = primStart;
5333                 *(primaries++) = UCOL_LEVELTERMINATOR;
5334                 uprv_memcpy(primaries, secStart, secsize);
5335                 primaries += secsize;
5336               }
5337             } else {
5338               *status = U_BUFFER_OVERFLOW_ERROR;
5339             }
5340           }
5341         } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator
5342           uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5343           sortKeySize += secsize;
5344           if(sortKeySize <= resultLength) { // if we managed to pack fine
5345             primaries = newPrim; // update the primary pointer
5346           } else { // overflow, need to reallocate and redo
5347             if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */
5348               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5349               if(U_SUCCESS(*status)) {
5350                 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr);
5351               }
5352             } else {
5353               *status = U_BUFFER_OVERFLOW_ERROR;
5354             }
5355           }
5356         }
5357       }
5358
5359       if(doCase) {
5360         uint32_t casesize = cases - caseStart;
5361         sortKeySize += casesize;
5362         if(sortKeySize <= resultLength) {
5363           *(primaries++) = UCOL_LEVELTERMINATOR;
5364           uprv_memcpy(primaries, caseStart, casesize);
5365           primaries += casesize;
5366         } else {
5367           if(allocateSKBuffer == TRUE) {
5368             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5369             if(U_SUCCESS(*status)) {
5370               *result = primStart;
5371               *(primaries++) = UCOL_LEVELTERMINATOR;
5372               uprv_memcpy(primaries, caseStart, casesize);
5373             }
5374           } else {
5375             *status = U_BUFFER_OVERFLOW_ERROR;
5376           }
5377         }
5378       }
5379
5380       if(compareTer == 0) {
5381         if (count3 > 0) {
5382           if (coll->tertiaryCommon != UCOL_COMMON_BOT3) {
5383             while (count3 >= coll->tertiaryTopCount) {
5384               *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5385               count3 -= (uint32_t)coll->tertiaryTopCount;
5386             }
5387             *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5388           } else {
5389             while (count3 > coll->tertiaryBottomCount) {
5390               *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5391               count3 -= (uint32_t)coll->tertiaryBottomCount;
5392             }
5393             *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5394           }
5395         }
5396         uint32_t tersize = tertiaries - terStart;
5397         sortKeySize += tersize;
5398         if(sortKeySize <= resultLength) {
5399           *(primaries++) = UCOL_LEVELTERMINATOR;
5400           uprv_memcpy(primaries, terStart, tersize);
5401           primaries += tersize;
5402         } else {
5403           if(allocateSKBuffer == TRUE) {
5404             primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5405             if(U_SUCCESS(*status)) {
5406               *result = primStart;
5407               *(primaries++) = UCOL_LEVELTERMINATOR;
5408               uprv_memcpy(primaries, terStart, tersize);
5409             }
5410           } else {
5411             *status = U_BUFFER_OVERFLOW_ERROR;
5412           }
5413         }
5414
5415         if(compareQuad == 0/*qShifted == TRUE*/) {
5416             if(count4 > 0) {
5417               while (count4 > UCOL_BOT_COUNT4) {
5418                 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4);
5419                 count4 -= UCOL_BOT_COUNT4;
5420               }
5421               *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1));
5422             }
5423             uint32_t quadsize = quads - quadStart;
5424             sortKeySize += quadsize;
5425             if(sortKeySize <= resultLength) {
5426               *(primaries++) = UCOL_LEVELTERMINATOR;
5427               uprv_memcpy(primaries, quadStart, quadsize);
5428               primaries += quadsize;
5429             } else {
5430               if(allocateSKBuffer == TRUE) {
5431                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5432                 if(U_SUCCESS(*status)) {
5433                   *result = primStart;
5434                   *(primaries++) = UCOL_LEVELTERMINATOR;
5435                   uprv_memcpy(primaries, quadStart, quadsize);
5436                 }
5437               } else {
5438                 *status = U_BUFFER_OVERFLOW_ERROR;
5439               }
5440             }
5441         }
5442
5443         if(compareIdent) {
5444           sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len);
5445           if(sortKeySize <= resultLength) {
5446             *(primaries++) = UCOL_LEVELTERMINATOR;
5447             primaries += u_writeIdenticalLevelRun(s.string, len, primaries);
5448           } else {
5449             if(allocateSKBuffer == TRUE) {
5450               primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status);
5451               if(U_SUCCESS(*status)) {
5452                 *result = primStart;
5453                 *(primaries++) = UCOL_LEVELTERMINATOR;
5454                 u_writeIdenticalLevelRun(s.string, len, primaries);
5455               }
5456             } else {
5457               *status = U_BUFFER_OVERFLOW_ERROR;
5458             }
5459           }
5460         }
5461       }
5462       *(primaries++) = '\0';
5463     }
5464
5465     if(terStart != tert) {
5466         uprv_free(terStart);
5467         uprv_free(secStart);
5468         uprv_free(caseStart);
5469         uprv_free(quadStart);
5470     }
5471
5472     if(normSource != normBuffer) {
5473         uprv_free(normSource);
5474     }
5475
5476     if(allocateSKBuffer == TRUE) {
5477       *result = (uint8_t*)uprv_malloc(sortKeySize);
5478       /* test for NULL */
5479       if (*result == NULL) {
5480         *status = U_MEMORY_ALLOCATION_ERROR;
5481         return sortKeySize;
5482       }
5483       uprv_memcpy(*result, primStart, sortKeySize);
5484       if(primStart != prim) {
5485         uprv_free(primStart);
5486       }
5487     }
5488
5489     return sortKeySize;
5490 }
5491
5492
5493 U_CFUNC int32_t U_CALLCONV
5494 ucol_calcSortKeySimpleTertiary(const    UCollator    *coll,
5495         const    UChar        *source,
5496         int32_t        sourceLength,
5497         uint8_t        **result,
5498         uint32_t        resultLength,
5499         UBool allocateSKBuffer,
5500         UErrorCode *status)
5501 {
5502     U_ALIGN_CODE(16);
5503
5504     const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts);
5505     uint32_t i = 0; /* general purpose counter */
5506
5507     /* Stack allocated buffers for buffers we use */
5508     uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER];
5509
5510     uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert;
5511
5512     if(U_FAILURE(*status)) {
5513       return 0;
5514     }
5515
5516     if(primaries == NULL && allocateSKBuffer == TRUE) {
5517         primaries = *result = prim;
5518         resultLength = UCOL_PRIMARY_MAX_BUFFER;
5519     }
5520
5521     uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER;
5522
5523     uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */
5524
5525     UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER];
5526     UChar *normSource = normBuffer;
5527     int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
5528
5529     int32_t len =  sourceLength;
5530
5531     /* If we need to normalize, we'll do it all at once at the beginning! */
5532     if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) {
5533         len = unorm_internalNormalize(normSource, normSourceLen,
5534                                       source, len,
5535                                       UNORM_FCD, FALSE,
5536                                       status);
5537         if(*status == U_BUFFER_OVERFLOW_ERROR) {
5538             normSourceLen = len;
5539             normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR);
5540             if(normSource == NULL) {
5541                 *status = U_MEMORY_ALLOCATION_ERROR;
5542                 return 0;
5543             }
5544             *status = U_ZERO_ERROR;
5545             len = unorm_internalNormalize(normSource, normSourceLen,
5546                                           source, len,
5547                                           UNORM_FCD, FALSE,
5548                                           status);
5549         }
5550
5551         if(U_FAILURE(*status)) {
5552             return 0;
5553         }
5554         source = normSource;
5555     }
5556
5557     collIterate s;
5558     IInit_collIterate(coll, (UChar *)source, len, &s);
5559     if(source == normSource) {
5560         s.flags &= ~UCOL_ITER_NORM;
5561     }
5562
5563     if(resultLength == 0 || primaries == NULL) {
5564         int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5565         if(normSource != normBuffer) {
5566             uprv_free(normSource);
5567         }
5568         return t;
5569     }
5570
5571     uint8_t *primarySafeEnd = primaries + resultLength - 2;
5572
5573     uint32_t minBufferSize = UCOL_MAX_BUFFER;
5574
5575     uint8_t *primStart = primaries;
5576     uint8_t *secStart = secondaries;
5577     uint8_t *terStart = tertiaries;
5578
5579     uint32_t order = 0;
5580
5581     uint8_t primary1 = 0;
5582     uint8_t primary2 = 0;
5583     uint8_t secondary = 0;
5584     uint8_t tertiary = 0;
5585     uint8_t caseSwitch = coll->caseSwitch;
5586     uint8_t tertiaryMask = coll->tertiaryMask;
5587     int8_t tertiaryAddition = (int8_t)coll->tertiaryAddition;
5588     uint8_t tertiaryTop = coll->tertiaryTop;
5589     uint8_t tertiaryBottom = coll->tertiaryBottom;
5590     uint8_t tertiaryCommon = coll->tertiaryCommon;
5591
5592     uint32_t prevBuffSize = 0;
5593
5594     UBool finished = FALSE;
5595     UBool notIsContinuation = FALSE;
5596
5597     uint32_t count2 = 0, count3 = 0;
5598     uint8_t leadPrimary = 0;
5599
5600     for(;;) {
5601         for(i=prevBuffSize; i<minBufferSize; ++i) {
5602
5603             order = ucol_IGetNextCE(coll, &s, status);
5604
5605             if(order == 0) {
5606               continue;
5607             }
5608
5609             if(order == UCOL_NO_MORE_CES) {
5610                 finished = TRUE;
5611                 break;
5612             }
5613
5614             notIsContinuation = !isContinuation(order);
5615
5616             if(notIsContinuation) {
5617               tertiary = (uint8_t)((order & tertiaryMask));
5618             } else {
5619               tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION));
5620             }
5621             secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5622             primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK);
5623             primary1 = (uint8_t)(order >> 8);
5624
5625             /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
5626             /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will   */
5627             /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above.               */
5628             /* regular and simple sortkey calc */
5629             if(primary1 != UCOL_IGNORABLE) {
5630               if(notIsContinuation) {
5631                 if(leadPrimary == primary1) {
5632                   *primaries++ = primary2;
5633                 } else {
5634                   if(leadPrimary != 0) {
5635                     *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN);
5636                   }
5637                   if(primary2 == UCOL_IGNORABLE) {
5638                   /* one byter, not compressed */
5639                       *primaries++ = primary1;
5640                       leadPrimary = 0;
5641                   } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
5642                       //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
5643                       (primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
5644                   /* not compressible */
5645                       leadPrimary = 0;
5646                       *primaries++ = primary1;
5647                       *primaries++ = primary2;
5648                   } else { /* compress */
5649                       *primaries++ = leadPrimary = primary1;
5650                       *primaries++ = primary2;
5651                   }
5652                 }
5653               } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
5654                 *primaries++ = primary1;
5655                 if(primary2 != UCOL_IGNORABLE) {
5656                   *primaries++ = primary2; /* second part */
5657                 }
5658               }
5659             }
5660
5661             if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
5662               /* This is compression code. */
5663               if (secondary == UCOL_COMMON2 && notIsContinuation) {
5664                 ++count2;
5665               } else {
5666                 if (count2 > 0) {
5667                   if (secondary > UCOL_COMMON2) { // not necessary for 4th level.
5668                     while (count2 > UCOL_TOP_COUNT2) {
5669                       *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2);
5670                       count2 -= (uint32_t)UCOL_TOP_COUNT2;
5671                     }
5672                     *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1));
5673                   } else {
5674                     while (count2 > UCOL_BOT_COUNT2) {
5675                       *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5676                       count2 -= (uint32_t)UCOL_BOT_COUNT2;
5677                     }
5678                     *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5679                   }
5680                   count2 = 0;
5681                 }
5682                 *secondaries++ = secondary;
5683               }
5684             }
5685
5686             if(notIsContinuation) {
5687               tertiary ^= caseSwitch;
5688             }
5689
5690               if(tertiary > 0) {
5691               /* This is compression code. */
5692               /* sequence size check is included in the if clause */
5693               if (tertiary == tertiaryCommon && notIsContinuation) {
5694                 ++count3;
5695               } else {
5696                 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) {
5697                   tertiary += tertiaryAddition;
5698                 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) {
5699                   tertiary -= tertiaryAddition;
5700                 }
5701                 if (count3 > 0) {
5702                   if ((tertiary > tertiaryCommon)) {
5703                     while (count3 > coll->tertiaryTopCount) {
5704                       *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5705                       count3 -= (uint32_t)coll->tertiaryTopCount;
5706                     }
5707                     *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1));
5708                   } else {
5709                     while (count3 > coll->tertiaryBottomCount) {
5710                       *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5711                       count3 -= (uint32_t)coll->tertiaryBottomCount;
5712                     }
5713                     *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5714                   }
5715                   count3 = 0;
5716                 }
5717                 *tertiaries++ = tertiary;
5718               }
5719             }
5720
5721             if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */
5722               if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */
5723                 IInit_collIterate(coll, (UChar *)source, len, &s);
5724                 if(source == normSource) {
5725                     s.flags &= ~UCOL_ITER_NORM;
5726                 }
5727                 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5728                 *status = U_BUFFER_OVERFLOW_ERROR;
5729                 finished = TRUE;
5730                 break;
5731               } else { /* It's much nicer if we can actually reallocate */
5732                 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart);
5733                 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status);
5734                 if(U_SUCCESS(*status)) {
5735                   *result = primStart;
5736                   primarySafeEnd = primStart + resultLength - 2;
5737                 } else {
5738                   IInit_collIterate(coll, (UChar *)source, len, &s);
5739                   if(source == normSource) {
5740                       s.flags &= ~UCOL_ITER_NORM;
5741                   }
5742                   sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5743                   finished = TRUE;
5744                   break;
5745                 }
5746               }
5747             }
5748         }
5749         if(finished) {
5750             break;
5751         } else {
5752           prevBuffSize = minBufferSize;
5753           secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status);
5754           terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status);
5755           minBufferSize *= 2;
5756           if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size
5757             IInit_collIterate(coll, (UChar *)source, len, &s);
5758             if(source == normSource) {
5759                 s.flags &= ~UCOL_ITER_NORM;
5760             }
5761             sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
5762             break;
5763           }
5764         }
5765     }
5766
5767     if(U_SUCCESS(*status)) {
5768       sortKeySize += (primaries - primStart);
5769       /* we have done all the CE's, now let's put them together to form a key */
5770       if (count2 > 0) {
5771         while (count2 > UCOL_BOT_COUNT2) {
5772           *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2);
5773           count2 -= (uint32_t)UCOL_BOT_COUNT2;
5774         }
5775         *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1));
5776       }
5777       uint32_t secsize = secondaries-secStart;
5778       sortKeySize += secsize;
5779       if(sortKeySize <= resultLength) {
5780         *(primaries++) = UCOL_LEVELTERMINATOR;
5781         uprv_memcpy(primaries, secStart, secsize);
5782         primaries += secsize;
5783       } else {
5784         if(allocateSKBuffer == TRUE) {
5785           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5786           if(U_SUCCESS(*status)) {
5787             *(primaries++) = UCOL_LEVELTERMINATOR;
5788             *result = primStart;
5789             uprv_memcpy(primaries, secStart, secsize);
5790           }
5791         } else {
5792           *status = U_BUFFER_OVERFLOW_ERROR;
5793         }
5794       }
5795
5796       if (count3 > 0) {
5797         if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) {
5798           while (count3 >= coll->tertiaryTopCount) {
5799             *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount);
5800             count3 -= (uint32_t)coll->tertiaryTopCount;
5801           }
5802           *tertiaries++ = (uint8_t)(tertiaryTop - count3);
5803         } else {
5804           while (count3 > coll->tertiaryBottomCount) {
5805             *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount);
5806             count3 -= (uint32_t)coll->tertiaryBottomCount;
5807           }
5808           *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1));
5809         }
5810       }
5811       uint32_t tersize = tertiaries - terStart;
5812       sortKeySize += tersize;
5813       if(sortKeySize <= resultLength) {
5814         *(primaries++) = UCOL_LEVELTERMINATOR;
5815         uprv_memcpy(primaries, terStart, tersize);
5816         primaries += tersize;
5817       } else {
5818         if(allocateSKBuffer == TRUE) {
5819           primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status);
5820           if(U_SUCCESS(*status)) {
5821             *result = primStart;
5822             *(primaries++) = UCOL_LEVELTERMINATOR;
5823             uprv_memcpy(primaries, terStart, tersize);
5824           }
5825         } else {
5826           *status = U_MEMORY_ALLOCATION_ERROR;
5827         }
5828       }
5829
5830       *(primaries++) = '\0';
5831     }
5832
5833     if(terStart != tert) {
5834         uprv_free(terStart);
5835         uprv_free(secStart);
5836     }
5837
5838     if(normSource != normBuffer) {
5839         uprv_free(normSource);
5840     }
5841
5842     if(allocateSKBuffer == TRUE) {
5843       *result = (uint8_t*)uprv_malloc(sortKeySize);
5844       /* test for NULL */
5845       if (*result == NULL) {
5846         *status = U_MEMORY_ALLOCATION_ERROR;
5847         return sortKeySize;
5848       }
5849       uprv_memcpy(*result, primStart, sortKeySize);
5850       if(primStart != prim) {
5851         uprv_free(primStart);
5852       }
5853     }
5854
5855     return sortKeySize;
5856 }
5857
5858 static inline
5859 UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) {
5860   UBool notIsContinuation = !isContinuation(CE);
5861   uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF);
5862   if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0)
5863     || (!notIsContinuation && *wasShifted))
5864     || (*wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */
5865     // The stuff below should probably be in the sortkey code... maybe not...
5866     if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */
5867                         /* we should just completely ignore it */
5868       *wasShifted = TRUE;
5869       //continue;
5870     }
5871     //*wasShifted = TRUE;
5872     return TRUE;
5873   } else {
5874     *wasShifted = FALSE;
5875     return FALSE;
5876   }
5877 }
5878 static inline
5879 void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) {
5880   if(level < maxLevel) {
5881     dest[i++] = UCOL_LEVELTERMINATOR;
5882   } else {
5883     dest[i++] = 0;
5884   }
5885 }
5886
5887 /** enumeration of level identifiers for partial sort key generation */
5888 enum {
5889   UCOL_PSK_PRIMARY = 0,
5890     UCOL_PSK_SECONDARY = 1,
5891     UCOL_PSK_CASE = 2,
5892     UCOL_PSK_TERTIARY = 3,
5893     UCOL_PSK_QUATERNARY = 4,
5894     UCOL_PSK_QUIN = 5,      /** This is an extra level, not used - but we have three bits to blow */
5895     UCOL_PSK_IDENTICAL = 6,
5896     UCOL_PSK_NULL = 7,      /** level for the end of sort key. Will just produce zeros */
5897     UCOL_PSK_LIMIT
5898 };
5899
5900 /** collation state enum. *_SHIFT value is how much to shift right
5901  *  to get the state piece to the right. *_MASK value should be
5902  *  ANDed with the shifted state. This data is stored in state[1]
5903  *  field.
5904  */
5905 enum {
5906     UCOL_PSK_LEVEL_SHIFT = 0,      /** level identificator. stores an enum value from above */
5907     UCOL_PSK_LEVEL_MASK = 7,       /** three bits */
5908     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */
5909     UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1,
5910     /** can be only 0 or 1, since we get up to two bytes from primary or quaternary
5911      *  This field is also used to denote that the French secondary level is finished
5912      */
5913     UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */
5914     UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */
5915     UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */
5916     UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */
5917     /** When we do French we need to reverse secondary values. However, continuations
5918      *  need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba
5919      */
5920     UCOL_PSK_USED_ELEMENTS_SHIFT = 7,
5921     UCOL_PSK_USED_ELEMENTS_MASK = 0x3FF,
5922     UCOL_PSK_ITER_SKIP_SHIFT = 17,
5923     UCOL_PSK_ITER_SKIP_MASK = 0x7FFF
5924 };
5925
5926
5927 /** main sortkey part procedure. On the first call,
5928  *  you should pass in a collator, an iterator, empty state
5929  *  state[0] == state[1] == 0, a buffer to hold results
5930  *  number of bytes you need and an error code pointer.
5931  *  Make sure your buffer is big enough to hold the wanted
5932  *  number of sortkey bytes. I don't check.
5933  *  The only meaningful status you can get back is
5934  *  U_BUFFER_OVERFLOW_ERROR, which basically means that you
5935  *  have been dealt a raw deal and that you probably won't
5936  *  be able to use partial sortkey generation for this
5937  *  particular combination of string and collator. This
5938  *  is highly unlikely, but you should still check the error code.
5939  *  Any other status means that you're not in a sane situation
5940  *  anymore. After the first call, preserve state values and
5941  *  use them on subsequent calls to obtain more bytes of a sortkey.
5942  *  Use until the number of bytes written is smaller than the requested
5943  *  number of bytes. Generated sortkey is not compatible with the
5944  *  one generated by ucol_getSortKey, as we don't do any compression.
5945  *  However, levels are still terminated by a 1 (one) and the sortkey
5946  *  is terminated by a 0 (zero). Identical level is the same as in the
5947  *  regular sortkey - internal bocu-1 implementation is used.
5948  *  For curious, although you cannot do much about this, here is
5949  *  the structure of state words.
5950  *  state[0] - iterator state. Depends on the iterator implementation,
5951  *             but allows the iterator to continue where it stopped in
5952  *             the last iteration.
5953  *  state[1] - collation processing state. Here is the distribution
5954  *             of the bits:
5955  *   0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary
5956  *             quaternary, quin (we don't use this one), identical and
5957  *             null (producing only zeroes - first one to terminate the
5958  *             sortkey and subsequent to fill the buffer).
5959  *   3       - byte count. Number of bytes written on the primary level.
5960  *   4       - was shifted. Whether the previous iteration finished in the
5961  *             shifted state.
5962  *   5, 6    - French continuation bytes written. See the comment in the enum
5963  *   7..16   - Used elements. Number of CEs that were already used from the
5964  *             expansion buffer or number of bytes from a bocu sequence on
5965  *             the identical level.
5966  *  17..31   - iterator skip. Number of move operations iterator needs to
5967  *             skip from the current state in order to continue. This is used
5968  *             only if normalization is turned on, since the normalizing iterator
5969  *             can return undefined state, which means that it's in the middle
5970  *             of normalizing sequence.
5971  */
5972 U_CAPI int32_t U_EXPORT2
5973 ucol_nextSortKeyPart(const UCollator *coll,
5974                      UCharIterator *iter,
5975                      uint32_t state[2],
5976                      uint8_t *dest, int32_t count,
5977                      UErrorCode *status) {
5978     /* error checking */
5979     if(status==NULL || U_FAILURE(*status)) {
5980         return 0;
5981     }
5982     UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART);
5983     if( coll==NULL || iter==NULL ||
5984         state==NULL ||
5985         count<0 || (count>0 && dest==NULL)
5986     ) {
5987         *status=U_ILLEGAL_ARGUMENT_ERROR;
5988     }
5989
5990     UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d",
5991                   coll, iter, state[0], state[1], dest, count);
5992
5993     if(count==0) {
5994         /* nothing to do */
5995         UTRACE_EXIT_VALUE(0);
5996         return 0;
5997     }
5998
5999     /** Setting up situation according to the state we got from the previous iteration */
6000     // The state of the iterator from the previous invocation
6001     uint32_t iterState = state[0];
6002     // Has the last iteration ended in the shifted state
6003     UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE;
6004     // What is the current level of the sortkey?
6005     int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK;
6006     // Have we written only one byte from a two byte primary in the previous iteration?
6007     // Also on secondary level - have we finished with the French secondary?
6008     int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK;
6009     // number of bytes in the continuation buffer for French
6010     int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK;
6011     // Skip the CEs that we got from an extraction
6012     // and delivered in the previous call
6013     int32_t usedElements = (state[1] >> UCOL_PSK_USED_ELEMENTS_SHIFT) & UCOL_PSK_USED_ELEMENTS_MASK;
6014     // Number of times to skip because the iterator returned
6015     // UITER_NO_STATE when it was stopped in the last iteration, so we had to save the
6016     // last valid state.
6017     int32_t iterSkips = (state[1] >> UCOL_PSK_ITER_SKIP_SHIFT) & UCOL_PSK_ITER_SKIP_MASK;
6018
6019     /** values that depend on the collator attributes */
6020     // strength of the collator.
6021     int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status);
6022     // maximal level of the partial sortkey. Need to take whether case level is done
6023     int32_t maxLevel = 0;
6024     if(strength < UCOL_TERTIARY) {
6025       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6026         maxLevel = UCOL_PSK_CASE;
6027       } else {
6028         maxLevel = strength;
6029       }
6030     } else {
6031         if(strength == UCOL_TERTIARY) {
6032           maxLevel = UCOL_PSK_TERTIARY;
6033         } else if(strength == UCOL_QUATERNARY) {
6034           maxLevel = UCOL_PSK_QUATERNARY;
6035         } else { // identical
6036           maxLevel = UCOL_IDENTICAL;
6037         }
6038     }
6039     // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation
6040     uint8_t UCOL_HIRAGANA_QUAD =
6041       (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF;
6042     // Boundary value that decides whether a CE is shifted or not
6043     uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0;
6044     // Are we doing French collation?
6045     UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON);
6046
6047     /** initializing the collation state */
6048     UBool notIsContinuation = FALSE;
6049     uint32_t CE = UCOL_NO_MORE_CES;
6050
6051     collIterate s;
6052     IInit_collIterate(coll, NULL, -1, &s);
6053     s.iterator = iter;
6054     s.flags |= UCOL_USE_ITERATOR;
6055     // This variable tells us whether we have produced some other levels in this iteration
6056     // before we moved to the identical level. In that case, we need to switch the
6057     // type of the iterator.
6058     UBool doingIdenticalFromStart = FALSE;
6059     // Normalizing iterator
6060     // The division for the array length may truncate the array size to
6061     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
6062     // for all platforms anyway.
6063     UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
6064     UNormIterator *normIter = NULL;
6065     // If the normalization is turned on for the collator and we are below identical level
6066     // we will use a FCD normalizing iterator
6067     if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) {
6068       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6069       s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status);
6070       s.flags &= ~UCOL_ITER_NORM;
6071       if(U_FAILURE(*status)) {
6072         UTRACE_EXIT_STATUS(*status);
6073         return 0;
6074       }
6075     } else if(level == UCOL_PSK_IDENTICAL) {
6076       // for identical level, we need a NFD iterator. We need to instantiate it here, since we
6077       // will be updating the state - and this cannot be done on an ordinary iterator.
6078       normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6079       s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6080       s.flags &= ~UCOL_ITER_NORM;
6081       if(U_FAILURE(*status)) {
6082         UTRACE_EXIT_STATUS(*status);
6083         return 0;
6084       }
6085       doingIdenticalFromStart = TRUE;
6086     }
6087
6088     // This is the tentative new state of the iterator. The problem
6089     // is that the iterator might return an undefined state, in
6090     // which case we should save the last valid state and increase
6091     // the iterator skip value.
6092     uint32_t newState = 0;
6093
6094     // First, we set the iterator to the last valid position
6095     // from the last iteration. This was saved in state[0].
6096     if(iterState == 0) {
6097       /* initial state */
6098       if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) {
6099         s.iterator->move(s.iterator, 0, UITER_LIMIT);
6100       } else {
6101         s.iterator->move(s.iterator, 0, UITER_START);
6102       }
6103     } else {
6104         /* reset to previous state */
6105       s.iterator->setState(s.iterator, iterState, status);
6106       if(U_FAILURE(*status)) {
6107           UTRACE_EXIT_STATUS(*status);
6108           return 0;
6109       }
6110     }
6111
6112     // Then, we may have to move more, if the normalizing iterator
6113     // was going through a normalizing sequence.
6114     if(iterSkips) {
6115       // if we are on secondary level AND we do French, we need to go backward instead of forward
6116       if(level == UCOL_PSK_SECONDARY && doingFrench) {
6117         s.iterator->move(s.iterator, -iterSkips, UITER_CURRENT);
6118       } else {
6119         s.iterator->move(s.iterator, iterSkips, UITER_CURRENT);
6120       }
6121     }
6122
6123
6124     // Number of expansion CEs that were already consumed in the
6125     // previous iteration for the last code point processed. We
6126     // want to clean out the expansion buffer, so that we can
6127     // get correct CEs. This value is persistent over iterations,
6128     // since we can have several iterations on the one expansion
6129     // buffer.
6130     int32_t consumedExpansionCEs = usedElements;
6131     // Number of bytes already writted from a bocsu sequence. Since
6132     // the longes bocsu sequence is 4 long, this can be up to 3. It
6133     // shares the state field with consumedExpansionCEs value, since
6134     // they cannot simultanously appear on the same level
6135     int32_t bocsuBytesUsed = 0;
6136     // Clean out the expansion buffer unless we are on
6137     // identical level. In that case we use this field
6138     // to store the number of bytes already written
6139     // from the previous bocsu sequence.
6140     if(level < UCOL_PSK_IDENTICAL && usedElements != 0) {
6141       while(usedElements-->0) {
6142         // If we're doing French and we are on the secondary level,
6143         // we go backwards.
6144         if(level == UCOL_PSK_SECONDARY && doingFrench) {
6145           CE = ucol_IGetPrevCE(coll, &s, status);
6146         } else {
6147           CE = ucol_IGetNextCE(coll, &s, status);
6148         }
6149         if(CE==UCOL_NO_MORE_CES) {
6150           /* should not happen */
6151           *status=U_INTERNAL_PROGRAM_ERROR;
6152           UTRACE_EXIT_STATUS(*status);
6153           return 0;
6154         }
6155       }
6156     } else {
6157       bocsuBytesUsed = usedElements;
6158     }
6159
6160     // This variable prevents the adjusting of iterator
6161     // skip variable when we are the first time on a
6162     // level. I hope there is a better way to do it, but
6163     // I could not think of it.
6164     UBool firstTimeOnLevel = TRUE;
6165     // French secondary needs to know whether the iterator state of zero came from previous level OR
6166     // from a new invocation...
6167     UBool wasDoingPrimary = FALSE;
6168     // Case level is kind of goofy. This variable tells us that
6169     // we are still not done with the case level.
6170     UBool dontAdvanceIteratorBecauseWeNeedALevelTerminator = FALSE;
6171     // destination buffer byte counter. When this guy
6172     // gets to count, we're done with the iteration
6173     int32_t i = 0;
6174     // used to count the zero bytes written after we
6175     // have finished with the sort key
6176     int32_t j = 0;
6177
6178
6179     // Hm.... I think we're ready to plunge in. Basic story is as following:
6180     // we have a fall through case based on level. This is used for initial
6181     // positioning on iteration start. Every level processor contains a
6182     // for(;;) which will be broken when we exhaust all the CEs. Other
6183     // way to exit is a goto saveState, which happens when we have filled
6184     // out our buffer.
6185     switch(level) {
6186     case UCOL_PSK_PRIMARY:
6187       wasDoingPrimary = TRUE;
6188       for(;;) {
6189           if(i==count) {
6190               goto saveState;
6191           }
6192           // We should save the state only if we
6193           // are sure that we are done with the
6194           // previous iterator state
6195           if(consumedExpansionCEs == 0 && byteCountOrFrenchDone == 0) {
6196             newState = s.iterator->getState(s.iterator);
6197             if(newState != UITER_NO_STATE) {
6198               iterState = newState;
6199               iterSkips = 0;
6200             } else {
6201               if(!firstTimeOnLevel && !byteCountOrFrenchDone) {
6202                 iterSkips++;
6203               }
6204             }
6205           }
6206           firstTimeOnLevel = FALSE;
6207           CE = ucol_IGetNextCE(coll, &s, status);
6208           if(CE==UCOL_NO_MORE_CES) {
6209               // Add the level separator
6210               terminatePSKLevel(level, maxLevel, i, dest);
6211               byteCountOrFrenchDone=0;
6212               // Restart the iteration an move to the
6213               // second level
6214               s.iterator->move(s.iterator, 0, UITER_START);
6215               level = UCOL_PSK_SECONDARY;
6216               break;
6217           }
6218           if(!isShiftedCE(CE, LVT, &wasShifted)) {
6219             CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */
6220             if(CE != 0) {
6221               if(byteCountOrFrenchDone == 0) {
6222                 // get the second byte of primary
6223                 dest[i++]=(uint8_t)(CE >> 8);
6224               } else {
6225                 byteCountOrFrenchDone = 0;
6226               }
6227               if((CE &=0xff)!=0) {
6228                   if(i==count) {
6229                       /* overflow */
6230                       byteCountOrFrenchDone=1;
6231                       goto saveState;
6232                   }
6233                   dest[i++]=(uint8_t)CE;
6234               }
6235             }
6236           }
6237           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6238             // s.pos != NULL means there is a normalization buffer in effect
6239             // in iterative case, this means that we are doing Thai (maybe discontiguos)
6240             consumedExpansionCEs++;
6241           } else {
6242             consumedExpansionCEs = 0;
6243           }
6244           if(s.pos && *s.pos == 0) {
6245             // maybe it is the end of Thai - we have to have
6246             // an extra skip
6247             iterSkips++;
6248           }
6249       }
6250       /* fall through to next level */
6251     case UCOL_PSK_SECONDARY:
6252       if(strength >= UCOL_SECONDARY) {
6253         if(!doingFrench) {
6254           for(;;) {
6255             if(i == count) {
6256               goto saveState;
6257             }
6258             // We should save the state only if we
6259             // are sure that we are done with the
6260             // previous iterator state
6261             if(consumedExpansionCEs == 0) {
6262               newState = s.iterator->getState(s.iterator);
6263               if(newState != UITER_NO_STATE) {
6264                 iterState = newState;
6265                 iterSkips = 0;
6266               } else {
6267                 if(!firstTimeOnLevel) {
6268                   iterSkips++;
6269                 }
6270               }
6271             }
6272             firstTimeOnLevel = FALSE;
6273             CE = ucol_IGetNextCE(coll, &s, status);
6274             if(CE==UCOL_NO_MORE_CES) {
6275                 // Add the level separator
6276                 terminatePSKLevel(level, maxLevel, i, dest);
6277                 byteCountOrFrenchDone=0;
6278                 // Restart the iteration an move to the
6279                 // second level
6280                 s.iterator->move(s.iterator, 0, UITER_START);
6281                 level = UCOL_PSK_CASE;
6282                 break;
6283             }
6284             if(!isShiftedCE(CE, LVT, &wasShifted)) {
6285               CE >>= 8; /* get secondary */
6286               if(CE != 0) {
6287                 dest[i++]=(uint8_t)CE;
6288               }
6289             }
6290             if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6291               consumedExpansionCEs++;
6292             } else {
6293               consumedExpansionCEs = 0;
6294             }
6295             if(s.pos && *s.pos == 0) {
6296               iterSkips++;
6297             }
6298           }
6299         } else { // French secondary processing
6300           uint8_t frenchBuff[UCOL_MAX_BUFFER];
6301           int32_t frenchIndex = 0;
6302           // Here we are going backwards.
6303           // If the iterator is at the beggining, it should be
6304           // moved to end.
6305           if(wasDoingPrimary) {
6306             s.iterator->move(s.iterator, 0, UITER_LIMIT);
6307           }
6308           for(;;) {
6309             if(i == count) {
6310               goto saveState;
6311             }
6312             if(consumedExpansionCEs == 0) {
6313               newState = s.iterator->getState(s.iterator);
6314               if(newState != UITER_NO_STATE) {
6315                 iterState = newState;
6316                 iterSkips = 0;
6317               } else {
6318                 if(!firstTimeOnLevel) {
6319                   iterSkips++;
6320                 }
6321               }
6322             }
6323             firstTimeOnLevel = FALSE;
6324             CE = ucol_IGetPrevCE(coll, &s, status);
6325             if(CE==UCOL_NO_MORE_CES) {
6326                 // Add the level separator
6327                 terminatePSKLevel(level, maxLevel, i, dest);
6328                 byteCountOrFrenchDone=0;
6329                 // Restart the iteration an move to the next level
6330                 s.iterator->move(s.iterator, 0, UITER_START);
6331                 level = UCOL_PSK_CASE;
6332                 break;
6333             }
6334             if(isContinuation(CE)) { // if it's a continuation, we want to save it and
6335               // reverse when we get a first non-continuation CE.
6336               CE >>= 8;
6337               frenchBuff[frenchIndex++] = (uint8_t)CE;
6338             } else if(!isShiftedCE(CE, LVT, &wasShifted)) {
6339               CE >>= 8; /* get secondary */
6340               if(!frenchIndex) {
6341                 if(CE != 0) {
6342                   dest[i++]=(uint8_t)CE;
6343                 }
6344               } else {
6345                 frenchBuff[frenchIndex++] = (uint8_t)CE;
6346                 frenchIndex -= usedFrench;
6347                 usedFrench = 0;
6348                 while(i < count && frenchIndex) {
6349                   dest[i++] = frenchBuff[--frenchIndex];
6350                   usedFrench++;
6351                 }
6352               }
6353             }
6354             if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6355               consumedExpansionCEs++;
6356             } else {
6357               consumedExpansionCEs = 0;
6358             }
6359             if(s.pos && *s.pos == 0) {
6360               iterSkips++;
6361             }
6362           }
6363         }
6364       } else {
6365         level = UCOL_PSK_CASE;
6366       }
6367         /* fall through to next level */
6368     case UCOL_PSK_CASE:
6369       if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) {
6370         uint32_t caseShift = UCOL_CASE_SHIFT_START;
6371         uint8_t caseByte = UCOL_CASE_BYTE_START;
6372         uint8_t caseBits = 0;
6373
6374         for(;;) {
6375           if(i == count) {
6376             goto saveState;
6377           }
6378           // We should save the state only if we
6379           // are sure that we are done with the
6380           // previous iterator state
6381           if(consumedExpansionCEs == 0) {
6382             newState = s.iterator->getState(s.iterator);
6383             if(newState != UITER_NO_STATE) {
6384               iterState = newState;
6385               iterSkips = 0;
6386             } else {
6387               if(!firstTimeOnLevel) {
6388                 iterSkips++;
6389               }
6390             }
6391           }
6392           firstTimeOnLevel = FALSE;
6393           CE = ucol_IGetNextCE(coll, &s, status);
6394           if(CE==UCOL_NO_MORE_CES) {
6395             // On the case level we might have an unfinished
6396             // case byte. Add one if it's started.
6397             if(caseShift != UCOL_CASE_SHIFT_START) {
6398               dest[i++] = caseByte;
6399             }
6400             // This is kind of tricky - situation where
6401             // we need to keep the iterator in the old
6402             // state, but don't need to bring anything
6403             // to the next invocation
6404             if(i < count) {
6405               // Add the level separator
6406               terminatePSKLevel(level, maxLevel, i, dest);
6407               // Restart the iteration and move to the
6408               // next level
6409               s.iterator->move(s.iterator, 0, UITER_START);
6410               level = UCOL_PSK_TERTIARY;
6411             } else {
6412               dontAdvanceIteratorBecauseWeNeedALevelTerminator = TRUE;
6413             }
6414             break;
6415           }
6416
6417           if(!isShiftedCE(CE, LVT, &wasShifted)) {
6418             if(!isContinuation(CE)) {
6419               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6420               caseBits = (uint8_t)(CE & 0xC0);
6421               // this copies the case level logic from the
6422               // sort key generation code
6423               if(CE != 0) {
6424                 if(coll->caseFirst == UCOL_UPPER_FIRST) {
6425                   if((caseBits & 0xC0) == 0) {
6426                     caseByte |= 1 << (--caseShift);
6427                   } else {
6428                     caseByte |= 0 << (--caseShift);
6429                     /* second bit */
6430                     if(caseShift == 0) {
6431                       dest[i++] = caseByte;
6432                       caseShift = UCOL_CASE_SHIFT_START;
6433                       caseByte = UCOL_CASE_BYTE_START;
6434                     }
6435                     caseByte |= ((caseBits>>6)&1) << (--caseShift);
6436                   }
6437                 } else {
6438                   if((caseBits & 0xC0) == 0) {
6439                     caseByte |= 0 << (--caseShift);
6440                   } else {
6441                     caseByte |= 1 << (--caseShift);
6442                     /* second bit */
6443                     if(caseShift == 0) {
6444                       dest[i++] = caseByte;
6445                       caseShift = UCOL_CASE_SHIFT_START;
6446                       caseByte = UCOL_CASE_BYTE_START;
6447                     }
6448                     caseByte |= ((caseBits>>7)&1) << (--caseShift);
6449                   }
6450                 }
6451               }
6452
6453             }
6454           }
6455           // Not sure this is correct for the case level - revisit
6456           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6457             consumedExpansionCEs++;
6458           } else {
6459             consumedExpansionCEs = 0;
6460           }
6461           if(s.pos && *s.pos == 0) {
6462             iterSkips++;
6463           }
6464         }
6465       } else {
6466         level = UCOL_PSK_TERTIARY;
6467       }
6468         /* fall through to next level */
6469     case UCOL_PSK_TERTIARY:
6470       if(strength >= UCOL_TERTIARY) {
6471         for(;;) {
6472           if(i == count) {
6473             goto saveState;
6474           }
6475           // We should save the state only if we
6476           // are sure that we are done with the
6477           // previous iterator state
6478           if(consumedExpansionCEs == 0) {
6479             newState = s.iterator->getState(s.iterator);
6480             if(newState != UITER_NO_STATE) {
6481               iterState = newState;
6482               iterSkips = 0;
6483             } else {
6484               if(!firstTimeOnLevel) {
6485                 iterSkips++;
6486               }
6487             }
6488           }
6489           firstTimeOnLevel = FALSE;
6490           CE = ucol_IGetNextCE(coll, &s, status);
6491           if(CE==UCOL_NO_MORE_CES) {
6492               // Add the level separator
6493               terminatePSKLevel(level, maxLevel, i, dest);
6494               byteCountOrFrenchDone=0;
6495               // Restart the iteration an move to the
6496               // second level
6497               s.iterator->move(s.iterator, 0, UITER_START);
6498               level = UCOL_PSK_QUATERNARY;
6499               break;
6500           }
6501           if(!isShiftedCE(CE, LVT, &wasShifted)) {
6502             notIsContinuation = !isContinuation(CE);
6503
6504             if(notIsContinuation) {
6505               CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK);
6506               CE ^= coll->caseSwitch;
6507               CE &= coll->tertiaryMask;
6508             } else {
6509               CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6510             }
6511
6512             if(CE != 0) {
6513               dest[i++]=(uint8_t)CE;
6514             }
6515           }
6516           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6517             consumedExpansionCEs++;
6518           } else {
6519             consumedExpansionCEs = 0;
6520           }
6521           if(s.pos && *s.pos == 0) {
6522             iterSkips++;
6523           }
6524         }
6525       } else {
6526         // if we're not doing tertiary
6527         // skip to the end
6528         level = UCOL_PSK_NULL;
6529       }
6530         /* fall through to next level */
6531     case UCOL_PSK_QUATERNARY:
6532       if(strength >= UCOL_QUATERNARY) {
6533         for(;;) {
6534           if(i == count) {
6535             goto saveState;
6536           }
6537           // We should save the state only if we
6538           // are sure that we are done with the
6539           // previous iterator state
6540           if(consumedExpansionCEs == 0) {
6541             newState = s.iterator->getState(s.iterator);
6542             if(newState != UITER_NO_STATE) {
6543               iterState = newState;
6544               iterSkips = 0;
6545             } else {
6546               if(!firstTimeOnLevel) {
6547                 iterSkips++;
6548               }
6549             }
6550           }
6551           firstTimeOnLevel = FALSE;
6552           CE = ucol_IGetNextCE(coll, &s, status);
6553           if(CE==UCOL_NO_MORE_CES) {
6554               // Add the level separator
6555               terminatePSKLevel(level, maxLevel, i, dest);
6556               //dest[i++] = UCOL_LEVELTERMINATOR;
6557               byteCountOrFrenchDone=0;
6558               // Restart the iteration an move to the
6559               // second level
6560               s.iterator->move(s.iterator, 0, UITER_START);
6561               level = UCOL_PSK_QUIN;
6562               break;
6563           }
6564           if(isShiftedCE(CE, LVT, &wasShifted)) {
6565             CE >>= 16; /* get primary */
6566             if(CE != 0) {
6567               if(byteCountOrFrenchDone == 0) {
6568                 dest[i++]=(uint8_t)(CE >> 8);
6569               } else {
6570                 byteCountOrFrenchDone = 0;
6571               }
6572               if((CE &=0xff)!=0) {
6573                   if(i==count) {
6574                       /* overflow */
6575                       byteCountOrFrenchDone=1;
6576                       goto saveState;
6577                   }
6578                   dest[i++]=(uint8_t)CE;
6579               }
6580             }
6581           } else {
6582             notIsContinuation = !isContinuation(CE);
6583             if(notIsContinuation) {
6584               if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it
6585                 dest[i++] = UCOL_HIRAGANA_QUAD;
6586               } else {
6587                 dest[i++] = 0xFF;
6588               }
6589             }
6590           }
6591           if(s.CEpos - s.toReturn || (s.pos && *s.pos != 0)) {
6592             consumedExpansionCEs++;
6593           } else {
6594             consumedExpansionCEs = 0;
6595           }
6596           if(s.pos && *s.pos == 0) {
6597             iterSkips++;
6598           }
6599         }
6600       } else {
6601         // if we're not doing quaternary
6602         // skip to the end
6603         level = UCOL_PSK_NULL;
6604       }
6605         /* fall through to next level */
6606     case UCOL_PSK_QUIN:
6607       level = UCOL_PSK_IDENTICAL;
6608         /* fall through to next level */
6609     case UCOL_PSK_IDENTICAL:
6610       if(strength >= UCOL_IDENTICAL) {
6611         UChar32 first, second;
6612         int32_t bocsuBytesWritten = 0;
6613         // We always need to do identical on
6614         // the NFD form of the string.
6615         if(normIter == NULL) {
6616           // we arrived from the level below and
6617           // normalization was not turned on.
6618           // therefore, we need to make a fresh NFD iterator
6619           normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status);
6620           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6621         } else if(!doingIdenticalFromStart) {
6622           // there is an iterator, but we did some other levels.
6623           // therefore, we have a FCD iterator - need to make
6624           // a NFD one.
6625           // normIter being at the beginning does not guarantee
6626           // that the underlying iterator is at the beginning
6627           iter->move(iter, 0, UITER_START);
6628           s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status);
6629         }
6630         // At this point we have a NFD iterator that is positioned
6631         // in the right place
6632         if(U_FAILURE(*status)) {
6633           UTRACE_EXIT_STATUS(*status);
6634           return 0;
6635         }
6636         first = uiter_previous32(s.iterator);
6637         // maybe we're at the start of the string
6638         if(first == U_SENTINEL) {
6639           first = 0;
6640         } else {
6641           uiter_next32(s.iterator);
6642         }
6643
6644         j = 0;
6645         for(;;) {
6646           if(i == count) {
6647             if(j+1 < bocsuBytesWritten) {
6648               bocsuBytesUsed = j+1;
6649             }
6650             goto saveState;
6651           }
6652
6653           // On identical level, we will always save
6654           // the state if we reach this point, since
6655           // we don't depend on getNextCE for content
6656           // all the content is in our buffer and we
6657           // already either stored the full buffer OR
6658           // otherwise we won't arrive here.
6659           newState = s.iterator->getState(s.iterator);
6660           if(newState != UITER_NO_STATE) {
6661             iterState = newState;
6662             iterSkips = 0;
6663           } else {
6664             iterSkips++;
6665           }
6666
6667           uint8_t buff[4];
6668           second = uiter_next32(s.iterator);
6669
6670           // end condition for identical level
6671           if(second == U_SENTINEL) {
6672             terminatePSKLevel(level, maxLevel, i, dest);
6673             level = UCOL_PSK_NULL;
6674             break;
6675           }
6676           bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff);
6677           first = second;
6678
6679           j = 0;
6680           if(bocsuBytesUsed != 0) {
6681             while(bocsuBytesUsed-->0) {
6682               j++;
6683             }
6684           }
6685
6686           while(i < count && j < bocsuBytesWritten) {
6687             dest[i++] = buff[j++];
6688           }
6689         }
6690
6691       } else {
6692         level = UCOL_PSK_NULL;
6693       }
6694         /* fall through to next level */
6695     case UCOL_PSK_NULL:
6696       j = i;
6697       while(j<count) {
6698           dest[j++]=0;
6699       }
6700       break;
6701     default:
6702       *status = U_INTERNAL_PROGRAM_ERROR;
6703       UTRACE_EXIT_STATUS(*status);
6704       return 0;
6705     }
6706
6707 saveState:
6708     // Now we need to return stuff. First we want to see whether we have
6709     // done everything for the current state of iterator.
6710     if(consumedExpansionCEs || byteCountOrFrenchDone
6711       || dontAdvanceIteratorBecauseWeNeedALevelTerminator) {
6712       // Any of above mean that the previous transaction
6713       // wasn't finished and that we should store the
6714       // previous iterator state.
6715       state[0] = iterState;
6716     } else {
6717       // The transaction is complete. We will continue in
6718       // next iteration.
6719       if((newState = s.iterator->getState(s.iterator))!= UITER_NO_STATE) {
6720         state[0] = s.iterator->getState(s.iterator);
6721         iterSkips = 0;
6722       } else {
6723         state[0] = iterState;
6724         iterSkips++;
6725       }
6726     }
6727     // Store the number of elements processed. On CE levels, this is
6728     // the number of expansion CEs processed. On identical level, this
6729     // is the number of bocsu bytes written.
6730     if(level < UCOL_PSK_IDENTICAL) {
6731       if((consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) != consumedExpansionCEs) {
6732         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6733       }
6734       state[1] = (consumedExpansionCEs & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6735     } else {
6736       if((bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) != bocsuBytesUsed) {
6737         *status = U_INDEX_OUTOFBOUNDS_ERROR;
6738       }
6739       state[1] = (bocsuBytesUsed & UCOL_PSK_USED_ELEMENTS_MASK) << UCOL_PSK_USED_ELEMENTS_SHIFT;
6740     }
6741
6742     // Next we put in the level of comparison
6743     state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT);
6744
6745     // If we are doing French, we need to store whether we have just finished the French level
6746     if(level == UCOL_PSK_SECONDARY && doingFrench) {
6747       state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6748     } else {
6749       state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT);
6750     }
6751
6752     // Was the latest CE shifted
6753     if(wasShifted) {
6754       state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT;
6755     }
6756     // Check for iterSkips overflow
6757     if((iterSkips & UCOL_PSK_ITER_SKIP_MASK) != iterSkips) {
6758       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6759     }
6760     // Store iterSkips
6761     state[1] |= ((iterSkips & UCOL_PSK_ITER_SKIP_MASK) << UCOL_PSK_ITER_SKIP_SHIFT);
6762
6763     // Check for French overflow
6764     if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) {
6765       *status = U_INDEX_OUTOFBOUNDS_ERROR;
6766     }
6767     // Store number of bytes written in the French secondary continuation sequence
6768     state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT);
6769
6770
6771     // If we have used normalizing iterator, get rid of it
6772     if(normIter != NULL) {
6773       unorm_closeIter(normIter);
6774     }
6775
6776     // Return number of meaningful sortkey bytes.
6777     UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d",
6778                   dest,i, state[0], state[1]);
6779     UTRACE_EXIT_VALUE(i);
6780     return i;
6781 }
6782
6783 /**
6784  * Produce a bound for a given sortkey and a number of levels.
6785  */
6786 U_CAPI int32_t U_EXPORT2
6787 ucol_getBound(const uint8_t       *source,
6788         int32_t             sourceLength,
6789         UColBoundMode       boundType,
6790         uint32_t            noOfLevels,
6791         uint8_t             *result,
6792         int32_t             resultLength,
6793         UErrorCode          *status) {
6794   // consistency checks
6795   if(status == NULL || U_FAILURE(*status)) {
6796     return 0;
6797   }
6798   if(source == NULL) {
6799     *status = U_ILLEGAL_ARGUMENT_ERROR;
6800     return 0;
6801   }
6802
6803   int32_t sourceIndex = 0;
6804   // Scan the string until we skip enough of the key OR reach the end of the key
6805   do {
6806     sourceIndex++;
6807     if(source[sourceIndex] == UCOL_LEVELTERMINATOR) {
6808       noOfLevels--;
6809     }
6810   } while (noOfLevels > 0
6811     && (source[sourceIndex] != 0 || sourceIndex < sourceLength));
6812
6813   if((source[sourceIndex] == 0 || sourceIndex == sourceLength)
6814     && noOfLevels > 0) {
6815     *status = U_SORT_KEY_TOO_SHORT_WARNING;
6816   }
6817
6818
6819   // READ ME: this code assumes that the values for boundType
6820   // enum will not changes. They are set so that the enum value
6821   // corresponds to the number of extra bytes each bound type
6822   // needs.
6823   if(result != NULL && resultLength >= sourceIndex+boundType) {
6824     uprv_memcpy(result, source, sourceIndex);
6825     switch(boundType) {
6826     // Lower bound just gets terminated. No extra bytes
6827     case UCOL_BOUND_LOWER: // = 0
6828       break;
6829     // Upper bound needs one extra byte
6830     case UCOL_BOUND_UPPER: // = 1
6831       result[sourceIndex++] = 2;
6832       break;
6833     // Upper long bound needs two extra bytes
6834     case UCOL_BOUND_UPPER_LONG: // = 2
6835       result[sourceIndex++] = 0xFF;
6836       result[sourceIndex++] = 0xFF;
6837       break;
6838     default:
6839       *status = U_ILLEGAL_ARGUMENT_ERROR;
6840       return 0;
6841     }
6842     result[sourceIndex++] = 0;
6843
6844     return sourceIndex;
6845   } else {
6846     return sourceIndex+boundType+1;
6847   }
6848 }
6849
6850 static
6851 inline void uprv_appendByteToHexString(char *dst, uint8_t val) {
6852   uint32_t len = (uint32_t)uprv_strlen(dst);
6853   *(dst+len) = T_CString_itosOffset((val >> 4));
6854   *(dst+len+1) = T_CString_itosOffset((val & 0xF));
6855   *(dst+len+2) = 0;
6856 }
6857
6858 /* this function makes a string with representation of a sortkey */
6859 U_CAPI char* U_EXPORT2 ucol_sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
6860   int32_t strength = UCOL_PRIMARY;
6861   uint32_t res_size = 0;
6862   UBool doneCase = FALSE;
6863
6864   char *current = buffer;
6865   const uint8_t *currentSk = sortkey;
6866
6867   uprv_strcpy(current, "[");
6868
6869   while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
6870     if(strength > UCOL_PRIMARY) {
6871       uprv_strcat(current, " . ");
6872     }
6873     while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
6874       uprv_appendByteToHexString(current, *currentSk++);
6875       uprv_strcat(current, " ");
6876     }
6877     if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
6878         doneCase = TRUE;
6879     } else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
6880       strength ++;
6881     }
6882     uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
6883     if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
6884       break;
6885     }
6886   }
6887
6888   if(coll->strength == UCOL_IDENTICAL) {
6889     uprv_strcat(current, " . ");
6890     while(*currentSk != 0) {
6891       uprv_appendByteToHexString(current, *currentSk++);
6892       uprv_strcat(current, " ");
6893     }
6894
6895     uprv_appendByteToHexString(current, *currentSk++);
6896   }
6897   uprv_strcat(current, "]");
6898
6899   if(res_size > *len) {
6900     return NULL;
6901   }
6902
6903   return buffer;
6904 }
6905
6906
6907 /****************************************************************************/
6908 /* Following are the functions that deal with the properties of a collator  */
6909 /* there are new APIs and some compatibility APIs                           */
6910 /****************************************************************************/
6911
6912 static inline void
6913 ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE,
6914                     int32_t *primShift, int32_t *secShift, int32_t *terShift) {
6915   uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0;
6916   UBool reverseSecondary = FALSE;
6917   if(!isContinuation(CE)) {
6918     tertiary = (uint8_t)((CE & coll->tertiaryMask));
6919     tertiary ^= coll->caseSwitch;
6920     reverseSecondary = TRUE;
6921   } else {
6922     tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION));
6923     tertiary &= UCOL_REMOVE_CASE;
6924     reverseSecondary = FALSE;
6925   }
6926
6927   secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6928   primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK);
6929   primary1 = (uint8_t)(CE >> 8);
6930
6931   if(primary1 != 0) {
6932     coll->latinOneCEs[ch] |= (primary1 << *primShift);
6933     *primShift -= 8;
6934   }
6935   if(primary2 != 0) {
6936     if(*primShift < 0) {
6937       coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
6938       coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6939       coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
6940       return;
6941     }
6942     coll->latinOneCEs[ch] |= (primary2 << *primShift);
6943     *primShift -= 8;
6944   }
6945   if(secondary != 0) {
6946     if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary
6947       coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary
6948       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24);
6949     } else { // normal case
6950       coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift);
6951     }
6952     *secShift -= 8;
6953   }
6954   if(tertiary != 0) {
6955     coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift);
6956     *terShift -= 8;
6957   }
6958 }
6959
6960 static inline UBool
6961 ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) {
6962     uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3);
6963     if(newTable == NULL) {
6964       *status = U_MEMORY_ALLOCATION_ERROR;
6965       coll->latinOneFailed = TRUE;
6966       return FALSE;
6967     }
6968     int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t);
6969     uprv_memset(newTable, 0, size*sizeof(uint32_t)*3);
6970     uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy);
6971     uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy);
6972     uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy);
6973     coll->latinOneTableLen = size;
6974     uprv_free(coll->latinOneCEs);
6975     coll->latinOneCEs = newTable;
6976     return TRUE;
6977 }
6978
6979 static UBool
6980 ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) {
6981   UBool result = TRUE;
6982   if(coll->latinOneCEs == NULL) {
6983     coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3);
6984     if(coll->latinOneCEs == NULL) {
6985       *status = U_MEMORY_ALLOCATION_ERROR;
6986       return FALSE;
6987     }
6988     coll->latinOneTableLen = UCOL_LATINONETABLELEN;
6989   }
6990   UChar ch = 0;
6991   UCollationElements *it = ucol_openElements(coll, &ch, 1, status);
6992   uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3);
6993
6994   int32_t primShift = 24, secShift = 24, terShift = 24;
6995   uint32_t CE = 0;
6996   int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1;
6997
6998   // TODO: make safe if you get more than you wanted...
6999   for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) {
7000     primShift = 24; secShift = 24; terShift = 24;
7001     if(ch < 0x100) {
7002       CE = coll->latinOneMapping[ch];
7003     } else {
7004       CE = UTRIE_GET32_FROM_LEAD(coll->mapping, ch);
7005       if(CE == UCOL_NOT_FOUND && coll->UCA) {
7006         CE = UTRIE_GET32_FROM_LEAD(coll->UCA->mapping, ch);
7007       }
7008     }
7009     if(CE < UCOL_NOT_FOUND) {
7010       ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
7011     } else {
7012       switch (getCETag(CE)) {
7013       case EXPANSION_TAG:
7014       case DIGIT_TAG:
7015         ucol_setText(it, &ch, 1, status);
7016         while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) {
7017           if(primShift < 0 || secShift < 0 || terShift < 0) {
7018             coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE;
7019             coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
7020             coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE;
7021             break;
7022           }
7023           ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift);
7024         }
7025         break;
7026       case CONTRACTION_TAG:
7027         // here is the trick
7028         // F2 is contraction. We do something very similar to contractions
7029         // but have two indices, one in the real contraction table and the
7030         // other to where we stuffed things. This hopes that we don't have
7031         // many contractions (this should work for latin-1 tables).
7032         {
7033           if((CE & 0x00FFF000) != 0) {
7034             *status = U_UNSUPPORTED_ERROR;
7035             coll->latinOneFailed = TRUE;
7036             return FALSE;
7037           }
7038
7039           const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE);
7040
7041           CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table
7042
7043           coll->latinOneCEs[ch] = CE;
7044           coll->latinOneCEs[coll->latinOneTableLen+ch] = CE;
7045           coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE;
7046
7047           // We're going to jump into contraction table, pick the elements
7048           // and use them
7049           do {
7050               CE = *(coll->contractionCEs +
7051                   (UCharOffset - coll->contractionIndex));
7052               if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) {
7053                 uint32_t size;
7054                 uint32_t i;    /* general counter */
7055                 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
7056                 size = getExpansionCount(CE);
7057                 //CE = *CEOffset++;
7058                 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
7059                   for(i = 0; i<size; i++) {
7060                     if(primShift < 0 || secShift < 0 || terShift < 0) {
7061                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7062                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7063                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7064                       break;
7065                     }
7066                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
7067                   }
7068                 } else { /* else, we do */
7069                   while(*CEOffset != 0) {
7070                     if(primShift < 0 || secShift < 0 || terShift < 0) {
7071                       coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7072                       coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7073                       coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7074                       break;
7075                     }
7076                     ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift);
7077                   }
7078                 }
7079                 contractionOffset++;
7080               } else if(CE < UCOL_NOT_FOUND) {
7081                 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift);
7082               } else {
7083                 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7084                 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7085                 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE;
7086                 contractionOffset++;
7087               }
7088               UCharOffset++;
7089               primShift = 24; secShift = 24; terShift = 24;
7090               if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate
7091                 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) {
7092                   coll->latinOneFailed = TRUE;
7093                   return FALSE;
7094                 }
7095               }
7096           } while(*UCharOffset != 0xFFFF);
7097         }
7098         break;
7099       default:
7100         coll->latinOneFailed = TRUE;
7101         result = FALSE;
7102         break;
7103       }
7104     }
7105   }
7106   ucol_closeElements(it);
7107   // compact table
7108   if(contractionOffset < coll->latinOneTableLen) {
7109     if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) {
7110         coll->latinOneFailed = TRUE;
7111         return FALSE;
7112     }
7113   }
7114   return result;
7115 }
7116
7117 void ucol_updateInternalState(UCollator *coll, UErrorCode *status) {
7118       if(U_SUCCESS(*status)) {
7119         if(coll->caseFirst == UCOL_UPPER_FIRST) {
7120           coll->caseSwitch = UCOL_CASE_SWITCH;
7121         } else {
7122           coll->caseSwitch = UCOL_NO_CASE_SWITCH;
7123         }
7124
7125         if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) {
7126           coll->tertiaryMask = UCOL_REMOVE_CASE;
7127           coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7128           coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_OFF;
7129           coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF;
7130           coll->tertiaryBottom = UCOL_COMMON_BOT3;
7131         } else {
7132           coll->tertiaryMask = UCOL_KEEP_CASE;
7133           coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON;
7134           if(coll->caseFirst == UCOL_UPPER_FIRST) {
7135             coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST;
7136             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER;
7137             coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER;
7138           } else {
7139             coll->tertiaryCommon = UCOL_COMMON3_NORMAL;
7140             coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER;
7141             coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER;
7142           }
7143         }
7144
7145         /* Set the compression values */
7146         uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1);
7147         coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */
7148         coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount);
7149
7150         if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY
7151           && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) {
7152           coll->sortKeyGen = ucol_calcSortKeySimpleTertiary;
7153         } else {
7154           coll->sortKeyGen = ucol_calcSortKey;
7155         }
7156         if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF
7157           && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) {
7158           if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) {
7159             if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it
7160               //fprintf(stderr, "F");
7161               coll->latinOneUse = TRUE;
7162             } else {
7163               coll->latinOneUse = FALSE;
7164             }
7165             if(*status == U_UNSUPPORTED_ERROR) {
7166               *status = U_ZERO_ERROR;
7167             }
7168           } else { // latin1Table exists and it doesn't need to be regenerated, just use it
7169             coll->latinOneUse = TRUE;
7170           }
7171         } else {
7172           coll->latinOneUse = FALSE;
7173         }
7174       }
7175
7176 }
7177
7178 U_CAPI uint32_t  U_EXPORT2
7179 ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) {
7180   if(U_FAILURE(*status) || coll == NULL) {
7181     return 0;
7182   }
7183   if(len == -1) {
7184     len = u_strlen(varTop);
7185   }
7186   if(len == 0) {
7187     *status = U_ILLEGAL_ARGUMENT_ERROR;
7188     return 0;
7189   }
7190
7191   collIterate s;
7192   IInit_collIterate(coll, varTop, len, &s);
7193
7194   uint32_t CE = ucol_IGetNextCE(coll, &s, status);
7195
7196   /* here we check if we have consumed all characters */
7197   /* you can put in either one character or a contraction */
7198   /* you shouldn't put more... */
7199   if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
7200     *status = U_CE_NOT_FOUND_ERROR;
7201     return 0;
7202   }
7203
7204   uint32_t nextCE = ucol_IGetNextCE(coll, &s, status);
7205
7206   if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) {
7207     *status = U_PRIMARY_TOO_LONG_ERROR;
7208     return 0;
7209   }
7210   if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) {
7211     coll->variableTopValueisDefault = FALSE;
7212     coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16;
7213   }
7214
7215   return CE & UCOL_PRIMARYMASK;
7216 }
7217
7218 U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) {
7219   if(U_FAILURE(*status) || coll == NULL) {
7220     return 0;
7221   }
7222   return coll->variableTopValue<<16;
7223 }
7224
7225 U_CAPI void  U_EXPORT2
7226 ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) {
7227   if(U_FAILURE(*status) || coll == NULL) {
7228     return;
7229   }
7230
7231   if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) {
7232       coll->variableTopValueisDefault = FALSE;
7233       coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16;
7234   }
7235 }
7236 /* Attribute setter API */
7237 U_CAPI void  U_EXPORT2
7238 ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) {
7239     if(U_FAILURE(*status) || coll == NULL) {
7240       return;
7241     }
7242     UColAttributeValue oldFrench = coll->frenchCollation;
7243     UColAttributeValue oldCaseFirst = coll->caseFirst;
7244     switch(attr) {
7245     case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
7246       if(value == UCOL_ON) {
7247         coll->numericCollation = UCOL_ON;
7248         coll->numericCollationisDefault = FALSE;
7249       } else if (value == UCOL_OFF) {
7250         coll->numericCollation = UCOL_OFF;
7251         coll->numericCollationisDefault = FALSE;
7252       } else if (value == UCOL_DEFAULT) {
7253         coll->numericCollationisDefault = TRUE;
7254         coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
7255       } else {
7256         *status = U_ILLEGAL_ARGUMENT_ERROR;
7257       }
7258       break;
7259     case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
7260       if(value == UCOL_ON) {
7261         coll->hiraganaQ = UCOL_ON;
7262         coll->hiraganaQisDefault = FALSE;
7263       } else if (value == UCOL_OFF) {
7264         coll->hiraganaQ = UCOL_OFF;
7265         coll->hiraganaQisDefault = FALSE;
7266       } else if (value == UCOL_DEFAULT) {
7267         coll->hiraganaQisDefault = TRUE;
7268         coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ;
7269       } else {
7270         *status = U_ILLEGAL_ARGUMENT_ERROR;
7271       }
7272       break;
7273     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7274         if(value == UCOL_ON) {
7275             coll->frenchCollation = UCOL_ON;
7276             coll->frenchCollationisDefault = FALSE;
7277         } else if (value == UCOL_OFF) {
7278             coll->frenchCollation = UCOL_OFF;
7279             coll->frenchCollationisDefault = FALSE;
7280         } else if (value == UCOL_DEFAULT) {
7281             coll->frenchCollationisDefault = TRUE;
7282             coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation;
7283         } else {
7284             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7285         }
7286         break;
7287     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7288         if(value == UCOL_SHIFTED) {
7289             coll->alternateHandling = UCOL_SHIFTED;
7290             coll->alternateHandlingisDefault = FALSE;
7291         } else if (value == UCOL_NON_IGNORABLE) {
7292             coll->alternateHandling = UCOL_NON_IGNORABLE;
7293             coll->alternateHandlingisDefault = FALSE;
7294         } else if (value == UCOL_DEFAULT) {
7295             coll->alternateHandlingisDefault = TRUE;
7296             coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ;
7297         } else {
7298             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7299         }
7300         break;
7301     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7302         if(value == UCOL_LOWER_FIRST) {
7303             coll->caseFirst = UCOL_LOWER_FIRST;
7304             coll->caseFirstisDefault = FALSE;
7305         } else if (value == UCOL_UPPER_FIRST) {
7306             coll->caseFirst = UCOL_UPPER_FIRST;
7307             coll->caseFirstisDefault = FALSE;
7308         } else if (value == UCOL_OFF) {
7309           coll->caseFirst = UCOL_OFF;
7310           coll->caseFirstisDefault = FALSE;
7311         } else if (value == UCOL_DEFAULT) {
7312             coll->caseFirst = (UColAttributeValue)coll->options->caseFirst;
7313             coll->caseFirstisDefault = TRUE;
7314         } else {
7315             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7316         }
7317         break;
7318     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7319         if(value == UCOL_ON) {
7320             coll->caseLevel = UCOL_ON;
7321             coll->caseLevelisDefault = FALSE;
7322         } else if (value == UCOL_OFF) {
7323             coll->caseLevel = UCOL_OFF;
7324             coll->caseLevelisDefault = FALSE;
7325         } else if (value == UCOL_DEFAULT) {
7326             coll->caseLevel = (UColAttributeValue)coll->options->caseLevel;
7327             coll->caseLevelisDefault = TRUE;
7328         } else {
7329             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7330         }
7331         break;
7332     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7333         if(value == UCOL_ON) {
7334             coll->normalizationMode = UCOL_ON;
7335             coll->normalizationModeisDefault = FALSE;
7336         } else if (value == UCOL_OFF) {
7337             coll->normalizationMode = UCOL_OFF;
7338             coll->normalizationModeisDefault = FALSE;
7339         } else if (value == UCOL_DEFAULT) {
7340             coll->normalizationModeisDefault = TRUE;
7341             coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode;
7342         } else {
7343             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7344         }
7345         break;
7346     case UCOL_STRENGTH:         /* attribute for strength */
7347         if (value == UCOL_DEFAULT) {
7348             coll->strengthisDefault = TRUE;
7349             coll->strength = (UColAttributeValue)coll->options->strength;
7350         } else if (value <= UCOL_IDENTICAL) {
7351             coll->strengthisDefault = FALSE;
7352             coll->strength = value;
7353         } else {
7354             *status = U_ILLEGAL_ARGUMENT_ERROR  ;
7355         }
7356         break;
7357     case UCOL_ATTRIBUTE_COUNT:
7358     default:
7359         *status = U_ILLEGAL_ARGUMENT_ERROR;
7360         break;
7361     }
7362     if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) {
7363       coll->latinOneRegenTable = TRUE;
7364     } else {
7365       coll->latinOneRegenTable = FALSE;
7366     }
7367     ucol_updateInternalState(coll, status);
7368 }
7369
7370 U_CAPI UColAttributeValue  U_EXPORT2
7371 ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) {
7372     if(U_FAILURE(*status) || coll == NULL) {
7373       return UCOL_DEFAULT;
7374     }
7375     switch(attr) {
7376     case UCOL_NUMERIC_COLLATION:
7377       return coll->numericCollation;
7378     case UCOL_HIRAGANA_QUATERNARY_MODE:
7379       return coll->hiraganaQ;
7380     case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
7381         return coll->frenchCollation;
7382     case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/
7383         return coll->alternateHandling;
7384     case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */
7385         return coll->caseFirst;
7386     case UCOL_CASE_LEVEL: /* do we have an extra case level */
7387         return coll->caseLevel;
7388     case UCOL_NORMALIZATION_MODE: /* attribute for normalization */
7389         return coll->normalizationMode;
7390     case UCOL_STRENGTH:         /* attribute for strength */
7391         return coll->strength;
7392     case UCOL_ATTRIBUTE_COUNT:
7393     default:
7394         *status = U_ILLEGAL_ARGUMENT_ERROR;
7395         break;
7396     }
7397     return UCOL_DEFAULT;
7398 }
7399
7400 U_CAPI void U_EXPORT2
7401 ucol_setStrength(    UCollator                *coll,
7402             UCollationStrength        strength)
7403 {
7404   UErrorCode status = U_ZERO_ERROR;
7405   ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status);
7406 }
7407
7408 U_CAPI UCollationStrength U_EXPORT2
7409 ucol_getStrength(const UCollator *coll)
7410 {
7411   UErrorCode status = U_ZERO_ERROR;
7412   return ucol_getAttribute(coll, UCOL_STRENGTH, &status);
7413 }
7414
7415 /****************************************************************************/
7416 /* Following are misc functions                                             */
7417 /* there are new APIs and some compatibility APIs                           */
7418 /****************************************************************************/
7419
7420 U_CAPI UCollator* U_EXPORT2
7421 ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status)
7422 {
7423     UCollator * localCollator;
7424     int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator);
7425     char *stackBufferChars = (char *)stackBuffer;
7426
7427     if (status == NULL || U_FAILURE(*status)){
7428         return 0;
7429     }
7430     if ((stackBuffer && !pBufferSize) || !coll){
7431        *status = U_ILLEGAL_ARGUMENT_ERROR;
7432         return 0;
7433     }
7434     /* Pointers on 64-bit platforms need to be aligned
7435      * on a 64-bit boundry in memory.
7436      */
7437     if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
7438         int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars);
7439         *pBufferSize -= offsetUp;
7440         stackBufferChars += offsetUp;
7441     }
7442     stackBuffer = (void *)stackBufferChars;
7443
7444     if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */
7445         *pBufferSize =  bufferSizeNeeded;
7446         return 0;
7447     }
7448     if (!stackBuffer || *pBufferSize < bufferSizeNeeded) {
7449         /* allocate one here...*/
7450         int32_t length;
7451         const UChar * rules = ucol_getRules(coll, &length);
7452
7453         localCollator = ucol_openRules(rules,
7454                                        length,
7455                                        ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status),
7456                                        ucol_getStrength(coll),
7457                                        NULL,
7458                                        status);
7459         if (U_SUCCESS(*status))
7460         {
7461             *status = U_SAFECLONE_ALLOCATED_WARNING;
7462         }
7463     } else {
7464         localCollator = (UCollator *)stackBuffer;
7465         uprv_memcpy(localCollator, coll, sizeof(UCollator));
7466         localCollator->freeOnClose = FALSE;
7467         localCollator->requestedLocale = NULL; // zero copies of pointers
7468         localCollator->validLocale = NULL;
7469     }
7470     return localCollator;
7471 }
7472
7473 U_CAPI int32_t U_EXPORT2
7474 ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) {
7475   UErrorCode status = U_ZERO_ERROR;
7476   int32_t len = 0;
7477   int32_t UCAlen = 0;
7478   const UChar* ucaRules = 0;
7479   const UChar *rules = ucol_getRules(coll, &len);
7480   if(delta == UCOL_FULL_RULES) {
7481     /* take the UCA rules and append real rules at the end */
7482     /* UCA rules will be probably coming from the root RB */
7483     ucaRules = ures_getStringByKey(coll->rb,"%%UCARULES",&UCAlen,&status);
7484     /*
7485     UResourceBundle* cresb = ures_getByKeyWithFallback(coll->rb, "collations", NULL, &status);
7486     UResourceBundle*  uca = ures_getByKeyWithFallback(cresb, "UCA", NULL, &status);
7487     ucaRules = ures_getStringByKey(uca,"Sequence",&UCAlen,&status);
7488     ures_close(uca);
7489     ures_close(cresb);
7490     */
7491   }
7492   if(U_FAILURE(status)) {
7493     return 0;
7494   }
7495   if(buffer!=0 && bufferLen>0){
7496       *buffer=0;
7497       if(UCAlen > 0) {
7498         u_memcpy(buffer, ucaRules, uprv_min(UCAlen, bufferLen));
7499       }
7500       if(len > 0 && bufferLen > UCAlen) {
7501         u_memcpy(buffer+UCAlen, rules, uprv_min(len, bufferLen-UCAlen));
7502       }
7503   }
7504   return u_terminateUChars(buffer, bufferLen, len+UCAlen, &status);
7505 }
7506
7507 static const UChar _NUL = 0;
7508
7509 U_CAPI const UChar* U_EXPORT2
7510 ucol_getRules(    const    UCollator       *coll,
7511         int32_t            *length)
7512 {
7513   if(coll->rules != NULL) {
7514     *length = coll->rulesLength;
7515     return coll->rules;
7516   } else {
7517     UErrorCode status = U_ZERO_ERROR;
7518     if(coll->elements != NULL) {
7519       if(U_SUCCESS(status)) {
7520         /*Semantic const */
7521         ((UCollator *)coll)->rules = ures_getStringByKey(coll->elements, "Sequence", length, &status);
7522         ((UCollator *)coll)->rulesLength = *length;
7523         ((UCollator *)coll)->freeRulesOnClose = FALSE;
7524         return coll->rules;
7525       }
7526     }
7527     *length = 0;
7528     return &_NUL;
7529   }
7530 }
7531
7532 U_CAPI int32_t U_EXPORT2
7533 ucol_getDisplayName(    const    char        *objLoc,
7534             const    char        *dispLoc,
7535             UChar             *result,
7536             int32_t         resultLength,
7537             UErrorCode        *status)
7538 {
7539
7540   if(U_FAILURE(*status)) return -1;
7541   UnicodeString dst;
7542   if(!(result==NULL && resultLength==0)) {
7543     // NULL destination for pure preflighting: empty dummy string
7544     // otherwise, alias the destination buffer
7545     dst.setTo(result, 0, resultLength);
7546   }
7547   Collator::getDisplayName(Locale(objLoc), Locale(dispLoc), dst);
7548   return dst.extract(result, resultLength, *status);
7549 }
7550
7551 U_CAPI const char* U_EXPORT2
7552 ucol_getAvailable(int32_t index)
7553 {
7554   return uloc_getAvailable(index);
7555 }
7556
7557 U_CAPI int32_t U_EXPORT2
7558 ucol_countAvailable()
7559 {
7560   return uloc_countAvailable();
7561 }
7562
7563 #if !UCONFIG_NO_SERVICE
7564 U_CAPI UEnumeration* U_EXPORT2
7565 ucol_openAvailableLocales(UErrorCode *status) {
7566     // This is a wrapper over Collator::getAvailableLocales()
7567     if (U_FAILURE(*status)) {
7568         return NULL;
7569     }
7570     StringEnumeration *s = Collator::getAvailableLocales();
7571     if (s == NULL) {
7572         *status = U_MEMORY_ALLOCATION_ERROR;
7573         return NULL;
7574     }
7575     return uenum_openStringEnumeration(s, status);
7576 }
7577 #endif
7578
7579 // Note: KEYWORDS[0] != RESOURCE_NAME - alan
7580
7581 static const char* RESOURCE_NAME = "collations";
7582
7583 static const char* KEYWORDS[] = { "collation" };
7584
7585 #define KEYWORD_COUNT (sizeof(KEYWORDS)/sizeof(KEYWORDS[0]))
7586
7587 U_CAPI UEnumeration* U_EXPORT2
7588 ucol_getKeywords(UErrorCode *status) {
7589     UEnumeration *result = NULL;
7590     if (U_SUCCESS(*status)) {
7591         return uenum_openCharStringsEnumeration(KEYWORDS, KEYWORD_COUNT, status);
7592     }
7593     return result;
7594 }
7595
7596 U_CAPI UEnumeration* U_EXPORT2
7597 ucol_getKeywordValues(const char *keyword, UErrorCode *status) {
7598     // hard-coded to accept exactly one collation keyword
7599     // modify if additional collation keyword is added later
7600     if (U_SUCCESS(*status) &&
7601         keyword==NULL || uprv_strcmp(keyword, KEYWORDS[0])!=0) {
7602         *status = U_ILLEGAL_ARGUMENT_ERROR;
7603         return NULL;
7604     }
7605     return ures_getKeywordValues(U_ICUDATA_COLL, RESOURCE_NAME, status);
7606 }
7607
7608 U_CAPI int32_t U_EXPORT2
7609 ucol_getFunctionalEquivalent(char* result, int32_t resultCapacity,
7610                              const char* keyword, const char* locale,
7611                              UBool* isAvailable, UErrorCode* status) {
7612     // N.B.: Resource name is "collations" but keyword is "collation"
7613     return ures_getFunctionalEquivalent(result, resultCapacity, U_ICUDATA_COLL,
7614                                         "collations", keyword, locale,
7615                                         isAvailable, TRUE, status);
7616 }
7617
7618 U_CAPI void U_EXPORT2
7619 ucol_getVersion(const UCollator* coll,
7620                 UVersionInfo versionInfo)
7621 {
7622     /* RunTime version  */
7623     uint8_t rtVersion = UCOL_RUNTIME_VERSION;
7624     /* Builder version*/
7625     uint8_t bdVersion = coll->image->version[0];
7626
7627     /* Charset Version. Need to get the version from cnv files
7628      * makeconv should populate cnv files with version and
7629      * an api has to be provided in ucnv.h to obtain this version
7630      */
7631     uint8_t csVersion = 0;
7632
7633     /* combine the version info */
7634     uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion));
7635
7636     /* Tailoring rules */
7637     versionInfo[0] = (uint8_t)(cmbVersion>>8);
7638     versionInfo[1] = (uint8_t)cmbVersion;
7639     versionInfo[2] = coll->image->version[1];
7640     if(coll->UCA) {
7641         versionInfo[3] = coll->UCA->image->UCAVersion[0];
7642     } else {
7643         versionInfo[3] = 0;
7644     }
7645 }
7646
7647
7648 /* This internal API checks whether a character is tailored or not */
7649 U_CAPI UBool  U_EXPORT2
7650 ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) {
7651   uint32_t CE = UCOL_NOT_FOUND;
7652   const UChar *ContractionStart = NULL;
7653   if(U_SUCCESS(*status) && coll != NULL) {
7654     if(coll == coll->UCA) {
7655       return FALSE;
7656     } else if(u < 0x100) { /* latin-1 */
7657       CE = coll->latinOneMapping[u];
7658       if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) {
7659         return FALSE;
7660       }
7661     } else { /* regular */
7662       /*CE = ucmpe32_get(coll->mapping, u);*/
7663       CE = UTRIE_GET32_FROM_LEAD(coll->mapping, u);
7664
7665     }
7666
7667     if(isContraction(CE)) {
7668       ContractionStart = (UChar *)coll->image+getContractOffset(CE);
7669       CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex));
7670     }
7671
7672     if(CE == UCOL_NOT_FOUND) {
7673       return FALSE;
7674     } else {
7675       return TRUE;
7676     }
7677   } else {
7678     return FALSE;
7679   }
7680 }
7681
7682
7683 /****************************************************************************/
7684 /* Following are the string compare functions                               */
7685 /*                                                                          */
7686 /****************************************************************************/
7687
7688
7689 /*  ucol_checkIdent    internal function.  Does byte level string compare.   */
7690 /*                     Used by strcoll if strength == identical and strings  */
7691 /*                     are otherwise equal.  Moved out-of-line because this  */
7692 /*                     is a rare case.                                       */
7693 /*                                                                           */
7694 /*                     Comparison must be done on NFD normalized strings.    */
7695 /*                     FCD is not good enough.                               */
7696 /*                                                                           */
7697 /*      TODO:  make an incremental NFD Comparison function, which could      */
7698 /*             be of general use                                             */
7699
7700 static
7701 UCollationResult    ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status)
7702 {
7703
7704   // TODO: When we have an UChar iterator, we need to access the whole string. One
7705   // useful modification would be a UChar iterator extract API, since reset next next...
7706   // is not optimal.
7707   // TODO: Handle long strings. Do the same in compareUsingSortKeys.
7708
7709   // When we arrive here, we can have normal strings or UCharIterators. Currently they are both
7710   // of same type, but that doesn't really mean that it will stay that way.
7711
7712     // The division for the array length may truncate the array size to
7713     // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
7714     // for all platforms anyway.
7715     UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7716     UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
7717     //UChar sStackBuf[256], tStackBuf[256];
7718     //int32_t sBufSize = 256, tBufSize = 256;
7719     int32_t            comparison;
7720     int32_t          sLen        = 0;
7721     UChar            *sBuf       = NULL;
7722     int32_t          tLen        = 0;
7723     UChar            *tBuf       = NULL;
7724     UBool freeSBuf = FALSE, freeTBuf = FALSE;
7725
7726     if (sColl->flags & UCOL_USE_ITERATOR) {
7727       UNormIterator *sNIt = NULL, *tNIt = NULL;
7728       sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
7729       tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
7730       sColl->iterator->move(sColl->iterator, 0, UITER_START);
7731       tColl->iterator->move(tColl->iterator, 0, UITER_START);
7732       UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status);
7733       UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status);
7734       comparison = u_strCompareIter(sIt, tIt, TRUE);
7735       unorm_closeIter(sNIt);
7736       unorm_closeIter(tNIt);
7737     } else {
7738       sLen        = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1;
7739       sBuf = sColl->string;
7740       tLen        = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1;
7741       tBuf = tColl->string;
7742
7743       if (normalize) {
7744           *status = U_ZERO_ERROR;
7745           if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) {
7746               sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7747                                      sBuf, sLen,
7748                                      FALSE, 0,
7749                                      status);
7750               if(*status == U_BUFFER_OVERFLOW_ERROR) {
7751                   if(!u_growBufferFromStatic(sColl->stackWritableBuffer,
7752                                              &sColl->writableBuffer,
7753                                              (int32_t *)&sColl->writableBufSize, sLen,
7754                                              0)
7755                   ) {
7756                       *status = U_MEMORY_ALLOCATION_ERROR;
7757                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7758                   }
7759                   *status = U_ZERO_ERROR;
7760                   sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize,
7761                                          sBuf, sLen,
7762                                          FALSE, 0,
7763                                          status);
7764               }
7765               if(freeSBuf) {
7766                 uprv_free(sBuf);
7767                 freeSBuf = FALSE;
7768               }
7769               sBuf = sColl->writableBuffer;
7770               if (sBuf != sColl->stackWritableBuffer) {
7771                   sColl->flags |= UCOL_ITER_ALLOCATED;
7772               }
7773           }
7774
7775           *status = U_ZERO_ERROR;
7776           if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) {
7777               tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7778                                      tBuf, tLen,
7779                                      FALSE, 0,
7780                                      status);
7781               if(*status == U_BUFFER_OVERFLOW_ERROR) {
7782                   if(!u_growBufferFromStatic(tColl->stackWritableBuffer,
7783                                              &tColl->writableBuffer,
7784                                              (int32_t *)&tColl->writableBufSize, tLen,
7785                                              0)
7786                   ) {
7787                       *status = U_MEMORY_ALLOCATION_ERROR;
7788                       return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */
7789                   }
7790                   *status = U_ZERO_ERROR;
7791                   tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize,
7792                                          tBuf, tLen,
7793                                          FALSE, 0,
7794                                          status);
7795               }
7796               if(freeTBuf) {
7797                 uprv_free(tBuf);
7798                 freeTBuf = FALSE;
7799               }
7800               tBuf = tColl->writableBuffer;
7801               if (tBuf != tColl->stackWritableBuffer) {
7802                   tColl->flags |= UCOL_ITER_ALLOCATED;
7803               }
7804           }
7805       }
7806
7807       if (sLen == -1 && tLen == -1) {
7808           comparison = u_strcmpCodePointOrder(sBuf, tBuf);
7809       } else {
7810           if (sLen == -1) {
7811               sLen = u_strlen(sBuf);
7812           }
7813           if (tLen == -1) {
7814               tLen = u_strlen(tBuf);
7815           }
7816           comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen));
7817           if (comparison == 0) {
7818               comparison = sLen - tLen;
7819           }
7820       }
7821     }
7822
7823     if (comparison < 0) {
7824         return UCOL_LESS;
7825     } else if (comparison == 0) {
7826         return UCOL_EQUAL;
7827     } else /* comparison > 0 */ {
7828         return UCOL_GREATER;
7829     }
7830 }
7831
7832 /*  CEBuf - A struct and some inline functions to handle the saving    */
7833 /*          of CEs in a buffer within ucol_strcoll                     */
7834
7835 #define UCOL_CEBUF_SIZE 512
7836 typedef struct ucol_CEBuf {
7837     uint32_t    *buf;
7838     uint32_t    *endp;
7839     uint32_t    *pos;
7840     uint32_t     localArray[UCOL_CEBUF_SIZE];
7841 } ucol_CEBuf;
7842
7843
7844 static
7845 inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) {
7846     (b)->buf = (b)->pos = (b)->localArray;
7847     (b)->endp = (b)->buf + UCOL_CEBUF_SIZE;
7848 };
7849
7850 static
7851 void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci) {
7852     uint32_t  oldSize;
7853     uint32_t  newSize;
7854     uint32_t  *newBuf;
7855
7856     ci->flags |= UCOL_ITER_ALLOCATED;
7857     oldSize = b->pos - b->buf;
7858     newSize = oldSize * 2;
7859     newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t));
7860     if(newBuf != NULL) {
7861       uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t));
7862       if (b->buf != b->localArray) {
7863           uprv_free(b->buf);
7864       }
7865       b->buf = newBuf;
7866       b->endp = b->buf + newSize;
7867       b->pos  = b->buf + oldSize;
7868     }
7869 }
7870
7871 static
7872 inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci) {
7873     if (b->pos == b->endp) {
7874         ucol_CEBuf_Expand(b, ci);
7875 }
7876     *(b)->pos++ = ce;
7877 };
7878
7879 /* This is a trick string compare function that goes in and uses sortkeys to compare */
7880 /* It is used when compare gets in trouble and needs to bail out                     */
7881 static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl,
7882                                                   collIterate *tColl)
7883 {
7884     uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER];
7885     uint8_t *sourceKeyP = sourceKey;
7886     uint8_t *targetKeyP = targetKey;
7887     int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER;
7888     const UCollator *coll = sColl->coll;
7889     UChar *source = NULL;
7890     UChar *target = NULL;
7891     UChar sStackBuf[256], tStackBuf[256];
7892     int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7893     int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7894
7895     // TODO: Handle long strings. Do the same in ucol_checkIdent.
7896     if(sColl->flags & UCOL_USE_ITERATOR) {
7897       sColl->iterator->move(sColl->iterator, 0, UITER_START);
7898       tColl->iterator->move(tColl->iterator, 0, UITER_START);
7899       source = sStackBuf;
7900       UChar *sBufp = source;
7901       target = tStackBuf;
7902       UChar *tBufp = target;
7903       while(sColl->iterator->hasNext(sColl->iterator)) {
7904         *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator);
7905       }
7906       while(tColl->iterator->hasNext(tColl->iterator)) {
7907         *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator);
7908       }
7909       sourceLength = sBufp - source;
7910       targetLength = tBufp - target;
7911     } else { // no iterators
7912       sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1;
7913       targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1;
7914       source = sColl->string;
7915       target = tColl->string;
7916     }
7917
7918
7919
7920     sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7921     if(sourceKeyLen > UCOL_MAX_BUFFER) {
7922         sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t));
7923         if(sourceKeyP != NULL) {
7924           sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen);
7925         }
7926     }
7927
7928     targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7929     if(targetKeyLen > UCOL_MAX_BUFFER) {
7930         targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t));
7931         if(targetKeyP != NULL) {
7932           targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen);
7933         }
7934     }
7935
7936     int32_t result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP);
7937
7938     if(sourceKeyP != sourceKey) {
7939         uprv_free(sourceKeyP);
7940     }
7941
7942     if(targetKeyP != targetKey) {
7943         uprv_free(targetKeyP);
7944     }
7945
7946     if(result<0) {
7947         return UCOL_LESS;
7948     } else if(result>0) {
7949         return UCOL_GREATER;
7950     } else {
7951         return UCOL_EQUAL;
7952     }
7953 }
7954
7955
7956 static inline UCollationResult
7957 ucol_strcollRegular( collIterate *sColl, collIterate *tColl,
7958 //              const UCollator    *coll,
7959 //              const UChar        *source,
7960 //              int32_t            sourceLength,
7961 //              const UChar        *target,
7962 //              int32_t            targetLength,
7963               UErrorCode *status)
7964 {
7965     U_ALIGN_CODE(16);
7966
7967     const UCollator *coll = sColl->coll;
7968
7969
7970     // setting up the collator parameters
7971     UColAttributeValue strength = coll->strength;
7972     UBool initialCheckSecTer = (strength  >= UCOL_SECONDARY);
7973
7974     UBool checkSecTer = initialCheckSecTer;
7975     UBool checkTertiary = (strength  >= UCOL_TERTIARY);
7976     UBool checkQuad = (strength  >= UCOL_QUATERNARY);
7977     UBool checkIdent = (strength == UCOL_IDENTICAL);
7978     UBool checkCase = (coll->caseLevel == UCOL_ON);
7979     UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer;
7980     UBool shifted = (coll->alternateHandling == UCOL_SHIFTED);
7981     UBool qShifted = shifted && checkQuad;
7982     UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad;
7983
7984     if(doHiragana && shifted) {
7985       return (ucol_compareUsingSortKeys(sColl, tColl));
7986     }
7987     uint8_t caseSwitch = coll->caseSwitch;
7988     uint8_t tertiaryMask = coll->tertiaryMask;
7989
7990     // This is the lowest primary value that will not be ignored if shifted
7991     uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0;
7992
7993     UCollationResult result = UCOL_EQUAL;
7994     UCollationResult hirResult = UCOL_EQUAL;
7995
7996     // Preparing the CE buffers. They will be filled during the primary phase
7997     ucol_CEBuf   sCEs;
7998     ucol_CEBuf   tCEs;
7999     UCOL_INIT_CEBUF(&sCEs);
8000     UCOL_INIT_CEBUF(&tCEs);
8001
8002     uint32_t secS = 0, secT = 0;
8003     uint32_t sOrder=0, tOrder=0;
8004
8005     // Non shifted primary processing is quite simple
8006     if(!shifted) {
8007       for(;;) {
8008
8009         // We fetch CEs until we hit a non ignorable primary or end.
8010         do {
8011           // We get the next CE
8012           sOrder = ucol_IGetNextCE(coll, sColl, status);
8013           // Stuff it in the buffer
8014           UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8015           // And keep just the primary part.
8016           sOrder &= UCOL_PRIMARYMASK;
8017         } while(sOrder == 0);
8018
8019         // see the comments on the above block
8020         do {
8021           tOrder = ucol_IGetNextCE(coll, tColl, status);
8022           UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8023           tOrder &= UCOL_PRIMARYMASK;
8024         } while(tOrder == 0);
8025
8026         // if both primaries are the same
8027         if(sOrder == tOrder) {
8028             // and there are no more CEs, we advance to the next level
8029             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
8030               break;
8031             }
8032             if(doHiragana && hirResult == UCOL_EQUAL) {
8033               if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) {
8034                 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA))
8035                   ? UCOL_LESS:UCOL_GREATER;
8036               }
8037             }
8038         } else {
8039             // if two primaries are different, we are done
8040             result = (sOrder < tOrder) ?  UCOL_LESS: UCOL_GREATER;
8041             goto commonReturn;
8042         }
8043       } // no primary difference... do the rest from the buffers
8044     } else { // shifted - do a slightly more complicated processing :)
8045       for(;;) {
8046         UBool sInShifted = FALSE;
8047         UBool tInShifted = FALSE;
8048         // This version of code can be refactored. However, it seems easier to understand this way.
8049         // Source loop. Sam as the target loop.
8050         for(;;) {
8051           sOrder = ucol_IGetNextCE(coll, sColl, status);
8052           if(sOrder == UCOL_NO_MORE_CES) {
8053             UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8054             break;
8055           } else if(sOrder == 0
8056             || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) {
8057             /* UCA amendment - ignore ignorables that follow shifted code points */
8058             continue;
8059           } else if(isContinuation(sOrder)) {
8060             if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
8061               if(sInShifted) {
8062                 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
8063                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8064                 continue;
8065               } else {
8066                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8067                 break;
8068               }
8069             } else { /* Just lower level values */
8070               if(sInShifted) {
8071                 continue;
8072               } else {
8073                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8074                 continue;
8075               }
8076             }
8077           } else { /* regular */
8078             if((sOrder & UCOL_PRIMARYMASK) > LVT) {
8079               UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8080               break;
8081             } else {
8082               if((sOrder & UCOL_PRIMARYMASK) > 0) {
8083                 sInShifted = TRUE;
8084                 sOrder &= UCOL_PRIMARYMASK;
8085                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8086                 continue;
8087               } else {
8088                 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl);
8089                 sInShifted = FALSE;
8090                 continue;
8091               }
8092             }
8093           }
8094         }
8095         sOrder &= UCOL_PRIMARYMASK;
8096         sInShifted = FALSE;
8097
8098         for(;;) {
8099           tOrder = ucol_IGetNextCE(coll, tColl, status);
8100           if(tOrder == UCOL_NO_MORE_CES) {
8101             UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8102             break;
8103           } else if(tOrder == 0
8104             || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) {
8105             /* UCA amendment - ignore ignorables that follow shifted code points */
8106             continue;
8107           } else if(isContinuation(tOrder)) {
8108             if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */
8109               if(tInShifted) {
8110                 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */
8111                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8112                 continue;
8113               } else {
8114                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8115                 break;
8116               }
8117             } else { /* Just lower level values */
8118               if(tInShifted) {
8119                 continue;
8120               } else {
8121                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8122                 continue;
8123               }
8124             }
8125           } else { /* regular */
8126             if((tOrder & UCOL_PRIMARYMASK) > LVT) {
8127               UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8128               break;
8129             } else {
8130               if((tOrder & UCOL_PRIMARYMASK) > 0) {
8131                 tInShifted = TRUE;
8132                 tOrder &= UCOL_PRIMARYMASK;
8133                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8134                 continue;
8135               } else {
8136                 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl);
8137                 tInShifted = FALSE;
8138                 continue;
8139               }
8140             }
8141           }
8142         }
8143         tOrder &= UCOL_PRIMARYMASK;
8144         tInShifted = FALSE;
8145
8146         if(sOrder == tOrder) {
8147           /*
8148             if(doHiragana && hirResult == UCOL_EQUAL) {
8149               if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) {
8150                 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA))
8151                   ? UCOL_LESS:UCOL_GREATER;
8152               }
8153             }
8154           */
8155             if(sOrder == UCOL_NO_MORE_CES_PRIMARY) {
8156               break;
8157             } else {
8158               sOrder = 0; tOrder = 0;
8159               continue;
8160             }
8161         } else {
8162             result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER;
8163             goto commonReturn;
8164         }
8165       } /* no primary difference... do the rest from the buffers */
8166     }
8167
8168     /* now, we're gonna reexamine collected CEs */
8169     uint32_t    *sCE;
8170     uint32_t    *tCE;
8171
8172     /* This is the secondary level of comparison */
8173     if(checkSecTer) {
8174       if(!isFrenchSec) { /* normal */
8175         sCE = sCEs.buf;
8176         tCE = tCEs.buf;
8177         for(;;) {
8178           while (secS == 0) {
8179             secS = *(sCE++) & UCOL_SECONDARYMASK;
8180           }
8181
8182           while(secT == 0) {
8183               secT = *(tCE++) & UCOL_SECONDARYMASK;
8184           }
8185
8186           if(secS == secT) {
8187             if(secS == UCOL_NO_MORE_CES_SECONDARY) {
8188               break;
8189             } else {
8190               secS = 0; secT = 0;
8191               continue;
8192             }
8193           } else {
8194                result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8195                goto commonReturn;
8196           }
8197         }
8198       } else { /* do the French */
8199         uint32_t *sCESave = NULL;
8200         uint32_t *tCESave = NULL;
8201         sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */
8202         tCE = tCEs.pos-2;
8203         for(;;) {
8204           while (secS == 0 && sCE >= sCEs.buf) {
8205             if(sCESave == 0) {
8206               secS = *(sCE--);
8207               if(isContinuation(secS)) {
8208                 while(isContinuation(secS = *(sCE--)));
8209                 /* after this, secS has the start of continuation, and sCEs points before that */
8210                 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */
8211                 sCE+=2;  /* need to point to the first continuation CP */
8212                 /* However, now you can just continue doing stuff */
8213               }
8214             } else {
8215               secS = *(sCE++);
8216               if(!isContinuation(secS)) { /* This means we have finished with this cont */
8217                 sCE = sCESave;            /* reset the pointer to before continuation */
8218                 sCESave = 0;
8219                 continue;
8220               }
8221             }
8222             secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */
8223           }
8224
8225           while(secT == 0 && tCE >= tCEs.buf) {
8226             if(tCESave == 0) {
8227               secT = *(tCE--);
8228               if(isContinuation(secT)) {
8229                 while(isContinuation(secT = *(tCE--)));
8230                 /* after this, secS has the start of continuation, and sCEs points before that */
8231                 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */
8232                 tCE+=2;  /* need to point to the first continuation CP */
8233                 /* However, now you can just continue doing stuff */
8234               }
8235             } else {
8236               secT = *(tCE++);
8237               if(!isContinuation(secT)) { /* This means we have finished with this cont */
8238                 tCE = tCESave;          /* reset the pointer to before continuation */
8239                 tCESave = 0;
8240                 continue;
8241               }
8242             }
8243             secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */
8244           }
8245
8246           if(secS == secT) {
8247             if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) {
8248               break;
8249             } else {
8250               secS = 0; secT = 0;
8251               continue;
8252             }
8253           } else {
8254               result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8255               goto commonReturn;
8256           }
8257         }
8258       }
8259     }
8260
8261     /* doing the case bit */
8262     if(checkCase) {
8263       sCE = sCEs.buf;
8264       tCE = tCEs.buf;
8265       for(;;) {
8266         while((secS & UCOL_REMOVE_CASE) == 0) {
8267           if(!isContinuation(*sCE++)) {
8268             secS =*(sCE-1) & UCOL_TERT_CASE_MASK;
8269             secS ^= caseSwitch;
8270           } else {
8271             secS = 0;
8272           }
8273         }
8274
8275         while((secT & UCOL_REMOVE_CASE) == 0) {
8276           if(!isContinuation(*tCE++)) {
8277             secT = *(tCE-1) & UCOL_TERT_CASE_MASK;
8278             secT ^= caseSwitch;
8279           } else {
8280             secT = 0;
8281           }
8282         }
8283
8284         if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) {
8285           result = UCOL_LESS;
8286           goto commonReturn;
8287         } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) {
8288           result = UCOL_GREATER;
8289           goto commonReturn;
8290         }
8291
8292         if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) {
8293           break;
8294         } else {
8295           secS = 0;
8296           secT = 0;
8297         }
8298       }
8299     }
8300
8301     /* Tertiary level */
8302     if(checkTertiary) {
8303       secS = 0;
8304       secT = 0;
8305       sCE = sCEs.buf;
8306       tCE = tCEs.buf;
8307       for(;;) {
8308         while((secS & UCOL_REMOVE_CASE) == 0) {
8309           secS = *(sCE++) & tertiaryMask;
8310           if(!isContinuation(secS)) {
8311             secS ^= caseSwitch;
8312           } else {
8313             secS &= UCOL_REMOVE_CASE;
8314           }
8315         }
8316
8317         while((secT & UCOL_REMOVE_CASE)  == 0) {
8318           secT = *(tCE++) & tertiaryMask;
8319           if(!isContinuation(secT)) {
8320             secT ^= caseSwitch;
8321           } else {
8322             secT &= UCOL_REMOVE_CASE;
8323           }
8324         }
8325
8326         if(secS == secT) {
8327           if((secS & UCOL_REMOVE_CASE) == 1) {
8328             break;
8329           } else {
8330             secS = 0; secT = 0;
8331             continue;
8332           }
8333         } else {
8334             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8335             goto commonReturn;
8336         }
8337       }
8338     }
8339
8340
8341     if(qShifted /*checkQuad*/) {
8342       UBool sInShifted = TRUE;
8343       UBool tInShifted = TRUE;
8344       secS = 0;
8345       secT = 0;
8346       sCE = sCEs.buf;
8347       tCE = tCEs.buf;
8348       for(;;) {
8349         while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) {
8350           secS = *(sCE++);
8351           if(isContinuation(secS)) {
8352             if(!sInShifted) {
8353               continue;
8354             }
8355           } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */
8356             secS = UCOL_PRIMARYMASK;
8357             sInShifted = FALSE;
8358           } else {
8359             sInShifted = TRUE;
8360           }
8361         }
8362         secS &= UCOL_PRIMARYMASK;
8363
8364
8365         while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) {
8366           secT = *(tCE++);
8367           if(isContinuation(secT)) {
8368             if(!tInShifted) {
8369               continue;
8370             }
8371           } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) {
8372             secT = UCOL_PRIMARYMASK;
8373             tInShifted = FALSE;
8374           } else {
8375             tInShifted = TRUE;
8376           }
8377         }
8378         secT &= UCOL_PRIMARYMASK;
8379
8380         if(secS == secT) {
8381           if(secS == UCOL_NO_MORE_CES_PRIMARY) {
8382             break;
8383           } else {
8384             secS = 0; secT = 0;
8385             continue;
8386           }
8387         } else {
8388             result = (secS < secT) ? UCOL_LESS : UCOL_GREATER;
8389             goto commonReturn;
8390         }
8391       }
8392     } else if(doHiragana && hirResult != UCOL_EQUAL) {
8393       // If we're fine on quaternaries, we might be different
8394       // on Hiragana. This, however, might fail us in shifted.
8395       result = hirResult;
8396       goto commonReturn;
8397     }
8398
8399     /*  For IDENTICAL comparisons, we use a bitwise character comparison */
8400     /*  as a tiebreaker if all else is equal.                                */
8401     /*  Getting here  should be quite rare - strings are not identical -     */
8402     /*     that is checked first, but compared == through all other checks.  */
8403     if(checkIdent)
8404     {
8405         //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON);
8406         result = ucol_checkIdent(sColl, tColl, TRUE, status);
8407     }
8408
8409 commonReturn:
8410     if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) {
8411         freeHeapWritableBuffer(sColl);
8412         freeHeapWritableBuffer(tColl);
8413
8414         if (sCEs.buf != sCEs.localArray ) {
8415             uprv_free(sCEs.buf);
8416         }
8417         if (tCEs.buf != tCEs.localArray ) {
8418             uprv_free(tCEs.buf);
8419         }
8420     }
8421
8422     return result;
8423 }
8424
8425
8426 static inline uint32_t
8427 ucol_getLatinOneContraction(const UCollator *coll, int32_t strength,
8428                           uint32_t CE, const UChar *s, int32_t *index, int32_t len) {
8429   const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
8430   int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
8431   int32_t offset = 1;
8432   UChar schar = 0, tchar = 0;
8433
8434   for(;;) {
8435     if(len == -1) {
8436       if(s[*index] == 0) { // end of string
8437         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8438       } else {
8439         schar = s[*index];
8440       }
8441     } else {
8442       if(*index == len) {
8443         return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8444       } else {
8445         schar = s[*index];
8446       }
8447     }
8448
8449     while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
8450       offset++;
8451     }
8452
8453     if (schar == tchar) {
8454       (*index)++;
8455       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
8456     }
8457     else
8458     {
8459       if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
8460         return UCOL_BAIL_OUT_CE;
8461       }
8462       // skip completely ignorables
8463       uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
8464       if(isZeroCE == 0) { // we have to ignore completely ignorables
8465         (*index)++;
8466         continue;
8467       }
8468
8469       return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
8470     }
8471   }
8472 }
8473
8474
8475 /**
8476  * This is a fast strcoll, geared towards text in Latin-1.
8477  * It supports contractions of size two, French secondaries
8478  * and case switching. You can use it with strengths primary
8479  * to tertiary. It does not support shifted and case level.
8480  * It relies on the table build by setupLatin1Table. If it
8481  * doesn't understand something, it will go to the regular
8482  * strcoll.
8483  */
8484 static inline UCollationResult
8485 ucol_strcollUseLatin1( const UCollator    *coll,
8486               const UChar        *source,
8487               int32_t            sLen,
8488               const UChar        *target,
8489               int32_t            tLen,
8490               UErrorCode *status)
8491 {
8492     U_ALIGN_CODE(16);
8493     int32_t strength = coll->strength;
8494
8495     int32_t sIndex = 0, tIndex = 0;
8496     UChar sChar = 0, tChar = 0;
8497     uint32_t sOrder=0, tOrder=0;
8498
8499     UBool endOfSource = FALSE, endOfTarget = FALSE;
8500
8501     uint32_t *elements = coll->latinOneCEs;
8502
8503     UBool haveContractions = FALSE; // if we have contractions in our string
8504                                     // we cannot do French secondary
8505
8506     // Do the primary level
8507     for(;;) {
8508       while(sOrder==0) { // this loop skips primary ignorables
8509         // sOrder=getNextlatinOneCE(source);
8510         if(sLen==-1) {   // handling zero terminated strings
8511           sChar=source[sIndex++];
8512           if(sChar==0) {
8513             endOfSource = TRUE;
8514             break;
8515           }
8516         } else {        // handling strings with known length
8517           if(sIndex==sLen) {
8518             endOfSource = TRUE;
8519             break;
8520           }
8521           sChar=source[sIndex++];
8522         }
8523         if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8524           //fprintf(stderr, "R");
8525           goto returnRegular;
8526           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8527         }
8528         sOrder = elements[sChar];
8529         if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
8530           // specials can basically be either contractions or bail-out signs. If we get anything
8531           // else, we'll bail out anywasy
8532           if(getCETag(sOrder) == CONTRACTION_TAG) {
8533             sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
8534             haveContractions = TRUE; // if there are contractions, we cannot do French secondary
8535             // However, if there are contractions in the table, but we always use just one char,
8536             // we might be able to do French. This should be checked out.
8537           }
8538           if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8539             //fprintf(stderr, "S");
8540             goto returnRegular;
8541             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8542           }
8543         }
8544       }
8545
8546       while(tOrder==0) {  // this loop skips primary ignorables
8547         // tOrder=getNextlatinOneCE(target);
8548         if(tLen==-1) {    // handling zero terminated strings
8549           tChar=target[tIndex++];
8550           if(tChar==0) {
8551             if(endOfSource) { // this is different than source loop,
8552               // as we already know that source loop is done here,
8553               // so we can either finish the primary loop if both
8554               // strings are done or anounce the result if only
8555               // target is done. Same below.
8556               goto endOfPrimLoop;
8557             } else {
8558               return UCOL_GREATER;
8559             }
8560           }
8561         } else {          // handling strings with known length
8562           if(tIndex==tLen) {
8563             if(endOfSource) {
8564               goto endOfPrimLoop;
8565             } else {
8566               return UCOL_GREATER;
8567             }
8568           }
8569           tChar=target[tIndex++];
8570         }
8571         if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
8572           //fprintf(stderr, "R");
8573           goto returnRegular;
8574           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8575         }
8576         tOrder = elements[tChar];
8577         if(tOrder >= UCOL_NOT_FOUND) {
8578           // Handling specials, see the comments for source
8579           if(getCETag(tOrder) == CONTRACTION_TAG) {
8580             tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
8581             haveContractions = TRUE;
8582           }
8583           if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
8584             //fprintf(stderr, "S");
8585             goto returnRegular;
8586             //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8587           }
8588         }
8589       }
8590       if(endOfSource) { // source is finished, but target is not, say the result.
8591           return UCOL_LESS;
8592       }
8593
8594       if(sOrder == tOrder) { // if we have same CEs, we continue the loop
8595         sOrder = 0; tOrder = 0;
8596         continue;
8597       } else {
8598         // compare current top bytes
8599         if(((sOrder^tOrder)&0xFF000000)!=0) {
8600           // top bytes differ, return difference
8601           if(sOrder < tOrder) {
8602             return UCOL_LESS;
8603           } else if(sOrder > tOrder) {
8604             return UCOL_GREATER;
8605           }
8606           // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
8607           // since we must return enum value
8608         }
8609
8610         // top bytes match, continue with following bytes
8611         sOrder<<=8;
8612         tOrder<<=8;
8613       }
8614     }
8615
8616 endOfPrimLoop:
8617     // after primary loop, we definitely know the sizes of strings,
8618     // so we set it and use simpler loop for secondaries and tertiaries
8619     sLen = sIndex; tLen = tIndex;
8620     if(strength >= UCOL_SECONDARY) {
8621       // adjust the table beggining
8622       elements += coll->latinOneTableLen;
8623       endOfSource = FALSE; endOfTarget = FALSE;
8624
8625       if(coll->frenchCollation == UCOL_OFF) { // non French
8626         // This loop is a simplified copy of primary loop
8627         // at this point we know that whole strings are latin-1, so we don't
8628         // check for that. We also know that we only have contractions as
8629         // specials.
8630         sIndex = 0; tIndex = 0;
8631         for(;;) {
8632           while(sOrder==0) {
8633             if(sIndex==sLen) {
8634               endOfSource = TRUE;
8635               break;
8636             }
8637             sChar=source[sIndex++];
8638             sOrder = elements[sChar];
8639             if(sOrder > UCOL_NOT_FOUND) {
8640               sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
8641             }
8642           }
8643
8644           while(tOrder==0) {
8645             if(tIndex==tLen) {
8646               if(endOfSource) {
8647                 goto endOfSecLoop;
8648               } else {
8649                 return UCOL_GREATER;
8650               }
8651             }
8652             tChar=target[tIndex++];
8653             tOrder = elements[tChar];
8654             if(tOrder > UCOL_NOT_FOUND) {
8655               tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
8656             }
8657           }
8658           if(endOfSource) {
8659               return UCOL_LESS;
8660           }
8661
8662           if(sOrder == tOrder) {
8663             sOrder = 0; tOrder = 0;
8664             continue;
8665           } else {
8666             // see primary loop for comments on this
8667             if(((sOrder^tOrder)&0xFF000000)!=0) {
8668               if(sOrder < tOrder) {
8669                 return UCOL_LESS;
8670               } else if(sOrder > tOrder) {
8671                 return UCOL_GREATER;
8672               }
8673             }
8674             sOrder<<=8;
8675             tOrder<<=8;
8676           }
8677         }
8678       } else { // French
8679         if(haveContractions) { // if we have contractions, we have to bail out
8680           // since we don't really know how to handle them here
8681           goto returnRegular;
8682           //return ucol_strcollRegular(coll, source, sLen, target, tLen, status);
8683         }
8684         // For French, we go backwards
8685         sIndex = sLen; tIndex = tLen;
8686         for(;;) {
8687           while(sOrder==0) {
8688             if(sIndex==0) {
8689               endOfSource = TRUE;
8690               break;
8691             }
8692             sChar=source[--sIndex];
8693             sOrder = elements[sChar];
8694             // don't even look for contractions
8695           }
8696
8697           while(tOrder==0) {
8698             if(tIndex==0) {
8699               if(endOfSource) {
8700                 goto endOfSecLoop;
8701               } else {
8702                 return UCOL_GREATER;
8703               }
8704             }
8705             tChar=target[--tIndex];
8706             tOrder = elements[tChar];
8707             // don't even look for contractions
8708           }
8709           if(endOfSource) {
8710               return UCOL_LESS;
8711           }
8712
8713           if(sOrder == tOrder) {
8714             sOrder = 0; tOrder = 0;
8715             continue;
8716           } else {
8717             // see the primary loop for comments
8718             if(((sOrder^tOrder)&0xFF000000)!=0) {
8719               if(sOrder < tOrder) {
8720                 return UCOL_LESS;
8721               } else if(sOrder > tOrder) {
8722                 return UCOL_GREATER;
8723               }
8724             }
8725             sOrder<<=8;
8726             tOrder<<=8;
8727           }
8728         }
8729       }
8730     }
8731
8732 endOfSecLoop:
8733     if(strength >= UCOL_TERTIARY) {
8734       // tertiary loop is the same as secondary (except no French)
8735       elements += coll->latinOneTableLen;
8736       sIndex = 0; tIndex = 0;
8737       endOfSource = FALSE; endOfTarget = FALSE;
8738       for(;;) {
8739         while(sOrder==0) {
8740           if(sIndex==sLen) {
8741             endOfSource = TRUE;
8742             break;
8743           }
8744           sChar=source[sIndex++];
8745           sOrder = elements[sChar];
8746           if(sOrder > UCOL_NOT_FOUND) {
8747             sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
8748           }
8749         }
8750         while(tOrder==0) {
8751           if(tIndex==tLen) {
8752             if(endOfSource) {
8753               return UCOL_EQUAL; // if both strings are at the end, they are equal
8754             } else {
8755               return UCOL_GREATER;
8756             }
8757           }
8758           tChar=target[tIndex++];
8759           tOrder = elements[tChar];
8760           if(tOrder > UCOL_NOT_FOUND) {
8761             tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
8762           }
8763         }
8764         if(endOfSource) {
8765             return UCOL_LESS;
8766         }
8767         if(sOrder == tOrder) {
8768           sOrder = 0; tOrder = 0;
8769           continue;
8770         } else {
8771           if(((sOrder^tOrder)&0xff000000)!=0) {
8772             if(sOrder < tOrder) {
8773               return UCOL_LESS;
8774             } else if(sOrder > tOrder) {
8775               return UCOL_GREATER;
8776             }
8777           }
8778           sOrder<<=8;
8779           tOrder<<=8;
8780         }
8781       }
8782     }
8783     return UCOL_EQUAL;
8784
8785 returnRegular:
8786     // Preparing the context objects for iterating over strings
8787     collIterate sColl, tColl;
8788
8789     IInit_collIterate(coll, source, sLen, &sColl);
8790     IInit_collIterate(coll, target, tLen, &tColl);
8791     return ucol_strcollRegular(&sColl, &tColl, status);
8792 }
8793
8794
8795 U_CAPI UCollationResult U_EXPORT2
8796 ucol_strcollIter( const UCollator    *coll,
8797                  UCharIterator *sIter,
8798                  UCharIterator *tIter,
8799                  UErrorCode         *status) {
8800   if(!status || U_FAILURE(*status)) {
8801     return UCOL_EQUAL;
8802   }
8803
8804   UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER);
8805   UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter);
8806
8807   if (sIter == tIter) {
8808     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8809     return UCOL_EQUAL;
8810   }
8811   if(sIter == NULL || tIter == NULL || coll == NULL) {
8812     *status = U_ILLEGAL_ARGUMENT_ERROR;
8813     UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
8814     return UCOL_EQUAL;
8815   }
8816
8817   UCollationResult result = UCOL_EQUAL;
8818
8819   // Preparing the context objects for iterating over strings
8820   collIterate sColl, tColl;
8821   // The division for the array length may truncate the array size to
8822   // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
8823   // for all platforms anyway.
8824   UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8825   UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
8826   UNormIterator *sNormIter = NULL, *tNormIter = NULL;
8827
8828   IInit_collIterate(coll, NULL, -1, &sColl);
8829   sColl.iterator = sIter;
8830   sColl.flags |= UCOL_USE_ITERATOR;
8831   IInit_collIterate(coll, NULL, -1, &tColl);
8832   tColl.flags |= UCOL_USE_ITERATOR;
8833   tColl.iterator = tIter;
8834
8835   if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
8836     sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
8837     sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status);
8838     sColl.flags &= ~UCOL_ITER_NORM;
8839
8840     tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
8841     tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status);
8842     tColl.flags &= ~UCOL_ITER_NORM;
8843   }
8844
8845   UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL;
8846
8847   while((sChar = sColl.iterator->next(sColl.iterator)) ==
8848     (tChar = tColl.iterator->next(tColl.iterator))) {
8849     if(UCOL_ISTHAIPREVOWEL(sChar)) {
8850       break;
8851     }
8852     if(sChar == U_SENTINEL) {
8853       result = UCOL_EQUAL;
8854       goto end_compare;
8855     }
8856   }
8857
8858   if(sChar == U_SENTINEL) {
8859     tChar = tColl.iterator->previous(tColl.iterator);
8860   }
8861
8862   if(tChar == U_SENTINEL) {
8863     sChar = sColl.iterator->previous(sColl.iterator);
8864   }
8865
8866   sChar = sColl.iterator->previous(sColl.iterator);
8867   tChar = tColl.iterator->previous(tColl.iterator);
8868
8869   if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll))
8870   {
8871       // We are stopped in the middle of a contraction.
8872       // Scan backwards through the == part of the string looking for the start of the contraction.
8873       //   It doesn't matter which string we scan, since they are the same in this region.
8874       do
8875       {
8876         sChar = sColl.iterator->previous(sColl.iterator);
8877         tChar = tColl.iterator->previous(tColl.iterator);
8878       }
8879       while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll));
8880   }
8881
8882
8883   if(U_SUCCESS(*status)) {
8884     result = ucol_strcollRegular(&sColl, &tColl, status);
8885   }
8886
8887 end_compare:
8888   if(sNormIter || tNormIter) {
8889     unorm_closeIter(sNormIter);
8890     unorm_closeIter(tNormIter);
8891   }
8892
8893   UTRACE_EXIT_VALUE_STATUS(result, *status)
8894   return result;
8895 }
8896
8897
8898
8899 /*                                                                      */
8900 /* ucol_strcoll     Main public API string comparison function          */
8901 /*                                                                      */
8902 U_CAPI UCollationResult U_EXPORT2
8903 ucol_strcoll( const UCollator    *coll,
8904               const UChar        *source,
8905               int32_t            sourceLength,
8906               const UChar        *target,
8907               int32_t            targetLength) {
8908     U_ALIGN_CODE(16);
8909
8910     UTRACE_ENTRY(UTRACE_UCOL_STRCOLL);
8911     if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
8912       UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
8913       UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength);
8914       UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength);
8915     }
8916
8917     UErrorCode status = U_ZERO_ERROR;
8918     if(source == NULL || target == NULL) {
8919       // do not crash, but return. Should have
8920       // status argument to return error.
8921       UTRACE_EXIT_VALUE(UTRACE_UCOL_STRCOLL);
8922       return UCOL_EQUAL;
8923     }
8924       collIterate sColl, tColl;
8925
8926     /* Scan the strings.  Find:                                                             */
8927     /*    The length of any leading portion that is equal                                   */
8928     /*    Whether they are exactly equal.  (in which case we just return)                   */
8929     const UChar    *pSrc    = source;
8930     const UChar    *pTarg   = target;
8931     int32_t        equalLength;
8932
8933     if (sourceLength == -1 && targetLength == -1) {
8934         // Both strings are null terminated.
8935         //    Check for them being the same string, and scan through
8936         //    any leading equal portion.
8937         if (source==target) {
8938             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8939             return UCOL_EQUAL;
8940         }
8941
8942         for (;;) {
8943             if ( *pSrc != *pTarg || *pSrc == 0) {
8944                 break;
8945             }
8946             if(UCOL_ISTHAIPREVOWEL(*pSrc)) {
8947               break;
8948             }
8949             pSrc++;
8950             pTarg++;
8951         }
8952         if (*pSrc == 0 && *pTarg == 0) {
8953             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8954             return UCOL_EQUAL;
8955         }
8956         equalLength = pSrc - source;
8957     }
8958     else
8959     {
8960         // One or both strings has an explicit length.
8961         /* check if source and target are same strings */
8962
8963         if (source==target  && sourceLength==targetLength) {
8964             UTRACE_EXIT_VALUE(UCOL_EQUAL);
8965             return UCOL_EQUAL;
8966         }
8967         const UChar    *pSrcEnd = source + sourceLength;
8968         const UChar    *pTargEnd = target + targetLength;
8969
8970
8971         // Scan while the strings are bitwise ==, or until one is exhausted.
8972             for (;;) {
8973                 if (pSrc == pSrcEnd || pTarg == pTargEnd) {
8974                     break;
8975                 }
8976                 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
8977                     break;
8978                 }
8979                 if (*pSrc != *pTarg) {
8980                     break;
8981                 }
8982                 if(UCOL_ISTHAIPREVOWEL(*pSrc)) { // they are the same here, so any will do
8983                     break;
8984                 }
8985                 pSrc++;
8986                 pTarg++;
8987             }
8988             equalLength = pSrc - source;
8989
8990             // If we made it all the way through both strings, we are done.  They are ==
8991             if ((pSrc ==pSrcEnd  || (pSrcEnd <pSrc  && *pSrc==0))  &&   /* At end of src string, however it was specified. */
8992                 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0)))  {  /* and also at end of dest string                  */
8993                 UTRACE_EXIT_VALUE(UCOL_EQUAL);
8994                 return UCOL_EQUAL;
8995             }
8996     }
8997     if (equalLength > 0) {
8998         /* There is an identical portion at the beginning of the two strings.        */
8999         /*   If the identical portion ends within a contraction or a comibining      */
9000         /*   character sequence, back up to the start of that sequence.              */
9001         pSrc  = source + equalLength;        /* point to the first differing chars   */
9002         pTarg = target + equalLength;
9003         if (pSrc  != source+sourceLength && ucol_unsafeCP(*pSrc, coll) ||
9004             pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll))
9005         {
9006             // We are stopped in the middle of a contraction.
9007             // Scan backwards through the == part of the string looking for the start of the contraction.
9008             //   It doesn't matter which string we scan, since they are the same in this region.
9009             do
9010             {
9011                 equalLength--;
9012                 pSrc--;
9013             }
9014             while (equalLength>0 && ucol_unsafeCP(*pSrc, coll));
9015         }
9016
9017         source += equalLength;
9018         target += equalLength;
9019         if (sourceLength > 0) {
9020             sourceLength -= equalLength;
9021         }
9022         if (targetLength > 0) {
9023             targetLength -= equalLength;
9024         }
9025     }
9026
9027     UCollationResult  returnVal;
9028     if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) {
9029       // Preparing the context objects for iterating over strings
9030       IInit_collIterate(coll, source, sourceLength, &sColl);
9031       IInit_collIterate(coll, target, targetLength, &tColl);
9032       returnVal = ucol_strcollRegular(&sColl, &tColl, &status);
9033     } else {
9034       returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status);
9035     }
9036     UTRACE_EXIT_VALUE(returnVal);
9037     return returnVal;
9038 }
9039
9040 /* convenience function for comparing strings */
9041 U_CAPI UBool U_EXPORT2
9042 ucol_greater(    const    UCollator        *coll,
9043         const    UChar            *source,
9044         int32_t            sourceLength,
9045         const    UChar            *target,
9046         int32_t            targetLength)
9047 {
9048   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9049       == UCOL_GREATER);
9050 }
9051
9052 /* convenience function for comparing strings */
9053 U_CAPI UBool U_EXPORT2
9054 ucol_greaterOrEqual(    const    UCollator    *coll,
9055             const    UChar        *source,
9056             int32_t        sourceLength,
9057             const    UChar        *target,
9058             int32_t        targetLength)
9059 {
9060   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9061       != UCOL_LESS);
9062 }
9063
9064 /* convenience function for comparing strings */
9065 U_CAPI UBool U_EXPORT2
9066 ucol_equal(        const    UCollator        *coll,
9067             const    UChar            *source,
9068             int32_t            sourceLength,
9069             const    UChar            *target,
9070             int32_t            targetLength)
9071 {
9072   return (ucol_strcoll(coll, source, sourceLength, target, targetLength)
9073       == UCOL_EQUAL);
9074 }
9075
9076 /* returns the locale name the collation data comes from */
9077 U_CAPI const char * U_EXPORT2
9078 ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
9079   return ucol_getLocaleByType(coll, type, status);
9080 }
9081
9082 U_CAPI const char * U_EXPORT2
9083 ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) {
9084   const char *result = NULL;
9085   if(status == NULL || U_FAILURE(*status)) {
9086     return NULL;
9087   }
9088   UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE);
9089   UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll);
9090
9091   switch(type) {
9092   case ULOC_ACTUAL_LOCALE:
9093     // validLocale is set only if service registration has explicitly set the
9094     // requested and valid locales.  if this is the case, the actual locale
9095     // is considered to be the valid locale.
9096     if (coll->validLocale != NULL) {
9097       result = coll->validLocale;
9098     } else if(coll->elements != NULL) {
9099       result = ures_getLocale(coll->elements, status);
9100     }
9101     break;
9102   case ULOC_VALID_LOCALE:
9103     if (coll->validLocale != NULL) {
9104       result = coll->validLocale;
9105     } else if(coll->rb != NULL) {
9106       result = ures_getLocale(coll->rb, status);
9107     }
9108     break;
9109   case ULOC_REQUESTED_LOCALE:
9110     result = coll->requestedLocale;
9111     break;
9112   default:
9113     *status = U_ILLEGAL_ARGUMENT_ERROR;
9114   }
9115   UTRACE_DATA1(UTRACE_INFO, "result = %s", result);
9116   UTRACE_EXIT_STATUS(*status);
9117   return result;
9118 }
9119
9120 U_CAPI USet * U_EXPORT2
9121 ucol_getTailoredSet(const UCollator *coll, UErrorCode *status)
9122 {
9123   if(status == NULL || U_FAILURE(*status)) {
9124     return NULL;
9125   }
9126   if(coll == NULL || coll->UCA == NULL) {
9127     *status = U_ILLEGAL_ARGUMENT_ERROR;
9128   }
9129   UParseError parseError;
9130   UColTokenParser src;
9131   int32_t rulesLen = 0;
9132   const UChar *rules = ucol_getRules(coll, &rulesLen);
9133   const UChar *current = NULL;
9134   UBool startOfRules = TRUE;
9135   // we internally use the C++ class, for the following reasons:
9136   // 1. we need to utilize canonical iterator, which is a C++ only class
9137   // 2. canonical iterator returns UnicodeStrings - USet cannot take them
9138   // 3. USet is internally really UnicodeSet, C is just a wrapper
9139   UnicodeSet *tailored = new UnicodeSet();
9140   UnicodeString pattern;
9141   UnicodeString empty;
9142   CanonicalIterator it(empty, *status);
9143
9144
9145   // The idea is to tokenize the rule set. For each non-reset token,
9146   // we add all the canonicaly equivalent FCD sequences
9147   ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status);
9148   while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError, status)) != NULL) {
9149     startOfRules = FALSE;
9150     if(src.parsedToken.strength != UCOL_TOK_RESET) {
9151       const UChar *stuff = src.source+(src.parsedToken.charsOffset);
9152       it.setSource(UnicodeString(stuff, src.parsedToken.charsLen), *status);
9153       pattern = it.next();
9154       while(!pattern.isBogus()) {
9155         if(Normalizer::quickCheck(pattern, UNORM_FCD, *status) != UNORM_NO) {
9156           tailored->add(pattern);
9157         }
9158         pattern = it.next();
9159       }
9160     }
9161   }
9162   ucol_tok_closeTokenList(&src);
9163   return (USet *)tailored;
9164 }
9165
9166 U_CAPI UBool U_EXPORT2
9167 ucol_equals(const UCollator *source, const UCollator *target) {
9168   UErrorCode status = U_ZERO_ERROR;
9169   // if pointers are equal, collators are equal
9170   if(source == target) {
9171     return TRUE;
9172   }
9173   int32_t i = 0, j = 0;
9174   // if any of attributes are different, collators are not equal
9175   for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) {
9176     if(ucol_getAttribute(source, (UColAttribute)i, &status) != ucol_getAttribute(target, (UColAttribute)i, &status) || U_FAILURE(status)) {
9177       return FALSE;
9178     }
9179   }
9180
9181   int32_t sourceRulesLen = 0, targetRulesLen = 0;
9182   const UChar *sourceRules = ucol_getRules(source, &sourceRulesLen);
9183   const UChar *targetRules = ucol_getRules(target, &targetRulesLen);
9184
9185   if(sourceRulesLen == targetRulesLen && u_strncmp(sourceRules, targetRules, sourceRulesLen) == 0) {
9186     // all the attributes are equal and the rules are equal - collators are equal
9187     return(TRUE);
9188   }
9189   // hard part, need to construct tree from rules and see if they yield the same tailoring
9190   UBool result = TRUE;
9191   UParseError parseError;
9192   UColTokenParser sourceParser, targetParser;
9193   int32_t sourceListLen = 0, targetListLen = 0;
9194   ucol_tok_initTokenList(&sourceParser, sourceRules, sourceRulesLen, source->UCA, &status);
9195   ucol_tok_initTokenList(&targetParser, targetRules, targetRulesLen, target->UCA, &status);
9196   sourceListLen = ucol_tok_assembleTokenList(&sourceParser, &parseError, &status);
9197   targetListLen = ucol_tok_assembleTokenList(&targetParser, &parseError, &status);
9198
9199   if(sourceListLen != targetListLen) {
9200     // different number of resets
9201     result = FALSE;
9202   } else {
9203     UColToken *sourceReset = NULL, *targetReset = NULL;
9204     UChar *sourceResetString = NULL, *targetResetString = NULL;
9205     int32_t sourceStringLen = 0, targetStringLen = 0;
9206     for(i = 0; i < sourceListLen; i++) {
9207       sourceReset = sourceParser.lh[i].reset;
9208       sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
9209       sourceStringLen = sourceReset->source >> 24;
9210       for(j = 0; j < sourceListLen; j++) {
9211         targetReset = targetParser.lh[j].reset;
9212         targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
9213         targetStringLen = targetReset->source >> 24;
9214         if(sourceStringLen == targetStringLen && (u_strncmp(sourceResetString, targetResetString, sourceStringLen) == 0)) {
9215           sourceReset = sourceParser.lh[i].first;
9216           targetReset = targetParser.lh[j].first;
9217           while(sourceReset != NULL && targetReset != NULL) {
9218             sourceResetString = sourceParser.source+(sourceReset->source & 0xFFFFFF);
9219             sourceStringLen = sourceReset->source >> 24;
9220             targetResetString = targetParser.source+(targetReset->source & 0xFFFFFF);
9221             targetStringLen = targetReset->source >> 24;
9222             if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
9223               result = FALSE;
9224               goto returnResult;
9225             }
9226             // probably also need to check the expansions
9227             if(sourceReset->expansion) {
9228               if(!targetReset->expansion) {
9229                 result = FALSE;
9230                 goto returnResult;
9231               } else {
9232                 // compare expansions
9233                 sourceResetString = sourceParser.source+(sourceReset->expansion& 0xFFFFFF);
9234                 sourceStringLen = sourceReset->expansion >> 24;
9235                 targetResetString = targetParser.source+(targetReset->expansion & 0xFFFFFF);
9236                 targetStringLen = targetReset->expansion >> 24;
9237                 if(sourceStringLen != targetStringLen || (u_strncmp(sourceResetString, targetResetString, sourceStringLen) != 0)) {
9238                   result = FALSE;
9239                   goto returnResult;
9240                 }
9241               }
9242             } else {
9243               if(targetReset->expansion) {
9244                 result = FALSE;
9245                 goto returnResult;
9246               }
9247             }
9248             sourceReset = sourceReset->next;
9249             targetReset = targetReset->next;
9250           }
9251           if(sourceReset != targetReset) { // at least one is not NULL
9252             // there are more tailored elements in one list
9253             result = FALSE;
9254             goto returnResult;
9255           }
9256
9257
9258           break;
9259         }
9260       }
9261       // couldn't find the reset anchor, so the collators are not equal
9262       if(j == sourceListLen) {
9263         result = FALSE;
9264         goto returnResult;
9265       }
9266     }
9267   }
9268
9269 returnResult:
9270   ucol_tok_closeTokenList(&sourceParser);
9271   ucol_tok_closeTokenList(&targetParser);
9272   return result;
9273
9274 }
9275
9276 U_CAPI void U_EXPORT2
9277 ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) {
9278   if(coll && coll->UCA) {
9279     uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo));
9280   }
9281 }
9282
9283 U_CAPI int32_t U_EXPORT2
9284 ucol_cloneBinary(const UCollator *coll,
9285                  uint8_t *buffer, int32_t capacity,
9286                  UErrorCode *status)
9287 {
9288     int32_t length = 0;
9289     if(U_FAILURE(*status)) {
9290         return length;
9291     }
9292     if(coll->hasRealData == TRUE) {
9293         length = coll->image->size;
9294         if(length <= capacity) {
9295             uprv_memcpy(buffer, coll->image, length);
9296         }
9297     } else {
9298         length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)));
9299         if(length <= capacity) {
9300             /* build the UCATableHeader with minimal entries */
9301             /* do not copy the header from the UCA file because its values are wrong! */
9302             /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */
9303
9304             /* reset everything */
9305             uprv_memset(buffer, 0, length);
9306
9307             /* set the tailoring-specific values */
9308             UCATableHeader *myData = (UCATableHeader *)buffer;
9309             myData->size = length;
9310
9311             /* offset for the options, the only part of the data that is present after the header */
9312             myData->options = sizeof(UCATableHeader);
9313
9314             /* need to always set the expansion value for an upper bound of the options */
9315             myData->expansion = myData->options + sizeof(UColOptionSet);
9316
9317             myData->magic = UCOL_HEADER_MAGIC;
9318             myData->isBigEndian = U_IS_BIG_ENDIAN;
9319             myData->charSetFamily = U_CHARSET_FAMILY;
9320
9321             /* copy UCA's version; genrb will override all but the builder version with tailoring data */
9322             uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo));
9323
9324             uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo));
9325             uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo));
9326             uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo));
9327             myData->jamoSpecial = coll->image->jamoSpecial;
9328
9329             /* copy the collator options */
9330             uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet));
9331         }
9332     }
9333     return length;
9334 }
9335
9336 U_CAPI UCollator* U_EXPORT2
9337 ucol_openBinary(const uint8_t *bin, int32_t length,
9338                 const UCollator *base,
9339                 UErrorCode *status)
9340 {
9341     UCollator *result = NULL;
9342     if(U_FAILURE(*status)){
9343         return NULL;
9344     }
9345     if(base == NULL) {
9346         // we don't support null base yet
9347         *status = U_ILLEGAL_ARGUMENT_ERROR;
9348         return NULL;
9349     }
9350     UCATableHeader *colData = (UCATableHeader *)bin;
9351     // do we want version check here? We're trying to figure out whether collators are compatible
9352     if(uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 ||
9353         uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0 ||
9354         colData->version[0] != UCOL_BUILDER_VERSION) {
9355             *status = U_COLLATOR_VERSION_MISMATCH;
9356             return NULL;
9357         } else {
9358             if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) {
9359                 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status);
9360                 if(U_FAILURE(*status)){
9361                     return NULL;
9362                 }
9363                 result->hasRealData = TRUE;
9364             } else {
9365                 if(base) {
9366                     result = ucol_initCollator(base->image, result, base, status);
9367                     ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status);
9368                     if(U_FAILURE(*status)){
9369                         return NULL;
9370                     }
9371                     result->hasRealData = FALSE;
9372                 } else {
9373                     *status = U_USELESS_COLLATOR_ERROR;
9374                     return NULL;
9375                 }
9376             }
9377             result->freeImageOnClose = FALSE;
9378         }
9379         result->validLocale = NULL;
9380         result->requestedLocale = NULL;
9381         result->rules = NULL;
9382         result->rulesLength = 0;
9383         result->freeRulesOnClose = FALSE;
9384         result->rb = NULL;
9385         result->elements = NULL;
9386         return result;
9387 }
9388
9389 #endif /* #if !UCONFIG_NO_COLLATION */
9390