icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2012, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/udata.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28 #include "unicode/uscript.h"
  29 #include "unicode/ustring.h"
  30 #include "unicode/utf16.h"
  31 #include "normalizer2impl.h"
  32 #include "ucol_bld.h"
  33 #include "ucol_elm.h"
  34 #include "ucol_cnt.h"
  35 #include "ucln_in.h"
  36 #include "umutex.h"
  37 #include "cmemory.h"
  38 #include "cstring.h"
  39
  40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
  41
  42 static const InverseUCATableHeader* _staticInvUCA = NULL;
  43 static UDataMemory* invUCA_DATA_MEM = NULL;
  44
  45 U_CDECL_BEGIN
  46 static UBool U_CALLCONV
  47 isAcceptableInvUCA(void * /*context*/,
  48                    const char * /*type*/, const char * /*name*/,
  49                    const UDataInfo *pInfo)
  50 {
  51     /* context, type & name are intentionally not used */
  52     if( pInfo->size>=20 &&
  53         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  54         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  55         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  56         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  57         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  58         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  59         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  60         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  61         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  62         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  63         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  64         )
  65     {
  66         UVersionInfo UCDVersion;
  67         u_getUnicodeVersion(UCDVersion);
  68         return (pInfo->dataVersion[0]==UCDVersion[0] &&
  69             pInfo->dataVersion[1]==UCDVersion[1]);
  70             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  71             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  72             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  73     } else {
  74         return FALSE;
  75     }
  76 }
  77 U_CDECL_END
  78
  79 /*
  80 * Takes two CEs (lead and continuation) and
  81 * compares them as CEs should be compared:
  82 * primary vs. primary, secondary vs. secondary
  83 * tertiary vs. tertiary
  84 */
  85 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
  86     uint32_t s1 = source0, s2, t1 = target0, t2;
  87     if(isContinuation(source1)) {
  88         s2 = source1;
  89     } else {
  90         s2 = 0;
  91     }
  92     if(isContinuation(target1)) {
  93         t2 = target1;
  94     } else {
  95         t2 = 0;
  96     }
  97
  98     uint32_t s = 0, t = 0;
  99     if(s1 == t1 && s2 == t2) {
 100         return 0;
 101     }
 102     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
 103     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
 104     if(s < t) {
 105         return -1;
 106     } else if(s > t) {
 107         return 1;
 108     } else {
 109         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
 110         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
 111         if(s < t) {
 112             return -1;
 113         } else if(s > t) {
 114             return 1;
 115         } else {
 116             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
 117             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
 118             if(s < t) {
 119                 return -1;
 120             } else {
 121                 return 1;
 122             }
 123         }
 124     }
 125 }
 126
 127 static
 128 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
 129     uint32_t bottom = 0, top = src->invUCA->tableSize;
 130     uint32_t i = 0;
 131     uint32_t first = 0, second = 0;
 132     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 133     int32_t res = 0;
 134
 135     while(bottom < top-1) {
 136         i = (top+bottom)/2;
 137         first = *(CETable+3*i);
 138         second = *(CETable+3*i+1);
 139         res = compareCEs(first, second, CE, SecondCE);
 140         if(res > 0) {
 141             top = i;
 142         } else if(res < 0) {
 143             bottom = i;
 144         } else {
 145             break;
 146         }
 147     }
 148
 149     /* weiv:                                                  */
 150     /* in searching for elements, I have removed the failure  */
 151     /* The reason for this is that the builder does not rely  */
 152     /* on search mechanism telling it that it didn't find an  */
 153     /* element. However, indirect positioning relies on being */
 154     /* able to find the elements around any CE, even if it is */
 155     /* not defined in the UCA. */
 156     return i;
 157     /*
 158     if((first == CE && second == SecondCE)) {
 159     return i;
 160     } else {
 161     return -1;
 162     }
 163     */
 164 }
 165
 166 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 167     0xFFFF0000,
 168     0xFFFFFF00,
 169     0xFFFFFFFF
 170 };
 171
 172 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 173                                             uint32_t CE, uint32_t contCE,
 174                                             uint32_t *nextCE, uint32_t *nextContCE,
 175                                             uint32_t strength)
 176 {
 177     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 178     int32_t iCE;
 179
 180     iCE = ucol_inv_findCE(src, CE, contCE);
 181
 182     if(iCE<0) {
 183         *nextCE = UCOL_NOT_FOUND;
 184         return -1;
 185     }
 186
 187     CE &= strengthMask[strength];
 188     contCE &= strengthMask[strength];
 189
 190     *nextCE = CE;
 191     *nextContCE = contCE;
 192
 193     while((*nextCE  & strengthMask[strength]) == CE
 194         && (*nextContCE  & strengthMask[strength]) == contCE)
 195     {
 196         *nextCE = (*(CETable+3*(++iCE)));
 197         *nextContCE = (*(CETable+3*(iCE)+1));
 198     }
 199
 200     return iCE;
 201 }
 202
 203 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 204                                             uint32_t CE, uint32_t contCE,
 205                                             uint32_t *prevCE, uint32_t *prevContCE,
 206                                             uint32_t strength)
 207 {
 208     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 209     int32_t iCE;
 210
 211     iCE = ucol_inv_findCE(src, CE, contCE);
 212
 213     if(iCE<0) {
 214         *prevCE = UCOL_NOT_FOUND;
 215         return -1;
 216     }
 217
 218     CE &= strengthMask[strength];
 219     contCE &= strengthMask[strength];
 220
 221     *prevCE = CE;
 222     *prevContCE = contCE;
 223
 224     while((*prevCE  & strengthMask[strength]) == CE
 225         && (*prevContCE  & strengthMask[strength])== contCE
 226         && iCE > 0) /* this condition should prevent falling off the edge of the world */
 227     {
 228         /* here, we end up in a singularity - zero */
 229         *prevCE = (*(CETable+3*(--iCE)));
 230         *prevContCE = (*(CETable+3*(iCE)+1));
 231     }
 232
 233     return iCE;
 234 }
 235
 236 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 237                                                        uint32_t prevCE, uint32_t prevContCE)
 238 {
 239     if(prevCE == CE && prevContCE == contCE) {
 240         return UCOL_IDENTICAL;
 241     }
 242     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
 243         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
 244     {
 245         return UCOL_PRIMARY;
 246     }
 247     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
 248         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
 249     {
 250         return UCOL_SECONDARY;
 251     }
 252     return UCOL_TERTIARY;
 253 }
 254
 255
 256 /*static
 257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 258
 259     uint32_t CE = lh->baseCE;
 260     uint32_t SecondCE = lh->baseContCE;
 261
 262     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 263     uint32_t previousCE, previousContCE;
 264     int32_t iCE;
 265
 266     iCE = ucol_inv_findCE(src, CE, SecondCE);
 267
 268     if(iCE<0) {
 269         return -1;
 270     }
 271
 272     CE &= strengthMask[strength];
 273     SecondCE &= strengthMask[strength];
 274
 275     previousCE = CE;
 276     previousContCE = SecondCE;
 277
 278     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 279         previousCE = (*(CETable+3*(--iCE)));
 280         previousContCE = (*(CETable+3*(iCE)+1));
 281     }
 282     lh->previousCE = previousCE;
 283     lh->previousContCE = previousContCE;
 284
 285     return iCE;
 286 }*/
 287
 288 static
 289 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 290     uint32_t CE = lh->baseCE;
 291     uint32_t SecondCE = lh->baseContCE;
 292
 293     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 294     uint32_t nextCE, nextContCE;
 295     int32_t iCE;
 296
 297     iCE = ucol_inv_findCE(src, CE, SecondCE);
 298
 299     if(iCE<0) {
 300         return -1;
 301     }
 302
 303     CE &= strengthMask[strength];
 304     SecondCE &= strengthMask[strength];
 305
 306     nextCE = CE;
 307     nextContCE = SecondCE;
 308
 309     while((nextCE  & strengthMask[strength]) == CE
 310         && (nextContCE  & strengthMask[strength]) == SecondCE)
 311     {
 312         nextCE = (*(CETable+3*(++iCE)));
 313         nextContCE = (*(CETable+3*(iCE)+1));
 314     }
 315
 316     lh->nextCE = nextCE;
 317     lh->nextContCE = nextContCE;
 318
 319     return iCE;
 320 }
 321
 322 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 323     /* reset all the gaps */
 324     int32_t i = 0;
 325     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 326     uint32_t st = 0;
 327     uint32_t t1, t2;
 328     int32_t pos;
 329
 330     UColToken *tok = lh->first;
 331     uint32_t tokStrength = tok->strength;
 332
 333     for(i = 0; i<3; i++) {
 334         lh->gapsHi[3*i] = 0;
 335         lh->gapsHi[3*i+1] = 0;
 336         lh->gapsHi[3*i+2] = 0;
 337         lh->gapsLo[3*i] = 0;
 338         lh->gapsLo[3*i+1] = 0;
 339         lh->gapsLo[3*i+2] = 0;
 340         lh->numStr[i] = 0;
 341         lh->fStrToken[i] = NULL;
 342         lh->lStrToken[i] = NULL;
 343         lh->pos[i] = -1;
 344     }
 345
 346     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 347
 348     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 349         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 350         lh->pos[0] = 0;
 351         t1 = lh->baseCE;
 352         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
 353         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 354         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 355         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 356         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
 357         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 358
 359         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
 360         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
 361
 362         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 363         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 364         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 365     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 366         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 367         lh->pos[0] = 0;
 368         t1 = lh->baseCE;
 369         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
 370         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 371         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 372         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 373         t1 = lh->nextCE;
 374         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
 375         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 376         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 377         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 378     } else {
 379         for(;;) {
 380             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 381                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 382                     lh->fStrToken[tokStrength] = tok;
 383                 } else { /* The CE must be implicit, since it's not in the table */
 384                     /* Error */
 385                     *status = U_INTERNAL_PROGRAM_ERROR;
 386                 }
 387             }
 388
 389             while(tok != NULL && tok->strength >= tokStrength) {
 390                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 391                     lh->lStrToken[tokStrength] = tok;
 392                 }
 393                 tok = tok->next;
 394             }
 395             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 396                 /* check if previous interval is the same and merge the intervals if it is so */
 397                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 398                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 399                     lh->fStrToken[tokStrength+1] = NULL;
 400                     lh->lStrToken[tokStrength+1] = NULL;
 401                     lh->pos[tokStrength+1] = -1;
 402                 }
 403             }
 404             if(tok != NULL) {
 405                 tokStrength = tok->strength;
 406             } else {
 407                 break;
 408             }
 409         }
 410         for(st = 0; st < 3; st++) {
 411             if((pos = lh->pos[st]) >= 0) {
 412                 t1 = *(CETable+3*(pos));
 413                 t2 = *(CETable+3*(pos)+1);
 414                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 415                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 416                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 417                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 418                 //pos--;
 419                 //t1 = *(CETable+3*(pos));
 420                 //t2 = *(CETable+3*(pos)+1);
 421                 t1 = lh->baseCE;
 422                 t2 = lh->baseContCE;
 423                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 424                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 425                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 426             }
 427         }
 428     }
 429 }
 430
 431
 432 #define ucol_countBytes(value, noOfBytes)   \
 433 {                               \
 434     uint32_t mask = 0xFFFFFFFF;   \
 435     (noOfBytes) = 0;              \
 436     while(mask != 0) {            \
 437     if(((value) & mask) != 0) { \
 438     (noOfBytes)++;            \
 439     }                           \
 440     mask >>= 8;                 \
 441     }                             \
 442 }
 443
 444 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 445     if(U_SUCCESS(*status)) {
 446         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 447     }
 448     return g->current;
 449 }
 450
 451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 452     /* TODO: rename to enum names */
 453     uint32_t high, low, count=1;
 454     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 455
 456     if(strength == UCOL_SECONDARY) {
 457         low = UCOL_COMMON_TOP2<<24;
 458         high = 0xFFFFFFFF;
 459         count = 0xFF - UCOL_COMMON_TOP2;
 460     } else {
 461         low = UCOL_BYTE_COMMON << 24; //0x05000000;
 462         high = 0x40000000;
 463         count = 0x40 - UCOL_BYTE_COMMON;
 464     }
 465
 466     if(tok->next != NULL && tok->next->strength == strength) {
 467         count = tok->next->toInsert;
 468     }
 469
 470     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 471     g->current = UCOL_BYTE_COMMON<<24;
 472
 473     if(g->noOfRanges == 0) {
 474         *status = U_INTERNAL_PROGRAM_ERROR;
 475     }
 476     return g->current;
 477 }
 478
 479 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 480     uint32_t strength = tok->strength;
 481     uint32_t low = lows[fStrength*3+strength];
 482     uint32_t high = highs[fStrength*3+strength];
 483     uint32_t maxByte = 0;
 484     if(strength == UCOL_TERTIARY) {
 485         maxByte = 0x3F;
 486     } else if(strength == UCOL_PRIMARY) {
 487         maxByte = 0xFE;
 488     } else {
 489         maxByte = 0xFF;
 490     }
 491
 492     uint32_t count = tok->toInsert;
 493
 494     if(low >= high && strength > UCOL_PRIMARY) {
 495         int32_t s = strength;
 496         for(;;) {
 497             s--;
 498             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 499                 if(strength == UCOL_SECONDARY) {
 500                     if (low < UCOL_COMMON_TOP2<<24 ) {
 501                        // Override if low range is less than UCOL_COMMON_TOP2.
 502                         low = UCOL_COMMON_TOP2<<24;
 503                     }
 504                     high = 0xFFFFFFFF;
 505                 } else {
 506                     // Override if low range is less than UCOL_COMMON_BOT3.
 507                     if ( low < UCOL_COMMON_BOT3<<24 ) {
 508                         low = UCOL_COMMON_BOT3<<24;
 509                     }
 510                     high = 0x40000000;
 511                 }
 512                 break;
 513             }
 514             if(s<0) {
 515                 *status = U_INTERNAL_PROGRAM_ERROR;
 516                 return 0;
 517             }
 518         }
 519     }
 520
 521     if(low < 0x02000000) {
 522         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
 523         // See http://site.icu-project.org/design/collation/bytes
 524         low = 0x02000000;
 525     }
 526
 527     if(strength == UCOL_SECONDARY) { /* similar as simple */
 528         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 529             low = UCOL_COMMON_TOP2<<24;
 530         }
 531         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 532             high = UCOL_COMMON_TOP2<<24;
 533         }
 534         if(low < (UCOL_COMMON_BOT2<<24)) {
 535             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 536             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 537             //g->current = UCOL_COMMON_BOT2<<24;
 538             return g->current;
 539         }
 540     }
 541
 542     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 543     if(g->noOfRanges == 0) {
 544         *status = U_INTERNAL_PROGRAM_ERROR;
 545     }
 546     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 547     return g->current;
 548 }
 549
 550 static
 551 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 552     uint32_t i = 0;
 553     UChar c;
 554
 555     if(U_FAILURE(*status)) {
 556         return 0;
 557     }
 558
 559     if(sourceLen > resLen) {
 560         *status = U_MEMORY_ALLOCATION_ERROR;
 561         return 0;
 562     }
 563
 564     for(i = 0; i < sourceLen; i++) {
 565         c = source[i];
 566         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 567             switch(c - 0x3000) {
 568             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 569             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 570                 c++;
 571                 break;
 572             case 0xF5:
 573                 c = 0x30AB;
 574                 break;
 575             case 0xF6:
 576                 c = 0x30B1;
 577                 break;
 578             }
 579         }
 580         resBuf[i] = c;
 581     }
 582     return sourceLen;
 583 }
 584
 585 static
 586 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 587     uint32_t i = 0;
 588     UChar c;
 589
 590     if(U_FAILURE(*status)) {
 591         return 0;
 592     }
 593
 594     if(sourceLen > resLen) {
 595         *status = U_MEMORY_ALLOCATION_ERROR;
 596         return 0;
 597     }
 598
 599     for(i = 0; i < sourceLen; i++) {
 600         c = source[i];
 601         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 602             switch(c - 0x3000) {
 603             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 604             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 605                 c--;
 606                 break;
 607             case 0xAB:
 608                 c = 0x30F5;
 609                 break;
 610             case 0xB1:
 611                 c = 0x30F6;
 612                 break;
 613             }
 614         }
 615         resBuf[i] = c;
 616     }
 617     return sourceLen;
 618 }
 619
 620 U_NAMESPACE_BEGIN
 621
 622 static
 623 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 624     uint32_t i = 0;
 625     UChar n[128];
 626     uint32_t nLen = 0;
 627     uint32_t uCount = 0, lCount = 0;
 628
 629     collIterate s;
 630     uint32_t order = 0;
 631
 632     if(U_FAILURE(*status)) {
 633         return UCOL_LOWER_CASE;
 634     }
 635
 636     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 637     if(U_SUCCESS(*status)) {
 638         for(i = 0; i < nLen; i++) {
 639             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
 640             order = ucol_getNextCE(UCA, &s, status);
 641             if(isContinuation(order)) {
 642                 *status = U_INTERNAL_PROGRAM_ERROR;
 643                 return UCOL_LOWER_CASE;
 644             }
 645             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 646                 uCount++;
 647             } else {
 648                 if(u_islower(n[i])) {
 649                     lCount++;
 650                 } else if(U_SUCCESS(*status)) {
 651                     UChar sk[1], lk[1];
 652                     u_toSmallKana(&n[i], 1, sk, 1, status);
 653                     u_toLargeKana(&n[i], 1, lk, 1, status);
 654                     if(sk[0] == n[i] && lk[0] != n[i]) {
 655                         lCount++;
 656                     }
 657                 }
 658             }
 659         }
 660     }
 661
 662     if(uCount != 0 && lCount != 0) {
 663         return UCOL_MIXED_CASE;
 664     } else if(uCount != 0) {
 665         return UCOL_UPPER_CASE;
 666     } else {
 667         return UCOL_LOWER_CASE;
 668     }
 669 }
 670
 671
 672 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 673     /* this one makes the table and stuff */
 674     uint32_t noOfBytes[3];
 675     uint32_t i;
 676
 677     for(i = 0; i<3; i++) {
 678         ucol_countBytes(CEparts[i], noOfBytes[i]);
 679     }
 680
 681     /* Here we have to pack CEs from parts */
 682
 683     uint32_t CEi = 0;
 684     uint32_t value = 0;
 685
 686     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 687         if(CEi > 0) {
 688             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 689         } else {
 690             value = 0;
 691         }
 692
 693         if(2*CEi<noOfBytes[0]) {
 694             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 695         }
 696         if(CEi<noOfBytes[1]) {
 697             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 698         }
 699         if(CEi<noOfBytes[2]) {
 700             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 701         }
 702         tok->CEs[CEi] = value;
 703         CEi++;
 704     }
 705     if(CEi == 0) { /* totally ignorable */
 706         tok->noOfCEs = 1;
 707         tok->CEs[0] = 0;
 708     } else { /* there is at least something */
 709         tok->noOfCEs = CEi;
 710     }
 711
 712
 713     // we want to set case bits here and now, not later.
 714     // Case bits handling
 715     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
 716         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 717         int32_t cSize = (tok->source & 0xFF000000) >> 24;
 718         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 719
 720         if(cSize > 1) {
 721             // Do it manually
 722             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 723         } else {
 724             // Copy it from the UCA
 725             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 726             tok->CEs[0] |= (caseCE & 0xC0);
 727         }
 728     }
 729
 730 #if UCOL_DEBUG==2
 731     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 732     for(i = 0; i<tok->noOfCEs; i++) {
 733         fprintf(stderr, "%08X ", tok->CEs[i]);
 734     }
 735     fprintf(stderr, "\n");
 736 #endif
 737 }
 738
 739 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 740     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 741     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 742
 743     UColToken *tok = lh->last;
 744     uint32_t t[UCOL_STRENGTH_LIMIT];
 745
 746     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 747
 748     /* must initialize ranges to avoid memory check warnings */
 749     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
 750         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
 751     }
 752
 753     tok->toInsert = 1;
 754     t[tok->strength] = 1;
 755
 756     while(tok->previous != NULL) {
 757         if(tok->previous->strength < tok->strength) { /* going up */
 758             t[tok->strength] = 0;
 759             t[tok->previous->strength]++;
 760         } else if(tok->previous->strength > tok->strength) { /* going down */
 761             t[tok->previous->strength] = 1;
 762         } else {
 763             t[tok->strength]++;
 764         }
 765         tok=tok->previous;
 766         tok->toInsert = t[tok->strength];
 767     }
 768
 769     tok->toInsert = t[tok->strength];
 770     ucol_inv_getGapPositions(src, lh, status);
 771
 772 #if UCOL_DEBUG
 773     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 774     int32_t j = 2;
 775     for(j = 2; j >= 0; j--) {
 776         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 777         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 778     }
 779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 780
 781     do {
 782         fprintf(stderr,"%i", tok->strength);
 783         tok = tok->next;
 784     } while(tok != NULL);
 785     fprintf(stderr, "\n");
 786
 787     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 788
 789     do {
 790         fprintf(stderr,"%i", tok->toInsert);
 791         tok = tok->next;
 792     } while(tok != NULL);
 793 #endif
 794
 795     tok = lh->first;
 796     uint32_t fStrength = UCOL_IDENTICAL;
 797     uint32_t initStrength = UCOL_IDENTICAL;
 798
 799
 800     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 801     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 802     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 803
 804     while (tok != NULL && U_SUCCESS(*status)) {
 805         fStrength = tok->strength;
 806         if(fStrength < initStrength) {
 807             initStrength = fStrength;
 808             if(lh->pos[fStrength] == -1) {
 809                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
 810                     fStrength--;
 811                 }
 812                 if(lh->pos[fStrength] == -1) {
 813                     *status = U_INTERNAL_PROGRAM_ERROR;
 814                     return;
 815                 }
 816             }
 817             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 818                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 819                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 820                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 821                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 822             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 823                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 824                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 825                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 826                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 827             } else { /* primaries */
 828                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 829                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 830                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 831                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 832             }
 833         } else {
 834             if(tok->strength == UCOL_TERTIARY) {
 835                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 836             } else if(tok->strength == UCOL_SECONDARY) {
 837                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 838                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 839             } else if(tok->strength == UCOL_PRIMARY) {
 840                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 841                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 842                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 843             }
 844         }
 845         ucol_doCE(src, CEparts, tok, status);
 846         tok = tok->next;
 847     }
 848 }
 849
 850 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 851     UCAElements el;
 852     UColToken *tok = lh->first;
 853     UColToken *expt = NULL;
 854     uint32_t i = 0, j = 0;
 855     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
 856
 857     while(tok != NULL && U_SUCCESS(*status)) {
 858         /* first, check if there are any expansions */
 859         /* if there are expansions, we need to do a little bit more processing */
 860         /* since parts of expansion can be tailored, while others are not */
 861         if(tok->expansion != 0) {
 862             uint32_t len = tok->expansion >> 24;
 863             uint32_t currentSequenceLen = len;
 864             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 865             //uint32_t exp = currentSequenceLen | expOffset;
 866             UColToken exp;
 867             exp.source = currentSequenceLen | expOffset;
 868             exp.rulesToParseHdl = &(src->source);
 869
 870             while(len > 0) {
 871                 currentSequenceLen = len;
 872                 while(currentSequenceLen > 0) {
 873                     exp.source = (currentSequenceLen << 24) | expOffset;
 874                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 875                         uint32_t noOfCEsToCopy = expt->noOfCEs;
 876                         for(j = 0; j<noOfCEsToCopy; j++) {
 877                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 878                         }
 879                         tok->noOfExpCEs += noOfCEsToCopy;
 880                         // Smart people never try to add codepoints and CEs.
 881                         // For some odd reason, it won't work.
 882                         expOffset += currentSequenceLen; //noOfCEsToCopy;
 883                         len -= currentSequenceLen; //noOfCEsToCopy;
 884                         break;
 885                     } else {
 886                         currentSequenceLen--;
 887                     }
 888                 }
 889                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 890                     /* will have to get one from UCA */
 891                     /* first, get the UChars from the rules */
 892                     /* then pick CEs out until there is no more and stuff them into expansion */
 893                     collIterate s;
 894                     uint32_t order = 0;
 895                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
 896
 897                     for(;;) {
 898                         order = ucol_getNextCE(src->UCA, &s, status);
 899                         if(order == UCOL_NO_MORE_CES) {
 900                             break;
 901                         }
 902                         tok->expCEs[tok->noOfExpCEs++] = order;
 903                     }
 904                     expOffset++;
 905                     len--;
 906                 }
 907             }
 908         } else {
 909             tok->noOfExpCEs = 0;
 910         }
 911
 912         /* set the ucaelement with obtained values */
 913         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 914         /* copy CEs */
 915         for(i = 0; i<tok->noOfCEs; i++) {
 916             el.CEs[i] = tok->CEs[i];
 917         }
 918         for(i = 0; i<tok->noOfExpCEs; i++) {
 919             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 920         }
 921
 922         /* copy UChars */
 923         // We kept prefix and source kind of together, as it is a kind of a contraction.
 924         // However, now we have to slice the prefix off the main thing -
 925         el.prefix = el.prefixChars;
 926         el.cPoints = el.uchars;
 927         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 928             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 929             // decomposed elements to the unsaf table.
 930             el.prefixSize = tok->prefix>>24;
 931             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 932
 933             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 934             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 935         } else {
 936             el.prefixSize = 0;
 937             *el.prefix = 0;
 938
 939             el.cSize = (tok->source >> 24);
 940             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 941         }
 942         if(src->UCA != NULL) {
 943             for(i = 0; i<el.cSize; i++) {
 944                 if(UCOL_ISJAMO(el.cPoints[i])) {
 945                     t->image->jamoSpecial = TRUE;
 946                 }
 947             }
 948             if (!src->buildCCTabFlag && el.cSize > 0) {
 949                 // Check the trailing canonical combining class (tccc) of the last character.
 950                 const UChar *s = el.cPoints + el.cSize;
 951                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
 952                 if ((fcd & 0xff) != 0) {
 953                     src->buildCCTabFlag = TRUE;
 954                 }
 955             }
 956         }
 957
 958         /* and then, add it */
 959 #if UCOL_DEBUG==2
 960         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 961 #endif
 962         uprv_uca_addAnElement(t, &el, status);
 963
 964 #if UCOL_DEBUG_DUPLICATES
 965         if(*status != U_ZERO_ERROR) {
 966             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 967             *status = U_ZERO_ERROR;
 968         }
 969 #endif
 970
 971         tok = tok->next;
 972     }
 973 }
 974
 975 U_CDECL_BEGIN
 976 static UBool U_CALLCONV
 977 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 978     UErrorCode status = U_ZERO_ERROR;
 979     tempUCATable *t = (tempUCATable *)context;
 980     if(value == 0) {
 981         while(start < limit) {
 982             uint32_t CE = utrie_get32(t->mapping, start, NULL);
 983             if(CE == UCOL_NOT_FOUND) {
 984                 UCAElements el;
 985                 el.isThai = FALSE;
 986                 el.prefixSize = 0;
 987                 el.prefixChars[0] = 0;
 988                 el.prefix = el.prefixChars;
 989                 el.cPoints = el.uchars;
 990
 991                 el.cSize = 0;
 992                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
 993
 994                 el.noOfCEs = 1;
 995                 el.CEs[0] = 0;
 996                 uprv_uca_addAnElement(t, &el, &status);
 997
 998             }
 999             start++;
1000         }
1001     }
1002     if(U_FAILURE(status)) {
1003         return FALSE;
1004     } else {
1005         return TRUE;
1006     }
1007 }
1008 U_CDECL_END
1009
1010 static void
1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1012                                UChar32 start, UChar32 end,
1013                                UErrorCode *status)
1014 {
1015     //UChar decomp[256];
1016     uint32_t CE = UCOL_NOT_FOUND;
1017     UChar32 u = 0;
1018     UCAElements el;
1019     el.isThai = FALSE;
1020     el.prefixSize = 0;
1021     el.prefixChars[0] = 0;
1022     collIterate colIt;
1023
1024     if(U_SUCCESS(*status)) {
1025         for(u = start; u<=end; u++) {
1026             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1027                 /* this test is for contractions that are missing the starting element. */
1028                 || ((isCntTableElement(CE)) &&
1029                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1030                 )
1031             {
1032                 el.cSize = 0;
1033                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1034                 //decomp[0] = (UChar)u;
1035                 //el.uchars[0] = (UChar)u;
1036                 el.cPoints = el.uchars;
1037                 //el.cSize = 1;
1038                 el.noOfCEs = 0;
1039                 el.prefix = el.prefixChars;
1040                 el.prefixSize = 0;
1041                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042                 // We actually want to check whether this element is a special
1043                 // If it is an implicit element (hangul, CJK - we want to copy the
1044                 // special, not the resolved CEs) - for hangul, copying resolved
1045                 // would just make things the same (there is an expansion and it
1046                 // takes approximately the same amount of time to resolve as
1047                 // falling back to the UCA).
1048                 /*
1049                 UTRIE_GET32(src->UCA->mapping, u, CE);
1050                 tag = getCETag(CE);
1051                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053                 || tag == LEAD_SURROGATE_TAG) {
1054                 el.CEs[el.noOfCEs++] = CE;
1055                 } else {
1056                 */
1057                 // It turns out that it does not make sense to keep implicits
1058                 // unresolved. The cost of resolving them is big enough so that
1059                 // it doesn't make any difference whether we have to go to the UCA
1060                 // or not.
1061                 {
1062                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1063                     while(CE != UCOL_NO_MORE_CES) {
1064                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1065                         if(CE != UCOL_NO_MORE_CES) {
1066                             el.CEs[el.noOfCEs++] = CE;
1067                         }
1068                     }
1069                 }
1070                 uprv_uca_addAnElement(t, &el, status);
1071             }
1072         }
1073     }
1074 }
1075
1076 U_NAMESPACE_END
1077
1078 U_CFUNC UCATableHeader *
1079 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1080     U_NAMESPACE_USE
1081
1082     uint32_t i = 0;
1083     if(U_FAILURE(*status)) {
1084         return NULL;
1085     }
1086     /*
1087     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1088     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089     create new ListHeader X
1090     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1091     first item you add, based on the stronger strength levels of the two lists.
1092     */
1093     /*
1094     3.  For each ListHeader with a non-null positive list:
1095     */
1096     /*
1097     o   Find all character strings with CEs between the baseCE and the
1098     next/previous CE, at the strength of the first token. Add these to the
1099     tailoring.
1100     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1101     tailoring has & x < z...
1102     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1103     */
1104     /* It is possible that this part should be done even while constructing list */
1105     /* The problem is that it is unknown what is going to be the strongest weight */
1106     /* So we might as well do it here */
1107
1108     /*
1109     o   Allocate CEs for each token in the list, based on the total number N of the
1110     largest level difference, and the gap G between baseCE and nextCE at that
1111     level. The relation * between the last item and nextCE is the same as the
1112     strongest strength.
1113     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115     Then fit b and c into the secondary gap between a and d, then fit q
1116     into the tertiary gap between b and c.
1117
1118     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1119     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120     Then fit q into the tertiary gap between b and c.
1121     o   When incrementing primary values, we will not cross high byte
1122     boundaries except where there is only a single-byte primary. That is to
1123     ensure that the script reordering will continue to work.
1124     */
1125     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1126     /* test for NULL */
1127     if (image == NULL) {
1128         *status = U_MEMORY_ALLOCATION_ERROR;
1129         return NULL;
1130     }
1131     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1132
1133     for(i = 0; i<src->resultLen; i++) {
1134         /* now we need to generate the CEs */
1135         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136         /* According to strength                                                          */
1137         if(U_SUCCESS(*status)) {
1138             if(src->lh[i].first) { // if there are any elements
1139                 // due to the way parser works, subsequent tailorings
1140                 // may remove all the elements from a sequence, therefore
1141                 // leaving an empty tailoring sequence.
1142                 ucol_initBuffers(src, &src->lh[i], status);
1143             }
1144         }
1145         if(U_FAILURE(*status)) {
1146             uprv_free(image);
1147             return NULL;
1148         }
1149     }
1150
1151     if(src->varTop != NULL) { /* stuff the variable top value */
1152         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1153         /* remove it from the list */
1154         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1155             src->varTop->listHeader->first = src->varTop->next;
1156         }
1157         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1158             src->varTop->listHeader->last = src->varTop->previous;
1159         }
1160         if(src->varTop->next != NULL) {
1161             src->varTop->next->previous = src->varTop->previous;
1162         }
1163         if(src->varTop->previous != NULL) {
1164             src->varTop->previous->next = src->varTop->next;
1165         }
1166     }
1167
1168
1169     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1170     if(U_FAILURE(*status)) {
1171         uprv_free(image);
1172         return NULL;
1173     }
1174
1175
1176     /* After this, we have assigned CE values to all regular CEs      */
1177     /* now we will go through list once more and resolve expansions,  */
1178     /* make UCAElements structs and add them to table                 */
1179     for(i = 0; i<src->resultLen; i++) {
1180         /* now we need to generate the CEs */
1181         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182         /* According to strength                                                          */
1183         if(U_SUCCESS(*status)) {
1184             ucol_createElements(src, t, &src->lh[i], status);
1185         }
1186     }
1187
1188     UCAElements el;
1189     el.isThai = FALSE;
1190     el.prefixSize = 0;
1191     el.prefixChars[0] = 0;
1192
1193     /* add latin-1 stuff */
1194     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1195
1196     /* add stuff for copying */
1197     if(src->copySet != NULL) {
1198         int32_t i = 0;
1199         UnicodeSet *set = (UnicodeSet *)src->copySet;
1200         for(i = 0; i < set->getRangeCount(); i++) {
1201             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1202         }
1203     }
1204
1205     if(U_SUCCESS(*status)) {
1206         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1207
1208         uint32_t tailoredCE = UCOL_NOT_FOUND;
1209         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1210         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1211         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1212         // Check for null pointer
1213         if (ucaEl == NULL) {
1214             *status = U_MEMORY_ALLOCATION_ERROR;
1215             return NULL;
1216         }
1217         while(*conts != 0) {
1218             // A continuation is NUL-terminated and NUL-padded
1219             // except if it has the maximum length.
1220             int32_t contractionLength = maxUCAContractionLength;
1221             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1222                 --contractionLength;
1223             }
1224             UChar32 first;
1225             int32_t firstLength = 0;
1226             U16_NEXT(conts, firstLength, contractionLength, first);
1227             tailoredCE = utrie_get32(t->mapping, first, NULL);
1228             if(tailoredCE != UCOL_NOT_FOUND) {
1229                 UBool needToAdd = TRUE;
1230                 if(isCntTableElement(tailoredCE)) {
1231                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1232                         needToAdd = FALSE;
1233                     }
1234                 }
1235                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1236                     UCAElements elm;
1237                     elm.cPoints = el.uchars;
1238                     elm.noOfCEs = 0;
1239                     elm.uchars[0] = *conts;
1240                     elm.uchars[1] = 0;
1241                     elm.cSize = 1;
1242                     elm.prefixChars[0] = *(conts+2);
1243                     elm.isThai = FALSE;
1244                     elm.prefix = elm.prefixChars;
1245                     elm.prefixSize = 1;
1246                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1247                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1248                         needToAdd = TRUE;
1249                     }
1250                 }
1251                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1252                     needToAdd = FALSE;
1253                 }
1254
1255                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1256                     if (*(conts+1) != 0) {  // contractions
1257                         el.prefix = el.prefixChars;
1258                         el.prefixSize = 0;
1259                         el.cPoints = el.uchars;
1260                         el.noOfCEs = 0;
1261                         u_memcpy(el.uchars, conts, contractionLength);
1262                         el.cSize = contractionLength;
1263                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1264                     }
1265                     else { // pre-context character
1266                         UChar str[4] = { 0 };
1267                         int32_t len=0;
1268                         int32_t preKeyLen=0;
1269
1270                         el.cPoints = el.uchars;
1271                         el.noOfCEs = 0;
1272                         el.uchars[0] = *conts;
1273                         el.uchars[1] = 0;
1274                         el.cSize = 1;
1275                         el.prefixChars[0] = *(conts+2);
1276                         el.prefix = el.prefixChars;
1277                         el.prefixSize = 1;
1278                         if (el.prefixChars[0]!=0) {
1279                             // get CE of prefix character first
1280                             str[0]=el.prefixChars[0];
1281                             str[1]=0;
1282                             ucol_setText(ucaEl, str, 1, status);
1283                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1284                                     != UCOL_NULLORDER) {
1285                                 preKeyLen++;  // count number of keys for prefix character
1286                             }
1287                             str[len++] = el.prefixChars[0];
1288                         }
1289
1290                         str[len++] = el.uchars[0];
1291                         str[len]=0;
1292                         ucol_setText(ucaEl, str, len, status);
1293                         // Skip the keys for prefix character, then copy the rest to el.
1294                         while ((preKeyLen-->0) &&
1295                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1296                             continue;
1297                         }
1298
1299                     }
1300                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1301                         el.noOfCEs++;
1302                     }
1303                     uprv_uca_addAnElement(t, &el, status);
1304                 }
1305
1306             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1307                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1308             }
1309             conts+=maxUCAContractionLength;
1310         }
1311         ucol_closeElements(ucaEl);
1312     }
1313
1314     // Add completely ignorable elements
1315     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1316
1317     // add tailoring characters related canonical closures
1318     uprv_uca_canonicalClosure(t, src, NULL, status);
1319
1320     /* still need to produce compatibility closure */
1321
1322     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1323
1324     uprv_uca_closeTempTable(t);
1325     uprv_free(image);
1326
1327     return myData;
1328 }
1329
1330 U_CDECL_BEGIN
1331 static UBool U_CALLCONV
1332 ucol_bld_cleanup(void)
1333 {
1334     udata_close(invUCA_DATA_MEM);
1335     invUCA_DATA_MEM = NULL;
1336     _staticInvUCA = NULL;
1337     return TRUE;
1338 }
1339 U_CDECL_END
1340
1341 U_CAPI const InverseUCATableHeader * U_EXPORT2
1342 ucol_initInverseUCA(UErrorCode *status)
1343 {
1344     if(U_FAILURE(*status)) return NULL;
1345
1346     UBool needsInit;
1347     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1348
1349     if(needsInit) {
1350         InverseUCATableHeader *newInvUCA = NULL;
1351         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1352
1353         if(U_FAILURE(*status)) {
1354             if (result) {
1355                 udata_close(result);
1356             }
1357             // This is not needed, as we are talking about
1358             // memory we got from UData
1359             //uprv_free(newInvUCA);
1360         }
1361
1362         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1363             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1364             UCollator *UCA = ucol_initUCA(status);
1365             // UCA versions of UCA and inverse UCA should match
1366             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1367                 *status = U_INVALID_FORMAT_ERROR;
1368                 udata_close(result);
1369                 return NULL;
1370             }
1371
1372             umtx_lock(NULL);
1373             if(_staticInvUCA == NULL) {
1374                 invUCA_DATA_MEM = result;
1375                 _staticInvUCA = newInvUCA;
1376                 result = NULL;
1377                 newInvUCA = NULL;
1378             }
1379             umtx_unlock(NULL);
1380
1381             if(newInvUCA != NULL) {
1382                 udata_close(result);
1383                 // This is not needed, as we are talking about
1384                 // memory we got from UData
1385                 //uprv_free(newInvUCA);
1386             }
1387             else {
1388                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1389             }
1390         }
1391     }
1392     return _staticInvUCA;
1393 }
1394
1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1396  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1397  */
1398 static const char * const ReorderingTokenNames[] = {
1399     "SPACE",
1400     "PUNCT",
1401     "SYMBOL",
1402     "CURRENCY",
1403     "DIGIT"
1404 };
1405
1406 static void toUpper(const char* src, char* dst, uint32_t length) {
1407    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1408        *dst = uprv_toupper(*src);
1409    }
1410    *dst = '\0';
1411 }
1412
1413 U_INTERNAL int32_t U_EXPORT2
1414 ucol_findReorderingEntry(const char* name) {
1415     char buffer[32];
1416     toUpper(name, buffer, 32);
1417     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
1418         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1419             return entry + UCOL_REORDER_CODE_FIRST;
1420         }
1421     }
1422     return USCRIPT_INVALID_CODE;
1423 }
1424
1425 #endif /* #if !UCONFIG_NO_COLLATION */