icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2011, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/udata.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28 #include "unicode/uscript.h"
  29 #include "unicode/ustring.h"
  30 #include "normalizer2impl.h"
  31 #include "ucol_bld.h"
  32 #include "ucol_elm.h"
  33 #include "ucol_cnt.h"
  34 #include "ucln_in.h"
  35 #include "umutex.h"
  36 #include "cmemory.h"
  37 #include "cstring.h"
  38
  39 static const InverseUCATableHeader* _staticInvUCA = NULL;
  40 static UDataMemory* invUCA_DATA_MEM = NULL;
  41
  42 U_CDECL_BEGIN
  43 static UBool U_CALLCONV
  44 isAcceptableInvUCA(void * /*context*/,
  45                    const char * /*type*/, const char * /*name*/,
  46                    const UDataInfo *pInfo)
  47 {
  48     /* context, type & name are intentionally not used */
  49     if( pInfo->size>=20 &&
  50         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  51         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  52         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  53         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  54         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  55         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  56         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  57         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  58         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  59         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  60         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  61         )
  62     {
  63         UVersionInfo UCDVersion;
  64         u_getUnicodeVersion(UCDVersion);
  65         return (pInfo->dataVersion[0]==UCDVersion[0] &&
  66             pInfo->dataVersion[1]==UCDVersion[1]);
  67             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  68             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  69             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  70     } else {
  71         return FALSE;
  72     }
  73 }
  74 U_CDECL_END
  75
  76 /*
  77 * Takes two CEs (lead and continuation) and
  78 * compares them as CEs should be compared:
  79 * primary vs. primary, secondary vs. secondary
  80 * tertiary vs. tertiary
  81 */
  82 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
  83     uint32_t s1 = source0, s2, t1 = target0, t2;
  84     if(isContinuation(source1)) {
  85         s2 = source1;
  86     } else {
  87         s2 = 0;
  88     }
  89     if(isContinuation(target1)) {
  90         t2 = target1;
  91     } else {
  92         t2 = 0;
  93     }
  94
  95     uint32_t s = 0, t = 0;
  96     if(s1 == t1 && s2 == t2) {
  97         return 0;
  98     }
  99     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
 100     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
 101     if(s < t) {
 102         return -1;
 103     } else if(s > t) {
 104         return 1;
 105     } else {
 106         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
 107         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
 108         if(s < t) {
 109             return -1;
 110         } else if(s > t) {
 111             return 1;
 112         } else {
 113             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
 114             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
 115             if(s < t) {
 116                 return -1;
 117             } else {
 118                 return 1;
 119             }
 120         }
 121     }
 122 }
 123
 124 static
 125 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
 126     uint32_t bottom = 0, top = src->invUCA->tableSize;
 127     uint32_t i = 0;
 128     uint32_t first = 0, second = 0;
 129     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 130     int32_t res = 0;
 131
 132     while(bottom < top-1) {
 133         i = (top+bottom)/2;
 134         first = *(CETable+3*i);
 135         second = *(CETable+3*i+1);
 136         res = compareCEs(first, second, CE, SecondCE);
 137         if(res > 0) {
 138             top = i;
 139         } else if(res < 0) {
 140             bottom = i;
 141         } else {
 142             break;
 143         }
 144     }
 145
 146     /* weiv:                                                  */
 147     /* in searching for elements, I have removed the failure  */
 148     /* The reason for this is that the builder does not rely  */
 149     /* on search mechanism telling it that it didn't find an  */
 150     /* element. However, indirect positioning relies on being */
 151     /* able to find the elements around any CE, even if it is */
 152     /* not defined in the UCA. */
 153     return i;
 154     /*
 155     if((first == CE && second == SecondCE)) {
 156     return i;
 157     } else {
 158     return -1;
 159     }
 160     */
 161 }
 162
 163 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 164     0xFFFF0000,
 165     0xFFFFFF00,
 166     0xFFFFFFFF
 167 };
 168
 169 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 170                                             uint32_t CE, uint32_t contCE,
 171                                             uint32_t *nextCE, uint32_t *nextContCE,
 172                                             uint32_t strength)
 173 {
 174     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 175     int32_t iCE;
 176
 177     iCE = ucol_inv_findCE(src, CE, contCE);
 178
 179     if(iCE<0) {
 180         *nextCE = UCOL_NOT_FOUND;
 181         return -1;
 182     }
 183
 184     CE &= strengthMask[strength];
 185     contCE &= strengthMask[strength];
 186
 187     *nextCE = CE;
 188     *nextContCE = contCE;
 189
 190     while((*nextCE  & strengthMask[strength]) == CE
 191         && (*nextContCE  & strengthMask[strength]) == contCE)
 192     {
 193         *nextCE = (*(CETable+3*(++iCE)));
 194         *nextContCE = (*(CETable+3*(iCE)+1));
 195     }
 196
 197     return iCE;
 198 }
 199
 200 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 201                                             uint32_t CE, uint32_t contCE,
 202                                             uint32_t *prevCE, uint32_t *prevContCE,
 203                                             uint32_t strength)
 204 {
 205     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 206     int32_t iCE;
 207
 208     iCE = ucol_inv_findCE(src, CE, contCE);
 209
 210     if(iCE<0) {
 211         *prevCE = UCOL_NOT_FOUND;
 212         return -1;
 213     }
 214
 215     CE &= strengthMask[strength];
 216     contCE &= strengthMask[strength];
 217
 218     *prevCE = CE;
 219     *prevContCE = contCE;
 220
 221     while((*prevCE  & strengthMask[strength]) == CE
 222         && (*prevContCE  & strengthMask[strength])== contCE
 223         && iCE > 0) /* this condition should prevent falling off the edge of the world */
 224     {
 225         /* here, we end up in a singularity - zero */
 226         *prevCE = (*(CETable+3*(--iCE)));
 227         *prevContCE = (*(CETable+3*(iCE)+1));
 228     }
 229
 230     return iCE;
 231 }
 232
 233 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 234                                                        uint32_t prevCE, uint32_t prevContCE)
 235 {
 236     if(prevCE == CE && prevContCE == contCE) {
 237         return UCOL_IDENTICAL;
 238     }
 239     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
 240         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
 241     {
 242         return UCOL_PRIMARY;
 243     }
 244     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
 245         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
 246     {
 247         return UCOL_SECONDARY;
 248     }
 249     return UCOL_TERTIARY;
 250 }
 251
 252
 253 /*static
 254 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 255
 256     uint32_t CE = lh->baseCE;
 257     uint32_t SecondCE = lh->baseContCE;
 258
 259     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 260     uint32_t previousCE, previousContCE;
 261     int32_t iCE;
 262
 263     iCE = ucol_inv_findCE(src, CE, SecondCE);
 264
 265     if(iCE<0) {
 266         return -1;
 267     }
 268
 269     CE &= strengthMask[strength];
 270     SecondCE &= strengthMask[strength];
 271
 272     previousCE = CE;
 273     previousContCE = SecondCE;
 274
 275     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 276         previousCE = (*(CETable+3*(--iCE)));
 277         previousContCE = (*(CETable+3*(iCE)+1));
 278     }
 279     lh->previousCE = previousCE;
 280     lh->previousContCE = previousContCE;
 281
 282     return iCE;
 283 }*/
 284
 285 static
 286 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 287     uint32_t CE = lh->baseCE;
 288     uint32_t SecondCE = lh->baseContCE;
 289
 290     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 291     uint32_t nextCE, nextContCE;
 292     int32_t iCE;
 293
 294     iCE = ucol_inv_findCE(src, CE, SecondCE);
 295
 296     if(iCE<0) {
 297         return -1;
 298     }
 299
 300     CE &= strengthMask[strength];
 301     SecondCE &= strengthMask[strength];
 302
 303     nextCE = CE;
 304     nextContCE = SecondCE;
 305
 306     while((nextCE  & strengthMask[strength]) == CE
 307         && (nextContCE  & strengthMask[strength]) == SecondCE)
 308     {
 309         nextCE = (*(CETable+3*(++iCE)));
 310         nextContCE = (*(CETable+3*(iCE)+1));
 311     }
 312
 313     lh->nextCE = nextCE;
 314     lh->nextContCE = nextContCE;
 315
 316     return iCE;
 317 }
 318
 319 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 320     /* reset all the gaps */
 321     int32_t i = 0;
 322     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 323     uint32_t st = 0;
 324     uint32_t t1, t2;
 325     int32_t pos;
 326
 327     UColToken *tok = lh->first;
 328     uint32_t tokStrength = tok->strength;
 329
 330     for(i = 0; i<3; i++) {
 331         lh->gapsHi[3*i] = 0;
 332         lh->gapsHi[3*i+1] = 0;
 333         lh->gapsHi[3*i+2] = 0;
 334         lh->gapsLo[3*i] = 0;
 335         lh->gapsLo[3*i+1] = 0;
 336         lh->gapsLo[3*i+2] = 0;
 337         lh->numStr[i] = 0;
 338         lh->fStrToken[i] = NULL;
 339         lh->lStrToken[i] = NULL;
 340         lh->pos[i] = -1;
 341     }
 342
 343     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 344
 345     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 346         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 347         lh->pos[0] = 0;
 348         t1 = lh->baseCE;
 349         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
 350         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 351         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 352         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 353         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
 354         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 355
 356         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
 357         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
 358
 359         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 360         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 361         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 362     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 363         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 364         lh->pos[0] = 0;
 365         t1 = lh->baseCE;
 366         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
 367         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 368         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 369         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 370         t1 = lh->nextCE;
 371         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
 372         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 373         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 374         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 375     } else {
 376         for(;;) {
 377             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 378                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 379                     lh->fStrToken[tokStrength] = tok;
 380                 } else { /* The CE must be implicit, since it's not in the table */
 381                     /* Error */
 382                     *status = U_INTERNAL_PROGRAM_ERROR;
 383                 }
 384             }
 385
 386             while(tok != NULL && tok->strength >= tokStrength) {
 387                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 388                     lh->lStrToken[tokStrength] = tok;
 389                 }
 390                 tok = tok->next;
 391             }
 392             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 393                 /* check if previous interval is the same and merge the intervals if it is so */
 394                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 395                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 396                     lh->fStrToken[tokStrength+1] = NULL;
 397                     lh->lStrToken[tokStrength+1] = NULL;
 398                     lh->pos[tokStrength+1] = -1;
 399                 }
 400             }
 401             if(tok != NULL) {
 402                 tokStrength = tok->strength;
 403             } else {
 404                 break;
 405             }
 406         }
 407         for(st = 0; st < 3; st++) {
 408             if((pos = lh->pos[st]) >= 0) {
 409                 t1 = *(CETable+3*(pos));
 410                 t2 = *(CETable+3*(pos)+1);
 411                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 412                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 413                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 414                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 415                 //pos--;
 416                 //t1 = *(CETable+3*(pos));
 417                 //t2 = *(CETable+3*(pos)+1);
 418                 t1 = lh->baseCE;
 419                 t2 = lh->baseContCE;
 420                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 421                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 422                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 423             }
 424         }
 425     }
 426 }
 427
 428
 429 #define ucol_countBytes(value, noOfBytes)   \
 430 {                               \
 431     uint32_t mask = 0xFFFFFFFF;   \
 432     (noOfBytes) = 0;              \
 433     while(mask != 0) {            \
 434     if(((value) & mask) != 0) { \
 435     (noOfBytes)++;            \
 436     }                           \
 437     mask >>= 8;                 \
 438     }                             \
 439 }
 440
 441 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 442     if(U_SUCCESS(*status)) {
 443         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 444     }
 445     return g->current;
 446 }
 447
 448 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 449     /* TODO: rename to enum names */
 450     uint32_t high, low, count=1;
 451     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 452
 453     if(strength == UCOL_SECONDARY) {
 454         low = UCOL_COMMON_TOP2<<24;
 455         high = 0xFFFFFFFF;
 456         count = 0xFF - UCOL_COMMON_TOP2;
 457     } else {
 458         low = UCOL_BYTE_COMMON << 24; //0x05000000;
 459         high = 0x40000000;
 460         count = 0x40 - UCOL_BYTE_COMMON;
 461     }
 462
 463     if(tok->next != NULL && tok->next->strength == strength) {
 464         count = tok->next->toInsert;
 465     }
 466
 467     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 468     g->current = UCOL_BYTE_COMMON<<24;
 469
 470     if(g->noOfRanges == 0) {
 471         *status = U_INTERNAL_PROGRAM_ERROR;
 472     }
 473     return g->current;
 474 }
 475
 476 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 477     uint32_t strength = tok->strength;
 478     uint32_t low = lows[fStrength*3+strength];
 479     uint32_t high = highs[fStrength*3+strength];
 480     uint32_t maxByte = 0;
 481     if(strength == UCOL_TERTIARY) {
 482         maxByte = 0x3F;
 483     } else if(strength == UCOL_PRIMARY) {
 484         maxByte = 0xFE;
 485     } else {
 486         maxByte = 0xFF;
 487     }
 488
 489     uint32_t count = tok->toInsert;
 490
 491     if(low >= high && strength > UCOL_PRIMARY) {
 492         int32_t s = strength;
 493         for(;;) {
 494             s--;
 495             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 496                 if(strength == UCOL_SECONDARY) {
 497                     if (low < UCOL_COMMON_TOP2<<24 ) {
 498                        // Override if low range is less than UCOL_COMMON_TOP2.
 499                         low = UCOL_COMMON_TOP2<<24;
 500                     }
 501                     high = 0xFFFFFFFF;
 502                 } else {
 503                     // Override if low range is less than UCOL_COMMON_BOT3.
 504                     if ( low < UCOL_COMMON_BOT3<<24 ) {
 505                         low = UCOL_COMMON_BOT3<<24;
 506                     }
 507                     high = 0x40000000;
 508                 }
 509                 break;
 510             }
 511             if(s<0) {
 512                 *status = U_INTERNAL_PROGRAM_ERROR;
 513                 return 0;
 514             }
 515         }
 516     }
 517
 518     if(low < 0x02000000) {
 519         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
 520         // See http://site.icu-project.org/design/collation/bytes
 521         low = 0x02000000;
 522     }
 523
 524     if(strength == UCOL_SECONDARY) { /* similar as simple */
 525         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 526             low = UCOL_COMMON_TOP2<<24;
 527         }
 528         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 529             high = UCOL_COMMON_TOP2<<24;
 530         }
 531         if(low < (UCOL_COMMON_BOT2<<24)) {
 532             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 533             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 534             //g->current = UCOL_COMMON_BOT2<<24;
 535             return g->current;
 536         }
 537     }
 538
 539     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 540     if(g->noOfRanges == 0) {
 541         *status = U_INTERNAL_PROGRAM_ERROR;
 542     }
 543     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 544     return g->current;
 545 }
 546
 547 static
 548 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 549     uint32_t i = 0;
 550     UChar c;
 551
 552     if(U_FAILURE(*status)) {
 553         return 0;
 554     }
 555
 556     if(sourceLen > resLen) {
 557         *status = U_MEMORY_ALLOCATION_ERROR;
 558         return 0;
 559     }
 560
 561     for(i = 0; i < sourceLen; i++) {
 562         c = source[i];
 563         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 564             switch(c - 0x3000) {
 565             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 566             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 567                 c++;
 568                 break;
 569             case 0xF5:
 570                 c = 0x30AB;
 571                 break;
 572             case 0xF6:
 573                 c = 0x30B1;
 574                 break;
 575             }
 576         }
 577         resBuf[i] = c;
 578     }
 579     return sourceLen;
 580 }
 581
 582 static
 583 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 584     uint32_t i = 0;
 585     UChar c;
 586
 587     if(U_FAILURE(*status)) {
 588         return 0;
 589     }
 590
 591     if(sourceLen > resLen) {
 592         *status = U_MEMORY_ALLOCATION_ERROR;
 593         return 0;
 594     }
 595
 596     for(i = 0; i < sourceLen; i++) {
 597         c = source[i];
 598         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 599             switch(c - 0x3000) {
 600             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 601             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 602                 c--;
 603                 break;
 604             case 0xAB:
 605                 c = 0x30F5;
 606                 break;
 607             case 0xB1:
 608                 c = 0x30F6;
 609                 break;
 610             }
 611         }
 612         resBuf[i] = c;
 613     }
 614     return sourceLen;
 615 }
 616
 617 U_NAMESPACE_BEGIN
 618
 619 static
 620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 621     uint32_t i = 0;
 622     UChar n[128];
 623     uint32_t nLen = 0;
 624     uint32_t uCount = 0, lCount = 0;
 625
 626     collIterate s;
 627     uint32_t order = 0;
 628
 629     if(U_FAILURE(*status)) {
 630         return UCOL_LOWER_CASE;
 631     }
 632
 633     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 634     if(U_SUCCESS(*status)) {
 635         for(i = 0; i < nLen; i++) {
 636             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
 637             order = ucol_getNextCE(UCA, &s, status);
 638             if(isContinuation(order)) {
 639                 *status = U_INTERNAL_PROGRAM_ERROR;
 640                 return UCOL_LOWER_CASE;
 641             }
 642             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 643                 uCount++;
 644             } else {
 645                 if(u_islower(n[i])) {
 646                     lCount++;
 647                 } else if(U_SUCCESS(*status)) {
 648                     UChar sk[1], lk[1];
 649                     u_toSmallKana(&n[i], 1, sk, 1, status);
 650                     u_toLargeKana(&n[i], 1, lk, 1, status);
 651                     if(sk[0] == n[i] && lk[0] != n[i]) {
 652                         lCount++;
 653                     }
 654                 }
 655             }
 656         }
 657     }
 658
 659     if(uCount != 0 && lCount != 0) {
 660         return UCOL_MIXED_CASE;
 661     } else if(uCount != 0) {
 662         return UCOL_UPPER_CASE;
 663     } else {
 664         return UCOL_LOWER_CASE;
 665     }
 666 }
 667
 668
 669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 670     /* this one makes the table and stuff */
 671     uint32_t noOfBytes[3];
 672     uint32_t i;
 673
 674     for(i = 0; i<3; i++) {
 675         ucol_countBytes(CEparts[i], noOfBytes[i]);
 676     }
 677
 678     /* Here we have to pack CEs from parts */
 679
 680     uint32_t CEi = 0;
 681     uint32_t value = 0;
 682
 683     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 684         if(CEi > 0) {
 685             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 686         } else {
 687             value = 0;
 688         }
 689
 690         if(2*CEi<noOfBytes[0]) {
 691             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 692         }
 693         if(CEi<noOfBytes[1]) {
 694             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 695         }
 696         if(CEi<noOfBytes[2]) {
 697             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 698         }
 699         tok->CEs[CEi] = value;
 700         CEi++;
 701     }
 702     if(CEi == 0) { /* totally ignorable */
 703         tok->noOfCEs = 1;
 704         tok->CEs[0] = 0;
 705     } else { /* there is at least something */
 706         tok->noOfCEs = CEi;
 707     }
 708
 709
 710     // we want to set case bits here and now, not later.
 711     // Case bits handling
 712     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
 713         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 714         int32_t cSize = (tok->source & 0xFF000000) >> 24;
 715         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 716
 717         if(cSize > 1) {
 718             // Do it manually
 719             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 720         } else {
 721             // Copy it from the UCA
 722             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 723             tok->CEs[0] |= (caseCE & 0xC0);
 724         }
 725     }
 726
 727 #if UCOL_DEBUG==2
 728     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 729     for(i = 0; i<tok->noOfCEs; i++) {
 730         fprintf(stderr, "%08X ", tok->CEs[i]);
 731     }
 732     fprintf(stderr, "\n");
 733 #endif
 734 }
 735
 736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 737     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 738     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 739
 740     UColToken *tok = lh->last;
 741     uint32_t t[UCOL_STRENGTH_LIMIT];
 742
 743     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 744
 745     tok->toInsert = 1;
 746     t[tok->strength] = 1;
 747
 748     while(tok->previous != NULL) {
 749         if(tok->previous->strength < tok->strength) { /* going up */
 750             t[tok->strength] = 0;
 751             t[tok->previous->strength]++;
 752         } else if(tok->previous->strength > tok->strength) { /* going down */
 753             t[tok->previous->strength] = 1;
 754         } else {
 755             t[tok->strength]++;
 756         }
 757         tok=tok->previous;
 758         tok->toInsert = t[tok->strength];
 759     }
 760
 761     tok->toInsert = t[tok->strength];
 762     ucol_inv_getGapPositions(src, lh, status);
 763
 764 #if UCOL_DEBUG
 765     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 766     int32_t j = 2;
 767     for(j = 2; j >= 0; j--) {
 768         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 769         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 770     }
 771     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 772
 773     do {
 774         fprintf(stderr,"%i", tok->strength);
 775         tok = tok->next;
 776     } while(tok != NULL);
 777     fprintf(stderr, "\n");
 778
 779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 780
 781     do {
 782         fprintf(stderr,"%i", tok->toInsert);
 783         tok = tok->next;
 784     } while(tok != NULL);
 785 #endif
 786
 787     tok = lh->first;
 788     uint32_t fStrength = UCOL_IDENTICAL;
 789     uint32_t initStrength = UCOL_IDENTICAL;
 790
 791
 792     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 793     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 794     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 795
 796     while (tok != NULL && U_SUCCESS(*status)) {
 797         fStrength = tok->strength;
 798         if(fStrength < initStrength) {
 799             initStrength = fStrength;
 800             if(lh->pos[fStrength] == -1) {
 801                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
 802                     fStrength--;
 803                 }
 804                 if(lh->pos[fStrength] == -1) {
 805                     *status = U_INTERNAL_PROGRAM_ERROR;
 806                     return;
 807                 }
 808             }
 809             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 810                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 811                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 812                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 813                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 814             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 815                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 816                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 817                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 818                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 819             } else { /* primaries */
 820                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 821                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 822                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 824             }
 825         } else {
 826             if(tok->strength == UCOL_TERTIARY) {
 827                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 828             } else if(tok->strength == UCOL_SECONDARY) {
 829                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 830                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 831             } else if(tok->strength == UCOL_PRIMARY) {
 832                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 833                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 834                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 835             }
 836         }
 837         ucol_doCE(src, CEparts, tok, status);
 838         tok = tok->next;
 839     }
 840 }
 841
 842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 843     UCAElements el;
 844     UColToken *tok = lh->first;
 845     UColToken *expt = NULL;
 846     uint32_t i = 0, j = 0;
 847     UChar32 fcdHighStart;
 848     const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
 849
 850     while(tok != NULL && U_SUCCESS(*status)) {
 851         /* first, check if there are any expansions */
 852         /* if there are expansions, we need to do a little bit more processing */
 853         /* since parts of expansion can be tailored, while others are not */
 854         if(tok->expansion != 0) {
 855             uint32_t len = tok->expansion >> 24;
 856             uint32_t currentSequenceLen = len;
 857             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 858             //uint32_t exp = currentSequenceLen | expOffset;
 859             UColToken exp;
 860             exp.source = currentSequenceLen | expOffset;
 861             exp.rulesToParseHdl = &(src->source);
 862
 863             while(len > 0) {
 864                 currentSequenceLen = len;
 865                 while(currentSequenceLen > 0) {
 866                     exp.source = (currentSequenceLen << 24) | expOffset;
 867                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 868                         uint32_t noOfCEsToCopy = expt->noOfCEs;
 869                         for(j = 0; j<noOfCEsToCopy; j++) {
 870                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 871                         }
 872                         tok->noOfExpCEs += noOfCEsToCopy;
 873                         // Smart people never try to add codepoints and CEs.
 874                         // For some odd reason, it won't work.
 875                         expOffset += currentSequenceLen; //noOfCEsToCopy;
 876                         len -= currentSequenceLen; //noOfCEsToCopy;
 877                         break;
 878                     } else {
 879                         currentSequenceLen--;
 880                     }
 881                 }
 882                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 883                     /* will have to get one from UCA */
 884                     /* first, get the UChars from the rules */
 885                     /* then pick CEs out until there is no more and stuff them into expansion */
 886                     collIterate s;
 887                     uint32_t order = 0;
 888                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
 889
 890                     for(;;) {
 891                         order = ucol_getNextCE(src->UCA, &s, status);
 892                         if(order == UCOL_NO_MORE_CES) {
 893                             break;
 894                         }
 895                         tok->expCEs[tok->noOfExpCEs++] = order;
 896                     }
 897                     expOffset++;
 898                     len--;
 899                 }
 900             }
 901         } else {
 902             tok->noOfExpCEs = 0;
 903         }
 904
 905         /* set the ucaelement with obtained values */
 906         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 907         /* copy CEs */
 908         for(i = 0; i<tok->noOfCEs; i++) {
 909             el.CEs[i] = tok->CEs[i];
 910         }
 911         for(i = 0; i<tok->noOfExpCEs; i++) {
 912             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 913         }
 914
 915         /* copy UChars */
 916         // We kept prefix and source kind of together, as it is a kind of a contraction.
 917         // However, now we have to slice the prefix off the main thing -
 918         el.prefix = el.prefixChars;
 919         el.cPoints = el.uchars;
 920         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 921             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 922             // decomposed elements to the unsaf table.
 923             el.prefixSize = tok->prefix>>24;
 924             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 925
 926             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 927             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 928         } else {
 929             el.prefixSize = 0;
 930             *el.prefix = 0;
 931
 932             el.cSize = (tok->source >> 24);
 933             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 934         }
 935         if(src->UCA != NULL) {
 936             for(i = 0; i<el.cSize; i++) {
 937                 if(UCOL_ISJAMO(el.cPoints[i])) {
 938                     t->image->jamoSpecial = TRUE;
 939                 }
 940             }
 941             if (!src->buildCCTabFlag && el.cSize > 0) {
 942                 // Check the trailing canonical combining class (tccc) of the last character.
 943                 const UChar *s = el.cPoints + el.cSize;
 944                 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
 945                 if ((fcd & 0xff) != 0) {
 946                     src->buildCCTabFlag = TRUE;
 947                 }
 948             }
 949         }
 950
 951         /* and then, add it */
 952 #if UCOL_DEBUG==2
 953         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 954 #endif
 955         uprv_uca_addAnElement(t, &el, status);
 956
 957 #if UCOL_DEBUG_DUPLICATES
 958         if(*status != U_ZERO_ERROR) {
 959             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 960             *status = U_ZERO_ERROR;
 961         }
 962 #endif
 963
 964         tok = tok->next;
 965     }
 966 }
 967
 968 U_CDECL_BEGIN
 969 static UBool U_CALLCONV
 970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 971     UErrorCode status = U_ZERO_ERROR;
 972     tempUCATable *t = (tempUCATable *)context;
 973     if(value == 0) {
 974         while(start < limit) {
 975             uint32_t CE = utrie_get32(t->mapping, start, NULL);
 976             if(CE == UCOL_NOT_FOUND) {
 977                 UCAElements el;
 978                 el.isThai = FALSE;
 979                 el.prefixSize = 0;
 980                 el.prefixChars[0] = 0;
 981                 el.prefix = el.prefixChars;
 982                 el.cPoints = el.uchars;
 983
 984                 el.cSize = 0;
 985                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
 986
 987                 el.noOfCEs = 1;
 988                 el.CEs[0] = 0;
 989                 uprv_uca_addAnElement(t, &el, &status);
 990
 991             }
 992             start++;
 993         }
 994     }
 995     if(U_FAILURE(status)) {
 996         return FALSE;
 997     } else {
 998         return TRUE;
 999     }
1000 }
1001 U_CDECL_END
1002
1003 static void
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005                                UChar32 start, UChar32 end,
1006                                UErrorCode *status)
1007 {
1008     //UChar decomp[256];
1009     uint32_t CE = UCOL_NOT_FOUND;
1010     UChar32 u = 0;
1011     UCAElements el;
1012     el.isThai = FALSE;
1013     el.prefixSize = 0;
1014     el.prefixChars[0] = 0;
1015     collIterate colIt;
1016
1017     if(U_SUCCESS(*status)) {
1018         for(u = start; u<=end; u++) {
1019             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020                 /* this test is for contractions that are missing the starting element. */
1021                 || ((isCntTableElement(CE)) &&
1022                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023                 )
1024             {
1025                 el.cSize = 0;
1026                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027                 //decomp[0] = (UChar)u;
1028                 //el.uchars[0] = (UChar)u;
1029                 el.cPoints = el.uchars;
1030                 //el.cSize = 1;
1031                 el.noOfCEs = 0;
1032                 el.prefix = el.prefixChars;
1033                 el.prefixSize = 0;
1034                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035                 // We actually want to check whether this element is a special
1036                 // If it is an implicit element (hangul, CJK - we want to copy the
1037                 // special, not the resolved CEs) - for hangul, copying resolved
1038                 // would just make things the same (there is an expansion and it
1039                 // takes approximately the same amount of time to resolve as
1040                 // falling back to the UCA).
1041                 /*
1042                 UTRIE_GET32(src->UCA->mapping, u, CE);
1043                 tag = getCETag(CE);
1044                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046                 || tag == LEAD_SURROGATE_TAG) {
1047                 el.CEs[el.noOfCEs++] = CE;
1048                 } else {
1049                 */
1050                 // It turns out that it does not make sense to keep implicits
1051                 // unresolved. The cost of resolving them is big enough so that
1052                 // it doesn't make any difference whether we have to go to the UCA
1053                 // or not.
1054                 {
1055                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056                     while(CE != UCOL_NO_MORE_CES) {
1057                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1058                         if(CE != UCOL_NO_MORE_CES) {
1059                             el.CEs[el.noOfCEs++] = CE;
1060                         }
1061                     }
1062                 }
1063                 uprv_uca_addAnElement(t, &el, status);
1064             }
1065         }
1066     }
1067 }
1068
1069 U_NAMESPACE_END
1070
1071 U_CFUNC UCATableHeader *
1072 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1073     U_NAMESPACE_USE
1074
1075     uint32_t i = 0;
1076     if(U_FAILURE(*status)) {
1077         return NULL;
1078     }
1079     /*
1080     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1081     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082     create new ListHeader X
1083     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1084     first item you add, based on the stronger strength levels of the two lists.
1085     */
1086     /*
1087     3.  For each ListHeader with a non-null positive list:
1088     */
1089     /*
1090     o   Find all character strings with CEs between the baseCE and the
1091     next/previous CE, at the strength of the first token. Add these to the
1092     tailoring.
1093     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1094     tailoring has & x < z...
1095     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1096     */
1097     /* It is possible that this part should be done even while constructing list */
1098     /* The problem is that it is unknown what is going to be the strongest weight */
1099     /* So we might as well do it here */
1100
1101     /*
1102     o   Allocate CEs for each token in the list, based on the total number N of the
1103     largest level difference, and the gap G between baseCE and nextCE at that
1104     level. The relation * between the last item and nextCE is the same as the
1105     strongest strength.
1106     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108     Then fit b and c into the secondary gap between a and d, then fit q
1109     into the tertiary gap between b and c.
1110
1111     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1112     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113     Then fit q into the tertiary gap between b and c.
1114     o   When incrementing primary values, we will not cross high byte
1115     boundaries except where there is only a single-byte primary. That is to
1116     ensure that the script reordering will continue to work.
1117     */
1118     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119     /* test for NULL */
1120     if (image == NULL) {
1121         *status = U_MEMORY_ALLOCATION_ERROR;
1122         return NULL;
1123     }
1124     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126     for(i = 0; i<src->resultLen; i++) {
1127         /* now we need to generate the CEs */
1128         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129         /* According to strength                                                          */
1130         if(U_SUCCESS(*status)) {
1131             if(src->lh[i].first) { // if there are any elements
1132                 // due to the way parser works, subsequent tailorings
1133                 // may remove all the elements from a sequence, therefore
1134                 // leaving an empty tailoring sequence.
1135                 ucol_initBuffers(src, &src->lh[i], status);
1136             }
1137         }
1138         if(U_FAILURE(*status)) {
1139             uprv_free(image);
1140             return NULL;
1141         }
1142     }
1143
1144     if(src->varTop != NULL) { /* stuff the variable top value */
1145         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146         /* remove it from the list */
1147         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148             src->varTop->listHeader->first = src->varTop->next;
1149         }
1150         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151             src->varTop->listHeader->last = src->varTop->previous;
1152         }
1153         if(src->varTop->next != NULL) {
1154             src->varTop->next->previous = src->varTop->previous;
1155         }
1156         if(src->varTop->previous != NULL) {
1157             src->varTop->previous->next = src->varTop->next;
1158         }
1159     }
1160
1161
1162     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163     if(U_FAILURE(*status)) {
1164         uprv_free(image);
1165         return NULL;
1166     }
1167
1168
1169     /* After this, we have assigned CE values to all regular CEs      */
1170     /* now we will go through list once more and resolve expansions,  */
1171     /* make UCAElements structs and add them to table                 */
1172     for(i = 0; i<src->resultLen; i++) {
1173         /* now we need to generate the CEs */
1174         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175         /* According to strength                                                          */
1176         if(U_SUCCESS(*status)) {
1177             ucol_createElements(src, t, &src->lh[i], status);
1178         }
1179     }
1180
1181     UCAElements el;
1182     el.isThai = FALSE;
1183     el.prefixSize = 0;
1184     el.prefixChars[0] = 0;
1185
1186     /* add latin-1 stuff */
1187     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1188
1189     /* add stuff for copying */
1190     if(src->copySet != NULL) {
1191         int32_t i = 0;
1192         UnicodeSet *set = (UnicodeSet *)src->copySet;
1193         for(i = 0; i < set->getRangeCount(); i++) {
1194             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1195         }
1196     }
1197
1198     if(U_SUCCESS(*status)) {
1199         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201         uint32_t tailoredCE = UCOL_NOT_FOUND;
1202         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1203         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1204         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205         // Check for null pointer
1206         if (ucaEl == NULL) {
1207                 *status = U_MEMORY_ALLOCATION_ERROR;
1208                 return NULL;
1209         }
1210         while(*conts != 0) {
1211             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1212             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1213             if(tailoredCE != UCOL_NOT_FOUND) {
1214                 UBool needToAdd = TRUE;
1215                 if(isCntTableElement(tailoredCE)) {
1216                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1217                         needToAdd = FALSE;
1218                     }
1219                 }
1220                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1221                     UCAElements elm;
1222                     elm.cPoints = el.uchars;
1223                     elm.noOfCEs = 0;
1224                     elm.uchars[0] = *conts;
1225                     elm.uchars[1] = 0;
1226                     elm.cSize = 1;
1227                     elm.prefixChars[0] = *(conts+2);
1228                     elm.isThai = FALSE;
1229                     elm.prefix = elm.prefixChars;
1230                     elm.prefixSize = 1;
1231                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1232                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1233                         needToAdd = TRUE;
1234                     }
1235                 }
1236                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1237                     needToAdd = FALSE;
1238                 }
1239
1240                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1241                     if (*(conts+1) != 0) {  // contractions
1242                         el.prefix = el.prefixChars;
1243                         el.prefixSize = 0;
1244                         el.cPoints = el.uchars;
1245                         el.noOfCEs = 0;
1246                         el.uchars[0] = *conts;
1247                         el.uchars[1] = *(conts+1);
1248                         if(*(conts+2)!=0) {
1249                             el.uchars[2] = *(conts+2);
1250                             el.cSize = 3;
1251                         } else {
1252                             el.cSize = 2;
1253                         }
1254                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1255                     }
1256                     else { // pre-context character
1257                         UChar str[4] = { 0 };
1258                         int32_t len=0;
1259                         int32_t preKeyLen=0;
1260
1261                         el.cPoints = el.uchars;
1262                         el.noOfCEs = 0;
1263                         el.uchars[0] = *conts;
1264                         el.uchars[1] = 0;
1265                         el.cSize = 1;
1266                         el.prefixChars[0] = *(conts+2);
1267                         el.prefix = el.prefixChars;
1268                         el.prefixSize = 1;
1269                         if (el.prefixChars[0]!=0) {
1270                             // get CE of prefix character first
1271                             str[0]=el.prefixChars[0];
1272                             str[1]=0;
1273                             ucol_setText(ucaEl, str, 1, status);
1274                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1275                                     != UCOL_NULLORDER) {
1276                                 preKeyLen++;  // count number of keys for prefix character
1277                             }
1278                             str[len++] = el.prefixChars[0];
1279                         }
1280
1281                         str[len++] = el.uchars[0];
1282                         str[len]=0;
1283                         ucol_setText(ucaEl, str, len, status);
1284                         // Skip the keys for prefix character, then copy the rest to el.
1285                         while ((preKeyLen-->0) &&
1286                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1287                             continue;
1288                         }
1289
1290                     }
1291                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1292                         el.noOfCEs++;
1293                     }
1294                     uprv_uca_addAnElement(t, &el, status);
1295                 }
1296
1297             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1298                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1299             }
1300             conts+=3;
1301         }
1302         ucol_closeElements(ucaEl);
1303     }
1304
1305     // Add completely ignorable elements
1306     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1307
1308     // add tailoring characters related canonical closures
1309     uprv_uca_canonicalClosure(t, src, NULL, status);
1310
1311     /* still need to produce compatibility closure */
1312
1313     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1314
1315     uprv_uca_closeTempTable(t);
1316     uprv_free(image);
1317
1318     return myData;
1319 }
1320
1321 U_CDECL_BEGIN
1322 static UBool U_CALLCONV
1323 ucol_bld_cleanup(void)
1324 {
1325     udata_close(invUCA_DATA_MEM);
1326     invUCA_DATA_MEM = NULL;
1327     _staticInvUCA = NULL;
1328     return TRUE;
1329 }
1330 U_CDECL_END
1331
1332 U_CAPI const InverseUCATableHeader * U_EXPORT2
1333 ucol_initInverseUCA(UErrorCode *status)
1334 {
1335     if(U_FAILURE(*status)) return NULL;
1336
1337     UBool needsInit;
1338     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1339
1340     if(needsInit) {
1341         InverseUCATableHeader *newInvUCA = NULL;
1342         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1343
1344         if(U_FAILURE(*status)) {
1345             if (result) {
1346                 udata_close(result);
1347             }
1348             // This is not needed, as we are talking about
1349             // memory we got from UData
1350             //uprv_free(newInvUCA);
1351         }
1352
1353         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1354             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1355             UCollator *UCA = ucol_initUCA(status);
1356             // UCA versions of UCA and inverse UCA should match
1357             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1358                 *status = U_INVALID_FORMAT_ERROR;
1359                 udata_close(result);
1360                 return NULL;
1361             }
1362
1363             umtx_lock(NULL);
1364             if(_staticInvUCA == NULL) {
1365                 invUCA_DATA_MEM = result;
1366                 _staticInvUCA = newInvUCA;
1367                 result = NULL;
1368                 newInvUCA = NULL;
1369             }
1370             umtx_unlock(NULL);
1371
1372             if(newInvUCA != NULL) {
1373                 udata_close(result);
1374                 // This is not needed, as we are talking about
1375                 // memory we got from UData
1376                 //uprv_free(newInvUCA);
1377             }
1378             else {
1379                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1380             }
1381         }
1382     }
1383     return _staticInvUCA;
1384 }
1385
1386 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1387  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1388  */
1389 static const char* ReorderingTokenNames[] = {
1390     "SPACE",
1391     "PUNCT",
1392     "SYMBOL",
1393     "CURRENCY",
1394     "DIGIT",
1395     NULL
1396 };
1397
1398 static void toUpper(const char* src, char* dst, uint32_t length) {
1399    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1400        *dst = toupper(*src);
1401    }
1402    *dst = '\0';
1403 }
1404
1405 U_INTERNAL int32_t U_EXPORT2
1406 ucol_findReorderingEntry(const char* name) {
1407     char buffer[32];
1408     toUpper(name, buffer, 32);
1409     for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1410         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1411             return entry + UCOL_REORDER_CODE_FIRST;
1412         }
1413     }
1414     return USCRIPT_INVALID_CODE;
1415 }
1416
1417 #endif /* #if !UCONFIG_NO_COLLATION */