icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2011, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/udata.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28 #include "unicode/uscript.h"
  29 #include "unicode/ustring.h"
  30 #include "unicode/utf16.h"
  31 #include "normalizer2impl.h"
  32 #include "ucol_bld.h"
  33 #include "ucol_elm.h"
  34 #include "ucol_cnt.h"
  35 #include "ucln_in.h"
  36 #include "umutex.h"
  37 #include "cmemory.h"
  38 #include "cstring.h"
  39
  40 static const InverseUCATableHeader* _staticInvUCA = NULL;
  41 static UDataMemory* invUCA_DATA_MEM = NULL;
  42
  43 U_CDECL_BEGIN
  44 static UBool U_CALLCONV
  45 isAcceptableInvUCA(void * /*context*/,
  46                    const char * /*type*/, const char * /*name*/,
  47                    const UDataInfo *pInfo)
  48 {
  49     /* context, type & name are intentionally not used */
  50     if( pInfo->size>=20 &&
  51         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  52         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  53         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  54         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  55         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  56         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  57         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  58         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  59         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  60         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  61         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  62         )
  63     {
  64         UVersionInfo UCDVersion;
  65         u_getUnicodeVersion(UCDVersion);
  66         return (pInfo->dataVersion[0]==UCDVersion[0] &&
  67             pInfo->dataVersion[1]==UCDVersion[1]);
  68             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  69             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  70             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  71     } else {
  72         return FALSE;
  73     }
  74 }
  75 U_CDECL_END
  76
  77 /*
  78 * Takes two CEs (lead and continuation) and
  79 * compares them as CEs should be compared:
  80 * primary vs. primary, secondary vs. secondary
  81 * tertiary vs. tertiary
  82 */
  83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
  84     uint32_t s1 = source0, s2, t1 = target0, t2;
  85     if(isContinuation(source1)) {
  86         s2 = source1;
  87     } else {
  88         s2 = 0;
  89     }
  90     if(isContinuation(target1)) {
  91         t2 = target1;
  92     } else {
  93         t2 = 0;
  94     }
  95
  96     uint32_t s = 0, t = 0;
  97     if(s1 == t1 && s2 == t2) {
  98         return 0;
  99     }
 100     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
 101     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
 102     if(s < t) {
 103         return -1;
 104     } else if(s > t) {
 105         return 1;
 106     } else {
 107         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
 108         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
 109         if(s < t) {
 110             return -1;
 111         } else if(s > t) {
 112             return 1;
 113         } else {
 114             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
 115             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
 116             if(s < t) {
 117                 return -1;
 118             } else {
 119                 return 1;
 120             }
 121         }
 122     }
 123 }
 124
 125 static
 126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
 127     uint32_t bottom = 0, top = src->invUCA->tableSize;
 128     uint32_t i = 0;
 129     uint32_t first = 0, second = 0;
 130     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 131     int32_t res = 0;
 132
 133     while(bottom < top-1) {
 134         i = (top+bottom)/2;
 135         first = *(CETable+3*i);
 136         second = *(CETable+3*i+1);
 137         res = compareCEs(first, second, CE, SecondCE);
 138         if(res > 0) {
 139             top = i;
 140         } else if(res < 0) {
 141             bottom = i;
 142         } else {
 143             break;
 144         }
 145     }
 146
 147     /* weiv:                                                  */
 148     /* in searching for elements, I have removed the failure  */
 149     /* The reason for this is that the builder does not rely  */
 150     /* on search mechanism telling it that it didn't find an  */
 151     /* element. However, indirect positioning relies on being */
 152     /* able to find the elements around any CE, even if it is */
 153     /* not defined in the UCA. */
 154     return i;
 155     /*
 156     if((first == CE && second == SecondCE)) {
 157     return i;
 158     } else {
 159     return -1;
 160     }
 161     */
 162 }
 163
 164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 165     0xFFFF0000,
 166     0xFFFFFF00,
 167     0xFFFFFFFF
 168 };
 169
 170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 171                                             uint32_t CE, uint32_t contCE,
 172                                             uint32_t *nextCE, uint32_t *nextContCE,
 173                                             uint32_t strength)
 174 {
 175     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 176     int32_t iCE;
 177
 178     iCE = ucol_inv_findCE(src, CE, contCE);
 179
 180     if(iCE<0) {
 181         *nextCE = UCOL_NOT_FOUND;
 182         return -1;
 183     }
 184
 185     CE &= strengthMask[strength];
 186     contCE &= strengthMask[strength];
 187
 188     *nextCE = CE;
 189     *nextContCE = contCE;
 190
 191     while((*nextCE  & strengthMask[strength]) == CE
 192         && (*nextContCE  & strengthMask[strength]) == contCE)
 193     {
 194         *nextCE = (*(CETable+3*(++iCE)));
 195         *nextContCE = (*(CETable+3*(iCE)+1));
 196     }
 197
 198     return iCE;
 199 }
 200
 201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 202                                             uint32_t CE, uint32_t contCE,
 203                                             uint32_t *prevCE, uint32_t *prevContCE,
 204                                             uint32_t strength)
 205 {
 206     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 207     int32_t iCE;
 208
 209     iCE = ucol_inv_findCE(src, CE, contCE);
 210
 211     if(iCE<0) {
 212         *prevCE = UCOL_NOT_FOUND;
 213         return -1;
 214     }
 215
 216     CE &= strengthMask[strength];
 217     contCE &= strengthMask[strength];
 218
 219     *prevCE = CE;
 220     *prevContCE = contCE;
 221
 222     while((*prevCE  & strengthMask[strength]) == CE
 223         && (*prevContCE  & strengthMask[strength])== contCE
 224         && iCE > 0) /* this condition should prevent falling off the edge of the world */
 225     {
 226         /* here, we end up in a singularity - zero */
 227         *prevCE = (*(CETable+3*(--iCE)));
 228         *prevContCE = (*(CETable+3*(iCE)+1));
 229     }
 230
 231     return iCE;
 232 }
 233
 234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 235                                                        uint32_t prevCE, uint32_t prevContCE)
 236 {
 237     if(prevCE == CE && prevContCE == contCE) {
 238         return UCOL_IDENTICAL;
 239     }
 240     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
 241         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
 242     {
 243         return UCOL_PRIMARY;
 244     }
 245     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
 246         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
 247     {
 248         return UCOL_SECONDARY;
 249     }
 250     return UCOL_TERTIARY;
 251 }
 252
 253
 254 /*static
 255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 256
 257     uint32_t CE = lh->baseCE;
 258     uint32_t SecondCE = lh->baseContCE;
 259
 260     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 261     uint32_t previousCE, previousContCE;
 262     int32_t iCE;
 263
 264     iCE = ucol_inv_findCE(src, CE, SecondCE);
 265
 266     if(iCE<0) {
 267         return -1;
 268     }
 269
 270     CE &= strengthMask[strength];
 271     SecondCE &= strengthMask[strength];
 272
 273     previousCE = CE;
 274     previousContCE = SecondCE;
 275
 276     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 277         previousCE = (*(CETable+3*(--iCE)));
 278         previousContCE = (*(CETable+3*(iCE)+1));
 279     }
 280     lh->previousCE = previousCE;
 281     lh->previousContCE = previousContCE;
 282
 283     return iCE;
 284 }*/
 285
 286 static
 287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 288     uint32_t CE = lh->baseCE;
 289     uint32_t SecondCE = lh->baseContCE;
 290
 291     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 292     uint32_t nextCE, nextContCE;
 293     int32_t iCE;
 294
 295     iCE = ucol_inv_findCE(src, CE, SecondCE);
 296
 297     if(iCE<0) {
 298         return -1;
 299     }
 300
 301     CE &= strengthMask[strength];
 302     SecondCE &= strengthMask[strength];
 303
 304     nextCE = CE;
 305     nextContCE = SecondCE;
 306
 307     while((nextCE  & strengthMask[strength]) == CE
 308         && (nextContCE  & strengthMask[strength]) == SecondCE)
 309     {
 310         nextCE = (*(CETable+3*(++iCE)));
 311         nextContCE = (*(CETable+3*(iCE)+1));
 312     }
 313
 314     lh->nextCE = nextCE;
 315     lh->nextContCE = nextContCE;
 316
 317     return iCE;
 318 }
 319
 320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 321     /* reset all the gaps */
 322     int32_t i = 0;
 323     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 324     uint32_t st = 0;
 325     uint32_t t1, t2;
 326     int32_t pos;
 327
 328     UColToken *tok = lh->first;
 329     uint32_t tokStrength = tok->strength;
 330
 331     for(i = 0; i<3; i++) {
 332         lh->gapsHi[3*i] = 0;
 333         lh->gapsHi[3*i+1] = 0;
 334         lh->gapsHi[3*i+2] = 0;
 335         lh->gapsLo[3*i] = 0;
 336         lh->gapsLo[3*i+1] = 0;
 337         lh->gapsLo[3*i+2] = 0;
 338         lh->numStr[i] = 0;
 339         lh->fStrToken[i] = NULL;
 340         lh->lStrToken[i] = NULL;
 341         lh->pos[i] = -1;
 342     }
 343
 344     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 345
 346     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 347         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 348         lh->pos[0] = 0;
 349         t1 = lh->baseCE;
 350         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
 351         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 352         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 353         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 354         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
 355         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 356
 357         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
 358         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
 359
 360         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 361         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 362         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 363     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 364         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 365         lh->pos[0] = 0;
 366         t1 = lh->baseCE;
 367         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
 368         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 369         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 370         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 371         t1 = lh->nextCE;
 372         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
 373         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 374         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 375         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 376     } else {
 377         for(;;) {
 378             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 379                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 380                     lh->fStrToken[tokStrength] = tok;
 381                 } else { /* The CE must be implicit, since it's not in the table */
 382                     /* Error */
 383                     *status = U_INTERNAL_PROGRAM_ERROR;
 384                 }
 385             }
 386
 387             while(tok != NULL && tok->strength >= tokStrength) {
 388                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 389                     lh->lStrToken[tokStrength] = tok;
 390                 }
 391                 tok = tok->next;
 392             }
 393             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 394                 /* check if previous interval is the same and merge the intervals if it is so */
 395                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 396                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 397                     lh->fStrToken[tokStrength+1] = NULL;
 398                     lh->lStrToken[tokStrength+1] = NULL;
 399                     lh->pos[tokStrength+1] = -1;
 400                 }
 401             }
 402             if(tok != NULL) {
 403                 tokStrength = tok->strength;
 404             } else {
 405                 break;
 406             }
 407         }
 408         for(st = 0; st < 3; st++) {
 409             if((pos = lh->pos[st]) >= 0) {
 410                 t1 = *(CETable+3*(pos));
 411                 t2 = *(CETable+3*(pos)+1);
 412                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 413                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 414                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 415                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 416                 //pos--;
 417                 //t1 = *(CETable+3*(pos));
 418                 //t2 = *(CETable+3*(pos)+1);
 419                 t1 = lh->baseCE;
 420                 t2 = lh->baseContCE;
 421                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 422                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 423                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 424             }
 425         }
 426     }
 427 }
 428
 429
 430 #define ucol_countBytes(value, noOfBytes)   \
 431 {                               \
 432     uint32_t mask = 0xFFFFFFFF;   \
 433     (noOfBytes) = 0;              \
 434     while(mask != 0) {            \
 435     if(((value) & mask) != 0) { \
 436     (noOfBytes)++;            \
 437     }                           \
 438     mask >>= 8;                 \
 439     }                             \
 440 }
 441
 442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 443     if(U_SUCCESS(*status)) {
 444         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 445     }
 446     return g->current;
 447 }
 448
 449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 450     /* TODO: rename to enum names */
 451     uint32_t high, low, count=1;
 452     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 453
 454     if(strength == UCOL_SECONDARY) {
 455         low = UCOL_COMMON_TOP2<<24;
 456         high = 0xFFFFFFFF;
 457         count = 0xFF - UCOL_COMMON_TOP2;
 458     } else {
 459         low = UCOL_BYTE_COMMON << 24; //0x05000000;
 460         high = 0x40000000;
 461         count = 0x40 - UCOL_BYTE_COMMON;
 462     }
 463
 464     if(tok->next != NULL && tok->next->strength == strength) {
 465         count = tok->next->toInsert;
 466     }
 467
 468     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 469     g->current = UCOL_BYTE_COMMON<<24;
 470
 471     if(g->noOfRanges == 0) {
 472         *status = U_INTERNAL_PROGRAM_ERROR;
 473     }
 474     return g->current;
 475 }
 476
 477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 478     uint32_t strength = tok->strength;
 479     uint32_t low = lows[fStrength*3+strength];
 480     uint32_t high = highs[fStrength*3+strength];
 481     uint32_t maxByte = 0;
 482     if(strength == UCOL_TERTIARY) {
 483         maxByte = 0x3F;
 484     } else if(strength == UCOL_PRIMARY) {
 485         maxByte = 0xFE;
 486     } else {
 487         maxByte = 0xFF;
 488     }
 489
 490     uint32_t count = tok->toInsert;
 491
 492     if(low >= high && strength > UCOL_PRIMARY) {
 493         int32_t s = strength;
 494         for(;;) {
 495             s--;
 496             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 497                 if(strength == UCOL_SECONDARY) {
 498                     if (low < UCOL_COMMON_TOP2<<24 ) {
 499                        // Override if low range is less than UCOL_COMMON_TOP2.
 500                         low = UCOL_COMMON_TOP2<<24;
 501                     }
 502                     high = 0xFFFFFFFF;
 503                 } else {
 504                     // Override if low range is less than UCOL_COMMON_BOT3.
 505                     if ( low < UCOL_COMMON_BOT3<<24 ) {
 506                         low = UCOL_COMMON_BOT3<<24;
 507                     }
 508                     high = 0x40000000;
 509                 }
 510                 break;
 511             }
 512             if(s<0) {
 513                 *status = U_INTERNAL_PROGRAM_ERROR;
 514                 return 0;
 515             }
 516         }
 517     }
 518
 519     if(low < 0x02000000) {
 520         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
 521         // See http://site.icu-project.org/design/collation/bytes
 522         low = 0x02000000;
 523     }
 524
 525     if(strength == UCOL_SECONDARY) { /* similar as simple */
 526         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 527             low = UCOL_COMMON_TOP2<<24;
 528         }
 529         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 530             high = UCOL_COMMON_TOP2<<24;
 531         }
 532         if(low < (UCOL_COMMON_BOT2<<24)) {
 533             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 534             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 535             //g->current = UCOL_COMMON_BOT2<<24;
 536             return g->current;
 537         }
 538     }
 539
 540     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 541     if(g->noOfRanges == 0) {
 542         *status = U_INTERNAL_PROGRAM_ERROR;
 543     }
 544     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 545     return g->current;
 546 }
 547
 548 static
 549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 550     uint32_t i = 0;
 551     UChar c;
 552
 553     if(U_FAILURE(*status)) {
 554         return 0;
 555     }
 556
 557     if(sourceLen > resLen) {
 558         *status = U_MEMORY_ALLOCATION_ERROR;
 559         return 0;
 560     }
 561
 562     for(i = 0; i < sourceLen; i++) {
 563         c = source[i];
 564         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 565             switch(c - 0x3000) {
 566             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 567             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 568                 c++;
 569                 break;
 570             case 0xF5:
 571                 c = 0x30AB;
 572                 break;
 573             case 0xF6:
 574                 c = 0x30B1;
 575                 break;
 576             }
 577         }
 578         resBuf[i] = c;
 579     }
 580     return sourceLen;
 581 }
 582
 583 static
 584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 585     uint32_t i = 0;
 586     UChar c;
 587
 588     if(U_FAILURE(*status)) {
 589         return 0;
 590     }
 591
 592     if(sourceLen > resLen) {
 593         *status = U_MEMORY_ALLOCATION_ERROR;
 594         return 0;
 595     }
 596
 597     for(i = 0; i < sourceLen; i++) {
 598         c = source[i];
 599         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 600             switch(c - 0x3000) {
 601             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 602             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 603                 c--;
 604                 break;
 605             case 0xAB:
 606                 c = 0x30F5;
 607                 break;
 608             case 0xB1:
 609                 c = 0x30F6;
 610                 break;
 611             }
 612         }
 613         resBuf[i] = c;
 614     }
 615     return sourceLen;
 616 }
 617
 618 U_NAMESPACE_BEGIN
 619
 620 static
 621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 622     uint32_t i = 0;
 623     UChar n[128];
 624     uint32_t nLen = 0;
 625     uint32_t uCount = 0, lCount = 0;
 626
 627     collIterate s;
 628     uint32_t order = 0;
 629
 630     if(U_FAILURE(*status)) {
 631         return UCOL_LOWER_CASE;
 632     }
 633
 634     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 635     if(U_SUCCESS(*status)) {
 636         for(i = 0; i < nLen; i++) {
 637             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
 638             order = ucol_getNextCE(UCA, &s, status);
 639             if(isContinuation(order)) {
 640                 *status = U_INTERNAL_PROGRAM_ERROR;
 641                 return UCOL_LOWER_CASE;
 642             }
 643             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 644                 uCount++;
 645             } else {
 646                 if(u_islower(n[i])) {
 647                     lCount++;
 648                 } else if(U_SUCCESS(*status)) {
 649                     UChar sk[1], lk[1];
 650                     u_toSmallKana(&n[i], 1, sk, 1, status);
 651                     u_toLargeKana(&n[i], 1, lk, 1, status);
 652                     if(sk[0] == n[i] && lk[0] != n[i]) {
 653                         lCount++;
 654                     }
 655                 }
 656             }
 657         }
 658     }
 659
 660     if(uCount != 0 && lCount != 0) {
 661         return UCOL_MIXED_CASE;
 662     } else if(uCount != 0) {
 663         return UCOL_UPPER_CASE;
 664     } else {
 665         return UCOL_LOWER_CASE;
 666     }
 667 }
 668
 669
 670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 671     /* this one makes the table and stuff */
 672     uint32_t noOfBytes[3];
 673     uint32_t i;
 674
 675     for(i = 0; i<3; i++) {
 676         ucol_countBytes(CEparts[i], noOfBytes[i]);
 677     }
 678
 679     /* Here we have to pack CEs from parts */
 680
 681     uint32_t CEi = 0;
 682     uint32_t value = 0;
 683
 684     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 685         if(CEi > 0) {
 686             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 687         } else {
 688             value = 0;
 689         }
 690
 691         if(2*CEi<noOfBytes[0]) {
 692             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 693         }
 694         if(CEi<noOfBytes[1]) {
 695             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 696         }
 697         if(CEi<noOfBytes[2]) {
 698             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 699         }
 700         tok->CEs[CEi] = value;
 701         CEi++;
 702     }
 703     if(CEi == 0) { /* totally ignorable */
 704         tok->noOfCEs = 1;
 705         tok->CEs[0] = 0;
 706     } else { /* there is at least something */
 707         tok->noOfCEs = CEi;
 708     }
 709
 710
 711     // we want to set case bits here and now, not later.
 712     // Case bits handling
 713     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
 714         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 715         int32_t cSize = (tok->source & 0xFF000000) >> 24;
 716         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 717
 718         if(cSize > 1) {
 719             // Do it manually
 720             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 721         } else {
 722             // Copy it from the UCA
 723             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 724             tok->CEs[0] |= (caseCE & 0xC0);
 725         }
 726     }
 727
 728 #if UCOL_DEBUG==2
 729     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 730     for(i = 0; i<tok->noOfCEs; i++) {
 731         fprintf(stderr, "%08X ", tok->CEs[i]);
 732     }
 733     fprintf(stderr, "\n");
 734 #endif
 735 }
 736
 737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 738     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 739     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 740
 741     UColToken *tok = lh->last;
 742     uint32_t t[UCOL_STRENGTH_LIMIT];
 743
 744     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 745
 746     tok->toInsert = 1;
 747     t[tok->strength] = 1;
 748
 749     while(tok->previous != NULL) {
 750         if(tok->previous->strength < tok->strength) { /* going up */
 751             t[tok->strength] = 0;
 752             t[tok->previous->strength]++;
 753         } else if(tok->previous->strength > tok->strength) { /* going down */
 754             t[tok->previous->strength] = 1;
 755         } else {
 756             t[tok->strength]++;
 757         }
 758         tok=tok->previous;
 759         tok->toInsert = t[tok->strength];
 760     }
 761
 762     tok->toInsert = t[tok->strength];
 763     ucol_inv_getGapPositions(src, lh, status);
 764
 765 #if UCOL_DEBUG
 766     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 767     int32_t j = 2;
 768     for(j = 2; j >= 0; j--) {
 769         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 770         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 771     }
 772     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 773
 774     do {
 775         fprintf(stderr,"%i", tok->strength);
 776         tok = tok->next;
 777     } while(tok != NULL);
 778     fprintf(stderr, "\n");
 779
 780     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
 781
 782     do {
 783         fprintf(stderr,"%i", tok->toInsert);
 784         tok = tok->next;
 785     } while(tok != NULL);
 786 #endif
 787
 788     tok = lh->first;
 789     uint32_t fStrength = UCOL_IDENTICAL;
 790     uint32_t initStrength = UCOL_IDENTICAL;
 791
 792
 793     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 794     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 795     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 796
 797     while (tok != NULL && U_SUCCESS(*status)) {
 798         fStrength = tok->strength;
 799         if(fStrength < initStrength) {
 800             initStrength = fStrength;
 801             if(lh->pos[fStrength] == -1) {
 802                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
 803                     fStrength--;
 804                 }
 805                 if(lh->pos[fStrength] == -1) {
 806                     *status = U_INTERNAL_PROGRAM_ERROR;
 807                     return;
 808                 }
 809             }
 810             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 811                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 812                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 813                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 814                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 815             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 816                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 817                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 818                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 819                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 820             } else { /* primaries */
 821                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 822                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 823                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 824                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 825             }
 826         } else {
 827             if(tok->strength == UCOL_TERTIARY) {
 828                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 829             } else if(tok->strength == UCOL_SECONDARY) {
 830                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 831                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 832             } else if(tok->strength == UCOL_PRIMARY) {
 833                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 834                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 835                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 836             }
 837         }
 838         ucol_doCE(src, CEparts, tok, status);
 839         tok = tok->next;
 840     }
 841 }
 842
 843 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 844     UCAElements el;
 845     UColToken *tok = lh->first;
 846     UColToken *expt = NULL;
 847     uint32_t i = 0, j = 0;
 848     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
 849
 850     while(tok != NULL && U_SUCCESS(*status)) {
 851         /* first, check if there are any expansions */
 852         /* if there are expansions, we need to do a little bit more processing */
 853         /* since parts of expansion can be tailored, while others are not */
 854         if(tok->expansion != 0) {
 855             uint32_t len = tok->expansion >> 24;
 856             uint32_t currentSequenceLen = len;
 857             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 858             //uint32_t exp = currentSequenceLen | expOffset;
 859             UColToken exp;
 860             exp.source = currentSequenceLen | expOffset;
 861             exp.rulesToParseHdl = &(src->source);
 862
 863             while(len > 0) {
 864                 currentSequenceLen = len;
 865                 while(currentSequenceLen > 0) {
 866                     exp.source = (currentSequenceLen << 24) | expOffset;
 867                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 868                         uint32_t noOfCEsToCopy = expt->noOfCEs;
 869                         for(j = 0; j<noOfCEsToCopy; j++) {
 870                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 871                         }
 872                         tok->noOfExpCEs += noOfCEsToCopy;
 873                         // Smart people never try to add codepoints and CEs.
 874                         // For some odd reason, it won't work.
 875                         expOffset += currentSequenceLen; //noOfCEsToCopy;
 876                         len -= currentSequenceLen; //noOfCEsToCopy;
 877                         break;
 878                     } else {
 879                         currentSequenceLen--;
 880                     }
 881                 }
 882                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 883                     /* will have to get one from UCA */
 884                     /* first, get the UChars from the rules */
 885                     /* then pick CEs out until there is no more and stuff them into expansion */
 886                     collIterate s;
 887                     uint32_t order = 0;
 888                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
 889
 890                     for(;;) {
 891                         order = ucol_getNextCE(src->UCA, &s, status);
 892                         if(order == UCOL_NO_MORE_CES) {
 893                             break;
 894                         }
 895                         tok->expCEs[tok->noOfExpCEs++] = order;
 896                     }
 897                     expOffset++;
 898                     len--;
 899                 }
 900             }
 901         } else {
 902             tok->noOfExpCEs = 0;
 903         }
 904
 905         /* set the ucaelement with obtained values */
 906         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 907         /* copy CEs */
 908         for(i = 0; i<tok->noOfCEs; i++) {
 909             el.CEs[i] = tok->CEs[i];
 910         }
 911         for(i = 0; i<tok->noOfExpCEs; i++) {
 912             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 913         }
 914
 915         /* copy UChars */
 916         // We kept prefix and source kind of together, as it is a kind of a contraction.
 917         // However, now we have to slice the prefix off the main thing -
 918         el.prefix = el.prefixChars;
 919         el.cPoints = el.uchars;
 920         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 921             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 922             // decomposed elements to the unsaf table.
 923             el.prefixSize = tok->prefix>>24;
 924             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 925
 926             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 927             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 928         } else {
 929             el.prefixSize = 0;
 930             *el.prefix = 0;
 931
 932             el.cSize = (tok->source >> 24);
 933             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 934         }
 935         if(src->UCA != NULL) {
 936             for(i = 0; i<el.cSize; i++) {
 937                 if(UCOL_ISJAMO(el.cPoints[i])) {
 938                     t->image->jamoSpecial = TRUE;
 939                 }
 940             }
 941             if (!src->buildCCTabFlag && el.cSize > 0) {
 942                 // Check the trailing canonical combining class (tccc) of the last character.
 943                 const UChar *s = el.cPoints + el.cSize;
 944                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
 945                 if ((fcd & 0xff) != 0) {
 946                     src->buildCCTabFlag = TRUE;
 947                 }
 948             }
 949         }
 950
 951         /* and then, add it */
 952 #if UCOL_DEBUG==2
 953         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 954 #endif
 955         uprv_uca_addAnElement(t, &el, status);
 956
 957 #if UCOL_DEBUG_DUPLICATES
 958         if(*status != U_ZERO_ERROR) {
 959             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 960             *status = U_ZERO_ERROR;
 961         }
 962 #endif
 963
 964         tok = tok->next;
 965     }
 966 }
 967
 968 U_CDECL_BEGIN
 969 static UBool U_CALLCONV
 970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 971     UErrorCode status = U_ZERO_ERROR;
 972     tempUCATable *t = (tempUCATable *)context;
 973     if(value == 0) {
 974         while(start < limit) {
 975             uint32_t CE = utrie_get32(t->mapping, start, NULL);
 976             if(CE == UCOL_NOT_FOUND) {
 977                 UCAElements el;
 978                 el.isThai = FALSE;
 979                 el.prefixSize = 0;
 980                 el.prefixChars[0] = 0;
 981                 el.prefix = el.prefixChars;
 982                 el.cPoints = el.uchars;
 983
 984                 el.cSize = 0;
 985                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
 986
 987                 el.noOfCEs = 1;
 988                 el.CEs[0] = 0;
 989                 uprv_uca_addAnElement(t, &el, &status);
 990
 991             }
 992             start++;
 993         }
 994     }
 995     if(U_FAILURE(status)) {
 996         return FALSE;
 997     } else {
 998         return TRUE;
 999     }
1000 }
1001 U_CDECL_END
1002
1003 static void
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1005                                UChar32 start, UChar32 end,
1006                                UErrorCode *status)
1007 {
1008     //UChar decomp[256];
1009     uint32_t CE = UCOL_NOT_FOUND;
1010     UChar32 u = 0;
1011     UCAElements el;
1012     el.isThai = FALSE;
1013     el.prefixSize = 0;
1014     el.prefixChars[0] = 0;
1015     collIterate colIt;
1016
1017     if(U_SUCCESS(*status)) {
1018         for(u = start; u<=end; u++) {
1019             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1020                 /* this test is for contractions that are missing the starting element. */
1021                 || ((isCntTableElement(CE)) &&
1022                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1023                 )
1024             {
1025                 el.cSize = 0;
1026                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1027                 //decomp[0] = (UChar)u;
1028                 //el.uchars[0] = (UChar)u;
1029                 el.cPoints = el.uchars;
1030                 //el.cSize = 1;
1031                 el.noOfCEs = 0;
1032                 el.prefix = el.prefixChars;
1033                 el.prefixSize = 0;
1034                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035                 // We actually want to check whether this element is a special
1036                 // If it is an implicit element (hangul, CJK - we want to copy the
1037                 // special, not the resolved CEs) - for hangul, copying resolved
1038                 // would just make things the same (there is an expansion and it
1039                 // takes approximately the same amount of time to resolve as
1040                 // falling back to the UCA).
1041                 /*
1042                 UTRIE_GET32(src->UCA->mapping, u, CE);
1043                 tag = getCETag(CE);
1044                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046                 || tag == LEAD_SURROGATE_TAG) {
1047                 el.CEs[el.noOfCEs++] = CE;
1048                 } else {
1049                 */
1050                 // It turns out that it does not make sense to keep implicits
1051                 // unresolved. The cost of resolving them is big enough so that
1052                 // it doesn't make any difference whether we have to go to the UCA
1053                 // or not.
1054                 {
1055                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
1056                     while(CE != UCOL_NO_MORE_CES) {
1057                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1058                         if(CE != UCOL_NO_MORE_CES) {
1059                             el.CEs[el.noOfCEs++] = CE;
1060                         }
1061                     }
1062                 }
1063                 uprv_uca_addAnElement(t, &el, status);
1064             }
1065         }
1066     }
1067 }
1068
1069 U_NAMESPACE_END
1070
1071 U_CFUNC UCATableHeader *
1072 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1073     U_NAMESPACE_USE
1074
1075     uint32_t i = 0;
1076     if(U_FAILURE(*status)) {
1077         return NULL;
1078     }
1079     /*
1080     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1081     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082     create new ListHeader X
1083     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1084     first item you add, based on the stronger strength levels of the two lists.
1085     */
1086     /*
1087     3.  For each ListHeader with a non-null positive list:
1088     */
1089     /*
1090     o   Find all character strings with CEs between the baseCE and the
1091     next/previous CE, at the strength of the first token. Add these to the
1092     tailoring.
1093     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1094     tailoring has & x < z...
1095     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1096     */
1097     /* It is possible that this part should be done even while constructing list */
1098     /* The problem is that it is unknown what is going to be the strongest weight */
1099     /* So we might as well do it here */
1100
1101     /*
1102     o   Allocate CEs for each token in the list, based on the total number N of the
1103     largest level difference, and the gap G between baseCE and nextCE at that
1104     level. The relation * between the last item and nextCE is the same as the
1105     strongest strength.
1106     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108     Then fit b and c into the secondary gap between a and d, then fit q
1109     into the tertiary gap between b and c.
1110
1111     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1112     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113     Then fit q into the tertiary gap between b and c.
1114     o   When incrementing primary values, we will not cross high byte
1115     boundaries except where there is only a single-byte primary. That is to
1116     ensure that the script reordering will continue to work.
1117     */
1118     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1119     /* test for NULL */
1120     if (image == NULL) {
1121         *status = U_MEMORY_ALLOCATION_ERROR;
1122         return NULL;
1123     }
1124     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1125
1126     for(i = 0; i<src->resultLen; i++) {
1127         /* now we need to generate the CEs */
1128         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129         /* According to strength                                                          */
1130         if(U_SUCCESS(*status)) {
1131             if(src->lh[i].first) { // if there are any elements
1132                 // due to the way parser works, subsequent tailorings
1133                 // may remove all the elements from a sequence, therefore
1134                 // leaving an empty tailoring sequence.
1135                 ucol_initBuffers(src, &src->lh[i], status);
1136             }
1137         }
1138         if(U_FAILURE(*status)) {
1139             uprv_free(image);
1140             return NULL;
1141         }
1142     }
1143
1144     if(src->varTop != NULL) { /* stuff the variable top value */
1145         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1146         /* remove it from the list */
1147         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1148             src->varTop->listHeader->first = src->varTop->next;
1149         }
1150         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1151             src->varTop->listHeader->last = src->varTop->previous;
1152         }
1153         if(src->varTop->next != NULL) {
1154             src->varTop->next->previous = src->varTop->previous;
1155         }
1156         if(src->varTop->previous != NULL) {
1157             src->varTop->previous->next = src->varTop->next;
1158         }
1159     }
1160
1161
1162     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1163     if(U_FAILURE(*status)) {
1164         uprv_free(image);
1165         return NULL;
1166     }
1167
1168
1169     /* After this, we have assigned CE values to all regular CEs      */
1170     /* now we will go through list once more and resolve expansions,  */
1171     /* make UCAElements structs and add them to table                 */
1172     for(i = 0; i<src->resultLen; i++) {
1173         /* now we need to generate the CEs */
1174         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175         /* According to strength                                                          */
1176         if(U_SUCCESS(*status)) {
1177             ucol_createElements(src, t, &src->lh[i], status);
1178         }
1179     }
1180
1181     UCAElements el;
1182     el.isThai = FALSE;
1183     el.prefixSize = 0;
1184     el.prefixChars[0] = 0;
1185
1186     /* add latin-1 stuff */
1187     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1188
1189     /* add stuff for copying */
1190     if(src->copySet != NULL) {
1191         int32_t i = 0;
1192         UnicodeSet *set = (UnicodeSet *)src->copySet;
1193         for(i = 0; i < set->getRangeCount(); i++) {
1194             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1195         }
1196     }
1197
1198     if(U_SUCCESS(*status)) {
1199         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1200
1201         uint32_t tailoredCE = UCOL_NOT_FOUND;
1202         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1203         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
1204         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1205         // Check for null pointer
1206         if (ucaEl == NULL) {
1207             *status = U_MEMORY_ALLOCATION_ERROR;
1208             return NULL;
1209         }
1210         while(*conts != 0) {
1211             // A continuation is NUL-terminated and NUL-padded
1212             // except if it has the maximum length.
1213             int32_t contractionLength = maxUCAContractionLength;
1214             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
1215                 --contractionLength;
1216             }
1217             UChar32 first;
1218             int32_t firstLength = 0;
1219             U16_NEXT(conts, firstLength, contractionLength, first);
1220             tailoredCE = utrie_get32(t->mapping, first, NULL);
1221             if(tailoredCE != UCOL_NOT_FOUND) {
1222                 UBool needToAdd = TRUE;
1223                 if(isCntTableElement(tailoredCE)) {
1224                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
1225                         needToAdd = FALSE;
1226                     }
1227                 }
1228                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1229                     UCAElements elm;
1230                     elm.cPoints = el.uchars;
1231                     elm.noOfCEs = 0;
1232                     elm.uchars[0] = *conts;
1233                     elm.uchars[1] = 0;
1234                     elm.cSize = 1;
1235                     elm.prefixChars[0] = *(conts+2);
1236                     elm.isThai = FALSE;
1237                     elm.prefix = elm.prefixChars;
1238                     elm.prefixSize = 1;
1239                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1240                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1241                         needToAdd = TRUE;
1242                     }
1243                 }
1244                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1245                     needToAdd = FALSE;
1246                 }
1247
1248                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1249                     if (*(conts+1) != 0) {  // contractions
1250                         el.prefix = el.prefixChars;
1251                         el.prefixSize = 0;
1252                         el.cPoints = el.uchars;
1253                         el.noOfCEs = 0;
1254                         u_memcpy(el.uchars, conts, contractionLength);
1255                         el.cSize = contractionLength;
1256                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1257                     }
1258                     else { // pre-context character
1259                         UChar str[4] = { 0 };
1260                         int32_t len=0;
1261                         int32_t preKeyLen=0;
1262
1263                         el.cPoints = el.uchars;
1264                         el.noOfCEs = 0;
1265                         el.uchars[0] = *conts;
1266                         el.uchars[1] = 0;
1267                         el.cSize = 1;
1268                         el.prefixChars[0] = *(conts+2);
1269                         el.prefix = el.prefixChars;
1270                         el.prefixSize = 1;
1271                         if (el.prefixChars[0]!=0) {
1272                             // get CE of prefix character first
1273                             str[0]=el.prefixChars[0];
1274                             str[1]=0;
1275                             ucol_setText(ucaEl, str, 1, status);
1276                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1277                                     != UCOL_NULLORDER) {
1278                                 preKeyLen++;  // count number of keys for prefix character
1279                             }
1280                             str[len++] = el.prefixChars[0];
1281                         }
1282
1283                         str[len++] = el.uchars[0];
1284                         str[len]=0;
1285                         ucol_setText(ucaEl, str, len, status);
1286                         // Skip the keys for prefix character, then copy the rest to el.
1287                         while ((preKeyLen-->0) &&
1288                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1289                             continue;
1290                         }
1291
1292                     }
1293                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1294                         el.noOfCEs++;
1295                     }
1296                     uprv_uca_addAnElement(t, &el, status);
1297                 }
1298
1299             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
1300                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
1301             }
1302             conts+=maxUCAContractionLength;
1303         }
1304         ucol_closeElements(ucaEl);
1305     }
1306
1307     // Add completely ignorable elements
1308     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1309
1310     // add tailoring characters related canonical closures
1311     uprv_uca_canonicalClosure(t, src, NULL, status);
1312
1313     /* still need to produce compatibility closure */
1314
1315     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1316
1317     uprv_uca_closeTempTable(t);
1318     uprv_free(image);
1319
1320     return myData;
1321 }
1322
1323 U_CDECL_BEGIN
1324 static UBool U_CALLCONV
1325 ucol_bld_cleanup(void)
1326 {
1327     udata_close(invUCA_DATA_MEM);
1328     invUCA_DATA_MEM = NULL;
1329     _staticInvUCA = NULL;
1330     return TRUE;
1331 }
1332 U_CDECL_END
1333
1334 U_CAPI const InverseUCATableHeader * U_EXPORT2
1335 ucol_initInverseUCA(UErrorCode *status)
1336 {
1337     if(U_FAILURE(*status)) return NULL;
1338
1339     UBool needsInit;
1340     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1341
1342     if(needsInit) {
1343         InverseUCATableHeader *newInvUCA = NULL;
1344         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1345
1346         if(U_FAILURE(*status)) {
1347             if (result) {
1348                 udata_close(result);
1349             }
1350             // This is not needed, as we are talking about
1351             // memory we got from UData
1352             //uprv_free(newInvUCA);
1353         }
1354
1355         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1356             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1357             UCollator *UCA = ucol_initUCA(status);
1358             // UCA versions of UCA and inverse UCA should match
1359             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1360                 *status = U_INVALID_FORMAT_ERROR;
1361                 udata_close(result);
1362                 return NULL;
1363             }
1364
1365             umtx_lock(NULL);
1366             if(_staticInvUCA == NULL) {
1367                 invUCA_DATA_MEM = result;
1368                 _staticInvUCA = newInvUCA;
1369                 result = NULL;
1370                 newInvUCA = NULL;
1371             }
1372             umtx_unlock(NULL);
1373
1374             if(newInvUCA != NULL) {
1375                 udata_close(result);
1376                 // This is not needed, as we are talking about
1377                 // memory we got from UData
1378                 //uprv_free(newInvUCA);
1379             }
1380             else {
1381                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1382             }
1383         }
1384     }
1385     return _staticInvUCA;
1386 }
1387
1388 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1389  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1390  */
1391 static const char* ReorderingTokenNames[] = {
1392     "SPACE",
1393     "PUNCT",
1394     "SYMBOL",
1395     "CURRENCY",
1396     "DIGIT",
1397     NULL
1398 };
1399
1400 static void toUpper(const char* src, char* dst, uint32_t length) {
1401    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
1402        *dst = uprv_toupper(*src);
1403    }
1404    *dst = '\0';
1405 }
1406
1407 U_INTERNAL int32_t U_EXPORT2
1408 ucol_findReorderingEntry(const char* name) {
1409     char buffer[32];
1410     toUpper(name, buffer, 32);
1411     for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
1412         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
1413             return entry + UCOL_REORDER_CODE_FIRST;
1414         }
1415     }
1416     return USCRIPT_INVALID_CODE;
1417 }
1418
1419 #endif /* #if !UCONFIG_NO_COLLATION */