icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/udata.h"
  26 #include "unicode/uchar.h"
  27 #include "unicode/uniset.h"
  28 #include "ucol_bld.h"
  29 #include "ucol_elm.h"
  30 #include "ucol_cnt.h"
  31 #include "ucln_in.h"
  32 #include "umutex.h"
  33 #include "unormimp.h"
  34 #include "cmemory.h"
  35
  36 static const InverseUCATableHeader* _staticInvUCA = NULL;
  37 static UDataMemory* invUCA_DATA_MEM = NULL;
  38
  39 U_CDECL_BEGIN
  40 static UBool U_CALLCONV
  41 isAcceptableInvUCA(void * /*context*/,
  42                    const char * /*type*/, const char * /*name*/,
  43                    const UDataInfo *pInfo)
  44 {
  45     /* context, type & name are intentionally not used */
  46     if( pInfo->size>=20 &&
  47         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  48         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  49         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  50         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  51         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  52         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  53         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  54         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  55         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  56         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  57         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  58         )
  59     {
  60         UVersionInfo UCDVersion;
  61         u_getUnicodeVersion(UCDVersion);
  62         return (pInfo->dataVersion[0]==UCDVersion[0] &&
  63             pInfo->dataVersion[1]==UCDVersion[1]);
  64             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  65             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  66             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  67     } else {
  68         return FALSE;
  69     }
  70 }
  71 U_CDECL_END
  72
  73 /*
  74 * Takes two CEs (lead and continuation) and
  75 * compares them as CEs should be compared:
  76 * primary vs. primary, secondary vs. secondary
  77 * tertiary vs. tertiary
  78 */
  79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
  80     uint32_t s1 = source0, s2, t1 = target0, t2;
  81     if(isContinuation(source1)) {
  82         s2 = source1;
  83     } else {
  84         s2 = 0;
  85     }
  86     if(isContinuation(target1)) {
  87         t2 = target1;
  88     } else {
  89         t2 = 0;
  90     }
  91
  92     uint32_t s = 0, t = 0;
  93     if(s1 == t1 && s2 == t2) {
  94         return 0;
  95     }
  96     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
  97     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
  98     if(s < t) {
  99         return -1;
 100     } else if(s > t) {
 101         return 1;
 102     } else {
 103         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
 104         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
 105         if(s < t) {
 106             return -1;
 107         } else if(s > t) {
 108             return 1;
 109         } else {
 110             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
 111             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
 112             if(s < t) {
 113                 return -1;
 114             } else {
 115                 return 1;
 116             }
 117         }
 118     }
 119 }
 120
 121 static
 122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
 123     uint32_t bottom = 0, top = src->invUCA->tableSize;
 124     uint32_t i = 0;
 125     uint32_t first = 0, second = 0;
 126     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 127     int32_t res = 0;
 128
 129     while(bottom < top-1) {
 130         i = (top+bottom)/2;
 131         first = *(CETable+3*i);
 132         second = *(CETable+3*i+1);
 133         res = compareCEs(first, second, CE, SecondCE);
 134         if(res > 0) {
 135             top = i;
 136         } else if(res < 0) {
 137             bottom = i;
 138         } else {
 139             break;
 140         }
 141     }
 142
 143     /* weiv:                                                  */
 144     /* in searching for elements, I have removed the failure  */
 145     /* The reason for this is that the builder does not rely  */
 146     /* on search mechanism telling it that it didn't find an  */
 147     /* element. However, indirect positioning relies on being */
 148     /* able to find the elements around any CE, even if it is */
 149     /* not defined in the UCA. */
 150     return i;
 151     /*
 152     if((first == CE && second == SecondCE)) {
 153     return i;
 154     } else {
 155     return -1;
 156     }
 157     */
 158 }
 159
 160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 161     0xFFFF0000,
 162     0xFFFFFF00,
 163     0xFFFFFFFF
 164 };
 165
 166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 167                                             uint32_t CE, uint32_t contCE,
 168                                             uint32_t *nextCE, uint32_t *nextContCE,
 169                                             uint32_t strength)
 170 {
 171     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 172     int32_t iCE;
 173
 174     iCE = ucol_inv_findCE(src, CE, contCE);
 175
 176     if(iCE<0) {
 177         *nextCE = UCOL_NOT_FOUND;
 178         return -1;
 179     }
 180
 181     CE &= strengthMask[strength];
 182     contCE &= strengthMask[strength];
 183
 184     *nextCE = CE;
 185     *nextContCE = contCE;
 186
 187     while((*nextCE  & strengthMask[strength]) == CE
 188         && (*nextContCE  & strengthMask[strength]) == contCE)
 189     {
 190         *nextCE = (*(CETable+3*(++iCE)));
 191         *nextContCE = (*(CETable+3*(iCE)+1));
 192     }
 193
 194     return iCE;
 195 }
 196
 197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 198                                             uint32_t CE, uint32_t contCE,
 199                                             uint32_t *prevCE, uint32_t *prevContCE,
 200                                             uint32_t strength)
 201 {
 202     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 203     int32_t iCE;
 204
 205     iCE = ucol_inv_findCE(src, CE, contCE);
 206
 207     if(iCE<0) {
 208         *prevCE = UCOL_NOT_FOUND;
 209         return -1;
 210     }
 211
 212     CE &= strengthMask[strength];
 213     contCE &= strengthMask[strength];
 214
 215     *prevCE = CE;
 216     *prevContCE = contCE;
 217
 218     while((*prevCE  & strengthMask[strength]) == CE
 219         && (*prevContCE  & strengthMask[strength])== contCE
 220         && iCE > 0) /* this condition should prevent falling off the edge of the world */
 221     {
 222         /* here, we end up in a singularity - zero */
 223         *prevCE = (*(CETable+3*(--iCE)));
 224         *prevContCE = (*(CETable+3*(iCE)+1));
 225     }
 226
 227     return iCE;
 228 }
 229
 230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 231                                                        uint32_t prevCE, uint32_t prevContCE)
 232 {
 233     if(prevCE == CE && prevContCE == contCE) {
 234         return UCOL_IDENTICAL;
 235     }
 236     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
 237         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
 238     {
 239         return UCOL_PRIMARY;
 240     }
 241     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
 242         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
 243     {
 244         return UCOL_SECONDARY;
 245     }
 246     return UCOL_TERTIARY;
 247 }
 248
 249
 250 /*static
 251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 252
 253     uint32_t CE = lh->baseCE;
 254     uint32_t SecondCE = lh->baseContCE;
 255
 256     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 257     uint32_t previousCE, previousContCE;
 258     int32_t iCE;
 259
 260     iCE = ucol_inv_findCE(src, CE, SecondCE);
 261
 262     if(iCE<0) {
 263         return -1;
 264     }
 265
 266     CE &= strengthMask[strength];
 267     SecondCE &= strengthMask[strength];
 268
 269     previousCE = CE;
 270     previousContCE = SecondCE;
 271
 272     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 273         previousCE = (*(CETable+3*(--iCE)));
 274         previousContCE = (*(CETable+3*(iCE)+1));
 275     }
 276     lh->previousCE = previousCE;
 277     lh->previousContCE = previousContCE;
 278
 279     return iCE;
 280 }*/
 281
 282 static
 283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 284     uint32_t CE = lh->baseCE;
 285     uint32_t SecondCE = lh->baseContCE;
 286
 287     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 288     uint32_t nextCE, nextContCE;
 289     int32_t iCE;
 290
 291     iCE = ucol_inv_findCE(src, CE, SecondCE);
 292
 293     if(iCE<0) {
 294         return -1;
 295     }
 296
 297     CE &= strengthMask[strength];
 298     SecondCE &= strengthMask[strength];
 299
 300     nextCE = CE;
 301     nextContCE = SecondCE;
 302
 303     while((nextCE  & strengthMask[strength]) == CE
 304         && (nextContCE  & strengthMask[strength]) == SecondCE)
 305     {
 306         nextCE = (*(CETable+3*(++iCE)));
 307         nextContCE = (*(CETable+3*(iCE)+1));
 308     }
 309
 310     lh->nextCE = nextCE;
 311     lh->nextContCE = nextContCE;
 312
 313     return iCE;
 314 }
 315
 316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 317     /* reset all the gaps */
 318     int32_t i = 0;
 319     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 320     uint32_t st = 0;
 321     uint32_t t1, t2;
 322     int32_t pos;
 323
 324     UColToken *tok = lh->first;
 325     uint32_t tokStrength = tok->strength;
 326
 327     for(i = 0; i<3; i++) {
 328         lh->gapsHi[3*i] = 0;
 329         lh->gapsHi[3*i+1] = 0;
 330         lh->gapsHi[3*i+2] = 0;
 331         lh->gapsLo[3*i] = 0;
 332         lh->gapsLo[3*i+1] = 0;
 333         lh->gapsLo[3*i+2] = 0;
 334         lh->numStr[i] = 0;
 335         lh->fStrToken[i] = NULL;
 336         lh->lStrToken[i] = NULL;
 337         lh->pos[i] = -1;
 338     }
 339
 340     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 341
 342     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 343         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 344         lh->pos[0] = 0;
 345         t1 = lh->baseCE;
 346         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
 347         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 348         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 349         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 350         uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
 351         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 352
 353         t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
 354         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
 355
 356         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 357         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 358         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 359     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 360         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 361         lh->pos[0] = 0;
 362         t1 = lh->baseCE;
 363         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
 364         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 365         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 366         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 367         t1 = lh->nextCE;
 368         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
 369         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 370         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 371         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 372     } else {
 373         for(;;) {
 374             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 375                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 376                     lh->fStrToken[tokStrength] = tok;
 377                 } else { /* The CE must be implicit, since it's not in the table */
 378                     /* Error */
 379                     *status = U_INTERNAL_PROGRAM_ERROR;
 380                 }
 381             }
 382
 383             while(tok != NULL && tok->strength >= tokStrength) {
 384                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 385                     lh->lStrToken[tokStrength] = tok;
 386                 }
 387                 tok = tok->next;
 388             }
 389             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 390                 /* check if previous interval is the same and merge the intervals if it is so */
 391                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 392                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 393                     lh->fStrToken[tokStrength+1] = NULL;
 394                     lh->lStrToken[tokStrength+1] = NULL;
 395                     lh->pos[tokStrength+1] = -1;
 396                 }
 397             }
 398             if(tok != NULL) {
 399                 tokStrength = tok->strength;
 400             } else {
 401                 break;
 402             }
 403         }
 404         for(st = 0; st < 3; st++) {
 405             if((pos = lh->pos[st]) >= 0) {
 406                 t1 = *(CETable+3*(pos));
 407                 t2 = *(CETable+3*(pos)+1);
 408                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 409                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 410                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 411                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 412                 //pos--;
 413                 //t1 = *(CETable+3*(pos));
 414                 //t2 = *(CETable+3*(pos)+1);
 415                 t1 = lh->baseCE;
 416                 t2 = lh->baseContCE;
 417                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 418                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 419                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 420             }
 421         }
 422     }
 423 }
 424
 425
 426 #define ucol_countBytes(value, noOfBytes)   \
 427 {                               \
 428     uint32_t mask = 0xFFFFFFFF;   \
 429     (noOfBytes) = 0;              \
 430     while(mask != 0) {            \
 431     if(((value) & mask) != 0) { \
 432     (noOfBytes)++;            \
 433     }                           \
 434     mask >>= 8;                 \
 435     }                             \
 436 }
 437
 438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 439     if(U_SUCCESS(*status)) {
 440         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 441     }
 442     return g->current;
 443 }
 444
 445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 446     /* TODO: rename to enum names */
 447     uint32_t high, low, count=1;
 448     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 449
 450     if(strength == UCOL_SECONDARY) {
 451         low = UCOL_COMMON_TOP2<<24;
 452         high = 0xFFFFFFFF;
 453         count = 0xFF - UCOL_COMMON_TOP2;
 454     } else {
 455         low = UCOL_BYTE_COMMON << 24; //0x05000000;
 456         high = 0x40000000;
 457         count = 0x40 - UCOL_BYTE_COMMON;
 458     }
 459
 460     if(tok->next != NULL && tok->next->strength == strength) {
 461         count = tok->next->toInsert;
 462     }
 463
 464     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 465     g->current = UCOL_BYTE_COMMON<<24;
 466
 467     if(g->noOfRanges == 0) {
 468         *status = U_INTERNAL_PROGRAM_ERROR;
 469     }
 470     return g->current;
 471 }
 472
 473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 474     uint32_t strength = tok->strength;
 475     uint32_t low = lows[fStrength*3+strength];
 476     uint32_t high = highs[fStrength*3+strength];
 477     uint32_t maxByte = 0;
 478     if(strength == UCOL_TERTIARY) {
 479         maxByte = 0x3F;
 480     } else if(strength == UCOL_PRIMARY) {
 481         maxByte = 0xFE;
 482     } else {
 483         maxByte = 0xFF;
 484     }
 485
 486     uint32_t count = tok->toInsert;
 487
 488     if(low >= high && strength > UCOL_PRIMARY) {
 489         int32_t s = strength;
 490         for(;;) {
 491             s--;
 492             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 493                 if(strength == UCOL_SECONDARY) {
 494                     if (low < UCOL_COMMON_TOP2<<24 ) {
 495                        // Override if low range is less than UCOL_COMMON_TOP2.
 496                         low = UCOL_COMMON_TOP2<<24;
 497                     }
 498                     high = 0xFFFFFFFF;
 499                 } else {
 500                     // Override if low range is less than UCOL_COMMON_BOT3.
 501                     if ( low < UCOL_COMMON_BOT3<<24 ) {
 502                         low = UCOL_COMMON_BOT3<<24;
 503                     }
 504                     high = 0x40000000;
 505                 }
 506                 break;
 507             }
 508             if(s<0) {
 509                 *status = U_INTERNAL_PROGRAM_ERROR;
 510                 return 0;
 511             }
 512         }
 513     }
 514
 515     if(low == 0) {
 516         low = 0x01000000;
 517     }
 518
 519     if(strength == UCOL_SECONDARY) { /* similar as simple */
 520         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 521             low = UCOL_COMMON_TOP2<<24;
 522         }
 523         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 524             high = UCOL_COMMON_TOP2<<24;
 525         }
 526         if(low < (UCOL_COMMON_BOT2<<24)) {
 527             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 528             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 529             //g->current = UCOL_COMMON_BOT2<<24;
 530             return g->current;
 531         }
 532     }
 533
 534     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 535     if(g->noOfRanges == 0) {
 536         *status = U_INTERNAL_PROGRAM_ERROR;
 537     }
 538     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 539     return g->current;
 540 }
 541
 542 static
 543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 544     uint32_t i = 0;
 545     UChar c;
 546
 547     if(U_FAILURE(*status)) {
 548         return 0;
 549     }
 550
 551     if(sourceLen > resLen) {
 552         *status = U_MEMORY_ALLOCATION_ERROR;
 553         return 0;
 554     }
 555
 556     for(i = 0; i < sourceLen; i++) {
 557         c = source[i];
 558         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 559             switch(c - 0x3000) {
 560             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 561             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 562                 c++;
 563                 break;
 564             case 0xF5:
 565                 c = 0x30AB;
 566                 break;
 567             case 0xF6:
 568                 c = 0x30B1;
 569                 break;
 570             }
 571         }
 572         resBuf[i] = c;
 573     }
 574     return sourceLen;
 575 }
 576
 577 static
 578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 579     uint32_t i = 0;
 580     UChar c;
 581
 582     if(U_FAILURE(*status)) {
 583         return 0;
 584     }
 585
 586     if(sourceLen > resLen) {
 587         *status = U_MEMORY_ALLOCATION_ERROR;
 588         return 0;
 589     }
 590
 591     for(i = 0; i < sourceLen; i++) {
 592         c = source[i];
 593         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
 594             switch(c - 0x3000) {
 595             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 596             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 597                 c--;
 598                 break;
 599             case 0xAB:
 600                 c = 0x30F5;
 601                 break;
 602             case 0xB1:
 603                 c = 0x30F6;
 604                 break;
 605             }
 606         }
 607         resBuf[i] = c;
 608     }
 609     return sourceLen;
 610 }
 611
 612 static
 613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 614     uint32_t i = 0;
 615     UChar n[128];
 616     uint32_t nLen = 0;
 617     uint32_t uCount = 0, lCount = 0;
 618
 619     collIterate s;
 620     uint32_t order = 0;
 621
 622     if(U_FAILURE(*status)) {
 623         return UCOL_LOWER_CASE;
 624     }
 625
 626     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 627     if(U_SUCCESS(*status)) {
 628         for(i = 0; i < nLen; i++) {
 629             uprv_init_collIterate(UCA, &n[i], 1, &s);
 630             order = ucol_getNextCE(UCA, &s, status);
 631             if(isContinuation(order)) {
 632                 *status = U_INTERNAL_PROGRAM_ERROR;
 633                 return UCOL_LOWER_CASE;
 634             }
 635             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 636                 uCount++;
 637             } else {
 638                 if(u_islower(n[i])) {
 639                     lCount++;
 640                 } else {
 641                     UChar sk[1], lk[1];
 642                     u_toSmallKana(&n[i], 1, sk, 1, status);
 643                     u_toLargeKana(&n[i], 1, lk, 1, status);
 644                     if(sk[0] == n[i] && lk[0] != n[i]) {
 645                         lCount++;
 646                     }
 647                 }
 648             }
 649         }
 650     }
 651
 652     if(uCount != 0 && lCount != 0) {
 653         return UCOL_MIXED_CASE;
 654     } else if(uCount != 0) {
 655         return UCOL_UPPER_CASE;
 656     } else {
 657         return UCOL_LOWER_CASE;
 658     }
 659 }
 660
 661
 662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 663     /* this one makes the table and stuff */
 664     uint32_t noOfBytes[3];
 665     uint32_t i;
 666
 667     for(i = 0; i<3; i++) {
 668         ucol_countBytes(CEparts[i], noOfBytes[i]);
 669     }
 670
 671     /* Here we have to pack CEs from parts */
 672
 673     uint32_t CEi = 0;
 674     uint32_t value = 0;
 675
 676     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 677         if(CEi > 0) {
 678             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 679         } else {
 680             value = 0;
 681         }
 682
 683         if(2*CEi<noOfBytes[0]) {
 684             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 685         }
 686         if(CEi<noOfBytes[1]) {
 687             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 688         }
 689         if(CEi<noOfBytes[2]) {
 690             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 691         }
 692         tok->CEs[CEi] = value;
 693         CEi++;
 694     }
 695     if(CEi == 0) { /* totally ignorable */
 696         tok->noOfCEs = 1;
 697         tok->CEs[0] = 0;
 698     } else { /* there is at least something */
 699         tok->noOfCEs = CEi;
 700     }
 701
 702
 703     // we want to set case bits here and now, not later.
 704     // Case bits handling
 705     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
 706         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 707         int32_t cSize = (tok->source & 0xFF000000) >> 24;
 708         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 709
 710         if(cSize > 1) {
 711             // Do it manually
 712             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 713         } else {
 714             // Copy it from the UCA
 715             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 716             tok->CEs[0] |= (caseCE & 0xC0);
 717         }
 718     }
 719
 720 #if UCOL_DEBUG==2
 721     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 722     for(i = 0; i<tok->noOfCEs; i++) {
 723         fprintf(stderr, "%08X ", tok->CEs[i]);
 724     }
 725     fprintf(stderr, "\n");
 726 #endif
 727 }
 728
 729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 730     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 731     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 732
 733     UColToken *tok = lh->last;
 734     uint32_t t[UCOL_STRENGTH_LIMIT];
 735
 736     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 737
 738     tok->toInsert = 1;
 739     t[tok->strength] = 1;
 740
 741     while(tok->previous != NULL) {
 742         if(tok->previous->strength < tok->strength) { /* going up */
 743             t[tok->strength] = 0;
 744             t[tok->previous->strength]++;
 745         } else if(tok->previous->strength > tok->strength) { /* going down */
 746             t[tok->previous->strength] = 1;
 747         } else {
 748             t[tok->strength]++;
 749         }
 750         tok=tok->previous;
 751         tok->toInsert = t[tok->strength];
 752     }
 753
 754     tok->toInsert = t[tok->strength];
 755     ucol_inv_getGapPositions(src, lh, status);
 756
 757 #if UCOL_DEBUG
 758     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 759     int32_t j = 2;
 760     for(j = 2; j >= 0; j--) {
 761         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 762         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 763     }
 764     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 765
 766     do {
 767         fprintf(stderr,"%i", tok->strength);
 768         tok = tok->next;
 769     } while(tok != NULL);
 770     fprintf(stderr, "\n");
 771
 772     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 773
 774     do {
 775         fprintf(stderr,"%i", tok->toInsert);
 776         tok = tok->next;
 777     } while(tok != NULL);
 778 #endif
 779
 780     tok = lh->first;
 781     uint32_t fStrength = UCOL_IDENTICAL;
 782     uint32_t initStrength = UCOL_IDENTICAL;
 783
 784
 785     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 786     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 787     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 788
 789     while (tok != NULL && U_SUCCESS(*status)) {
 790         fStrength = tok->strength;
 791         if(fStrength < initStrength) {
 792             initStrength = fStrength;
 793             if(lh->pos[fStrength] == -1) {
 794                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
 795                     fStrength--;
 796                 }
 797                 if(lh->pos[fStrength] == -1) {
 798                     *status = U_INTERNAL_PROGRAM_ERROR;
 799                     return;
 800                 }
 801             }
 802             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 803                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 804                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 805                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 806                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 807             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 808                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 809                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 810                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 811                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 812             } else { /* primaries */
 813                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 814                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 815                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 816                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 817             }
 818         } else {
 819             if(tok->strength == UCOL_TERTIARY) {
 820                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 821             } else if(tok->strength == UCOL_SECONDARY) {
 822                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 824             } else if(tok->strength == UCOL_PRIMARY) {
 825                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 826                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 827                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 828             }
 829         }
 830         ucol_doCE(src, CEparts, tok, status);
 831         tok = tok->next;
 832     }
 833 }
 834
 835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 836     UCAElements el;
 837     UColToken *tok = lh->first;
 838     UColToken *expt = NULL;
 839     uint32_t i = 0, j = 0;
 840     const uint16_t  *fcdTrieData = unorm_getFCDTrie(status);
 841
 842     while(tok != NULL && U_SUCCESS(*status)) {
 843         /* first, check if there are any expansions */
 844         /* if there are expansions, we need to do a little bit more processing */
 845         /* since parts of expansion can be tailored, while others are not */
 846         if(tok->expansion != 0) {
 847             uint32_t len = tok->expansion >> 24;
 848             uint32_t currentSequenceLen = len;
 849             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 850             //uint32_t exp = currentSequenceLen | expOffset;
 851             UColToken exp;
 852             exp.source = currentSequenceLen | expOffset;
 853             exp.rulesToParse = src->source;
 854
 855             while(len > 0) {
 856                 currentSequenceLen = len;
 857                 while(currentSequenceLen > 0) {
 858                     exp.source = (currentSequenceLen << 24) | expOffset;
 859                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 860                         uint32_t noOfCEsToCopy = expt->noOfCEs;
 861                         for(j = 0; j<noOfCEsToCopy; j++) {
 862                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 863                         }
 864                         tok->noOfExpCEs += noOfCEsToCopy;
 865                         // Smart people never try to add codepoints and CEs.
 866                         // For some odd reason, it won't work.
 867                         expOffset += currentSequenceLen; //noOfCEsToCopy;
 868                         len -= currentSequenceLen; //noOfCEsToCopy;
 869                         break;
 870                     } else {
 871                         currentSequenceLen--;
 872                     }
 873                 }
 874                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 875                     /* will have to get one from UCA */
 876                     /* first, get the UChars from the rules */
 877                     /* then pick CEs out until there is no more and stuff them into expansion */
 878                     collIterate s;
 879                     uint32_t order = 0;
 880                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
 881
 882                     for(;;) {
 883                         order = ucol_getNextCE(src->UCA, &s, status);
 884                         if(order == UCOL_NO_MORE_CES) {
 885                             break;
 886                         }
 887                         tok->expCEs[tok->noOfExpCEs++] = order;
 888                     }
 889                     expOffset++;
 890                     len--;
 891                 }
 892             }
 893         } else {
 894             tok->noOfExpCEs = 0;
 895         }
 896
 897         /* set the ucaelement with obtained values */
 898         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 899         /* copy CEs */
 900         for(i = 0; i<tok->noOfCEs; i++) {
 901             el.CEs[i] = tok->CEs[i];
 902         }
 903         for(i = 0; i<tok->noOfExpCEs; i++) {
 904             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 905         }
 906
 907         /* copy UChars */
 908         // We kept prefix and source kind of together, as it is a kind of a contraction.
 909         // However, now we have to slice the prefix off the main thing -
 910         el.prefix = el.prefixChars;
 911         el.cPoints = el.uchars;
 912         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 913             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 914             // decomposed elements to the unsaf table.
 915             el.prefixSize = tok->prefix>>24;
 916             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 917
 918             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 919             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 920         } else {
 921             el.prefixSize = 0;
 922             *el.prefix = 0;
 923
 924             el.cSize = (tok->source >> 24);
 925             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 926         }
 927         if(src->UCA != NULL) {
 928             UBool containCombinMarks = FALSE;
 929             for(i = 0; i<el.cSize; i++) {
 930                 if(UCOL_ISJAMO(el.cPoints[i])) {
 931                     t->image->jamoSpecial = TRUE;
 932                 }
 933                 if ( !src->buildCCTabFlag ) {
 934                     // check combining class
 935                     int16_t fcd = unorm_getFCD16(fcdTrieData, el.cPoints[i]);
 936                     if ( (fcd && 0xff) == 0 ) {
 937                         // reset flag when current char is not combining mark.
 938                         containCombinMarks = FALSE;
 939                     }
 940                     else {
 941                         containCombinMarks = TRUE;
 942                     }
 943                 }
 944             }
 945             if ( !src->buildCCTabFlag && containCombinMarks ) {
 946                 src->buildCCTabFlag = TRUE;
 947             }
 948         }
 949
 950         /* and then, add it */
 951 #if UCOL_DEBUG==2
 952         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 953 #endif
 954         uprv_uca_addAnElement(t, &el, status);
 955
 956 #if UCOL_DEBUG_DUPLICATES
 957         if(*status != U_ZERO_ERROR) {
 958             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 959             *status = U_ZERO_ERROR;
 960         }
 961 #endif
 962
 963         tok = tok->next;
 964     }
 965 }
 966
 967 U_CDECL_BEGIN
 968 static UBool U_CALLCONV
 969 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 970     UErrorCode status = U_ZERO_ERROR;
 971     tempUCATable *t = (tempUCATable *)context;
 972     if(value == 0) {
 973         while(start < limit) {
 974             uint32_t CE = utrie_get32(t->mapping, start, NULL);
 975             if(CE == UCOL_NOT_FOUND) {
 976                 UCAElements el;
 977                 el.isThai = FALSE;
 978                 el.prefixSize = 0;
 979                 el.prefixChars[0] = 0;
 980                 el.prefix = el.prefixChars;
 981                 el.cPoints = el.uchars;
 982
 983                 el.cSize = 0;
 984                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
 985
 986                 el.noOfCEs = 1;
 987                 el.CEs[0] = 0;
 988                 uprv_uca_addAnElement(t, &el, &status);
 989
 990             }
 991             start++;
 992         }
 993     }
 994     if(U_FAILURE(status)) {
 995         return FALSE;
 996     } else {
 997         return TRUE;
 998     }
 999 }
1000 U_CDECL_END
1001
1002 static void
1003 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1004                                UChar32 start, UChar32 end,
1005                                UErrorCode *status)
1006 {
1007     //UChar decomp[256];
1008     uint32_t CE = UCOL_NOT_FOUND;
1009     UChar32 u = 0;
1010     UCAElements el;
1011     el.isThai = FALSE;
1012     el.prefixSize = 0;
1013     el.prefixChars[0] = 0;
1014     collIterate colIt;
1015
1016     if(U_SUCCESS(*status)) {
1017         for(u = start; u<=end; u++) {
1018             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1019                 /* this test is for contractions that are missing the starting element. */
1020                 || ((isCntTableElement(CE)) &&
1021                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1022                 )
1023             {
1024                 el.cSize = 0;
1025                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1026                 //decomp[0] = (UChar)u;
1027                 //el.uchars[0] = (UChar)u;
1028                 el.cPoints = el.uchars;
1029                 //el.cSize = 1;
1030                 el.noOfCEs = 0;
1031                 el.prefix = el.prefixChars;
1032                 el.prefixSize = 0;
1033                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1034                 // We actually want to check whether this element is a special
1035                 // If it is an implicit element (hangul, CJK - we want to copy the
1036                 // special, not the resolved CEs) - for hangul, copying resolved
1037                 // would just make things the same (there is an expansion and it
1038                 // takes approximately the same amount of time to resolve as
1039                 // falling back to the UCA).
1040                 /*
1041                 UTRIE_GET32(src->UCA->mapping, u, CE);
1042                 tag = getCETag(CE);
1043                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1044                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1045                 || tag == LEAD_SURROGATE_TAG) {
1046                 el.CEs[el.noOfCEs++] = CE;
1047                 } else {
1048                 */
1049                 // It turns out that it does not make sense to keep implicits
1050                 // unresolved. The cost of resolving them is big enough so that
1051                 // it doesn't make any difference whether we have to go to the UCA
1052                 // or not.
1053                 {
1054                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1055                     while(CE != UCOL_NO_MORE_CES) {
1056                         CE = ucol_getNextCE(src->UCA, &colIt, status);
1057                         if(CE != UCOL_NO_MORE_CES) {
1058                             el.CEs[el.noOfCEs++] = CE;
1059                         }
1060                     }
1061                 }
1062                 uprv_uca_addAnElement(t, &el, status);
1063             }
1064         }
1065     }
1066 }
1067
1068 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1069     U_NAMESPACE_USE
1070
1071     uint32_t i = 0;
1072     if(U_FAILURE(*status)) {
1073         return NULL;
1074     }
1075     /*
1076     2.  Eliminate the negative lists by doing the following for each non-null negative list:
1077     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1078     create new ListHeader X
1079     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1080     first item you add, based on the stronger strength levels of the two lists.
1081     */
1082     /*
1083     3.  For each ListHeader with a non-null positive list:
1084     */
1085     /*
1086     o   Find all character strings with CEs between the baseCE and the
1087     next/previous CE, at the strength of the first token. Add these to the
1088     tailoring.
1089     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1090     tailoring has & x < z...
1091     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1092     */
1093     /* It is possible that this part should be done even while constructing list */
1094     /* The problem is that it is unknown what is going to be the strongest weight */
1095     /* So we might as well do it here */
1096
1097     /*
1098     o   Allocate CEs for each token in the list, based on the total number N of the
1099     largest level difference, and the gap G between baseCE and nextCE at that
1100     level. The relation * between the last item and nextCE is the same as the
1101     strongest strength.
1102     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1103     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1104     Then fit b and c into the secondary gap between a and d, then fit q
1105     into the tertiary gap between b and c.
1106
1107     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1108     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1109     Then fit q into the tertiary gap between b and c.
1110     o   When incrementing primary values, we will not cross high byte
1111     boundaries except where there is only a single-byte primary. That is to
1112     ensure that the script reordering will continue to work.
1113     */
1114     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1115     /* test for NULL */
1116     if (image == NULL) {
1117         *status = U_MEMORY_ALLOCATION_ERROR;
1118         return NULL;
1119     }
1120     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1121
1122     for(i = 0; i<src->resultLen; i++) {
1123         /* now we need to generate the CEs */
1124         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1125         /* According to strength                                                          */
1126         if(U_SUCCESS(*status)) {
1127             if(src->lh[i].first) { // if there are any elements
1128                 // due to the way parser works, subsequent tailorings
1129                 // may remove all the elements from a sequence, therefore
1130                 // leaving an empty tailoring sequence.
1131                 ucol_initBuffers(src, &src->lh[i], status);
1132             }
1133         }
1134         if(U_FAILURE(*status)) {
1135             uprv_free(image);
1136             return NULL;
1137         }
1138     }
1139
1140     if(src->varTop != NULL) { /* stuff the variable top value */
1141         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1142         /* remove it from the list */
1143         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1144             src->varTop->listHeader->first = src->varTop->next;
1145         }
1146         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1147             src->varTop->listHeader->last = src->varTop->previous;
1148         }
1149         if(src->varTop->next != NULL) {
1150             src->varTop->next->previous = src->varTop->previous;
1151         }
1152         if(src->varTop->previous != NULL) {
1153             src->varTop->previous->next = src->varTop->next;
1154         }
1155     }
1156
1157
1158     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1159     if(U_FAILURE(*status)) {
1160         uprv_free(image);
1161         return NULL;
1162     }
1163
1164
1165     /* After this, we have assigned CE values to all regular CEs      */
1166     /* now we will go through list once more and resolve expansions,  */
1167     /* make UCAElements structs and add them to table                 */
1168     for(i = 0; i<src->resultLen; i++) {
1169         /* now we need to generate the CEs */
1170         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1171         /* According to strength                                                          */
1172         if(U_SUCCESS(*status)) {
1173             ucol_createElements(src, t, &src->lh[i], status);
1174         }
1175     }
1176
1177     UCAElements el;
1178     el.isThai = FALSE;
1179     el.prefixSize = 0;
1180     el.prefixChars[0] = 0;
1181
1182     /* add latin-1 stuff */
1183     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1184
1185     /* add stuff for copying */
1186     if(src->copySet != NULL) {
1187         int32_t i = 0;
1188         UnicodeSet *set = (UnicodeSet *)src->copySet;
1189         for(i = 0; i < set->getRangeCount(); i++) {
1190             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1191         }
1192     }
1193
1194     if(U_SUCCESS(*status)) {
1195         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1196
1197         uint32_t tailoredCE = UCOL_NOT_FOUND;
1198         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1199         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1200         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1201         // Check for null pointer
1202         if (ucaEl == NULL) {
1203                 *status = U_MEMORY_ALLOCATION_ERROR;
1204                 return NULL;
1205         }
1206         while(*conts != 0) {
1207             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1208             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1209             if(tailoredCE != UCOL_NOT_FOUND) {
1210                 UBool needToAdd = TRUE;
1211                 if(isCntTableElement(tailoredCE)) {
1212                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1213                         needToAdd = FALSE;
1214                     }
1215                 }
1216                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1217                     UCAElements elm;
1218                     elm.cPoints = el.uchars;
1219                     elm.noOfCEs = 0;
1220                     elm.uchars[0] = *conts;
1221                     elm.uchars[1] = 0;
1222                     elm.cSize = 1;
1223                     elm.prefixChars[0] = *(conts+2);
1224                     elm.isThai = FALSE;
1225                     elm.prefix = elm.prefixChars;
1226                     elm.prefixSize = 1;
1227                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1228                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1229                         needToAdd = TRUE;
1230                     }
1231                 }
1232                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1233                     needToAdd = FALSE;
1234                 }
1235
1236                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1237                     if (*(conts+1) != 0) {  // contractions
1238                         el.prefix = el.prefixChars;
1239                         el.prefixSize = 0;
1240                         el.cPoints = el.uchars;
1241                         el.noOfCEs = 0;
1242                         el.uchars[0] = *conts;
1243                         el.uchars[1] = *(conts+1);
1244                         if(*(conts+2)!=0) {
1245                             el.uchars[2] = *(conts+2);
1246                             el.cSize = 3;
1247                         } else {
1248                             el.cSize = 2;
1249                         }
1250                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
1251                     }
1252                     else { // pre-context character
1253                         UChar str[4] = { 0 };
1254                         int32_t len=0;
1255                         int32_t preKeyLen=0;
1256
1257                         el.cPoints = el.uchars;
1258                         el.noOfCEs = 0;
1259                         el.uchars[0] = *conts;
1260                         el.uchars[1] = 0;
1261                         el.cSize = 1;
1262                         el.prefixChars[0] = *(conts+2);
1263                         el.prefix = el.prefixChars;
1264                         el.prefixSize = 1;
1265                         if (el.prefixChars[0]!=0) {
1266                             // get CE of prefix character first
1267                             str[0]=el.prefixChars[0];
1268                             str[1]=0;
1269                             ucol_setText(ucaEl, str, 1, status);
1270                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1271                                     != UCOL_NULLORDER) {
1272                                 preKeyLen++;  // count number of keys for prefix character
1273                             }
1274                             str[len++] = el.prefixChars[0];
1275                         }
1276
1277                         str[len++] = el.uchars[0];
1278                         str[len]=0;
1279                         ucol_setText(ucaEl, str, len, status);
1280                         // Skip the keys for prefix character, then copy the rest to el.
1281                         while ((preKeyLen-->0) &&
1282                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1283                             continue;
1284                         }
1285
1286                     }
1287                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1288                         el.noOfCEs++;
1289                     }
1290                     uprv_uca_addAnElement(t, &el, status);
1291                 }
1292
1293             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1294                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1295             }
1296             conts+=3;
1297         }
1298         ucol_closeElements(ucaEl);
1299     }
1300
1301     // Add completely ignorable elements
1302     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1303
1304     // add tailoring characters related canonical closures
1305     uprv_uca_canonicalClosure(t, src, status);
1306
1307     /* still need to produce compatibility closure */
1308
1309     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1310
1311     uprv_uca_closeTempTable(t);
1312     uprv_free(image);
1313
1314     return myData;
1315 }
1316
1317 U_CDECL_BEGIN
1318 static UBool U_CALLCONV
1319 ucol_bld_cleanup(void)
1320 {
1321     udata_close(invUCA_DATA_MEM);
1322     invUCA_DATA_MEM = NULL;
1323     _staticInvUCA = NULL;
1324     return TRUE;
1325 }
1326 U_CDECL_END
1327
1328 U_CAPI const InverseUCATableHeader * U_EXPORT2
1329 ucol_initInverseUCA(UErrorCode *status)
1330 {
1331     if(U_FAILURE(*status)) return NULL;
1332
1333     UBool needsInit;
1334     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1335
1336     if(needsInit) {
1337         InverseUCATableHeader *newInvUCA = NULL;
1338         UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1339
1340         if(U_FAILURE(*status)) {
1341             if (result) {
1342                 udata_close(result);
1343             }
1344             // This is not needed, as we are talking about
1345             // memory we got from UData
1346             //uprv_free(newInvUCA);
1347         }
1348
1349         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1350             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1351             UCollator *UCA = ucol_initUCA(status);
1352             // UCA versions of UCA and inverse UCA should match
1353             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1354                 *status = U_INVALID_FORMAT_ERROR;
1355                 udata_close(result);
1356                 return NULL;
1357             }
1358
1359             umtx_lock(NULL);
1360             if(_staticInvUCA == NULL) {
1361                 _staticInvUCA = newInvUCA;
1362                 invUCA_DATA_MEM = result;
1363                 result = NULL;
1364                 newInvUCA = NULL;
1365             }
1366             umtx_unlock(NULL);
1367
1368             if(newInvUCA != NULL) {
1369                 udata_close(result);
1370                 // This is not needed, as we are talking about
1371                 // memory we got from UData
1372                 //uprv_free(newInvUCA);
1373             }
1374             else {
1375                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1376             }
1377         }
1378     }
1379     return _staticInvUCA;
1380 }
1381
1382 #endif /* #if !UCONFIG_NO_COLLATION */