icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2004, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/uchar.h"
  26 #include "ucol_bld.h"
  27 #include "ucln_in.h"
  28 #include "umutex.h"
  29 #include "unicode/uniset.h"
  30
  31 static const InverseUCATableHeader* _staticInvUCA = NULL;
  32 static UDataMemory* invUCA_DATA_MEM = NULL;
  33
  34 U_CDECL_BEGIN
  35 static UBool U_CALLCONV
  36 isAcceptableInvUCA(void * /*context*/,
  37              const char * /*type*/, const char * /*name*/,
  38              const UDataInfo *pInfo){
  39   /* context, type & name are intentionally not used */
  40     if( pInfo->size>=20 &&
  41         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  42         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  43         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  44         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  45         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  46         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  47         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  48         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  49         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  50         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  51         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  52         ) {
  53         UVersionInfo UCDVersion;
  54         u_getUnicodeVersion(UCDVersion);
  55         if(pInfo->dataVersion[0]==UCDVersion[0] &&
  56         pInfo->dataVersion[1]==UCDVersion[1]) {
  57         //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  58         //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  59         //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  60           return TRUE;
  61         } else {
  62           return FALSE;
  63         }
  64     } else {
  65         return FALSE;
  66     }
  67 }
  68 U_CDECL_END
  69
  70 static
  71 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
  72   uint32_t bottom = 0, top = src->invUCA->tableSize;
  73   uint32_t i = 0;
  74   uint32_t first = 0, second = 0;
  75   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
  76
  77   while(bottom < top-1) {
  78     i = (top+bottom)/2;
  79     first = *(CETable+3*i);
  80     second = *(CETable+3*i+1);
  81     if(first > CE) {
  82       top = i;
  83     } else if(first < CE) {
  84       bottom = i;
  85     } else {
  86         if(second > SecondCE) {
  87           top = i;
  88         } else if(second < SecondCE) {
  89           bottom = i;
  90         } else {
  91           break;
  92         }
  93     }
  94   }
  95
  96   /* weiv:                                                  */
  97   /* in searching for elements, I have removed the failure  */
  98   /* The reason for this is that the builder does not rely  */
  99   /* on search mechanism telling it that it didn't find an  */
 100   /* element. However, indirect positioning relies on being */
 101   /* able to find the elements around any CE, even if it is */
 102   /* not defined in the UCA. */
 103   return i;
 104 /*
 105   if((first == CE && second == SecondCE)) {
 106     return i;
 107   } else {
 108     return -1;
 109   }
 110 */
 111 }
 112
 113 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 114   0xFFFF0000,
 115   0xFFFFFF00,
 116   0xFFFFFFFF
 117 };
 118
 119 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 120                                             uint32_t CE, uint32_t contCE,
 121                                             uint32_t *nextCE, uint32_t *nextContCE,
 122                                             uint32_t strength) {
 123   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 124   int32_t iCE;
 125
 126   iCE = ucol_inv_findCE(src, CE, contCE);
 127
 128   if(iCE<0) {
 129     *nextCE = UCOL_NOT_FOUND;
 130     return -1;
 131   }
 132
 133   CE &= strengthMask[strength];
 134   contCE &= strengthMask[strength];
 135
 136   *nextCE = CE;
 137   *nextContCE = contCE;
 138
 139   while((*nextCE  & strengthMask[strength]) == CE
 140     && (*nextContCE  & strengthMask[strength]) == contCE) {
 141     *nextCE = (*(CETable+3*(++iCE)));
 142     *nextContCE = (*(CETable+3*(iCE)+1));
 143   }
 144
 145   return iCE;
 146 }
 147
 148 U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 149                                             uint32_t CE, uint32_t contCE,
 150                                             uint32_t *prevCE, uint32_t *prevContCE,
 151                                             uint32_t strength) {
 152   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 153   int32_t iCE;
 154
 155   iCE = ucol_inv_findCE(src, CE, contCE);
 156
 157   if(iCE<0) {
 158     *prevCE = UCOL_NOT_FOUND;
 159     return -1;
 160   }
 161
 162   CE &= strengthMask[strength];
 163   contCE &= strengthMask[strength];
 164
 165   *prevCE = CE;
 166   *prevContCE = contCE;
 167
 168   while((*prevCE  & strengthMask[strength]) == CE
 169     && (*prevContCE  & strengthMask[strength])== contCE
 170     && iCE > 0) { /* this condition should prevent falling off the edge of the world */
 171     /* here, we end up in a singularity - zero */
 172     *prevCE = (*(CETable+3*(--iCE)));
 173     *prevContCE = (*(CETable+3*(iCE)+1));
 174   }
 175
 176   return iCE;
 177 }
 178
 179 U_CAPI uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 180                                             uint32_t prevCE, uint32_t prevContCE) {
 181     uint32_t strength = UCOL_TERTIARY;
 182     while(((prevCE & strengthMask[strength]) != (CE & strengthMask[strength])
 183         || (prevContCE & strengthMask[strength]) != (contCE & strengthMask[strength]))
 184         && strength) {
 185         strength--;
 186     }
 187     return strength;
 188
 189 }
 190
 191
 192 static
 193 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 194
 195   uint32_t CE = lh->baseCE;
 196   uint32_t SecondCE = lh->baseContCE;
 197
 198   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 199   uint32_t previousCE, previousContCE;
 200   int32_t iCE;
 201
 202   iCE = ucol_inv_findCE(src, CE, SecondCE);
 203
 204   if(iCE<0) {
 205     return -1;
 206   }
 207
 208   CE &= strengthMask[strength];
 209   SecondCE &= strengthMask[strength];
 210
 211   previousCE = CE;
 212   previousContCE = SecondCE;
 213
 214   while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 215     previousCE = (*(CETable+3*(--iCE)));
 216     previousContCE = (*(CETable+3*(iCE)+1));
 217   }
 218   lh->previousCE = previousCE;
 219   lh->previousContCE = previousContCE;
 220
 221   return iCE;
 222 }
 223
 224 static
 225 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 226   uint32_t CE = lh->baseCE;
 227   uint32_t SecondCE = lh->baseContCE;
 228
 229   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 230   uint32_t nextCE, nextContCE;
 231   int32_t iCE;
 232
 233   iCE = ucol_inv_findCE(src, CE, SecondCE);
 234
 235   if(iCE<0) {
 236     return -1;
 237   }
 238
 239   CE &= strengthMask[strength];
 240   SecondCE &= strengthMask[strength];
 241
 242   nextCE = CE;
 243   nextContCE = SecondCE;
 244
 245   while((nextCE  & strengthMask[strength]) == CE
 246     && (nextContCE  & strengthMask[strength]) == SecondCE) {
 247     nextCE = (*(CETable+3*(++iCE)));
 248     nextContCE = (*(CETable+3*(iCE)+1));
 249   }
 250
 251   lh->nextCE = nextCE;
 252   lh->nextContCE = nextContCE;
 253
 254   return iCE;
 255 }
 256
 257 U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 258   /* reset all the gaps */
 259   int32_t i = 0;
 260   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 261   uint32_t st = 0;
 262   uint32_t t1, t2;
 263   int32_t pos;
 264
 265   UColToken *tok = lh->first;
 266   uint32_t tokStrength = tok->strength;
 267
 268   for(i = 0; i<3; i++) {
 269     lh->gapsHi[3*i] = 0;
 270     lh->gapsHi[3*i+1] = 0;
 271     lh->gapsHi[3*i+2] = 0;
 272     lh->gapsLo[3*i] = 0;
 273     lh->gapsLo[3*i+1] = 0;
 274     lh->gapsLo[3*i+2] = 0;
 275     lh->numStr[i] = 0;
 276     lh->fStrToken[i] = NULL;
 277     lh->lStrToken[i] = NULL;
 278     lh->pos[i] = -1;
 279   }
 280
 281   UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 282
 283   if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 284   //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 285     lh->pos[0] = 0;
 286     t1 = lh->baseCE;
 287     t2 = lh->baseContCE;
 288     lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 289     lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 290     lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 291     uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
 292     primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 293
 294     t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
 295     t2 = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
 296
 297     lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 298     lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 299     lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 300   } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 301   //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 302     lh->pos[0] = 0;
 303     t1 = lh->baseCE;
 304     t2 = lh->baseContCE;
 305     lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 306     lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 307     lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 308     t1 = lh->nextCE;
 309     t2 = lh->nextContCE;
 310     lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 311     lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 312     lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 313   } else {
 314     for(;;) {
 315       if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 316         if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 317           lh->fStrToken[tokStrength] = tok;
 318         } else { /* The CE must be implicit, since it's not in the table */
 319           /* Error */
 320           *status = U_INTERNAL_PROGRAM_ERROR;
 321         }
 322       }
 323
 324       while(tok != NULL && tok->strength >= tokStrength) {
 325         if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 326           lh->lStrToken[tokStrength] = tok;
 327         }
 328         tok = tok->next;
 329       }
 330       if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 331         /* check if previous interval is the same and merge the intervals if it is so */
 332         if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 333           lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 334           lh->fStrToken[tokStrength+1] = NULL;
 335           lh->lStrToken[tokStrength+1] = NULL;
 336           lh->pos[tokStrength+1] = -1;
 337         }
 338       }
 339       if(tok != NULL) {
 340         tokStrength = tok->strength;
 341       } else {
 342         break;
 343       }
 344     }
 345     for(st = 0; st < 3; st++) {
 346       if((pos = lh->pos[st]) >= 0) {
 347         t1 = *(CETable+3*(pos));
 348         t2 = *(CETable+3*(pos)+1);
 349         lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 350         lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 351         //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 352         lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 353         //pos--;
 354         //t1 = *(CETable+3*(pos));
 355         //t2 = *(CETable+3*(pos)+1);
 356         t1 = lh->baseCE;
 357         t2 = lh->baseContCE;
 358         lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 359         lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 360         lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 361       }
 362     }
 363   }
 364 }
 365
 366
 367 #define ucol_countBytes(value, noOfBytes)   \
 368 {                               \
 369   uint32_t mask = 0xFFFFFFFF;   \
 370   (noOfBytes) = 0;              \
 371   while(mask != 0) {            \
 372     if(((value) & mask) != 0) { \
 373       (noOfBytes)++;            \
 374     }                           \
 375     mask >>= 8;                 \
 376   }                             \
 377 }
 378
 379 U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 380   if(U_SUCCESS(*status)) {
 381   g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 382   }
 383   return g->current;
 384 }
 385
 386 U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 387 /* TODO: rename to enum names */
 388   uint32_t high, low, count=1;
 389   uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 390
 391   if(strength == UCOL_SECONDARY) {
 392     low = UCOL_COMMON_TOP2<<24;
 393     high = 0xFFFFFFFF;
 394     count = 0xFF - UCOL_COMMON_TOP2;
 395   } else {
 396     low = UCOL_BYTE_COMMON << 24; //0x05000000;
 397     high = 0x40000000;
 398     count = 0x40 - UCOL_BYTE_COMMON;
 399   }
 400
 401   if(tok->next != NULL && tok->next->strength == strength) {
 402     count = tok->next->toInsert;
 403   }
 404
 405   g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 406   g->current = UCOL_BYTE_COMMON<<24;
 407
 408   if(g->noOfRanges == 0) {
 409     *status = U_INTERNAL_PROGRAM_ERROR;
 410   }
 411   return g->current;
 412 }
 413
 414 U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 415   uint32_t strength = tok->strength;
 416   uint32_t low = lows[fStrength*3+strength];
 417   uint32_t high = highs[fStrength*3+strength];
 418   uint32_t maxByte = 0;
 419   if(strength == UCOL_TERTIARY) {
 420       maxByte = 0x3F;
 421   } else if(strength == UCOL_PRIMARY) {
 422       maxByte = 0xFE;
 423   } else {
 424       maxByte = 0xFF;
 425   }
 426
 427   uint32_t count = tok->toInsert;
 428
 429   if(low >= high && strength > UCOL_PRIMARY) {
 430     int32_t s = strength;
 431     for(;;) {
 432       s--;
 433       if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 434         if(strength == UCOL_SECONDARY) {
 435           low = UCOL_COMMON_TOP2<<24;
 436           high = 0xFFFFFFFF;
 437         } else {
 438           //low = 0x02000000; // This needs to be checked - what if low is
 439           // not good...
 440           high = 0x40000000;
 441         }
 442         break;
 443       }
 444       if(s<0) {
 445         *status = U_INTERNAL_PROGRAM_ERROR;
 446         return 0;
 447       }
 448     }
 449   }
 450
 451   if(low == 0) {
 452     low = 0x01000000;
 453   }
 454
 455   if(strength == UCOL_SECONDARY) { /* similar as simple */
 456     if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 457       low = UCOL_COMMON_TOP2<<24;
 458     }
 459     if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 460       high = UCOL_COMMON_TOP2<<24;
 461     }
 462     if(low < (UCOL_COMMON_BOT2<<24)) {
 463       g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 464       g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 465       //g->current = UCOL_COMMON_BOT2<<24;
 466       return g->current;
 467     }
 468   }
 469
 470   g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 471   if(g->noOfRanges == 0) {
 472     *status = U_INTERNAL_PROGRAM_ERROR;
 473   }
 474   g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 475   return g->current;
 476 }
 477
 478 static
 479 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 480   uint32_t i = 0;
 481   UChar c;
 482
 483   if(U_FAILURE(*status)) {
 484     return 0;
 485   }
 486
 487   if(sourceLen > resLen) {
 488     *status = U_MEMORY_ALLOCATION_ERROR;
 489     return 0;
 490   }
 491
 492   for(i = 0; i < sourceLen; i++) {
 493     c = source[i];
 494     if(0x3042 < c && c < 0x30ef) { /* Kana range */
 495       switch(c - 0x3000) {
 496       case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 497       case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 498         c++;
 499         break;
 500       case 0xF5:
 501         c = 0x30AB;
 502         break;
 503       case 0xF6:
 504         c = 0x30B1;
 505         break;
 506       }
 507     }
 508     resBuf[i] = c;
 509   }
 510   return sourceLen;
 511 }
 512
 513 static
 514 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 515   uint32_t i = 0;
 516   UChar c;
 517
 518   if(U_FAILURE(*status)) {
 519     return 0;
 520   }
 521
 522   if(sourceLen > resLen) {
 523     *status = U_MEMORY_ALLOCATION_ERROR;
 524     return 0;
 525   }
 526
 527   for(i = 0; i < sourceLen; i++) {
 528     c = source[i];
 529     if(0x3042 < c && c < 0x30ef) { /* Kana range */
 530       switch(c - 0x3000) {
 531       case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 532       case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 533         c--;
 534         break;
 535       case 0xAB:
 536         c = 0x30F5;
 537         break;
 538       case 0xB1:
 539         c = 0x30F6;
 540         break;
 541       }
 542     }
 543     resBuf[i] = c;
 544   }
 545   return sourceLen;
 546 }
 547
 548 static
 549 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 550   uint32_t i = 0;
 551   UChar n[128];
 552   uint32_t nLen = 0;
 553   uint32_t uCount = 0, lCount = 0;
 554
 555   collIterate s;
 556   uint32_t order = 0;
 557
 558   if(U_FAILURE(*status)) {
 559     return UCOL_LOWER_CASE;
 560   }
 561
 562   nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 563   if(U_SUCCESS(*status)) {
 564     for(i = 0; i < nLen; i++) {
 565       uprv_init_collIterate(UCA, &n[i], 1, &s);
 566       order = ucol_getNextCE(UCA, &s, status);
 567       if(isContinuation(order)) {
 568         *status = U_INTERNAL_PROGRAM_ERROR;
 569         return UCOL_LOWER_CASE;
 570       }
 571       if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 572         uCount++;
 573       } else {
 574         if(u_islower(n[i])) {
 575           lCount++;
 576         } else {
 577           UChar sk[1], lk[1];
 578           u_toSmallKana(&n[i], 1, sk, 1, status);
 579           u_toLargeKana(&n[i], 1, lk, 1, status);
 580           if(sk[0] == n[i] && lk[0] != n[i]) {
 581             lCount++;
 582           }
 583         }
 584       }
 585     }
 586   }
 587
 588   if(uCount != 0 && lCount != 0) {
 589     return UCOL_MIXED_CASE;
 590   } else if(uCount != 0) {
 591     return UCOL_UPPER_CASE;
 592   } else {
 593     return UCOL_LOWER_CASE;
 594   }
 595 }
 596
 597
 598 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 599   /* this one makes the table and stuff */
 600   uint32_t noOfBytes[3];
 601   uint32_t i;
 602
 603   for(i = 0; i<3; i++) {
 604     ucol_countBytes(CEparts[i], noOfBytes[i]);
 605   }
 606
 607   /* Here we have to pack CEs from parts */
 608
 609   uint32_t CEi = 0;
 610   uint32_t value = 0;
 611
 612   while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 613     if(CEi > 0) {
 614       value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 615     } else {
 616       value = 0;
 617     }
 618
 619     if(2*CEi<noOfBytes[0]) {
 620       value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 621     }
 622     if(CEi<noOfBytes[1]) {
 623       value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 624     }
 625     if(CEi<noOfBytes[2]) {
 626       value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 627     }
 628     tok->CEs[CEi] = value;
 629     CEi++;
 630   }
 631   if(CEi == 0) { /* totally ignorable */
 632     tok->noOfCEs = 1;
 633     tok->CEs[0] = 0;
 634   } else { /* there is at least something */
 635     tok->noOfCEs = CEi;
 636   }
 637
 638
 639   // we want to set case bits here and now, not later.
 640   // Case bits handling
 641   tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 642   int32_t cSize = (tok->source & 0xFF000000) >> 24;
 643   UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 644
 645   if(cSize > 1) {
 646     // Do it manually
 647     tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 648   } else {
 649     // Copy it from the UCA
 650     uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 651     tok->CEs[0] |= (caseCE & 0xC0);
 652   }
 653
 654 #if UCOL_DEBUG==2
 655   fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 656   for(i = 0; i<tok->noOfCEs; i++) {
 657     fprintf(stderr, "%08X ", tok->CEs[i]);
 658   }
 659   fprintf(stderr, "\n");
 660 #endif
 661 }
 662
 663 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 664   ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 665   uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 666
 667   UColToken *tok = lh->last;
 668   uint32_t t[UCOL_STRENGTH_LIMIT];
 669
 670   uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 671
 672   tok->toInsert = 1;
 673   t[tok->strength] = 1;
 674
 675   while(tok->previous != NULL) {
 676     if(tok->previous->strength < tok->strength) { /* going up */
 677       t[tok->strength] = 0;
 678       t[tok->previous->strength]++;
 679     } else if(tok->previous->strength > tok->strength) { /* going down */
 680       t[tok->previous->strength] = 1;
 681     } else {
 682       t[tok->strength]++;
 683     }
 684     tok=tok->previous;
 685     tok->toInsert = t[tok->strength];
 686   }
 687
 688   tok->toInsert = t[tok->strength];
 689   ucol_inv_getGapPositions(src, lh, status);
 690
 691 #if UCOL_DEBUG
 692   fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 693   int32_t j = 2;
 694   for(j = 2; j >= 0; j--) {
 695     fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 696     fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 697   }
 698   tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 699
 700   do {
 701     fprintf(stderr,"%i", tok->strength);
 702     tok = tok->next;
 703   } while(tok != NULL);
 704   fprintf(stderr, "\n");
 705
 706   tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 707
 708   do {
 709     fprintf(stderr,"%i", tok->toInsert);
 710     tok = tok->next;
 711   } while(tok != NULL);
 712 #endif
 713
 714   tok = lh->first;
 715   uint32_t fStrength = UCOL_IDENTICAL;
 716   uint32_t initStrength = UCOL_IDENTICAL;
 717
 718
 719   CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 720   CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 721   CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 722
 723   while (tok != NULL && U_SUCCESS(*status)) {
 724     fStrength = tok->strength;
 725     if(fStrength < initStrength) {
 726       initStrength = fStrength;
 727       if(lh->pos[fStrength] == -1) {
 728         while(lh->pos[fStrength] == -1 && fStrength > 0) {
 729           fStrength--;
 730         }
 731         if(lh->pos[fStrength] == -1) {
 732           *status = U_INTERNAL_PROGRAM_ERROR;
 733           return;
 734         }
 735       }
 736       if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 737         CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 738         CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 739         /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 740         CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 741       } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 742         CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 743         /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 744         CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 745         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 746       } else { /* primaries */
 747         /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 748         CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 749         CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 750         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 751       }
 752     } else {
 753       if(tok->strength == UCOL_TERTIARY) {
 754         CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 755       } else if(tok->strength == UCOL_SECONDARY) {
 756         CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 757         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 758       } else if(tok->strength == UCOL_PRIMARY) {
 759         CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 760         CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 761         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 762       }
 763     }
 764     ucol_doCE(src, CEparts, tok, status);
 765     tok = tok->next;
 766   }
 767 }
 768
 769 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 770   UCAElements el;
 771   UColToken *tok = lh->first;
 772   UColToken *expt = NULL;
 773   uint32_t i = 0, j = 0;
 774
 775   while(tok != NULL && U_SUCCESS(*status)) {
 776     /* first, check if there are any expansions */
 777     /* if there are expansions, we need to do a little bit more processing */
 778     /* since parts of expansion can be tailored, while others are not */
 779     if(tok->expansion != 0) {
 780       uint32_t len = tok->expansion >> 24;
 781       uint32_t currentSequenceLen = len;
 782       uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 783       //uint32_t exp = currentSequenceLen | expOffset;
 784       UColToken exp;
 785       exp.source = currentSequenceLen | expOffset;
 786       exp.rulesToParse = src->source;
 787
 788       while(len > 0) {
 789         currentSequenceLen = len;
 790         while(currentSequenceLen > 0) {
 791           exp.source = (currentSequenceLen << 24) | expOffset;
 792           if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 793             uint32_t noOfCEsToCopy = expt->noOfCEs;
 794             for(j = 0; j<noOfCEsToCopy; j++) {
 795               tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 796             }
 797             tok->noOfExpCEs += noOfCEsToCopy;
 798             // Smart people never try to add codepoints and CEs.
 799             // For some odd reason, it won't work.
 800             expOffset += currentSequenceLen; //noOfCEsToCopy;
 801             len -= currentSequenceLen; //noOfCEsToCopy;
 802             break;
 803           } else {
 804             currentSequenceLen--;
 805           }
 806         }
 807         if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 808           /* will have to get one from UCA */
 809           /* first, get the UChars from the rules */
 810           /* then pick CEs out until there is no more and stuff them into expansion */
 811           collIterate s;
 812           uint32_t order = 0;
 813           uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
 814
 815           for(;;) {
 816             order = ucol_getNextCE(src->UCA, &s, status);
 817             if(order == UCOL_NO_MORE_CES) {
 818                 break;
 819             }
 820             tok->expCEs[tok->noOfExpCEs++] = order;
 821           }
 822           expOffset++;
 823           len--;
 824         }
 825       }
 826     } else {
 827       tok->noOfExpCEs = 0;
 828     }
 829
 830     /* set the ucaelement with obtained values */
 831     el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 832     /* copy CEs */
 833     for(i = 0; i<tok->noOfCEs; i++) {
 834       el.CEs[i] = tok->CEs[i];
 835     }
 836     for(i = 0; i<tok->noOfExpCEs; i++) {
 837       el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 838     }
 839
 840     /* copy UChars */
 841     // We kept prefix and source kind of together, as it is a kind of a contraction.
 842     // However, now we have to slice the prefix off the main thing -
 843     el.prefix = el.prefixChars;
 844     el.cPoints = el.uchars;
 845     if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 846       // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 847       // decomposed elements to the unsaf table.
 848       el.prefixSize = tok->prefix>>24;
 849       uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 850
 851       el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 852       uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 853     } else {
 854       el.prefixSize = 0;
 855       *el.prefix = 0;
 856
 857       el.cSize = (tok->source >> 24);
 858       uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 859     }
 860
 861     if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
 862       el.isThai = TRUE;
 863     } else {
 864       el.isThai = FALSE;
 865     }
 866
 867     if(src->UCA != NULL) {
 868       for(i = 0; i<el.cSize; i++) {
 869         if(UCOL_ISJAMO(el.cPoints[i])) {
 870           t->image->jamoSpecial = TRUE;
 871         }
 872       }
 873     }
 874
 875 #if 0
 876     // we do case bits in doCE now, since we will mess up expansions otherwise.
 877     // Case bits handling
 878     el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 879     if(el.cSize > 1) {
 880       // Do it manually
 881       el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
 882     } else {
 883       // Copy it from the UCA
 884       uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
 885       el.CEs[0] |= (caseCE & 0xC0);
 886     }
 887 #endif
 888
 889     /* and then, add it */
 890 #if UCOL_DEBUG==2
 891     fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 892 #endif
 893     uprv_uca_addAnElement(t, &el, status);
 894
 895 #if 0
 896     if(el.cSize > 1) { // this is a contraction, we should check whether a composed form should also be included
 897       UChar composed[256];
 898       uint32_t compLen = unorm_normalize(el.cPoints, el.cSize, UNORM_NFC, 0, composed, 256, status);;
 899
 900       if(compLen != el.cSize || uprv_memcmp(composed, el.cPoints, el.cSize*sizeof(UChar))) {
 901         // composed form of a contraction is different than the decomposed form!
 902         // do it!
 903 #ifdef UCOL_DEBUG
 904         fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed);
 905 #endif
 906         el.cSize = compLen;
 907         uprv_memcpy(el.cPoints, composed, el.cSize*sizeof(UChar));
 908         uprv_uca_addAnElement(t, &el, status);
 909       }
 910     }
 911 #endif
 912
 913 #if UCOL_DEBUG_DUPLICATES
 914     if(*status != U_ZERO_ERROR) {
 915       fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 916       *status = U_ZERO_ERROR;
 917     }
 918 #endif
 919
 920     tok = tok->next;
 921   }
 922 }
 923
 924 U_CDECL_BEGIN
 925 static UBool U_CALLCONV
 926 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 927   UErrorCode status = U_ZERO_ERROR;
 928   tempUCATable *t = (tempUCATable *)context;
 929   if(value == 0) {
 930     while(start < limit) {
 931       uint32_t CE = utrie_get32(t->mapping, start, NULL);
 932       if(CE == UCOL_NOT_FOUND) {
 933         UCAElements el;
 934         el.isThai = FALSE;
 935         el.prefixSize = 0;
 936         el.prefixChars[0] = 0;
 937         el.prefix = el.prefixChars;
 938         el.cPoints = el.uchars;
 939
 940         el.cSize = 0;
 941         UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
 942
 943         el.noOfCEs = 1;
 944         el.CEs[0] = 0;
 945         uprv_uca_addAnElement(t, &el, &status);
 946
 947       }
 948       start++;
 949     }
 950   }
 951   if(U_FAILURE(status)) {
 952     return FALSE;
 953   } else {
 954     return TRUE;
 955   }
 956 }
 957 U_CDECL_END
 958
 959 static void
 960 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
 961                                UChar32 start, UChar32 end,
 962                                UErrorCode *status) {
 963   //UChar decomp[256];
 964   uint32_t CE = UCOL_NOT_FOUND;
 965   UChar32 u = 0;
 966   UCAElements el;
 967   el.isThai = FALSE;
 968   el.prefixSize = 0;
 969   el.prefixChars[0] = 0;
 970   collIterate colIt;
 971
 972   if(U_SUCCESS(*status)) {
 973     for(u = start; u<=end; u++) {
 974       if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
 975         /* this test is for contractions that are missing the starting element. */
 976          || ((isCntTableElement(CE)) &&
 977         (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
 978         ) {
 979         el.cSize = 0;
 980         U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
 981         //decomp[0] = (UChar)u;
 982         //el.uchars[0] = (UChar)u;
 983         el.cPoints = el.uchars;
 984         //el.cSize = 1;
 985         el.noOfCEs = 0;
 986         el.prefix = el.prefixChars;
 987         el.prefixSize = 0;
 988         //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
 989         // We actually want to check whether this element is a special
 990         // If it is an implicit element (hangul, CJK - we want to copy the
 991         // special, not the resolved CEs) - for hangul, copying resolved
 992         // would just make things the same (there is an expansion and it
 993         // takes approximately the same amount of time to resolve as
 994         // falling back to the UCA).
 995         /*
 996         UTRIE_GET32(src->UCA->mapping, u, CE);
 997         tag = getCETag(CE);
 998         if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
 999           || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1000           || tag == LEAD_SURROGATE_TAG) {
1001           el.CEs[el.noOfCEs++] = CE;
1002         } else {
1003         */
1004         // It turns out that it does not make sense to keep implicits
1005         // unresolved. The cost of resolving them is big enough so that
1006         // it doesn't make any difference whether we have to go to the UCA
1007         // or not.
1008         {
1009           uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1010           while(CE != UCOL_NO_MORE_CES) {
1011             CE = ucol_getNextCE(src->UCA, &colIt, status);
1012             if(CE != UCOL_NO_MORE_CES) {
1013               el.CEs[el.noOfCEs++] = CE;
1014             }
1015           }
1016         }
1017         uprv_uca_addAnElement(t, &el, status);
1018       }
1019     }
1020   }
1021 }
1022
1023 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1024   uint32_t i = 0;
1025   if(U_FAILURE(*status)) {
1026     return NULL;
1027   }
1028 /*
1029 2.  Eliminate the negative lists by doing the following for each non-null negative list:
1030     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1031     create new ListHeader X
1032     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1033     first item you add, based on the stronger strength levels of the two lists.
1034 */
1035 /*
1036 3.  For each ListHeader with a non-null positive list:
1037 */
1038 /*
1039     o   Find all character strings with CEs between the baseCE and the
1040     next/previous CE, at the strength of the first token. Add these to the
1041     tailoring.
1042       ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1043       tailoring has & x < z...
1044       ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1045 */
1046   /* It is possible that this part should be done even while constructing list */
1047   /* The problem is that it is unknown what is going to be the strongest weight */
1048   /* So we might as well do it here */
1049
1050 /*
1051     o   Allocate CEs for each token in the list, based on the total number N of the
1052     largest level difference, and the gap G between baseCE and nextCE at that
1053     level. The relation * between the last item and nextCE is the same as the
1054     strongest strength.
1055     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1056       ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1057       Then fit b and c into the secondary gap between a and d, then fit q
1058       into the tertiary gap between b and c.
1059
1060     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1061       ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1062       Then fit q into the tertiary gap between b and c.
1063     o   When incrementing primary values, we will not cross high byte
1064     boundaries except where there is only a single-byte primary. That is to
1065     ensure that the script reordering will continue to work.
1066 */
1067   UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1068   /* test for NULL */
1069   if (image == NULL) {
1070     *status = U_MEMORY_ALLOCATION_ERROR;
1071     return NULL;
1072   }
1073   uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1074
1075   for(i = 0; i<src->resultLen; i++) {
1076     /* now we need to generate the CEs */
1077     /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1078     /* According to strength                                                          */
1079     if(U_SUCCESS(*status)) {
1080       if(src->lh[i].first) { // if there are any elements
1081         // due to the way parser works, subsequent tailorings
1082         // may remove all the elements from a sequence, therefore
1083         // leaving an empty tailoring sequence.
1084         ucol_initBuffers(src, &src->lh[i], status);
1085       }
1086     }
1087     if(U_FAILURE(*status)) {
1088       return NULL;
1089     }
1090
1091   }
1092
1093   if(src->varTop != NULL) { /* stuff the variable top value */
1094     src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1095     /* remove it from the list */
1096     if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1097       src->varTop->listHeader->first = src->varTop->next;
1098     }
1099     if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1100       src->varTop->listHeader->last = src->varTop->previous;
1101     }
1102     if(src->varTop->next != NULL) {
1103       src->varTop->next->previous = src->varTop->previous;
1104     }
1105     if(src->varTop->previous != NULL) {
1106       src->varTop->previous->next = src->varTop->next;
1107     }
1108   }
1109
1110
1111   tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1112
1113
1114   /* After this, we have assigned CE values to all regular CEs      */
1115   /* now we will go through list once more and resolve expansions,  */
1116   /* make UCAElements structs and add them to table                 */
1117   for(i = 0; i<src->resultLen; i++) {
1118     /* now we need to generate the CEs */
1119     /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1120     /* According to strength                                                          */
1121     if(U_SUCCESS(*status)) {
1122       ucol_createElements(src, t, &src->lh[i], status);
1123     }
1124   }
1125
1126   UCAElements el;
1127   el.isThai = FALSE;
1128   el.prefixSize = 0;
1129   el.prefixChars[0] = 0;
1130
1131   /* add latin-1 stuff */
1132   ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1133
1134   /* add stuff for copying */
1135   if(src->copySet != NULL) {
1136     int32_t i = 0;
1137     UnicodeSet *set = (UnicodeSet *)src->copySet;
1138     for(i = 0; i < set->getRangeCount(); i++) {
1139       ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1140     }
1141   }
1142
1143   if(U_SUCCESS(*status)) {
1144     /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1145
1146     uint32_t tailoredCE = UCOL_NOT_FOUND;
1147     //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1148     UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1149     UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1150     while(*conts != 0) {
1151       /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1152       tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1153       if(tailoredCE != UCOL_NOT_FOUND) {
1154         UBool needToAdd = TRUE;
1155         if(isCntTableElement(tailoredCE)) {
1156           if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1157             needToAdd = FALSE;
1158           }
1159         }
1160         if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1161           needToAdd = FALSE;
1162         }
1163
1164         if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1165           el.prefix = el.prefixChars;
1166           el.prefixSize = 0;
1167           el.cPoints = el.uchars;
1168           el.noOfCEs = 0;
1169           el.uchars[0] = *conts;
1170           el.uchars[1] = *(conts+1);
1171           if(*(conts+2)!=0) {
1172             el.uchars[2] = *(conts+2);
1173             el.cSize = 3;
1174           } else {
1175             el.cSize = 2;
1176           }
1177           ucol_setText(ucaEl, el.uchars, el.cSize, status);
1178           while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1179             el.noOfCEs++;
1180           }
1181           uprv_uca_addAnElement(t, &el, status);
1182         }
1183
1184       } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1185         ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1186       }
1187       conts+=3;
1188     }
1189     ucol_closeElements(ucaEl);
1190   }
1191
1192   // Add completely ignorable elements
1193   utrie_enum(t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1194
1195
1196   // canonical closure
1197   uprv_uca_canonicalClosure(t, status);
1198
1199
1200     /* still need to produce compatibility closure */
1201
1202   UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1203
1204   uprv_uca_closeTempTable(t);
1205   uprv_free(image);
1206
1207   return myData;
1208 }
1209
1210 U_CDECL_BEGIN
1211 static UBool U_CALLCONV
1212 ucol_bld_cleanup(void)
1213 {
1214     udata_close(invUCA_DATA_MEM);
1215     invUCA_DATA_MEM = NULL;
1216     _staticInvUCA = NULL;
1217     return TRUE;
1218 }
1219 U_CDECL_END
1220
1221 U_CAPI const InverseUCATableHeader * U_EXPORT2
1222 ucol_initInverseUCA(UErrorCode *status)
1223 {
1224     if(U_FAILURE(*status)) return NULL;
1225
1226     umtx_lock(NULL);
1227     UBool f = (_staticInvUCA == NULL);
1228     umtx_unlock(NULL);
1229
1230     if(f) {
1231         InverseUCATableHeader *newInvUCA = NULL;
1232         UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1233
1234         if(U_FAILURE(*status)) {
1235             if (result) {
1236                 udata_close(result);
1237             }
1238             // This is not needed, as we are talking about
1239             // memory we got from UData
1240             //uprv_free(newInvUCA);
1241         }
1242
1243         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1244             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1245             UCollator *UCA = ucol_initUCA(status);
1246             // UCA versions of UCA and inverse UCA should match
1247             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1248               *status = U_INVALID_FORMAT_ERROR;
1249               udata_close(result);
1250               return NULL;
1251             }
1252
1253             umtx_lock(NULL);
1254             if(_staticInvUCA == NULL) {
1255                 _staticInvUCA = newInvUCA;
1256                 invUCA_DATA_MEM = result;
1257                 result = NULL;
1258                 newInvUCA = NULL;
1259             }
1260             umtx_unlock(NULL);
1261
1262             if(newInvUCA != NULL) {
1263                 udata_close(result);
1264                 // This is not needed, as we are talking about
1265                 // memory we got from UData
1266                 //uprv_free(newInvUCA);
1267             }
1268             else {
1269                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1270             }
1271         }
1272     }
1273     return _staticInvUCA;
1274 }
1275
1276 #endif /* #if !UCONFIG_NO_COLLATION */