icuSources/i18n/ucol_bld.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2006, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucol_bld.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 * This module builds a collator based on the rule set.
  17 *
  18 */
  19
  20 #include "unicode/utypes.h"
  21
  22 #if !UCONFIG_NO_COLLATION
  23
  24 #include "unicode/ucoleitr.h"
  25 #include "unicode/uchar.h"
  26 #include "ucol_bld.h"
  27 #include "ucln_in.h"
  28 #include "umutex.h"
  29 #include "unicode/uniset.h"
  30
  31 static const InverseUCATableHeader* _staticInvUCA = NULL;
  32 static UDataMemory* invUCA_DATA_MEM = NULL;
  33
  34 U_CDECL_BEGIN
  35 static UBool U_CALLCONV
  36 isAcceptableInvUCA(void * /*context*/,
  37              const char * /*type*/, const char * /*name*/,
  38              const UDataInfo *pInfo){
  39   /* context, type & name are intentionally not used */
  40     if( pInfo->size>=20 &&
  41         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
  42         pInfo->charsetFamily==U_CHARSET_FAMILY &&
  43         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
  44         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
  45         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
  46         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
  47         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
  48         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
  49         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
  50         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
  51         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
  52         ) {
  53         UVersionInfo UCDVersion;
  54         u_getUnicodeVersion(UCDVersion);
  55         if(pInfo->dataVersion[0]==UCDVersion[0] &&
  56         pInfo->dataVersion[1]==UCDVersion[1]) {
  57         //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
  58         //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
  59         //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
  60           return TRUE;
  61         } else {
  62           return FALSE;
  63         }
  64     } else {
  65         return FALSE;
  66     }
  67 }
  68 U_CDECL_END
  69
  70 /*
  71  * Takes two CEs (lead and continuation) and
  72  * compares them as CEs should be compared:
  73  * primary vs. primary, secondary vs. secondary
  74  * tertiary vs. tertiary
  75  */
  76 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
  77   uint32_t s1 = source0, s2, t1 = target0, t2;
  78   if(isContinuation(source1)) {
  79     s2 = source1;
  80   } else {
  81     s2 = 0;
  82   }
  83   if(isContinuation(target1)) {
  84     t2 = target1;
  85   } else {
  86     t2 = 0;
  87   }
  88
  89   uint32_t s = 0, t = 0;
  90   if(s1 == t1 && s2 == t2) {
  91     return 0;
  92   }
  93   s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
  94   t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
  95   if(s < t) {
  96     return -1;
  97   } else if(s > t) {
  98     return 1;
  99   } else {
 100     s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
 101     t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
 102     if(s < t) {
 103       return -1;
 104     } else if(s > t) {
 105       return 1;
 106     } else {
 107       s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
 108       t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
 109       if(s < t) {
 110         return -1;
 111       } else {
 112         return 1;
 113       }
 114     }
 115   }
 116 }
 117
 118 static
 119 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
 120   uint32_t bottom = 0, top = src->invUCA->tableSize;
 121   uint32_t i = 0;
 122   uint32_t first = 0, second = 0;
 123   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 124   int32_t res = 0;
 125
 126   while(bottom < top-1) {
 127     i = (top+bottom)/2;
 128     first = *(CETable+3*i);
 129     second = *(CETable+3*i+1);
 130     res = compareCEs(first, second, CE, SecondCE);
 131     if(res > 0) {
 132       top = i;
 133     } else if(res < 0) {
 134       bottom = i;
 135     } else {
 136       break;
 137     }
 138   }
 139
 140   /* weiv:                                                  */
 141   /* in searching for elements, I have removed the failure  */
 142   /* The reason for this is that the builder does not rely  */
 143   /* on search mechanism telling it that it didn't find an  */
 144   /* element. However, indirect positioning relies on being */
 145   /* able to find the elements around any CE, even if it is */
 146   /* not defined in the UCA. */
 147   return i;
 148 /*
 149   if((first == CE && second == SecondCE)) {
 150     return i;
 151   } else {
 152     return -1;
 153   }
 154 */
 155 }
 156
 157 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
 158   0xFFFF0000,
 159   0xFFFFFF00,
 160   0xFFFFFFFF
 161 };
 162
 163 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
 164                                             uint32_t CE, uint32_t contCE,
 165                                             uint32_t *nextCE, uint32_t *nextContCE,
 166                                             uint32_t strength) {
 167   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 168   int32_t iCE;
 169
 170   iCE = ucol_inv_findCE(src, CE, contCE);
 171
 172   if(iCE<0) {
 173     *nextCE = UCOL_NOT_FOUND;
 174     return -1;
 175   }
 176
 177   CE &= strengthMask[strength];
 178   contCE &= strengthMask[strength];
 179
 180   *nextCE = CE;
 181   *nextContCE = contCE;
 182
 183   while((*nextCE  & strengthMask[strength]) == CE
 184     && (*nextContCE  & strengthMask[strength]) == contCE) {
 185     *nextCE = (*(CETable+3*(++iCE)));
 186     *nextContCE = (*(CETable+3*(iCE)+1));
 187   }
 188
 189   return iCE;
 190 }
 191
 192 U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
 193                                             uint32_t CE, uint32_t contCE,
 194                                             uint32_t *prevCE, uint32_t *prevContCE,
 195                                             uint32_t strength) {
 196   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 197   int32_t iCE;
 198
 199   iCE = ucol_inv_findCE(src, CE, contCE);
 200
 201   if(iCE<0) {
 202     *prevCE = UCOL_NOT_FOUND;
 203     return -1;
 204   }
 205
 206   CE &= strengthMask[strength];
 207   contCE &= strengthMask[strength];
 208
 209   *prevCE = CE;
 210   *prevContCE = contCE;
 211
 212   while((*prevCE  & strengthMask[strength]) == CE
 213     && (*prevContCE  & strengthMask[strength])== contCE
 214     && iCE > 0) { /* this condition should prevent falling off the edge of the world */
 215     /* here, we end up in a singularity - zero */
 216     *prevCE = (*(CETable+3*(--iCE)));
 217     *prevContCE = (*(CETable+3*(iCE)+1));
 218   }
 219
 220   return iCE;
 221 }
 222
 223 U_CAPI uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
 224                                             uint32_t prevCE, uint32_t prevContCE)
 225 {
 226     if(prevCE == CE && prevContCE == contCE) {
 227       return UCOL_IDENTICAL;
 228     }
 229     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
 230       || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) {
 231       return UCOL_PRIMARY;
 232     }
 233     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
 234       || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) {
 235       return UCOL_SECONDARY;
 236     }
 237     return UCOL_TERTIARY;
 238 }
 239
 240
 241 static
 242 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 243
 244   uint32_t CE = lh->baseCE;
 245   uint32_t SecondCE = lh->baseContCE;
 246
 247   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 248   uint32_t previousCE, previousContCE;
 249   int32_t iCE;
 250
 251   iCE = ucol_inv_findCE(src, CE, SecondCE);
 252
 253   if(iCE<0) {
 254     return -1;
 255   }
 256
 257   CE &= strengthMask[strength];
 258   SecondCE &= strengthMask[strength];
 259
 260   previousCE = CE;
 261   previousContCE = SecondCE;
 262
 263   while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
 264     previousCE = (*(CETable+3*(--iCE)));
 265     previousContCE = (*(CETable+3*(iCE)+1));
 266   }
 267   lh->previousCE = previousCE;
 268   lh->previousContCE = previousContCE;
 269
 270   return iCE;
 271 }
 272
 273 static
 274 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
 275   uint32_t CE = lh->baseCE;
 276   uint32_t SecondCE = lh->baseContCE;
 277
 278   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 279   uint32_t nextCE, nextContCE;
 280   int32_t iCE;
 281
 282   iCE = ucol_inv_findCE(src, CE, SecondCE);
 283
 284   if(iCE<0) {
 285     return -1;
 286   }
 287
 288   CE &= strengthMask[strength];
 289   SecondCE &= strengthMask[strength];
 290
 291   nextCE = CE;
 292   nextContCE = SecondCE;
 293
 294   while((nextCE  & strengthMask[strength]) == CE
 295     && (nextContCE  & strengthMask[strength]) == SecondCE) {
 296     nextCE = (*(CETable+3*(++iCE)));
 297     nextContCE = (*(CETable+3*(iCE)+1));
 298   }
 299
 300   lh->nextCE = nextCE;
 301   lh->nextContCE = nextContCE;
 302
 303   return iCE;
 304 }
 305
 306 U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 307   /* reset all the gaps */
 308   int32_t i = 0;
 309   uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
 310   uint32_t st = 0;
 311   uint32_t t1, t2;
 312   int32_t pos;
 313
 314   UColToken *tok = lh->first;
 315   uint32_t tokStrength = tok->strength;
 316
 317   for(i = 0; i<3; i++) {
 318     lh->gapsHi[3*i] = 0;
 319     lh->gapsHi[3*i+1] = 0;
 320     lh->gapsHi[3*i+2] = 0;
 321     lh->gapsLo[3*i] = 0;
 322     lh->gapsLo[3*i+1] = 0;
 323     lh->gapsLo[3*i+2] = 0;
 324     lh->numStr[i] = 0;
 325     lh->fStrToken[i] = NULL;
 326     lh->lStrToken[i] = NULL;
 327     lh->pos[i] = -1;
 328   }
 329
 330   UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
 331
 332   if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
 333   //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
 334     lh->pos[0] = 0;
 335     t1 = lh->baseCE;
 336     t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
 337     lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 338     lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 339     lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 340     uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
 341     primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
 342
 343     t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
 344     t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
 345
 346     lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 347     lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 348     lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 349   } else if(lh->indirect == TRUE && lh->nextCE != 0) {
 350   //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
 351     lh->pos[0] = 0;
 352     t1 = lh->baseCE;
 353     t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
 354     lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 355     lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 356     lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 357     t1 = lh->nextCE;
 358     t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
 359     lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 360     lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 361     lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 362   } else {
 363     for(;;) {
 364       if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 365         if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
 366           lh->fStrToken[tokStrength] = tok;
 367         } else { /* The CE must be implicit, since it's not in the table */
 368           /* Error */
 369           *status = U_INTERNAL_PROGRAM_ERROR;
 370         }
 371       }
 372
 373       while(tok != NULL && tok->strength >= tokStrength) {
 374         if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
 375           lh->lStrToken[tokStrength] = tok;
 376         }
 377         tok = tok->next;
 378       }
 379       if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
 380         /* check if previous interval is the same and merge the intervals if it is so */
 381         if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
 382           lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
 383           lh->fStrToken[tokStrength+1] = NULL;
 384           lh->lStrToken[tokStrength+1] = NULL;
 385           lh->pos[tokStrength+1] = -1;
 386         }
 387       }
 388       if(tok != NULL) {
 389         tokStrength = tok->strength;
 390       } else {
 391         break;
 392       }
 393     }
 394     for(st = 0; st < 3; st++) {
 395       if((pos = lh->pos[st]) >= 0) {
 396         t1 = *(CETable+3*(pos));
 397         t2 = *(CETable+3*(pos)+1);
 398         lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 399         lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 400         //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
 401         lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 402         //pos--;
 403         //t1 = *(CETable+3*(pos));
 404         //t2 = *(CETable+3*(pos)+1);
 405         t1 = lh->baseCE;
 406         t2 = lh->baseContCE;
 407         lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
 408         lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
 409         lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
 410       }
 411     }
 412   }
 413 }
 414
 415
 416 #define ucol_countBytes(value, noOfBytes)   \
 417 {                               \
 418   uint32_t mask = 0xFFFFFFFF;   \
 419   (noOfBytes) = 0;              \
 420   while(mask != 0) {            \
 421     if(((value) & mask) != 0) { \
 422       (noOfBytes)++;            \
 423     }                           \
 424     mask >>= 8;                 \
 425   }                             \
 426 }
 427
 428 U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
 429   if(U_SUCCESS(*status)) {
 430   g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 431   }
 432   return g->current;
 433 }
 434
 435 U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
 436 /* TODO: rename to enum names */
 437   uint32_t high, low, count=1;
 438   uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
 439
 440   if(strength == UCOL_SECONDARY) {
 441     low = UCOL_COMMON_TOP2<<24;
 442     high = 0xFFFFFFFF;
 443     count = 0xFF - UCOL_COMMON_TOP2;
 444   } else {
 445     low = UCOL_BYTE_COMMON << 24; //0x05000000;
 446     high = 0x40000000;
 447     count = 0x40 - UCOL_BYTE_COMMON;
 448   }
 449
 450   if(tok->next != NULL && tok->next->strength == strength) {
 451     count = tok->next->toInsert;
 452   }
 453
 454   g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 455   g->current = UCOL_BYTE_COMMON<<24;
 456
 457   if(g->noOfRanges == 0) {
 458     *status = U_INTERNAL_PROGRAM_ERROR;
 459   }
 460   return g->current;
 461 }
 462
 463 U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
 464   uint32_t strength = tok->strength;
 465   uint32_t low = lows[fStrength*3+strength];
 466   uint32_t high = highs[fStrength*3+strength];
 467   uint32_t maxByte = 0;
 468   if(strength == UCOL_TERTIARY) {
 469       maxByte = 0x3F;
 470   } else if(strength == UCOL_PRIMARY) {
 471       maxByte = 0xFE;
 472   } else {
 473       maxByte = 0xFF;
 474   }
 475
 476   uint32_t count = tok->toInsert;
 477
 478   if(low >= high && strength > UCOL_PRIMARY) {
 479     int32_t s = strength;
 480     for(;;) {
 481       s--;
 482       if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
 483         if(strength == UCOL_SECONDARY) {
 484           low = UCOL_COMMON_TOP2<<24;
 485           high = 0xFFFFFFFF;
 486         } else {
 487           //low = 0x02000000; // This needs to be checked - what if low is
 488           // not good...
 489           high = 0x40000000;
 490         }
 491         break;
 492       }
 493       if(s<0) {
 494         *status = U_INTERNAL_PROGRAM_ERROR;
 495         return 0;
 496       }
 497     }
 498   }
 499
 500   if(low == 0) {
 501     low = 0x01000000;
 502   }
 503
 504   if(strength == UCOL_SECONDARY) { /* similar as simple */
 505     if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 506       low = UCOL_COMMON_TOP2<<24;
 507     }
 508     if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
 509       high = UCOL_COMMON_TOP2<<24;
 510     }
 511     if(low < (UCOL_COMMON_BOT2<<24)) {
 512       g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
 513       g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 514       //g->current = UCOL_COMMON_BOT2<<24;
 515       return g->current;
 516     }
 517   }
 518
 519   g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
 520   if(g->noOfRanges == 0) {
 521     *status = U_INTERNAL_PROGRAM_ERROR;
 522   }
 523   g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
 524   return g->current;
 525 }
 526
 527 static
 528 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 529   uint32_t i = 0;
 530   UChar c;
 531
 532   if(U_FAILURE(*status)) {
 533     return 0;
 534   }
 535
 536   if(sourceLen > resLen) {
 537     *status = U_MEMORY_ALLOCATION_ERROR;
 538     return 0;
 539   }
 540
 541   for(i = 0; i < sourceLen; i++) {
 542     c = source[i];
 543     if(0x3042 < c && c < 0x30ef) { /* Kana range */
 544       switch(c - 0x3000) {
 545       case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
 546       case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
 547         c++;
 548         break;
 549       case 0xF5:
 550         c = 0x30AB;
 551         break;
 552       case 0xF6:
 553         c = 0x30B1;
 554         break;
 555       }
 556     }
 557     resBuf[i] = c;
 558   }
 559   return sourceLen;
 560 }
 561
 562 static
 563 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
 564   uint32_t i = 0;
 565   UChar c;
 566
 567   if(U_FAILURE(*status)) {
 568     return 0;
 569   }
 570
 571   if(sourceLen > resLen) {
 572     *status = U_MEMORY_ALLOCATION_ERROR;
 573     return 0;
 574   }
 575
 576   for(i = 0; i < sourceLen; i++) {
 577     c = source[i];
 578     if(0x3042 < c && c < 0x30ef) { /* Kana range */
 579       switch(c - 0x3000) {
 580       case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
 581       case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
 582         c--;
 583         break;
 584       case 0xAB:
 585         c = 0x30F5;
 586         break;
 587       case 0xB1:
 588         c = 0x30F6;
 589         break;
 590       }
 591     }
 592     resBuf[i] = c;
 593   }
 594   return sourceLen;
 595 }
 596
 597 static
 598 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
 599   uint32_t i = 0;
 600   UChar n[128];
 601   uint32_t nLen = 0;
 602   uint32_t uCount = 0, lCount = 0;
 603
 604   collIterate s;
 605   uint32_t order = 0;
 606
 607   if(U_FAILURE(*status)) {
 608     return UCOL_LOWER_CASE;
 609   }
 610
 611   nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
 612   if(U_SUCCESS(*status)) {
 613     for(i = 0; i < nLen; i++) {
 614       uprv_init_collIterate(UCA, &n[i], 1, &s);
 615       order = ucol_getNextCE(UCA, &s, status);
 616       if(isContinuation(order)) {
 617         *status = U_INTERNAL_PROGRAM_ERROR;
 618         return UCOL_LOWER_CASE;
 619       }
 620       if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
 621         uCount++;
 622       } else {
 623         if(u_islower(n[i])) {
 624           lCount++;
 625         } else {
 626           UChar sk[1], lk[1];
 627           u_toSmallKana(&n[i], 1, sk, 1, status);
 628           u_toLargeKana(&n[i], 1, lk, 1, status);
 629           if(sk[0] == n[i] && lk[0] != n[i]) {
 630             lCount++;
 631           }
 632         }
 633       }
 634     }
 635   }
 636
 637   if(uCount != 0 && lCount != 0) {
 638     return UCOL_MIXED_CASE;
 639   } else if(uCount != 0) {
 640     return UCOL_UPPER_CASE;
 641   } else {
 642     return UCOL_LOWER_CASE;
 643   }
 644 }
 645
 646
 647 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
 648   /* this one makes the table and stuff */
 649   uint32_t noOfBytes[3];
 650   uint32_t i;
 651
 652   for(i = 0; i<3; i++) {
 653     ucol_countBytes(CEparts[i], noOfBytes[i]);
 654   }
 655
 656   /* Here we have to pack CEs from parts */
 657
 658   uint32_t CEi = 0;
 659   uint32_t value = 0;
 660
 661   while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
 662     if(CEi > 0) {
 663       value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
 664     } else {
 665       value = 0;
 666     }
 667
 668     if(2*CEi<noOfBytes[0]) {
 669       value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
 670     }
 671     if(CEi<noOfBytes[1]) {
 672       value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
 673     }
 674     if(CEi<noOfBytes[2]) {
 675       value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
 676     }
 677     tok->CEs[CEi] = value;
 678     CEi++;
 679   }
 680   if(CEi == 0) { /* totally ignorable */
 681     tok->noOfCEs = 1;
 682     tok->CEs[0] = 0;
 683   } else { /* there is at least something */
 684     tok->noOfCEs = CEi;
 685   }
 686
 687
 688   // we want to set case bits here and now, not later.
 689   // Case bits handling
 690   if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
 691     tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
 692     int32_t cSize = (tok->source & 0xFF000000) >> 24;
 693     UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
 694
 695     if(cSize > 1) {
 696       // Do it manually
 697       tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
 698     } else {
 699       // Copy it from the UCA
 700       uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
 701       tok->CEs[0] |= (caseCE & 0xC0);
 702     }
 703   }
 704
 705 #if UCOL_DEBUG==2
 706   fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
 707   for(i = 0; i<tok->noOfCEs; i++) {
 708     fprintf(stderr, "%08X ", tok->CEs[i]);
 709   }
 710   fprintf(stderr, "\n");
 711 #endif
 712 }
 713
 714 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
 715   ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
 716   uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
 717
 718   UColToken *tok = lh->last;
 719   uint32_t t[UCOL_STRENGTH_LIMIT];
 720
 721   uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
 722
 723   tok->toInsert = 1;
 724   t[tok->strength] = 1;
 725
 726   while(tok->previous != NULL) {
 727     if(tok->previous->strength < tok->strength) { /* going up */
 728       t[tok->strength] = 0;
 729       t[tok->previous->strength]++;
 730     } else if(tok->previous->strength > tok->strength) { /* going down */
 731       t[tok->previous->strength] = 1;
 732     } else {
 733       t[tok->strength]++;
 734     }
 735     tok=tok->previous;
 736     tok->toInsert = t[tok->strength];
 737   }
 738
 739   tok->toInsert = t[tok->strength];
 740   ucol_inv_getGapPositions(src, lh, status);
 741
 742 #if UCOL_DEBUG
 743   fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
 744   int32_t j = 2;
 745   for(j = 2; j >= 0; j--) {
 746     fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
 747     fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
 748   }
 749   tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 750
 751   do {
 752     fprintf(stderr,"%i", tok->strength);
 753     tok = tok->next;
 754   } while(tok != NULL);
 755   fprintf(stderr, "\n");
 756
 757   tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
 758
 759   do {
 760     fprintf(stderr,"%i", tok->toInsert);
 761     tok = tok->next;
 762   } while(tok != NULL);
 763 #endif
 764
 765   tok = lh->first;
 766   uint32_t fStrength = UCOL_IDENTICAL;
 767   uint32_t initStrength = UCOL_IDENTICAL;
 768
 769
 770   CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
 771   CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
 772   CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
 773
 774   while (tok != NULL && U_SUCCESS(*status)) {
 775     fStrength = tok->strength;
 776     if(fStrength < initStrength) {
 777       initStrength = fStrength;
 778       if(lh->pos[fStrength] == -1) {
 779         while(lh->pos[fStrength] == -1 && fStrength > 0) {
 780           fStrength--;
 781         }
 782         if(lh->pos[fStrength] == -1) {
 783           *status = U_INTERNAL_PROGRAM_ERROR;
 784           return;
 785         }
 786       }
 787       if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
 788         CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 789         CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
 790         /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
 791         CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
 792       } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
 793         CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
 794         /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
 795         CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 796         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 797       } else { /* primaries */
 798         /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
 799         CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
 800         CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 801         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 802       }
 803     } else {
 804       if(tok->strength == UCOL_TERTIARY) {
 805         CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
 806       } else if(tok->strength == UCOL_SECONDARY) {
 807         CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
 808         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 809       } else if(tok->strength == UCOL_PRIMARY) {
 810         CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
 811         CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
 812         CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
 813       }
 814     }
 815     ucol_doCE(src, CEparts, tok, status);
 816     tok = tok->next;
 817   }
 818 }
 819
 820 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
 821   UCAElements el;
 822   UColToken *tok = lh->first;
 823   UColToken *expt = NULL;
 824   uint32_t i = 0, j = 0;
 825
 826   while(tok != NULL && U_SUCCESS(*status)) {
 827     /* first, check if there are any expansions */
 828     /* if there are expansions, we need to do a little bit more processing */
 829     /* since parts of expansion can be tailored, while others are not */
 830     if(tok->expansion != 0) {
 831       uint32_t len = tok->expansion >> 24;
 832       uint32_t currentSequenceLen = len;
 833       uint32_t expOffset = tok->expansion & 0x00FFFFFF;
 834       //uint32_t exp = currentSequenceLen | expOffset;
 835       UColToken exp;
 836       exp.source = currentSequenceLen | expOffset;
 837       exp.rulesToParse = src->source;
 838
 839       while(len > 0) {
 840         currentSequenceLen = len;
 841         while(currentSequenceLen > 0) {
 842           exp.source = (currentSequenceLen << 24) | expOffset;
 843           if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
 844             uint32_t noOfCEsToCopy = expt->noOfCEs;
 845             for(j = 0; j<noOfCEsToCopy; j++) {
 846               tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
 847             }
 848             tok->noOfExpCEs += noOfCEsToCopy;
 849             // Smart people never try to add codepoints and CEs.
 850             // For some odd reason, it won't work.
 851             expOffset += currentSequenceLen; //noOfCEsToCopy;
 852             len -= currentSequenceLen; //noOfCEsToCopy;
 853             break;
 854           } else {
 855             currentSequenceLen--;
 856           }
 857         }
 858         if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
 859           /* will have to get one from UCA */
 860           /* first, get the UChars from the rules */
 861           /* then pick CEs out until there is no more and stuff them into expansion */
 862           collIterate s;
 863           uint32_t order = 0;
 864           uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
 865
 866           for(;;) {
 867             order = ucol_getNextCE(src->UCA, &s, status);
 868             if(order == UCOL_NO_MORE_CES) {
 869                 break;
 870             }
 871             tok->expCEs[tok->noOfExpCEs++] = order;
 872           }
 873           expOffset++;
 874           len--;
 875         }
 876       }
 877     } else {
 878       tok->noOfExpCEs = 0;
 879     }
 880
 881     /* set the ucaelement with obtained values */
 882     el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
 883     /* copy CEs */
 884     for(i = 0; i<tok->noOfCEs; i++) {
 885       el.CEs[i] = tok->CEs[i];
 886     }
 887     for(i = 0; i<tok->noOfExpCEs; i++) {
 888       el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
 889     }
 890
 891     /* copy UChars */
 892     // We kept prefix and source kind of together, as it is a kind of a contraction.
 893     // However, now we have to slice the prefix off the main thing -
 894     el.prefix = el.prefixChars;
 895     el.cPoints = el.uchars;
 896     if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
 897       // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
 898       // decomposed elements to the unsaf table.
 899       el.prefixSize = tok->prefix>>24;
 900       uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
 901
 902       el.cSize = (tok->source >> 24)-(tok->prefix>>24);
 903       uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
 904     } else {
 905       el.prefixSize = 0;
 906       *el.prefix = 0;
 907
 908       el.cSize = (tok->source >> 24);
 909       uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
 910     }
 911     if(src->UCA != NULL) {
 912       for(i = 0; i<el.cSize; i++) {
 913         if(UCOL_ISJAMO(el.cPoints[i])) {
 914           t->image->jamoSpecial = TRUE;
 915         }
 916       }
 917     }
 918
 919     /* and then, add it */
 920 #if UCOL_DEBUG==2
 921     fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
 922 #endif
 923     uprv_uca_addAnElement(t, &el, status);
 924
 925 #if UCOL_DEBUG_DUPLICATES
 926     if(*status != U_ZERO_ERROR) {
 927       fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
 928       *status = U_ZERO_ERROR;
 929     }
 930 #endif
 931
 932     tok = tok->next;
 933   }
 934 }
 935
 936 U_CDECL_BEGIN
 937 static UBool U_CALLCONV
 938 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
 939   UErrorCode status = U_ZERO_ERROR;
 940   tempUCATable *t = (tempUCATable *)context;
 941   if(value == 0) {
 942     while(start < limit) {
 943       uint32_t CE = utrie_get32(t->mapping, start, NULL);
 944       if(CE == UCOL_NOT_FOUND) {
 945         UCAElements el;
 946         el.isThai = FALSE;
 947         el.prefixSize = 0;
 948         el.prefixChars[0] = 0;
 949         el.prefix = el.prefixChars;
 950         el.cPoints = el.uchars;
 951
 952         el.cSize = 0;
 953         UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
 954
 955         el.noOfCEs = 1;
 956         el.CEs[0] = 0;
 957         uprv_uca_addAnElement(t, &el, &status);
 958
 959       }
 960       start++;
 961     }
 962   }
 963   if(U_FAILURE(status)) {
 964     return FALSE;
 965   } else {
 966     return TRUE;
 967   }
 968 }
 969 U_CDECL_END
 970
 971 static void
 972 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
 973                                UChar32 start, UChar32 end,
 974                                UErrorCode *status) {
 975   //UChar decomp[256];
 976   uint32_t CE = UCOL_NOT_FOUND;
 977   UChar32 u = 0;
 978   UCAElements el;
 979   el.isThai = FALSE;
 980   el.prefixSize = 0;
 981   el.prefixChars[0] = 0;
 982   collIterate colIt;
 983
 984   if(U_SUCCESS(*status)) {
 985     for(u = start; u<=end; u++) {
 986       if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
 987         /* this test is for contractions that are missing the starting element. */
 988          || ((isCntTableElement(CE)) &&
 989         (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
 990         ) {
 991         el.cSize = 0;
 992         U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
 993         //decomp[0] = (UChar)u;
 994         //el.uchars[0] = (UChar)u;
 995         el.cPoints = el.uchars;
 996         //el.cSize = 1;
 997         el.noOfCEs = 0;
 998         el.prefix = el.prefixChars;
 999         el.prefixSize = 0;
1000         //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1001         // We actually want to check whether this element is a special
1002         // If it is an implicit element (hangul, CJK - we want to copy the
1003         // special, not the resolved CEs) - for hangul, copying resolved
1004         // would just make things the same (there is an expansion and it
1005         // takes approximately the same amount of time to resolve as
1006         // falling back to the UCA).
1007         /*
1008         UTRIE_GET32(src->UCA->mapping, u, CE);
1009         tag = getCETag(CE);
1010         if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1011           || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1012           || tag == LEAD_SURROGATE_TAG) {
1013           el.CEs[el.noOfCEs++] = CE;
1014         } else {
1015         */
1016         // It turns out that it does not make sense to keep implicits
1017         // unresolved. The cost of resolving them is big enough so that
1018         // it doesn't make any difference whether we have to go to the UCA
1019         // or not.
1020         {
1021           uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1022           while(CE != UCOL_NO_MORE_CES) {
1023             CE = ucol_getNextCE(src->UCA, &colIt, status);
1024             if(CE != UCOL_NO_MORE_CES) {
1025               el.CEs[el.noOfCEs++] = CE;
1026             }
1027           }
1028         }
1029         uprv_uca_addAnElement(t, &el, status);
1030       }
1031     }
1032   }
1033 }
1034
1035 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1036   uint32_t i = 0;
1037   if(U_FAILURE(*status)) {
1038     return NULL;
1039   }
1040 /*
1041 2.  Eliminate the negative lists by doing the following for each non-null negative list:
1042     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1043     create new ListHeader X
1044     o   reverse the list, add to the end of X's positive list. Reset the strength of the
1045     first item you add, based on the stronger strength levels of the two lists.
1046 */
1047 /*
1048 3.  For each ListHeader with a non-null positive list:
1049 */
1050 /*
1051     o   Find all character strings with CEs between the baseCE and the
1052     next/previous CE, at the strength of the first token. Add these to the
1053     tailoring.
1054       ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
1055       tailoring has & x < z...
1056       ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
1057 */
1058   /* It is possible that this part should be done even while constructing list */
1059   /* The problem is that it is unknown what is going to be the strongest weight */
1060   /* So we might as well do it here */
1061
1062 /*
1063     o   Allocate CEs for each token in the list, based on the total number N of the
1064     largest level difference, and the gap G between baseCE and nextCE at that
1065     level. The relation * between the last item and nextCE is the same as the
1066     strongest strength.
1067     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1068       ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1069       Then fit b and c into the secondary gap between a and d, then fit q
1070       into the tertiary gap between b and c.
1071
1072     o   Example: baseCE << b <<< q << c * nextCE(X,2)
1073       ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1074       Then fit q into the tertiary gap between b and c.
1075     o   When incrementing primary values, we will not cross high byte
1076     boundaries except where there is only a single-byte primary. That is to
1077     ensure that the script reordering will continue to work.
1078 */
1079   UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1080   /* test for NULL */
1081   if (image == NULL) {
1082     *status = U_MEMORY_ALLOCATION_ERROR;
1083     return NULL;
1084   }
1085   uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1086
1087   for(i = 0; i<src->resultLen; i++) {
1088     /* now we need to generate the CEs */
1089     /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1090     /* According to strength                                                          */
1091     if(U_SUCCESS(*status)) {
1092       if(src->lh[i].first) { // if there are any elements
1093         // due to the way parser works, subsequent tailorings
1094         // may remove all the elements from a sequence, therefore
1095         // leaving an empty tailoring sequence.
1096         ucol_initBuffers(src, &src->lh[i], status);
1097       }
1098     }
1099     if(U_FAILURE(*status)) {
1100       return NULL;
1101     }
1102
1103   }
1104
1105   if(src->varTop != NULL) { /* stuff the variable top value */
1106     src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1107     /* remove it from the list */
1108     if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1109       src->varTop->listHeader->first = src->varTop->next;
1110     }
1111     if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1112       src->varTop->listHeader->last = src->varTop->previous;
1113     }
1114     if(src->varTop->next != NULL) {
1115       src->varTop->next->previous = src->varTop->previous;
1116     }
1117     if(src->varTop->previous != NULL) {
1118       src->varTop->previous->next = src->varTop->next;
1119     }
1120   }
1121
1122
1123   tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1124
1125
1126   /* After this, we have assigned CE values to all regular CEs      */
1127   /* now we will go through list once more and resolve expansions,  */
1128   /* make UCAElements structs and add them to table                 */
1129   for(i = 0; i<src->resultLen; i++) {
1130     /* now we need to generate the CEs */
1131     /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1132     /* According to strength                                                          */
1133     if(U_SUCCESS(*status)) {
1134       ucol_createElements(src, t, &src->lh[i], status);
1135     }
1136   }
1137
1138   UCAElements el;
1139   el.isThai = FALSE;
1140   el.prefixSize = 0;
1141   el.prefixChars[0] = 0;
1142
1143   /* add latin-1 stuff */
1144   ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1145
1146   /* add stuff for copying */
1147   if(src->copySet != NULL) {
1148     int32_t i = 0;
1149     UnicodeSet *set = (UnicodeSet *)src->copySet;
1150     for(i = 0; i < set->getRangeCount(); i++) {
1151       ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1152     }
1153   }
1154
1155   if(U_SUCCESS(*status)) {
1156     /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1157
1158     uint32_t tailoredCE = UCOL_NOT_FOUND;
1159     //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1160     UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1161     UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1162     while(*conts != 0) {
1163       /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1164       tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1165       if(tailoredCE != UCOL_NOT_FOUND) {
1166         UBool needToAdd = TRUE;
1167         if(isCntTableElement(tailoredCE)) {
1168           if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1169             needToAdd = FALSE;
1170           }
1171         }
1172         if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1173           needToAdd = FALSE;
1174         }
1175
1176         if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1177           el.prefix = el.prefixChars;
1178           el.prefixSize = 0;
1179           el.cPoints = el.uchars;
1180           el.noOfCEs = 0;
1181           el.uchars[0] = *conts;
1182           el.uchars[1] = *(conts+1);
1183           if(*(conts+2)!=0) {
1184             el.uchars[2] = *(conts+2);
1185             el.cSize = 3;
1186           } else {
1187             el.cSize = 2;
1188           }
1189           ucol_setText(ucaEl, el.uchars, el.cSize, status);
1190           while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1191             el.noOfCEs++;
1192           }
1193           uprv_uca_addAnElement(t, &el, status);
1194         }
1195
1196       } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1197         ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1198       }
1199       conts+=3;
1200     }
1201     ucol_closeElements(ucaEl);
1202   }
1203
1204   // Add completely ignorable elements
1205   utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1206
1207
1208   // canonical closure
1209   uprv_uca_canonicalClosure(t, status);
1210
1211
1212     /* still need to produce compatibility closure */
1213
1214   UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1215
1216   uprv_uca_closeTempTable(t);
1217   uprv_free(image);
1218
1219   return myData;
1220 }
1221
1222 U_CDECL_BEGIN
1223 static UBool U_CALLCONV
1224 ucol_bld_cleanup(void)
1225 {
1226     udata_close(invUCA_DATA_MEM);
1227     invUCA_DATA_MEM = NULL;
1228     _staticInvUCA = NULL;
1229     return TRUE;
1230 }
1231 U_CDECL_END
1232
1233 U_CAPI const InverseUCATableHeader * U_EXPORT2
1234 ucol_initInverseUCA(UErrorCode *status)
1235 {
1236     if(U_FAILURE(*status)) return NULL;
1237
1238     umtx_lock(NULL);
1239     UBool f = (_staticInvUCA == NULL);
1240     umtx_unlock(NULL);
1241
1242     if(f) {
1243         InverseUCATableHeader *newInvUCA = NULL;
1244         UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1245
1246         if(U_FAILURE(*status)) {
1247             if (result) {
1248                 udata_close(result);
1249             }
1250             // This is not needed, as we are talking about
1251             // memory we got from UData
1252             //uprv_free(newInvUCA);
1253         }
1254
1255         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1256             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1257             UCollator *UCA = ucol_initUCA(status);
1258             // UCA versions of UCA and inverse UCA should match
1259             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1260               *status = U_INVALID_FORMAT_ERROR;
1261               udata_close(result);
1262               return NULL;
1263             }
1264
1265             umtx_lock(NULL);
1266             if(_staticInvUCA == NULL) {
1267                 _staticInvUCA = newInvUCA;
1268                 invUCA_DATA_MEM = result;
1269                 result = NULL;
1270                 newInvUCA = NULL;
1271             }
1272             umtx_unlock(NULL);
1273
1274             if(newInvUCA != NULL) {
1275                 udata_close(result);
1276                 // This is not needed, as we are talking about
1277                 // memory we got from UData
1278                 //uprv_free(newInvUCA);
1279             }
1280             else {
1281                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1282             }
1283         }
1284     }
1285     return _staticInvUCA;
1286 }
1287
1288 #endif /* #if !UCONFIG_NO_COLLATION */