icuSources/i18n/ucol_elm.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2001-2008, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  ucaelems.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created 02/22/2001
  14 *   created by: Vladimir Weinstein
  15 *
  16 *   This program reads the Franctional UCA table and generates
  17 *   internal format for UCA table as well as inverse UCA table.
  18 *   It then writes binary files containing the data: ucadata.dat
  19 *   & invuca.dat
  20 *
  21 *   date        name       comments
  22 *   03/02/2001  synwee     added setMaxExpansion
  23 *   03/07/2001  synwee     merged UCA's maxexpansion and tailoring's
  24 */
  25
  26 #include "unicode/utypes.h"
  27
  28 #if !UCONFIG_NO_COLLATION
  29
  30 #include "unicode/uchar.h"
  31 #include "unicode/unistr.h"
  32 #include "unicode/ucoleitr.h"
  33 #include "unicode/normlzr.h"
  34 #include "ucol_elm.h"
  35 #include "ucol_tok.h"
  36 #include "ucol_cnt.h"
  37 #include "unormimp.h"
  38 #include "unicode/caniter.h"
  39 #include "cmemory.h"
  40
  41 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status);
  42
  43 U_CDECL_BEGIN
  44 static int32_t U_CALLCONV
  45 prefixLookupHash(const UHashTok e) {
  46     UCAElements *element = (UCAElements *)e.pointer;
  47     UChar buf[256];
  48     UHashTok key;
  49     key.pointer = buf;
  50     uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar));
  51     buf[element->cSize] = 0;
  52     //key.pointer = element->cPoints;
  53     //element->cPoints[element->cSize] = 0;
  54     return uhash_hashUChars(key);
  55 }
  56
  57 static int8_t U_CALLCONV
  58 prefixLookupComp(const UHashTok e1, const UHashTok e2) {
  59     UCAElements *element1 = (UCAElements *)e1.pointer;
  60     UCAElements *element2 = (UCAElements *)e2.pointer;
  61
  62     UChar buf1[256];
  63     UHashTok key1;
  64     key1.pointer = buf1;
  65     uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar));
  66     buf1[element1->cSize] = 0;
  67
  68     UChar buf2[256];
  69     UHashTok key2;
  70     key2.pointer = buf2;
  71     uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar));
  72     buf2[element2->cSize] = 0;
  73
  74     return uhash_compareUChars(key1, key2);
  75 }
  76 U_CDECL_END
  77
  78 static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) {
  79     if(U_FAILURE(*status)) {
  80         return 0;
  81     }
  82     if(expansions->CEs == NULL) {
  83         expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
  84         /* test for NULL */
  85         if (expansions->CEs == NULL) {
  86             *status = U_MEMORY_ALLOCATION_ERROR;
  87             return 0;
  88         }
  89         expansions->size = INIT_EXP_TABLE_SIZE;
  90         expansions->position = 0;
  91     }
  92
  93     if(expansions->position == expansions->size) {
  94         uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t));
  95         if(newData == NULL) {
  96 #ifdef UCOL_DEBUG
  97             fprintf(stderr, "out of memory for expansions\n");
  98 #endif
  99             *status = U_MEMORY_ALLOCATION_ERROR;
 100             return -1;
 101         }
 102         expansions->CEs = newData;
 103         expansions->size *= 2;
 104     }
 105
 106     expansions->CEs[expansions->position] = value;
 107     return(expansions->position++);
 108 }
 109
 110 U_CAPI tempUCATable*  U_EXPORT2
 111 uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UColCETags supplementaryInitTag, UErrorCode *status) {
 112     MaxJamoExpansionTable *maxjet;
 113     MaxExpansionTable *maxet;
 114     tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
 115     /* test for NULL */
 116     if (t == NULL) {
 117         *status = U_MEMORY_ALLOCATION_ERROR;
 118         return NULL;
 119     }
 120     uprv_memset(t, 0, sizeof(tempUCATable));
 121
 122     maxet  = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable));
 123     if (maxet == NULL) {
 124         goto allocation_failure;
 125     }
 126     uprv_memset(maxet, 0, sizeof(MaxExpansionTable));
 127     t->maxExpansions       = maxet;
 128
 129     maxjet = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable));
 130     if (maxjet == NULL) {
 131         goto allocation_failure;
 132     }
 133     uprv_memset(maxjet, 0, sizeof(MaxJamoExpansionTable));
 134     t->maxJamoExpansions = maxjet;
 135
 136     t->image = image;
 137     t->options = opts;
 138
 139     t->UCA = UCA;
 140     t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
 141     /* test for NULL */
 142     if (t->expansions == NULL) {
 143         goto allocation_failure;
 144     }
 145     uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
 146
 147     t->mapping = utrie_open(NULL, NULL, UCOL_ELM_TRIE_CAPACITY,
 148         UCOL_SPECIAL_FLAG | (initTag<<24),
 149         UCOL_SPECIAL_FLAG | (supplementaryInitTag << 24),
 150         TRUE); // Do your own mallocs for the structure, array and have linear Latin 1
 151     if (U_FAILURE(*status)) {
 152         goto allocation_failure;
 153     }
 154     t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, NULL, status);
 155     if (U_FAILURE(*status)) {
 156         goto allocation_failure;
 157     }
 158     uhash_setValueDeleter(t->prefixLookup, uhash_freeBlock);
 159
 160     t->contractions = uprv_cnttab_open(t->mapping, status);
 161     if (U_FAILURE(*status)) {
 162         goto cleanup;
 163     }
 164
 165     /* copy UCA's maxexpansion and merge as we go along */
 166     if (UCA != NULL) {
 167         /* adding an extra initial value for easier manipulation */
 168         maxet->size            = (UCA->lastEndExpansionCE - UCA->endExpansionCE)
 169             + 2;
 170         maxet->position        = maxet->size - 1;
 171         maxet->endExpansionCE  =
 172             (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size);
 173         /* test for NULL */
 174         if (maxet->endExpansionCE == NULL) {
 175             goto allocation_failure;
 176         }
 177         maxet->expansionCESize =
 178             (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size);
 179         /* test for NULL */
 180         if (maxet->expansionCESize == NULL) {
 181             goto allocation_failure;
 182         }
 183         /* initialized value */
 184         *(maxet->endExpansionCE)  = 0;
 185         *(maxet->expansionCESize) = 0;
 186         uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE,
 187             sizeof(uint32_t) * (maxet->size - 1));
 188         uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize,
 189             sizeof(uint8_t) * (maxet->size - 1));
 190     }
 191     else {
 192         maxet->size     = 0;
 193     }
 194     maxjet->endExpansionCE = NULL;
 195     maxjet->isV = NULL;
 196     maxjet->size = 0;
 197     maxjet->position = 0;
 198     maxjet->maxLSize = 1;
 199     maxjet->maxVSize = 1;
 200     maxjet->maxTSize = 1;
 201
 202     t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
 203     /* test for NULL */
 204     if (t->unsafeCP == NULL) {
 205         goto allocation_failure;
 206     }
 207     t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
 208     /* test for NULL */
 209     if (t->contrEndCP == NULL) {
 210         goto allocation_failure;
 211     }
 212     uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
 213     uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
 214     t->cmLookup = NULL;
 215     return t;
 216
 217 allocation_failure:
 218     *status = U_MEMORY_ALLOCATION_ERROR;
 219 cleanup:
 220     uprv_uca_closeTempTable(t);
 221     return NULL;
 222 }
 223
 224 static tempUCATable* U_EXPORT2
 225 uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
 226     if(U_FAILURE(*status)) {
 227         return NULL;
 228     }
 229
 230     tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
 231     /* test for NULL */
 232     if (r == NULL) {
 233         *status = U_MEMORY_ALLOCATION_ERROR;
 234         return NULL;
 235     }
 236     uprv_memset(r, 0, sizeof(tempUCATable));
 237
 238     /* mapping */
 239     if(t->mapping != NULL) {
 240         /*r->mapping = ucmpe32_clone(t->mapping, status);*/
 241         r->mapping = utrie_clone(NULL, t->mapping, NULL, 0);
 242     }
 243
 244     // a hashing clone function would be very nice. We have none currently...
 245     // However, we should be good, as closing should not produce any prefixed elements.
 246     r->prefixLookup = NULL; // prefixes are not used in closing
 247
 248     /* expansions */
 249     if(t->expansions != NULL) {
 250         r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
 251         /* test for NULL */
 252         if (r->expansions == NULL) {
 253             *status = U_MEMORY_ALLOCATION_ERROR;
 254             goto cleanup;
 255         }
 256         r->expansions->position = t->expansions->position;
 257         r->expansions->size = t->expansions->size;
 258         if(t->expansions->CEs != NULL) {
 259             r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size);
 260             /* test for NULL */
 261             if (r->expansions->CEs == NULL) {
 262                 *status = U_MEMORY_ALLOCATION_ERROR;
 263                 goto cleanup;
 264             }
 265             uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->position);
 266         } else {
 267             r->expansions->CEs = NULL;
 268         }
 269     }
 270
 271     if(t->contractions != NULL) {
 272         r->contractions = uprv_cnttab_clone(t->contractions, status);
 273         // Check for cloning failure.
 274         if (r->contractions == NULL) {
 275             *status = U_MEMORY_ALLOCATION_ERROR;
 276             goto cleanup;
 277         }
 278         r->contractions->mapping = r->mapping;
 279     }
 280
 281     if(t->maxExpansions != NULL) {
 282         r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable));
 283         /* test for NULL */
 284         if (r->maxExpansions == NULL) {
 285             *status = U_MEMORY_ALLOCATION_ERROR;
 286             goto cleanup;
 287         }
 288         r->maxExpansions->size = t->maxExpansions->size;
 289         r->maxExpansions->position = t->maxExpansions->position;
 290         if(t->maxExpansions->endExpansionCE != NULL) {
 291             r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size);
 292             /* test for NULL */
 293             if (r->maxExpansions->endExpansionCE == NULL) {
 294                 *status = U_MEMORY_ALLOCATION_ERROR;
 295                 goto cleanup;
 296             }
 297             uprv_memset(r->maxExpansions->endExpansionCE, 0xDB, sizeof(uint32_t)*t->maxExpansions->size);
 298             uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->position*sizeof(uint32_t));
 299         } else {
 300             r->maxExpansions->endExpansionCE = NULL;
 301         }
 302         if(t->maxExpansions->expansionCESize != NULL) {
 303             r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size);
 304             /* test for NULL */
 305             if (r->maxExpansions->expansionCESize == NULL) {
 306                 *status = U_MEMORY_ALLOCATION_ERROR;
 307                 goto cleanup;
 308             }
 309             uprv_memset(r->maxExpansions->expansionCESize, 0xDB, sizeof(uint8_t)*t->maxExpansions->size);
 310             uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->position*sizeof(uint8_t));
 311         } else {
 312             r->maxExpansions->expansionCESize = NULL;
 313         }
 314     }
 315
 316     if(t->maxJamoExpansions != NULL) {
 317         r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable));
 318         /* test for NULL */
 319         if (r->maxJamoExpansions == NULL) {
 320             *status = U_MEMORY_ALLOCATION_ERROR;
 321             goto cleanup;
 322         }
 323         r->maxJamoExpansions->size = t->maxJamoExpansions->size;
 324         r->maxJamoExpansions->position = t->maxJamoExpansions->position;
 325         r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize;
 326         r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize;
 327         r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize;
 328         if(t->maxJamoExpansions->size != 0) {
 329             r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size);
 330             /* test for NULL */
 331             if (r->maxJamoExpansions->endExpansionCE == NULL) {
 332                 *status = U_MEMORY_ALLOCATION_ERROR;
 333                 goto cleanup;
 334             }
 335             uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->position*sizeof(uint32_t));
 336             r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size);
 337             /* test for NULL */
 338             if (r->maxJamoExpansions->isV == NULL) {
 339                 *status = U_MEMORY_ALLOCATION_ERROR;
 340                 goto cleanup;
 341             }
 342             uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->position*sizeof(UBool));
 343         } else {
 344             r->maxJamoExpansions->endExpansionCE = NULL;
 345             r->maxJamoExpansions->isV = NULL;
 346         }
 347     }
 348
 349     if(t->unsafeCP != NULL) {
 350         r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
 351         /* test for NULL */
 352         if (r->unsafeCP == NULL) {
 353             *status = U_MEMORY_ALLOCATION_ERROR;
 354             goto cleanup;
 355         }
 356         uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
 357     }
 358
 359     if(t->contrEndCP != NULL) {
 360         r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
 361         /* test for NULL */
 362         if (r->contrEndCP == NULL) {
 363             *status = U_MEMORY_ALLOCATION_ERROR;
 364             goto cleanup;
 365         }
 366         uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
 367     }
 368
 369     r->UCA = t->UCA;
 370     r->image = t->image;
 371     r->options = t->options;
 372
 373     return r;
 374 cleanup:
 375     uprv_uca_closeTempTable(t);
 376     return NULL;
 377 }
 378
 379
 380 U_CAPI void  U_EXPORT2
 381 uprv_uca_closeTempTable(tempUCATable *t) {
 382     if(t != NULL) {
 383         if (t->expansions != NULL) {
 384             uprv_free(t->expansions->CEs);
 385             uprv_free(t->expansions);
 386         }
 387         if(t->contractions != NULL) {
 388             uprv_cnttab_close(t->contractions);
 389         }
 390         if (t->mapping != NULL) {
 391             utrie_close(t->mapping);
 392         }
 393
 394         if(t->prefixLookup != NULL) {
 395             uhash_close(t->prefixLookup);
 396         }
 397
 398         if (t->maxExpansions != NULL) {
 399             uprv_free(t->maxExpansions->endExpansionCE);
 400             uprv_free(t->maxExpansions->expansionCESize);
 401             uprv_free(t->maxExpansions);
 402         }
 403
 404         if (t->maxJamoExpansions->size > 0) {
 405             uprv_free(t->maxJamoExpansions->endExpansionCE);
 406             uprv_free(t->maxJamoExpansions->isV);
 407         }
 408         uprv_free(t->maxJamoExpansions);
 409
 410         uprv_free(t->unsafeCP);
 411         uprv_free(t->contrEndCP);
 412
 413         if (t->cmLookup != NULL) {
 414             uprv_free(t->cmLookup->cPoints);
 415             uprv_free(t->cmLookup);
 416         }
 417
 418         uprv_free(t);
 419     }
 420 }
 421
 422 /**
 423 * Looks for the maximum length of all expansion sequences ending with the same
 424 * collation element. The size required for maxexpansion and maxsize is
 425 * returned if the arrays are too small.
 426 * @param endexpansion the last expansion collation element to be added
 427 * @param expansionsize size of the expansion
 428 * @param maxexpansion data structure to store the maximum expansion data.
 429 * @param status error status
 430 * @returns size of the maxexpansion and maxsize used.
 431 */
 432 static int uprv_uca_setMaxExpansion(uint32_t           endexpansion,
 433                                     uint8_t            expansionsize,
 434                                     MaxExpansionTable *maxexpansion,
 435                                     UErrorCode        *status)
 436 {
 437     if (maxexpansion->size == 0) {
 438         /* we'll always make the first element 0, for easier manipulation */
 439         maxexpansion->endExpansionCE =
 440             (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t));
 441         /* test for NULL */
 442         if (maxexpansion->endExpansionCE == NULL) {
 443             *status = U_MEMORY_ALLOCATION_ERROR;
 444             return 0;
 445         }
 446         *(maxexpansion->endExpansionCE) = 0;
 447         maxexpansion->expansionCESize =
 448             (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t));
 449         /* test for NULL */;
 450         if (maxexpansion->expansionCESize == NULL) {
 451             *status = U_MEMORY_ALLOCATION_ERROR;
 452             return 0;
 453         }
 454         *(maxexpansion->expansionCESize) = 0;
 455         maxexpansion->size     = INIT_EXP_TABLE_SIZE;
 456         maxexpansion->position = 0;
 457     }
 458
 459     if (maxexpansion->position + 1 == maxexpansion->size) {
 460         uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE,
 461             2 * maxexpansion->size * sizeof(uint32_t));
 462         if (neweece == NULL) {
 463             *status = U_MEMORY_ALLOCATION_ERROR;
 464             return 0;
 465         }
 466         maxexpansion->endExpansionCE  = neweece;
 467
 468         uint8_t  *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize,
 469             2 * maxexpansion->size * sizeof(uint8_t));
 470         if (neweces == NULL) {
 471             *status = U_MEMORY_ALLOCATION_ERROR;
 472             return 0;
 473         }
 474         maxexpansion->expansionCESize = neweces;
 475         maxexpansion->size *= 2;
 476     }
 477
 478     uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
 479     uint8_t  *pexpansionsize  = maxexpansion->expansionCESize;
 480     int      pos              = maxexpansion->position;
 481
 482     uint32_t *start = pendexpansionce;
 483     uint32_t *limit = pendexpansionce + pos;
 484
 485     /* using binary search to determine if last expansion element is
 486     already in the array */
 487     uint32_t *mid;
 488     int       result = -1;
 489     while (start < limit - 1) {
 490         mid = start + ((limit - start) >> 1);
 491         if (endexpansion <= *mid) {
 492             limit = mid;
 493         }
 494         else {
 495             start = mid;
 496         }
 497     }
 498
 499     if (*start == endexpansion) {
 500         result = start - pendexpansionce;
 501     }
 502     else if (*limit == endexpansion) {
 503         result = limit - pendexpansionce;
 504     }
 505
 506     if (result > -1) {
 507         /* found the ce in expansion, we'll just modify the size if it is
 508         smaller */
 509         uint8_t *currentsize = pexpansionsize + result;
 510         if (*currentsize < expansionsize) {
 511             *currentsize = expansionsize;
 512         }
 513     }
 514     else {
 515         /* we'll need to squeeze the value into the array.
 516         initial implementation. */
 517         /* shifting the subarray down by 1 */
 518         int      shiftsize     = (pendexpansionce + pos) - start;
 519         uint32_t *shiftpos     = start + 1;
 520         uint8_t  *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
 521
 522         /* okay need to rearrange the array into sorted order */
 523         if (shiftsize == 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */
 524             *(pendexpansionce + pos + 1) = endexpansion;
 525             *(pexpansionsize + pos + 1)  = expansionsize;
 526         }
 527         else {
 528             uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t));
 529             uprv_memmove(sizeshiftpos + 1, sizeshiftpos,
 530                 shiftsize * sizeof(uint8_t));
 531             *shiftpos     = endexpansion;
 532             *sizeshiftpos = expansionsize;
 533         }
 534         maxexpansion->position ++;
 535
 536 #ifdef UCOL_DEBUG
 537         int   temp;
 538         UBool found = FALSE;
 539         for (temp = 0; temp < maxexpansion->position; temp ++) {
 540             if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) {
 541                 fprintf(stderr, "expansions %d\n", temp);
 542             }
 543             if (pendexpansionce[temp] == endexpansion) {
 544                 found =TRUE;
 545                 if (pexpansionsize[temp] < expansionsize) {
 546                     fprintf(stderr, "expansions size %d\n", temp);
 547                 }
 548             }
 549         }
 550         if (pendexpansionce[temp] == endexpansion) {
 551             found =TRUE;
 552             if (pexpansionsize[temp] < expansionsize) {
 553                 fprintf(stderr, "expansions size %d\n", temp);
 554             }
 555         }
 556         if (!found)
 557             fprintf(stderr, "expansion not found %d\n", temp);
 558 #endif
 559     }
 560
 561     return maxexpansion->position;
 562 }
 563
 564 /**
 565 * Sets the maximum length of all jamo expansion sequences ending with the same
 566 * collation element. The size required for maxexpansion and maxsize is
 567 * returned if the arrays are too small.
 568 * @param ch the jamo codepoint
 569 * @param endexpansion the last expansion collation element to be added
 570 * @param expansionsize size of the expansion
 571 * @param maxexpansion data structure to store the maximum expansion data.
 572 * @param status error status
 573 * @returns size of the maxexpansion and maxsize used.
 574 */
 575 static int uprv_uca_setMaxJamoExpansion(UChar                  ch,
 576                                         uint32_t               endexpansion,
 577                                         uint8_t                expansionsize,
 578                                         MaxJamoExpansionTable *maxexpansion,
 579                                         UErrorCode            *status)
 580 {
 581     UBool isV = TRUE;
 582     if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) {
 583         /* determines L for Jamo, doesn't need to store this since it is never
 584         at the end of a expansion */
 585         if (maxexpansion->maxLSize < expansionsize) {
 586             maxexpansion->maxLSize = expansionsize;
 587         }
 588         return maxexpansion->position;
 589     }
 590
 591     if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) {
 592         /* determines V for Jamo */
 593         if (maxexpansion->maxVSize < expansionsize) {
 594             maxexpansion->maxVSize = expansionsize;
 595         }
 596     }
 597
 598     if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) {
 599         isV = FALSE;
 600         /* determines T for Jamo */
 601         if (maxexpansion->maxTSize < expansionsize) {
 602             maxexpansion->maxTSize = expansionsize;
 603         }
 604     }
 605
 606     if (maxexpansion->size == 0) {
 607         /* we'll always make the first element 0, for easier manipulation */
 608         maxexpansion->endExpansionCE =
 609             (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t));
 610         /* test for NULL */;
 611         if (maxexpansion->endExpansionCE == NULL) {
 612             *status = U_MEMORY_ALLOCATION_ERROR;
 613             return 0;
 614         }
 615         *(maxexpansion->endExpansionCE) = 0;
 616         maxexpansion->isV =
 617             (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool));
 618         /* test for NULL */;
 619         if (maxexpansion->isV == NULL) {
 620             *status = U_MEMORY_ALLOCATION_ERROR;
 621             uprv_free(maxexpansion->endExpansionCE);
 622             maxexpansion->endExpansionCE = NULL;
 623             return 0;
 624         }
 625         *(maxexpansion->isV) = 0;
 626         maxexpansion->size     = INIT_EXP_TABLE_SIZE;
 627         maxexpansion->position = 0;
 628     }
 629
 630     if (maxexpansion->position + 1 == maxexpansion->size) {
 631         maxexpansion->size *= 2;
 632         maxexpansion->endExpansionCE = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE,
 633             maxexpansion->size * sizeof(uint32_t));
 634         if (maxexpansion->endExpansionCE == NULL) {
 635 #ifdef UCOL_DEBUG
 636             fprintf(stderr, "out of memory for maxExpansions\n");
 637 #endif
 638             *status = U_MEMORY_ALLOCATION_ERROR;
 639             return 0;
 640         }
 641         maxexpansion->isV  = (UBool *)uprv_realloc(maxexpansion->isV,
 642             maxexpansion->size * sizeof(UBool));
 643         if (maxexpansion->isV == NULL) {
 644 #ifdef UCOL_DEBUG
 645             fprintf(stderr, "out of memory for maxExpansions\n");
 646 #endif
 647             *status = U_MEMORY_ALLOCATION_ERROR;
 648             uprv_free(maxexpansion->endExpansionCE);
 649             maxexpansion->endExpansionCE = NULL;
 650             return 0;
 651         }
 652     }
 653
 654     uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
 655     int       pos             = maxexpansion->position;
 656
 657     while (pos > 0) {
 658         pos --;
 659         if (*(pendexpansionce + pos) == endexpansion) {
 660             return maxexpansion->position;
 661         }
 662     }
 663
 664     *(pendexpansionce + maxexpansion->position) = endexpansion;
 665     *(maxexpansion->isV + maxexpansion->position) = isV;
 666     maxexpansion->position ++;
 667
 668     return maxexpansion->position;
 669 }
 670
 671
 672 static void ContrEndCPSet(uint8_t *table, UChar c) {
 673     uint32_t    hash;
 674     uint8_t     *htByte;
 675
 676     hash = c;
 677     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 678         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 679     }
 680     htByte = &table[hash>>3];
 681     *htByte |= (1 << (hash & 7));
 682 }
 683
 684
 685 static void unsafeCPSet(uint8_t *table, UChar c) {
 686     uint32_t    hash;
 687     uint8_t     *htByte;
 688
 689     hash = c;
 690     if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
 691         if (hash >= 0xd800 && hash <= 0xf8ff) {
 692             /*  Part of a surrogate, or in private use area.            */
 693             /*   These don't go in the table                            */
 694             return;
 695         }
 696         hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
 697     }
 698     htByte = &table[hash>>3];
 699     *htByte |= (1 << (hash & 7));
 700 }
 701
 702 static void
 703 uprv_uca_createCMTable(tempUCATable *t, int32_t noOfCM, UErrorCode *status) {
 704     t->cmLookup = (CombinClassTable *)uprv_malloc(sizeof(CombinClassTable));
 705     if (t->cmLookup==NULL) {
 706         *status = U_MEMORY_ALLOCATION_ERROR;
 707         return;
 708     }
 709     t->cmLookup->cPoints=(UChar *)uprv_malloc(noOfCM*sizeof(UChar));
 710     if (t->cmLookup->cPoints ==NULL) {
 711         uprv_free(t->cmLookup);
 712         t->cmLookup = NULL;
 713         *status = U_MEMORY_ALLOCATION_ERROR;
 714         return;
 715     }
 716
 717     t->cmLookup->size=noOfCM;
 718     uprv_memset(t->cmLookup->index, 0, sizeof(t->cmLookup->index));
 719
 720     return;
 721 }
 722
 723 static void
 724 uprv_uca_copyCMTable(tempUCATable *t, UChar *cm, uint16_t *index) {
 725     int32_t count=0;
 726
 727     for (int32_t i=0; i<256; ++i) {
 728         if (index[i]>0) {
 729             // cPoints is ordered by combining class value.
 730             uprv_memcpy(t->cmLookup->cPoints+count, cm+(i<<8), index[i]*sizeof(UChar));
 731             count += index[i];
 732         }
 733         t->cmLookup->index[i]=count;
 734     }
 735     return;
 736 }
 737
 738 /* 1. to the UnsafeCP hash table, add all chars with combining class != 0     */
 739 /* 2. build combining marks table for all chars with combining class != 0     */
 740 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
 741
 742     UChar              c;
 743     uint16_t           fcd;     // Hi byte is lead combining class.
 744     // lo byte is trailing combing class.
 745     const uint16_t    *fcdTrieData;
 746     UBool buildCMTable = (t->cmLookup==NULL); // flag for building combining class table
 747     UChar *cm=NULL;
 748     uint16_t index[256];
 749     int32_t count=0;
 750     fcdTrieData = unorm_getFCDTrie(status);
 751     if (U_FAILURE(*status)) {
 752         return;
 753     }
 754
 755     if (buildCMTable) {
 756         if (cm==NULL) {
 757             cm = (UChar *)uprv_malloc(sizeof(UChar)*UCOL_MAX_CM_TAB);
 758             if (cm==NULL) {
 759                 *status = U_MEMORY_ALLOCATION_ERROR;
 760                 return;
 761             }
 762         }
 763         uprv_memset(index, 0, sizeof(index));
 764     }
 765     for (c=0; c<0xffff; c++) {
 766         fcd = unorm_getFCD16(fcdTrieData, c);
 767         if (fcd >= 0x100 ||               // if the leading combining class(c) > 0 ||
 768             (UTF_IS_LEAD(c) && fcd != 0)) {//    c is a leading surrogate with some FCD data
 769             if (buildCMTable) {
 770                 uint32_t cClass = fcd & 0xff;
 771                 //uint32_t temp=(cClass<<8)+index[cClass];
 772                 cm[(cClass<<8)+index[cClass]] = c; //
 773                 index[cClass]++;
 774                 count++;
 775             }
 776             unsafeCPSet(t->unsafeCP, c);
 777         }
 778     }
 779
 780     // copy to cm table
 781     if (buildCMTable) {
 782         uprv_uca_createCMTable(t, count, status);
 783         if(U_FAILURE(*status)) {
 784             if (cm!=NULL) {
 785                 uprv_free(cm);
 786             }
 787             return;
 788         }
 789         uprv_uca_copyCMTable(t, cm, index);
 790     }
 791
 792     if(t->prefixLookup != NULL) {
 793         int32_t i = -1;
 794         const UHashElement *e = NULL;
 795         UCAElements *element = NULL;
 796         UChar NFCbuf[256];
 797         uint32_t NFCbufLen = 0;
 798         while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) {
 799             element = (UCAElements *)e->value.pointer;
 800             // codepoints here are in the NFD form. We need to add the
 801             // first code point of the NFC form to unsafe, because
 802             // strcoll needs to backup over them.
 803             NFCbufLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0,
 804                 NFCbuf, 256, status);
 805             unsafeCPSet(t->unsafeCP, NFCbuf[0]);
 806         }
 807     }
 808
 809     if (cm!=NULL) {
 810         uprv_free(cm);
 811     }
 812 }
 813
 814 static uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE,
 815                                    UCAElements *element, UErrorCode *status)
 816 {
 817     // currently the longest prefix we're supporting in Japanese is two characters
 818     // long. Although this table could quite easily mimic complete contraction stuff
 819     // there is no good reason to make a general solution, as it would require some
 820     // error prone messing.
 821     CntTable *contractions = t->contractions;
 822     UChar32 cp;
 823     uint32_t cpsize = 0;
 824     UChar *oldCP = element->cPoints;
 825     uint32_t oldCPSize = element->cSize;
 826
 827
 828     contractions->currentTag = SPEC_PROC_TAG;
 829
 830     // here, we will normalize & add prefix to the table.
 831     uint32_t j = 0;
 832 #ifdef UCOL_DEBUG
 833     for(j=0; j<element->cSize; j++) {
 834         fprintf(stdout, "CP: %04X ", element->cPoints[j]);
 835     }
 836     fprintf(stdout, "El: %08X Pref: ", CE);
 837     for(j=0; j<element->prefixSize; j++) {
 838         fprintf(stdout, "%04X ", element->prefix[j]);
 839     }
 840     fprintf(stdout, "%08X ", element->mapCE);
 841 #endif
 842
 843     for (j = 1; j<element->prefixSize; j++) {   /* First add NFD prefix chars to unsafe CP hash table */
 844         // Unless it is a trail surrogate, which is handled algoritmically and
 845         // shouldn't take up space in the table.
 846         if(!(UTF_IS_TRAIL(element->prefix[j]))) {
 847             unsafeCPSet(t->unsafeCP, element->prefix[j]);
 848         }
 849     }
 850
 851     UChar tempPrefix = 0;
 852
 853     for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards
 854         // therefore, we will promptly reverse the prefix buffer...
 855         tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1);
 856         *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j];
 857         element->prefix[j] = tempPrefix;
 858     }
 859
 860 #ifdef UCOL_DEBUG
 861     fprintf(stdout, "Reversed: ");
 862     for(j=0; j<element->prefixSize; j++) {
 863         fprintf(stdout, "%04X ", element->prefix[j]);
 864     }
 865     fprintf(stdout, "%08X\n", element->mapCE);
 866 #endif
 867
 868     // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix
 869     if(!(UTF_IS_TRAIL(element->cPoints[0]))) {
 870         unsafeCPSet(t->unsafeCP, element->cPoints[0]);
 871     }
 872
 873     // Maybe we need this... To handle prefixes completely in the forward direction...
 874     //if(element->cSize == 1) {
 875     //  if(!(UTF_IS_TRAIL(element->cPoints[0]))) {
 876     //    ContrEndCPSet(t->contrEndCP, element->cPoints[0]);
 877     //  }
 878     //}
 879
 880     element->cPoints = element->prefix;
 881     element->cSize = element->prefixSize;
 882
 883     // Add the last char of the contraction to the contraction-end hash table.
 884     // unless it is a trail surrogate, which is handled algorithmically and
 885     // shouldn't be in the table
 886     if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) {
 887         ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
 888     }
 889
 890     // First we need to check if contractions starts with a surrogate
 891     UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
 892
 893     // If there are any Jamos in the contraction, we should turn on special
 894     // processing for Jamos
 895     if(UCOL_ISJAMO(element->prefix[0])) {
 896         t->image->jamoSpecial = TRUE;
 897     }
 898     /* then we need to deal with it */
 899     /* we could aready have something in table - or we might not */
 900
 901     if(!isPrefix(CE)) {
 902         /* if it wasn't contraction, we wouldn't end up here*/
 903         int32_t firstContractionOffset = 0;
 904         firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
 905         uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
 906         uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status);
 907         uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
 908         CE =  constructContractCE(SPEC_PROC_TAG, firstContractionOffset);
 909     } else { /* we are adding to existing contraction */
 910         /* there were already some elements in the table, so we need to add a new contraction */
 911         /* Two things can happen here: either the codepoint is already in the table, or it is not */
 912         int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status);
 913         if(position > 0) {       /* if it is we just continue down the chain */
 914             uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
 915             uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
 916             uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status);
 917         } else {                  /* if it isn't, we will have to create a new sequence */
 918             uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
 919             uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status);
 920         }
 921     }
 922
 923     element->cPoints = oldCP;
 924     element->cSize = oldCPSize;
 925
 926     return CE;
 927 }
 928
 929 // Note regarding surrogate handling: We are interested only in the single
 930 // or leading surrogates in a contraction. If a surrogate is somewhere else
 931 // in the contraction, it is going to be handled as a pair of code units,
 932 // as it doesn't affect the performance AND handling surrogates specially
 933 // would complicate code way too much.
 934 static uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE,
 935                                         UCAElements *element, UErrorCode *status)
 936 {
 937     CntTable *contractions = t->contractions;
 938     UChar32 cp;
 939     uint32_t cpsize = 0;
 940
 941     contractions->currentTag = CONTRACTION_TAG;
 942
 943     // First we need to check if contractions starts with a surrogate
 944     UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
 945
 946     if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first
 947         uint32_t j = 0;
 948         for (j=1; j<element->cSize; j++) {   /* First add contraction chars to unsafe CP hash table */
 949             // Unless it is a trail surrogate, which is handled algoritmically and
 950             // shouldn't take up space in the table.
 951             if(!(UTF_IS_TRAIL(element->cPoints[j]))) {
 952                 unsafeCPSet(t->unsafeCP, element->cPoints[j]);
 953             }
 954         }
 955         // Add the last char of the contraction to the contraction-end hash table.
 956         // unless it is a trail surrogate, which is handled algorithmically and
 957         // shouldn't be in the table
 958         if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) {
 959             ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
 960         }
 961
 962         // If there are any Jamos in the contraction, we should turn on special
 963         // processing for Jamos
 964         if(UCOL_ISJAMO(element->cPoints[0])) {
 965             t->image->jamoSpecial = TRUE;
 966         }
 967         /* then we need to deal with it */
 968         /* we could aready have something in table - or we might not */
 969         element->cPoints+=cpsize;
 970         element->cSize-=cpsize;
 971         if(!isContraction(CE)) {
 972             /* if it wasn't contraction, we wouldn't end up here*/
 973             int32_t firstContractionOffset = 0;
 974             firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
 975             uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
 976             uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
 977             uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
 978             CE =  constructContractCE(CONTRACTION_TAG, firstContractionOffset);
 979         } else { /* we are adding to existing contraction */
 980             /* there were already some elements in the table, so we need to add a new contraction */
 981             /* Two things can happen here: either the codepoint is already in the table, or it is not */
 982             int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status);
 983             if(position > 0) {       /* if it is we just continue down the chain */
 984                 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
 985                 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
 986                 uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status);
 987             } else {                  /* if it isn't, we will have to create a new sequence */
 988                 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
 989                 uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status);
 990             }
 991         }
 992         element->cPoints-=cpsize;
 993         element->cSize+=cpsize;
 994         /*ucmpe32_set(t->mapping, cp, CE);*/
 995         utrie_set32(t->mapping, cp, CE);
 996     } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */
 997         /*ucmpe32_set(t->mapping, cp, element->mapCE);*/
 998         utrie_set32(t->mapping, cp, element->mapCE);
 999     } else { /* fill out the first stage of the contraction with the surrogate CE */
1000         uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status);
1001         uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status);
1002     }
1003     return CE;
1004 }
1005
1006
1007 static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
1008     int32_t firstContractionOffset = 0;
1009     //    uint32_t contractionElement = UCOL_NOT_FOUND;
1010
1011     if(U_FAILURE(*status)) {
1012         return UCOL_NOT_FOUND;
1013     }
1014
1015     /* end of recursion */
1016     if(element->cSize == 1) {
1017         if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) {
1018             uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
1019             uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
1020             return existingCE;
1021         } else {
1022             return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
1023         }
1024     }
1025
1026     /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
1027     /* for both backward and forward cycles */
1028
1029     /* we encountered either an empty space or a non-contraction element */
1030     /* this means we are constructing a new contraction sequence */
1031     element->cPoints++;
1032     element->cSize--;
1033     if(!isCntTableElement(existingCE)) {
1034         /* if it wasn't contraction, we wouldn't end up here*/
1035         firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
1036         uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
1037         uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
1038         uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
1039         existingCE =  constructContractCE(contractions->currentTag, firstContractionOffset);
1040     } else { /* we are adding to existing contraction */
1041         /* there were already some elements in the table, so we need to add a new contraction */
1042         /* Two things can happen here: either the codepoint is already in the table, or it is not */
1043         int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status);
1044         if(position > 0) {       /* if it is we just continue down the chain */
1045             uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
1046             uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
1047             uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
1048         } else {                  /* if it isn't, we will have to create a new sequence */
1049             uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
1050             uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
1051         }
1052     }
1053     element->cPoints--;
1054     element->cSize++;
1055     return existingCE;
1056 }
1057
1058 static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) {
1059     uint32_t CE = UCOL_NOT_FOUND;
1060     // This should add a completely ignorable element to the
1061     // unsafe table, so that backward iteration will skip
1062     // over it when treating contractions.
1063     uint32_t i = 0;
1064     if(element->mapCE == 0) {
1065         for(i = 0; i < element->cSize; i++) {
1066             if(!UTF_IS_TRAIL(element->cPoints[i])) {
1067                 unsafeCPSet(t->unsafeCP, element->cPoints[i]);
1068             }
1069         }
1070     }
1071     if(element->cSize > 1) { /* we're adding a contraction */
1072         uint32_t i = 0;
1073         UChar32 cp;
1074
1075         UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp);
1076         /*CE = ucmpe32_get(t->mapping, cp);*/
1077         CE = utrie_get32(t->mapping, cp, NULL);
1078
1079         CE = uprv_uca_addContraction(t, CE, element, status);
1080     } else { /* easy case, */
1081         /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/
1082         CE = utrie_get32(t->mapping, element->cPoints[0], NULL);
1083
1084         if( CE != UCOL_NOT_FOUND) {
1085             if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
1086                 if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop
1087                     // Only expansions and regular CEs can go here... Contractions will never happen in this place
1088                     uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status);
1089                     /* This loop has to change the CE at the end of contraction REDO!*/
1090                     uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status);
1091                 }
1092             } else {
1093                 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
1094                 utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
1095                 if ((element->prefixSize!=0) && (getCETag(CE)!=IMPLICIT_TAG)) {
1096                     UCAElements *origElem = (UCAElements *)uprv_malloc(sizeof(UCAElements));
1097                     /* test for NULL */
1098                     if (origElem== NULL) {
1099                         *status = U_MEMORY_ALLOCATION_ERROR;
1100                         return 0;
1101                     }
1102                     /* copy the original UCA value */
1103                     origElem->prefixSize = 0;
1104                     origElem->prefix = NULL;
1105                     origElem->cPoints = origElem->uchars;
1106                     origElem->cPoints[0] = element->cPoints[0];
1107                     origElem->cSize = 1;
1108                     origElem->CEs[0]=CE;
1109                     origElem->mapCE=CE;
1110                     origElem->noOfCEs=1;
1111                     uprv_uca_finalizeAddition(t, origElem, status);
1112                     uprv_free(origElem);
1113                 }
1114 #ifdef UCOL_DEBUG
1115                 fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]);
1116                 //*status = U_ILLEGAL_ARGUMENT_ERROR;
1117 #endif
1118             }
1119         } else {
1120             /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
1121             utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
1122         }
1123     }
1124     return CE;
1125 }
1126
1127 /* This adds a read element, while testing for existence */
1128 U_CAPI uint32_t  U_EXPORT2
1129 uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) {
1130     U_NAMESPACE_USE
1131
1132     ExpansionTable *expansions = t->expansions;
1133
1134     uint32_t i = 1;
1135     uint32_t expansion = 0;
1136     uint32_t CE;
1137
1138     if(U_FAILURE(*status)) {
1139         return 0xFFFF;
1140     }
1141
1142     element->mapCE = 0; // clear mapCE so that we can catch expansions
1143
1144     if(element->noOfCEs == 1) {
1145         element->mapCE = element->CEs[0];
1146     } else {
1147         /* ICU 2.1 long primaries */
1148         /* unfortunately, it looks like we have to look for a long primary here */
1149         /* since in canonical closure we are going to hit some long primaries from */
1150         /* the first phase, and they will come back as continuations/expansions */
1151         /* destroying the effect of the previous opitimization */
1152         /* A long primary is a three byte primary with starting secondaries and tertiaries */
1153         /* It can appear in long runs of only primary differences (like east Asian tailorings) */
1154         /* also, it should not be an expansion, as expansions would break with this */
1155         // This part came in from ucol_bld.cpp
1156         //if(tok->expansion == 0
1157         //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
1158         //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
1159         /* we will construct a special CE that will go unchanged to the table */
1160         if(element->noOfCEs == 2 // a two CE expansion
1161             && isContinuation(element->CEs[1]) // which  is a continuation
1162             && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
1163             && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
1164             && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
1165             )
1166         {
1167 #ifdef UCOL_DEBUG
1168             fprintf(stdout, "Long primary %04X\n", element->cPoints[0]);
1169 #endif
1170             element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
1171                 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
1172                 | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
1173         }
1174         else {
1175             expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
1176                 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1177                 & 0xFFFFF0);
1178
1179             for(i = 1; i<element->noOfCEs; i++) {
1180                 uprv_uca_addExpansion(expansions, element->CEs[i], status);
1181             }
1182             if(element->noOfCEs <= 0xF) {
1183                 expansion |= element->noOfCEs;
1184             } else {
1185                 uprv_uca_addExpansion(expansions, 0, status);
1186             }
1187             element->mapCE = expansion;
1188             uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
1189                 (uint8_t)element->noOfCEs,
1190                 t->maxExpansions,
1191                 status);
1192             if(UCOL_ISJAMO(element->cPoints[0])) {
1193                 t->image->jamoSpecial = TRUE;
1194                 uprv_uca_setMaxJamoExpansion(element->cPoints[0],
1195                     element->CEs[element->noOfCEs - 1],
1196                     (uint8_t)element->noOfCEs,
1197                     t->maxJamoExpansions,
1198                     status);
1199             }
1200             if (U_FAILURE(*status)) {
1201                 return 0;
1202             }
1203         }
1204     }
1205
1206     // We treat digits differently - they are "uber special" and should be
1207     // processed differently if numeric collation is on.
1208     UChar32 uniChar = 0;
1209     //printElement(element);
1210     if ((element->cSize == 2) && U16_IS_LEAD(element->cPoints[0])){
1211         uniChar = U16_GET_SUPPLEMENTARY(element->cPoints[0], element->cPoints[1]);
1212     } else if (element->cSize == 1){
1213         uniChar = element->cPoints[0];
1214     }
1215
1216     // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
1217     // one element to the expansion buffer. When we encounter a digit and we don't
1218     // do numeric collation, we will just pick the CE we have and break out of case
1219     // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
1220     // a special, further processing will occur. If it's a simple CE, we'll return due
1221     // to how the loop is constructed.
1222     if (uniChar != 0 && u_isdigit(uniChar)){
1223         expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element
1224         if(element->mapCE) { // if there is an expansion, we'll pick it here
1225             expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4);
1226         } else {
1227             expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4);
1228         }
1229         element->mapCE = expansion;
1230
1231         // Need to go back to the beginning of the digit string if in the middle!
1232         if(uniChar <= 0xFFFF) { // supplementaries are always unsafe. API takes UChars
1233             unsafeCPSet(t->unsafeCP, (UChar)uniChar);
1234         }
1235     }
1236
1237     // here we want to add the prefix structure.
1238     // I will try to process it as a reverse contraction, if possible.
1239     // prefix buffer is already reversed.
1240
1241     if(element->prefixSize!=0) {
1242         // We keep the seen prefix starter elements in a hashtable
1243         // we need it to be able to distinguish between the simple
1244         // codepoints and prefix starters. Also, we need to use it
1245         // for canonical closure.
1246
1247         UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
1248         /* test for NULL */
1249         if (composed == NULL) {
1250             *status = U_MEMORY_ALLOCATION_ERROR;
1251             return 0;
1252         }
1253         uprv_memcpy(composed, element, sizeof(UCAElements));
1254         composed->cPoints = composed->uchars;
1255         composed->prefix = composed->prefixChars;
1256
1257         composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status);
1258
1259
1260         if(t->prefixLookup != NULL) {
1261             UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element);
1262             if(uCE != NULL) { // there is already a set of code points here
1263                 element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status);
1264             } else { // no code points, so this spot is clean
1265                 element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status);
1266                 uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements));
1267                 /* test for NULL */
1268                 if (uCE == NULL) {
1269                     *status = U_MEMORY_ALLOCATION_ERROR;
1270                     return 0;
1271                 }
1272                 uprv_memcpy(uCE, element, sizeof(UCAElements));
1273                 uCE->cPoints = uCE->uchars;
1274                 uhash_put(t->prefixLookup, uCE, uCE, status);
1275             }
1276             if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) {
1277                 // do it!
1278                 composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status);
1279             }
1280         }
1281         uprv_free(composed);
1282     }
1283
1284     // We need to use the canonical iterator here
1285     // the way we do it is to generate the canonically equivalent strings
1286     // for the contraction and then add the sequences that pass FCD check
1287     if(element->cSize > 1 && !(element->cSize==2 && UTF16_IS_LEAD(element->cPoints[0]) && UTF16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included
1288         UnicodeString source(element->cPoints, element->cSize);
1289         CanonicalIterator it(source, *status);
1290         source = it.next();
1291         while(!source.isBogus()) {
1292             if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) {
1293                 element->cSize = source.extract(element->cPoints, 128, *status);
1294                 uprv_uca_finalizeAddition(t, element, status);
1295             }
1296             source = it.next();
1297         }
1298         CE = element->mapCE;
1299     } else {
1300         CE = uprv_uca_finalizeAddition(t, element, status);
1301     }
1302
1303     return CE;
1304 }
1305
1306
1307 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray       *mapping, */
1308 static void uprv_uca_getMaxExpansionJamo(UNewTrie       *mapping,
1309                                          MaxExpansionTable     *maxexpansion,
1310                                          MaxJamoExpansionTable *maxjamoexpansion,
1311                                          UBool                  jamospecial,
1312                                          UErrorCode            *status)
1313 {
1314     const uint32_t VBASE  = 0x1161;
1315     const uint32_t TBASE  = 0x11A8;
1316     const uint32_t VCOUNT = 21;
1317     const uint32_t TCOUNT = 28;
1318
1319     uint32_t v = VBASE + VCOUNT - 1;
1320     uint32_t t = TBASE + TCOUNT - 1;
1321     uint32_t ce;
1322
1323     while (v >= VBASE) {
1324         /*ce = ucmpe32_get(mapping, v);*/
1325         ce = utrie_get32(mapping, v, NULL);
1326         if (ce < UCOL_SPECIAL_FLAG) {
1327             uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status);
1328         }
1329         v --;
1330     }
1331
1332     while (t >= TBASE)
1333     {
1334         /*ce = ucmpe32_get(mapping, t);*/
1335         ce = utrie_get32(mapping, t, NULL);
1336         if (ce < UCOL_SPECIAL_FLAG) {
1337             uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status);
1338         }
1339         t --;
1340     }
1341     /*  According to the docs, 99% of the time, the Jamo will not be special */
1342     if (jamospecial) {
1343         /* gets the max expansion in all unicode characters */
1344         int     count    = maxjamoexpansion->position;
1345         uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize +
1346             maxjamoexpansion->maxVSize +
1347             maxjamoexpansion->maxTSize);
1348         uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize +
1349             maxjamoexpansion->maxVSize);
1350
1351         while (count > 0) {
1352             count --;
1353             if (*(maxjamoexpansion->isV + count) == TRUE) {
1354                 uprv_uca_setMaxExpansion(
1355                     *(maxjamoexpansion->endExpansionCE + count),
1356                     maxVSize, maxexpansion, status);
1357             }
1358             else {
1359                 uprv_uca_setMaxExpansion(
1360                     *(maxjamoexpansion->endExpansionCE + count),
1361                     maxTSize, maxexpansion, status);
1362             }
1363         }
1364     }
1365 }
1366
1367 U_CDECL_BEGIN
1368 static inline uint32_t U_CALLCONV
1369 getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset)
1370 {
1371     uint32_t value;
1372     uint32_t tag;
1373     UChar32 limit;
1374     UBool inBlockZero;
1375
1376     limit=start+0x400;
1377     while(start<limit) {
1378         value=utrie_get32(trie, start, &inBlockZero);
1379         tag = getCETag(value);
1380         if(inBlockZero == TRUE) {
1381             start+=UTRIE_DATA_BLOCK_LENGTH;
1382         } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) {
1383             /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the
1384             * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is
1385             * nothing in this position and that it should be skipped.
1386             */
1387 #ifdef UCOL_DEBUG
1388             static int32_t count = 1;
1389             fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value);
1390 #endif
1391             return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset);
1392         } else {
1393             ++start;
1394         }
1395     }
1396     return 0;
1397 }
1398 U_CDECL_END
1399
1400 #ifdef UCOL_DEBUG
1401 // This is a debug function to print the contents of a trie.
1402 // It is used in conjuction with the code around utrie_unserialize call
1403 void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
1404     if(start<0x10000) {
1405         fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
1406     } else {
1407         fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value);
1408     }
1409 }
1410
1411 int32_t
1412 myGetFoldingOffset(uint32_t data) {
1413     if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) {
1414         return (data&0xFFFFFF);
1415     } else {
1416         return 0;
1417     }
1418 }
1419 #endif
1420
1421 U_CAPI UCATableHeader* U_EXPORT2
1422 uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
1423     /*CompactEIntArray *mapping = t->mapping;*/
1424     UNewTrie *mapping = t->mapping;
1425     ExpansionTable *expansions = t->expansions;
1426     CntTable *contractions = t->contractions;
1427     MaxExpansionTable *maxexpansion = t->maxExpansions;
1428
1429     if(U_FAILURE(*status)) {
1430         return NULL;
1431     }
1432
1433     uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar));
1434
1435     int32_t contractionsSize = 0;
1436     contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
1437
1438     /* the following operation depends on the trie data. Therefore, we have to do it before */
1439     /* the trie is compacted */
1440     /* sets jamo expansions */
1441     uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions,
1442         t->image->jamoSpecial, status);
1443
1444     /*ucmpe32_compact(mapping);*/
1445     /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/
1446     /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/
1447     /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/
1448
1449     // After setting the jamo expansions, compact the trie and get the needed size
1450     int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status);
1451
1452     uint32_t tableOffset = 0;
1453     uint8_t *dataStart;
1454
1455     /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */
1456
1457     uint32_t toAllocate =(uint32_t)(headersize+
1458         paddedsize(expansions->position*sizeof(uint32_t))+
1459         paddedsize(mappingSize)+
1460         paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+
1461         //paddedsize(0x100*sizeof(uint32_t))  /* Latin1 is now included in the trie */
1462         /* maxexpansion array */
1463         + paddedsize(maxexpansion->position * sizeof(uint32_t)) +
1464         /* maxexpansion size array */
1465         paddedsize(maxexpansion->position * sizeof(uint8_t)) +
1466         paddedsize(UCOL_UNSAFECP_TABLE_SIZE) +   /*  Unsafe chars             */
1467         paddedsize(UCOL_UNSAFECP_TABLE_SIZE));    /*  Contraction Ending chars */
1468
1469
1470     dataStart = (uint8_t *)uprv_malloc(toAllocate);
1471     /* test for NULL */
1472     if (dataStart == NULL) {
1473         *status = U_MEMORY_ALLOCATION_ERROR;
1474         return NULL;
1475     }
1476
1477     UCATableHeader *myData = (UCATableHeader *)dataStart;
1478     // Please, do reset all the fields!
1479     uprv_memset(dataStart, 0, toAllocate);
1480     // Make sure we know this is reset
1481     myData->magic = UCOL_HEADER_MAGIC;
1482     myData->isBigEndian = U_IS_BIG_ENDIAN;
1483     myData->charSetFamily = U_CHARSET_FAMILY;
1484     myData->formatVersion[0] = UCA_FORMAT_VERSION_0;
1485     myData->formatVersion[1] = UCA_FORMAT_VERSION_1;
1486     myData->formatVersion[2] = UCA_FORMAT_VERSION_2;
1487     myData->formatVersion[3] = UCA_FORMAT_VERSION_3;
1488     myData->jamoSpecial = t->image->jamoSpecial;
1489
1490     // Don't copy stuff from UCA header!
1491     //uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
1492
1493     myData->contractionSize = contractionsSize;
1494
1495     tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader)));
1496
1497     myData->options = tableOffset;
1498     uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet));
1499     tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet)));
1500
1501     /* copy expansions */
1502     /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
1503     myData->expansion = tableOffset;
1504     uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t));
1505     tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t)));
1506
1507     /* contractions block */
1508     if(contractionsSize != 0) {
1509         /* copy contraction index */
1510         /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
1511         myData->contractionIndex = tableOffset;
1512         uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar));
1513         tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar)));
1514
1515         /* copy contraction collation elements */
1516         /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
1517         myData->contractionCEs = tableOffset;
1518         uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t));
1519         tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t)));
1520     } else {
1521         myData->contractionIndex = 0;
1522         myData->contractionCEs = 0;
1523     }
1524
1525     /* copy mapping table */
1526     /*myData->mappingPosition = dataStart+tableOffset;*/
1527     /*myData->mappingPosition = tableOffset;*/
1528     /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/
1529
1530     myData->mappingPosition = tableOffset;
1531     utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status);
1532 #ifdef UCOL_DEBUG
1533     // This is debug code to dump the contents of the trie. It needs two functions defined above
1534     {
1535         UTrie UCAt = { 0 };
1536         uint32_t trieWord;
1537         utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status);
1538         UCAt.getFoldingOffset = myGetFoldingOffset;
1539         if(U_SUCCESS(*status)) {
1540             utrie_enum(&UCAt, NULL, enumRange, NULL);
1541         }
1542         trieWord = UTRIE_GET32_FROM_LEAD(UCAt, 0xDC01)
1543     }
1544 #endif
1545     tableOffset += paddedsize(mappingSize);
1546
1547
1548     int32_t i = 0;
1549
1550     /* copy max expansion table */
1551     myData->endExpansionCE      = tableOffset;
1552     myData->endExpansionCECount = maxexpansion->position - 1;
1553     /* not copying the first element which is a dummy */
1554     uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1,
1555         (maxexpansion->position - 1) * sizeof(uint32_t));
1556     tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint32_t)));
1557     myData->expansionCESize = tableOffset;
1558     uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1,
1559         (maxexpansion->position - 1) * sizeof(uint8_t));
1560     tableOffset += (uint32_t)(paddedsize((maxexpansion->position)* sizeof(uint8_t)));
1561
1562     /* Unsafe chars table.  Finish it off, then copy it. */
1563     uprv_uca_unsafeCPAddCCNZ(t, status);
1564     if (t->UCA != 0) {              /* Or in unsafebits from UCA, making a combined table.    */
1565         for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
1566             t->unsafeCP[i] |= t->UCA->unsafeCP[i];
1567         }
1568     }
1569     myData->unsafeCP = tableOffset;
1570     uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
1571     tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
1572
1573
1574     /* Finish building Contraction Ending chars hash table and then copy it out.  */
1575     if (t->UCA != 0) {              /* Or in unsafebits from UCA, making a combined table.    */
1576         for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
1577             t->contrEndCP[i] |= t->UCA->contrEndCP[i];
1578         }
1579     }
1580     myData->contrEndCP = tableOffset;
1581     uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
1582     tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
1583
1584     if(tableOffset != toAllocate) {
1585 #ifdef UCOL_DEBUG
1586         fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset);
1587 #endif
1588         *status = U_INTERNAL_PROGRAM_ERROR;
1589         uprv_free(dataStart);
1590         return 0;
1591     }
1592
1593     myData->size = tableOffset;
1594     /* This should happen upon ressurection */
1595     /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/
1596     /*uprv_mstrm_close(ms);*/
1597     return myData;
1598 }
1599
1600
1601 struct enumStruct {
1602     tempUCATable *t;
1603     UCollator *tempColl;
1604     UCollationElements* colEl;
1605     int32_t noOfClosures;
1606     UErrorCode *status;
1607 };
1608 U_CDECL_BEGIN
1609 static UBool U_CALLCONV
1610 _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1611
1612     if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later
1613         UErrorCode *status = ((enumStruct *)context)->status;
1614         tempUCATable *t = ((enumStruct *)context)->t;
1615         UCollator *tempColl = ((enumStruct *)context)->tempColl;
1616         UCollationElements* colEl = ((enumStruct *)context)->colEl;
1617         UCAElements el;
1618         UChar decomp[256] = { 0 };
1619         int32_t noOfDec = 0;
1620
1621         UChar32 u32 = 0;
1622         UChar comp[2];
1623         uint32_t len = 0;
1624
1625         for(u32 = start; u32 < limit; u32++) {
1626             noOfDec = unorm_getDecomposition(u32, FALSE, decomp, 256);
1627             //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
1628             //|| (noOfDec == 1 && *decomp != (UChar)u32))
1629             if(noOfDec > 0) // if we're positive, that means there is no decomposition
1630             {
1631                 len = 0;
1632                 UTF_APPEND_CHAR_UNSAFE(comp, len, u32);
1633                 if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) {
1634 #ifdef UCOL_DEBUG
1635                     fprintf(stderr, "Closure: %08X -> ", u32);
1636                     uint32_t i = 0;
1637                     for(i = 0; i<noOfDec; i++) {
1638                         fprintf(stderr, "%04X ", decomp[i]);
1639                     }
1640                     fprintf(stderr, "\n");
1641 #endif
1642                     ((enumStruct *)context)->noOfClosures++;
1643                     el.cPoints = decomp;
1644                     el.cSize = noOfDec;
1645                     el.noOfCEs = 0;
1646                     el.prefix = el.prefixChars;
1647                     el.prefixSize = 0;
1648
1649                     UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el);
1650                     el.cPoints = comp;
1651                     el.cSize = len;
1652                     el.prefix = el.prefixChars;
1653                     el.prefixSize = 0;
1654                     if(prefix == NULL) {
1655                         el.noOfCEs = 0;
1656                         ucol_setText(colEl, decomp, noOfDec, status);
1657                         while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
1658                             el.noOfCEs++;
1659                         }
1660                     } else {
1661                         el.noOfCEs = 1;
1662                         el.CEs[0] = prefix->mapCE;
1663                         // This character uses a prefix. We have to add it
1664                         // to the unsafe table, as it decomposed form is already
1665                         // in. In Japanese, this happens for \u309e & \u30fe
1666                         // Since unsafeCPSet is static in ucol_elm, we are going
1667                         // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
1668                     }
1669                     uprv_uca_addAnElement(t, &el, status);
1670                 }
1671             }
1672         }
1673     }
1674     return TRUE;
1675 }
1676 U_CDECL_END
1677
1678 static void
1679 uprv_uca_setMapCE(tempUCATable *t, UCAElements *element, UErrorCode *status) {
1680     uint32_t expansion = 0;
1681     int32_t j;
1682
1683     ExpansionTable *expansions = t->expansions;
1684     if(element->noOfCEs == 2 // a two CE expansion
1685         && isContinuation(element->CEs[1]) // which  is a continuation
1686         && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
1687         && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
1688         && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
1689         ) {
1690             element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
1691                 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
1692                 | ((element->CEs[1]>>24) & 0xFF);   // third byte of primary
1693         } else {
1694             expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
1695                 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1696                 & 0xFFFFF0);
1697
1698             for(j = 1; j<(int32_t)element->noOfCEs; j++) {
1699                 uprv_uca_addExpansion(expansions, element->CEs[j], status);
1700             }
1701             if(element->noOfCEs <= 0xF) {
1702                 expansion |= element->noOfCEs;
1703             } else {
1704                 uprv_uca_addExpansion(expansions, 0, status);
1705             }
1706             element->mapCE = expansion;
1707             uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
1708                 (uint8_t)element->noOfCEs,
1709                 t->maxExpansions,
1710                 status);
1711         }
1712 }
1713
1714 static void
1715 uprv_uca_addFCD4AccentedContractions(tempUCATable *t,
1716                                       UCollationElements* colEl,
1717                                       UChar *data,
1718                                       int32_t len,
1719                                       UCAElements *el,
1720                                       UErrorCode *status) {
1721     UChar decomp[256], comp[256];
1722     int32_t decLen, compLen;
1723
1724     decLen = unorm_normalize(data, len, UNORM_NFD, 0, decomp, 256, status);
1725     compLen = unorm_normalize(data, len, UNORM_NFC, 0, comp, 256, status);
1726     decomp[decLen] = comp[compLen] = 0;
1727
1728     el->cPoints = decomp;
1729     el->cSize = decLen;
1730     el->noOfCEs = 0;
1731     el->prefixSize = 0;
1732     el->prefix = el->prefixChars;
1733
1734     UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el);
1735     el->cPoints = comp;
1736     el->cSize = compLen;
1737     el->prefix = el->prefixChars;
1738     el->prefixSize = 0;
1739     if(prefix == NULL) {
1740         el->noOfCEs = 0;
1741         ucol_setText(colEl, decomp, decLen, status);
1742         while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
1743             el->noOfCEs++;
1744         }
1745         uprv_uca_setMapCE(t, el, status);
1746         uprv_uca_addAnElement(t, el, status);
1747     }
1748 }
1749
1750 static void
1751 uprv_uca_addMultiCMContractions(tempUCATable *t,
1752                                 UCollationElements* colEl,
1753                                 tempTailorContext *c,
1754                                 UCAElements *el,
1755                                 UErrorCode *status) {
1756     CombinClassTable *cmLookup = t->cmLookup;
1757     UChar  newDecomp[256];
1758     int32_t maxComp, newDecLen;
1759     const uint16_t  *fcdTrieData = unorm_getFCDTrie(status);
1760     int16_t curClass = (unorm_getFCD16(fcdTrieData, c->tailoringCM) & 0xff);
1761     CompData *precomp = c->precomp;
1762     int32_t  compLen = c->compLen;
1763     UChar *comp = c->comp;
1764     maxComp = c->precompLen;
1765
1766     for (int32_t j=0; j < maxComp; j++) {
1767         int32_t count=0;
1768         do {
1769             if ( count == 0 ) {  // Decompose the saved precomposed char.
1770                 UChar temp[2];
1771                 temp[0]=precomp[j].cp;
1772                 temp[1]=0;
1773                 newDecLen = unorm_normalize(temp, 1, UNORM_NFD, 0,
1774                             newDecomp, sizeof(newDecomp)/sizeof(UChar), status);
1775                 newDecomp[newDecLen++] = cmLookup->cPoints[c->cmPos];
1776             }
1777             else {  // swap 2 combining marks when they are equal.
1778                 uprv_memcpy(newDecomp, c->decomp, sizeof(UChar)*(c->decompLen));
1779                 newDecLen = c->decompLen;
1780                 newDecomp[newDecLen++] = precomp[j].cClass;
1781             }
1782             newDecomp[newDecLen] = 0;
1783             compLen = unorm_normalize(newDecomp, newDecLen, UNORM_NFC, 0,
1784                               comp, 256, status);
1785             if (compLen==1) {
1786                 comp[compLen++] = newDecomp[newDecLen++] = c->tailoringCM;
1787                 comp[compLen] = newDecomp[newDecLen] = 0;
1788                 el->cPoints = newDecomp;
1789                 el->cSize = newDecLen;
1790
1791                 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, el);
1792                 el->cPoints = c->comp;
1793                 el->cSize = compLen;
1794                 el->prefix = el->prefixChars;
1795                 el->prefixSize = 0;
1796                 if(prefix == NULL) {
1797                     el->noOfCEs = 0;
1798                     ucol_setText(colEl, newDecomp, newDecLen, status);
1799                     while((el->CEs[el->noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
1800                         el->noOfCEs++;
1801                     }
1802                     uprv_uca_setMapCE(t, el, status);
1803                     uprv_uca_finalizeAddition(t, el, status);
1804
1805                     // Save the current precomposed char and its class to find any
1806                     // other combining mark combinations.
1807                     precomp[c->precompLen].cp=comp[0];
1808                     precomp[c->precompLen].cClass = curClass;
1809                     c->precompLen++;
1810                 }
1811             }
1812         } while (++count<2 && (precomp[j].cClass == curClass));
1813     }
1814
1815 }
1816
1817 static void
1818 uprv_uca_addTailCanonicalClosures(tempUCATable *t,
1819                                   UCollationElements* colEl,
1820                                   UChar baseCh,
1821                                   UChar cMark,
1822                                   UCAElements *el,
1823                                   UErrorCode *status) {
1824     CombinClassTable *cmLookup = t->cmLookup;
1825     const uint16_t  *fcdTrieData = unorm_getFCDTrie(status);
1826     int16_t maxIndex = (unorm_getFCD16(fcdTrieData, cMark) & 0xff );
1827     UCAElements element;
1828     uint16_t *index;
1829     UChar  decomp[256];
1830     UChar  comp[256];
1831     CompData precomp[256];   // precomposed array
1832     int32_t  precompLen = 0; // count for precomp
1833     int32_t i, len, decompLen, curClass, replacedPos;
1834     tempTailorContext c;
1835
1836     if ( cmLookup == NULL ) {
1837         return;
1838     }
1839     index = cmLookup->index;
1840     int32_t cClass=(unorm_getFCD16(fcdTrieData, cMark) & 0xff);
1841     maxIndex = (int32_t)index[(unorm_getFCD16(fcdTrieData, cMark) & 0xff)-1];
1842     c.comp = comp;
1843     c.decomp = decomp;
1844     c.precomp = precomp;
1845     c.tailoringCM =  cMark;
1846
1847     if (cClass>0) {
1848         maxIndex = (int32_t)index[cClass-1];
1849     }
1850     else {
1851         maxIndex=0;
1852     }
1853     decomp[0]=baseCh;
1854     for ( i=0; i<maxIndex ; i++ ) {
1855         decomp[1] = cmLookup->cPoints[i];
1856         decomp[2]=0;
1857         decompLen=2;
1858         len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status);
1859         if (len==1) {
1860             // Save the current precomposed char and its class to find any
1861             // other combining mark combinations.
1862             precomp[precompLen].cp=comp[0];
1863             curClass = precomp[precompLen].cClass =
1864                        index[unorm_getFCD16(fcdTrieData, decomp[1]) & 0xff];
1865             precompLen++;
1866             replacedPos=0;
1867             for (decompLen=0; decompLen< (int32_t)el->cSize; decompLen++) {
1868                 decomp[decompLen] = el->cPoints[decompLen];
1869                 if (decomp[decompLen]==cMark) {
1870                     replacedPos = decompLen;  // record the position for later use
1871                 }
1872             }
1873             if ( replacedPos != 0 ) {
1874                 decomp[replacedPos]=cmLookup->cPoints[i];
1875             }
1876             decomp[decompLen] = 0;
1877             len = unorm_normalize(decomp, decompLen, UNORM_NFC, 0, comp, 256, status);
1878             comp[len++] = decomp[decompLen++] = cMark;
1879             comp[len] = decomp[decompLen] = 0;
1880             element.cPoints = decomp;
1881             element.cSize = decompLen;
1882             element.noOfCEs = 0;
1883             element.prefix = el->prefixChars;
1884             element.prefixSize = 0;
1885
1886             UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &element);
1887             element.cPoints = comp;
1888             element.cSize = len;
1889             element.prefix = el->prefixChars;
1890             element.prefixSize = 0;
1891             if(prefix == NULL) {
1892                 element.noOfCEs = 0;
1893                 ucol_setText(colEl, decomp, decompLen, status);
1894                 while((element.CEs[element.noOfCEs] = ucol_next(colEl, status)) != (uint32_t)UCOL_NULLORDER) {
1895                     element.noOfCEs++;
1896                 }
1897                 uprv_uca_setMapCE(t, &element, status);
1898                 uprv_uca_finalizeAddition(t, &element, status);
1899             }
1900
1901             // This is a fix for tailoring contractions with accented
1902             // character at the end of contraction string.
1903             if ((len>2) &&
1904                 (unorm_getFCD16(fcdTrieData, comp[len-2]) & 0xff00)==0) {
1905                 uprv_uca_addFCD4AccentedContractions(t, colEl, comp, len, &element, status);
1906             }
1907
1908             if (precompLen >1) {
1909                 c.compLen = len;
1910                 c.decompLen = decompLen;
1911                 c.precompLen = precompLen;
1912                 c.cmPos = i;
1913                 uprv_uca_addMultiCMContractions(t, colEl, &c, &element, status);
1914                 precompLen = c.precompLen;
1915             }
1916         }
1917     }
1918 }
1919
1920 U_CFUNC int32_t U_EXPORT2
1921 uprv_uca_canonicalClosure(tempUCATable *t,
1922                           UColTokenParser *src,
1923                           UErrorCode *status)
1924 {
1925     enumStruct context;
1926     context.noOfClosures = 0;
1927     UCAElements el;
1928     UColToken *tok;
1929     uint32_t i = 0, j = 0;
1930     UChar  baseChar, firstCM;
1931     const uint16_t  *fcdTrieData = unorm_getFCDTrie(status);
1932
1933     if(!U_SUCCESS(*status)) {
1934         return 0;
1935     }
1936
1937     UCollator *tempColl = NULL;
1938     tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status);
1939     // Check for null pointer
1940     if (U_FAILURE(*status)) {
1941         return 0;
1942     }
1943
1944     UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status);
1945     tempColl = ucol_initCollator(tempData, 0, t->UCA, status);
1946     if ( tempTable->cmLookup != NULL ) {
1947         t->cmLookup = tempTable->cmLookup;  // copy over to t
1948         tempTable->cmLookup = NULL;
1949     }
1950     uprv_uca_closeTempTable(tempTable);
1951
1952     if(U_SUCCESS(*status)) {
1953         tempColl->ucaRules = NULL;
1954         tempColl->actualLocale = NULL;
1955         tempColl->validLocale = NULL;
1956         tempColl->requestedLocale = NULL;
1957         tempColl->hasRealData = TRUE;
1958         tempColl->freeImageOnClose = TRUE;
1959     } else if(tempData != 0) {
1960         uprv_free(tempData);
1961     }
1962
1963     /* produce canonical closure */
1964     UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status);
1965     // Check for null pointer
1966     if (U_FAILURE(*status)) {
1967         return 0;
1968     }
1969     context.t = t;
1970     context.tempColl = tempColl;
1971     context.colEl = colEl;
1972     context.status = status;
1973     u_enumCharTypes(_enumCategoryRangeClosureCategory, &context);
1974
1975     if ( (src==NULL) || !src->buildCCTabFlag ) {
1976         ucol_closeElements(colEl);
1977         ucol_close(tempColl);
1978         return context.noOfClosures;  // no extra contraction needed to add
1979     }
1980
1981     for (i=0; i < src->resultLen; i++) {
1982         baseChar = firstCM= (UChar)0;
1983         tok = src->lh[i].first;
1984         while (tok != NULL && U_SUCCESS(*status)) {
1985             el.prefix = el.prefixChars;
1986             el.cPoints = el.uchars;
1987             if(tok->prefix != 0) {
1988                 el.prefixSize = tok->prefix>>24;
1989                 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
1990
1991                 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
1992                 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
1993             } else {
1994                 el.prefixSize = 0;
1995                 *el.prefix = 0;
1996
1997                 el.cSize = (tok->source >> 24);
1998                 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
1999             }
2000             if(src->UCA != NULL) {
2001                 for(j = 0; j<el.cSize; j++) {
2002                     int16_t fcd = unorm_getFCD16(fcdTrieData, el.cPoints[j]);
2003                     if ( (fcd & 0xff) == 0 ) {
2004                         baseChar = el.cPoints[j];  // last base character
2005                         firstCM=0;  // reset combining mark value
2006                     }
2007                     else {
2008                         if ( (baseChar!=0) && (firstCM==0) ) {
2009                             firstCM = el.cPoints[j];  // first combining mark
2010                         }
2011                     }
2012                 }
2013             }
2014             if ( (baseChar!= (UChar)0) && (firstCM != (UChar)0) ) {
2015                 // find all the canonical rules
2016                 uprv_uca_addTailCanonicalClosures(t, colEl, baseChar, firstCM, &el, status);
2017             }
2018             tok = tok->next;
2019         }
2020     }
2021     ucol_closeElements(colEl);
2022     ucol_close(tempColl);
2023
2024     return context.noOfClosures;
2025 }
2026
2027 #endif /* #if !UCONFIG_NO_COLLATION */