]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_elm.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_elm.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2001-2003, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucaelems.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
16* This program reads the Franctional UCA table and generates
17* internal format for UCA table as well as inverse UCA table.
18* It then writes binary files containing the data: ucadata.dat
19* & invuca.dat
20*
21* date name comments
22* 03/02/2001 synwee added setMaxExpansion
23* 03/07/2001 synwee merged UCA's maxexpansion and tailoring's
24*/
25
26#include "unicode/utypes.h"
27
28#if !UCONFIG_NO_COLLATION
29
30#include "unicode/uchar.h"
31#include "unicode/unistr.h"
32#include "unicode/ucoleitr.h"
33#include "unicode/normlzr.h"
34#include "ucol_elm.h"
35#include "unormimp.h"
36#include "unicode/caniter.h"
37#include "cmemory.h"
38
39U_NAMESPACE_BEGIN
40
41static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status);
42
43U_CDECL_BEGIN
44static int32_t U_EXPORT2 U_CALLCONV
45prefixLookupHash(const UHashTok e) {
46 UCAElements *element = (UCAElements *)e.pointer;
47 UChar buf[256];
48 UHashTok key;
49 key.pointer = buf;
50 uprv_memcpy(buf, element->cPoints, element->cSize*sizeof(UChar));
51 buf[element->cSize] = 0;
52 //key.pointer = element->cPoints;
53 //element->cPoints[element->cSize] = 0;
54 return uhash_hashUChars(key);
55}
56
57static int8_t U_EXPORT2 U_CALLCONV
58prefixLookupComp(const UHashTok e1, const UHashTok e2) {
59 UCAElements *element1 = (UCAElements *)e1.pointer;
60 UCAElements *element2 = (UCAElements *)e2.pointer;
61
62 UChar buf1[256];
63 UHashTok key1;
64 key1.pointer = buf1;
65 uprv_memcpy(buf1, element1->cPoints, element1->cSize*sizeof(UChar));
66 buf1[element1->cSize] = 0;
67
68 UChar buf2[256];
69 UHashTok key2;
70 key2.pointer = buf2;
71 uprv_memcpy(buf2, element2->cPoints, element2->cSize*sizeof(UChar));
72 buf2[element2->cSize] = 0;
73
74 return uhash_compareUChars(key1, key2);
75}
76U_CDECL_END
77
78static int32_t uprv_uca_addExpansion(ExpansionTable *expansions, uint32_t value, UErrorCode *status) {
79 if(U_FAILURE(*status)) {
80 return 0;
81 }
82 if(expansions->CEs == NULL) {
83 expansions->CEs = (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE*sizeof(uint32_t));
84 /* test for NULL */
85 if (expansions->CEs == NULL) {
86 *status = U_MEMORY_ALLOCATION_ERROR;
87 return 0;
88 }
89 expansions->size = INIT_EXP_TABLE_SIZE;
90 expansions->position = 0;
91 }
92
93 if(expansions->position == expansions->size) {
94 uint32_t *newData = (uint32_t *)uprv_realloc(expansions->CEs, 2*expansions->size*sizeof(uint32_t));
95 if(newData == NULL) {
96#ifdef UCOL_DEBUG
97 fprintf(stderr, "out of memory for expansions\n");
98#endif
99 *status = U_MEMORY_ALLOCATION_ERROR;
100 return -1;
101 }
102 expansions->CEs = newData;
103 expansions->size *= 2;
104 }
105
106 expansions->CEs[expansions->position] = value;
107 return(expansions->position++);
108}
109
110U_CAPI tempUCATable* U_EXPORT2
111uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts, const UCollator *UCA, UColCETags initTag, UErrorCode *status) {
112 tempUCATable *t = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
113 /* test for NULL */
114 if (t == NULL) {
115 *status = U_MEMORY_ALLOCATION_ERROR;
116 return NULL;
117 }
118 MaxExpansionTable *maxet = (MaxExpansionTable *)uprv_malloc(
119 sizeof(MaxExpansionTable));
120 /* test for NULL */
121 if (maxet == NULL) {
122 *status = U_MEMORY_ALLOCATION_ERROR;
123 uprv_free(t);
124 return NULL;
125 }
126 MaxJamoExpansionTable *maxjet = (MaxJamoExpansionTable *)uprv_malloc(
127 sizeof(MaxJamoExpansionTable));
128 /* test for NULL */
129 if (maxjet == NULL) {
130 *status = U_MEMORY_ALLOCATION_ERROR;
131 uprv_free(t);
132 uprv_free(maxet);
133 return NULL;
134 }
135 t->image = image;
136 t->options = opts;
137
138 t->UCA = UCA;
139 t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
140 /* test for NULL */
141 if (t->expansions == NULL) {
142 *status = U_MEMORY_ALLOCATION_ERROR;
143 uprv_free(t);
144 uprv_free(maxet);
145 uprv_free(maxjet);
146 return NULL;
147 }
148 uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
149 /*t->mapping = ucmpe32_open(UCOL_SPECIAL_FLAG | (initTag<<24), UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG<<24), status);*/
150 t->mapping = utrie_open(NULL, NULL, 0x100000, UCOL_SPECIAL_FLAG | (initTag<<24), TRUE); // Do your own mallocs for the structure, array and have linear Latin 1
151 t->prefixLookup = uhash_open(prefixLookupHash, prefixLookupComp, status);
152 uhash_setValueDeleter(t->prefixLookup, uhash_freeBlock);
153
154 t->contractions = uprv_cnttab_open(t->mapping, status);
155
156 /* copy UCA's maxexpansion and merge as we go along */
157 t->maxExpansions = maxet;
158 if (UCA != NULL) {
159 /* adding an extra initial value for easier manipulation */
160 maxet->size = (UCA->lastEndExpansionCE - UCA->endExpansionCE)
161 + 2;
162 maxet->position = maxet->size - 1;
163 maxet->endExpansionCE =
164 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet->size);
165 /* test for NULL */
166 if (maxet->endExpansionCE == NULL) {
167 *status = U_MEMORY_ALLOCATION_ERROR;
168 return NULL;
169 }
170 maxet->expansionCESize =
171 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet->size);
172 /* test for NULL */
173 if (maxet->expansionCESize == NULL) {
174 *status = U_MEMORY_ALLOCATION_ERROR;
175 uprv_free(maxet->endExpansionCE);
176 return NULL;
177 }
178 /* initialized value */
179 *(maxet->endExpansionCE) = 0;
180 *(maxet->expansionCESize) = 0;
181 uprv_memcpy(maxet->endExpansionCE + 1, UCA->endExpansionCE,
182 sizeof(uint32_t) * (maxet->size - 1));
183 uprv_memcpy(maxet->expansionCESize + 1, UCA->expansionCESize,
184 sizeof(uint8_t) * (maxet->size - 1));
185 }
186 else {
187 maxet->size = 0;
188 }
189 t->maxJamoExpansions = maxjet;
190 maxjet->endExpansionCE = NULL;
191 maxjet->isV = NULL;
192 maxjet->size = 0;
193 maxjet->position = 0;
194 maxjet->maxLSize = 1;
195 maxjet->maxVSize = 1;
196 maxjet->maxTSize = 1;
197
198 t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
199 /* test for NULL */
200 if (t->unsafeCP == NULL) {
201 *status = U_MEMORY_ALLOCATION_ERROR;
202 return NULL;
203 }
204 t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
205 /* test for NULL */
206 if (t->contrEndCP == NULL) {
207 *status = U_MEMORY_ALLOCATION_ERROR;
208 uprv_free(t->unsafeCP);
209 return NULL;
210 }
211 uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
212 uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
213return t;
214}
215
216U_CAPI tempUCATable* U_EXPORT2
217uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
218 if(U_FAILURE(*status)) {
219 return NULL;
220 }
221
222 tempUCATable *r = (tempUCATable *)uprv_malloc(sizeof(tempUCATable));
223 /* test for NULL */
224 if (r == NULL) {
225 *status = U_MEMORY_ALLOCATION_ERROR;
226 return NULL;
227 }
228 uprv_memset(r, 0, sizeof(tempUCATable));
229
230 /* mapping */
231 if(t->mapping != NULL) {
232 /*r->mapping = ucmpe32_clone(t->mapping, status);*/
233 r->mapping = utrie_clone(NULL, t->mapping, NULL, 0);
234 }
235
236 // a hashing clone function would be very nice. We have none currently...
237 // However, we should be good, as closing should not produce any prefixed elements.
238 r->prefixLookup = NULL; // prefixes are not used in closing
239
240 /* expansions */
241 if(t->expansions != NULL) {
242 r->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
243 /* test for NULL */
244 if (r->expansions == NULL) {
245 *status = U_MEMORY_ALLOCATION_ERROR;
246 return NULL;
247 }
248 r->expansions->position = t->expansions->position;
249 r->expansions->size = t->expansions->size;
250 if(t->expansions->CEs != NULL) {
251 r->expansions->CEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->expansions->size);
252 /* test for NULL */
253 if (r->expansions->CEs == NULL) {
254 *status = U_MEMORY_ALLOCATION_ERROR;
255 return NULL;
256 }
257 uprv_memcpy(r->expansions->CEs, t->expansions->CEs, sizeof(uint32_t)*t->expansions->size);
258 } else {
259 r->expansions->CEs = NULL;
260 }
261 }
262
263 if(t->contractions != NULL) {
264 r->contractions = uprv_cnttab_clone(t->contractions, status);
265 r->contractions->mapping = r->mapping;
266 }
267
268 if(t->maxExpansions != NULL) {
269 r->maxExpansions = (MaxExpansionTable *)uprv_malloc(sizeof(MaxExpansionTable));
270 /* test for NULL */
271 if (r->maxExpansions == NULL) {
272 *status = U_MEMORY_ALLOCATION_ERROR;
273 return NULL;
274 }
275 r->maxExpansions->size = t->maxExpansions->size;
276 r->maxExpansions->position = t->maxExpansions->position;
277 if(t->maxExpansions->endExpansionCE != NULL) {
278 r->maxExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxExpansions->size);
279 /* test for NULL */
280 if (r->maxExpansions->endExpansionCE == NULL) {
281 *status = U_MEMORY_ALLOCATION_ERROR;
282 return NULL;
283 }
284 uprv_memcpy(r->maxExpansions->endExpansionCE, t->maxExpansions->endExpansionCE, t->maxExpansions->size*sizeof(uint32_t));
285 } else {
286 r->maxExpansions->endExpansionCE = NULL;
287 }
288 if(t->maxExpansions->expansionCESize != NULL) {
289 r->maxExpansions->expansionCESize = (uint8_t *)uprv_malloc(sizeof(uint8_t)*t->maxExpansions->size);
290 /* test for NULL */
291 if (r->maxExpansions->expansionCESize == NULL) {
292 *status = U_MEMORY_ALLOCATION_ERROR;
293 return NULL;
294 }
295 uprv_memcpy(r->maxExpansions->expansionCESize, t->maxExpansions->expansionCESize, t->maxExpansions->size*sizeof(uint8_t));
296 } else {
297 r->maxExpansions->expansionCESize = NULL;
298 }
299 }
300
301 if(t->maxJamoExpansions != NULL) {
302 r->maxJamoExpansions = (MaxJamoExpansionTable *)uprv_malloc(sizeof(MaxJamoExpansionTable));
303 /* test for NULL */
304 if (r->maxJamoExpansions == NULL) {
305 *status = U_MEMORY_ALLOCATION_ERROR;
306 return NULL;
307 }
308 r->maxJamoExpansions->size = t->maxJamoExpansions->size;
309 r->maxJamoExpansions->position = t->maxJamoExpansions->position;
310 r->maxJamoExpansions->maxLSize = t->maxJamoExpansions->maxLSize;
311 r->maxJamoExpansions->maxVSize = t->maxJamoExpansions->maxVSize;
312 r->maxJamoExpansions->maxTSize = t->maxJamoExpansions->maxTSize;
313 if(t->maxJamoExpansions->size != 0) {
314 r->maxJamoExpansions->endExpansionCE = (uint32_t *)uprv_malloc(sizeof(uint32_t)*t->maxJamoExpansions->size);
315 /* test for NULL */
316 if (r->maxJamoExpansions->endExpansionCE == NULL) {
317 *status = U_MEMORY_ALLOCATION_ERROR;
318 return NULL;
319 }
320 uprv_memcpy(r->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->endExpansionCE, t->maxJamoExpansions->size*sizeof(uint32_t));
321 r->maxJamoExpansions->isV = (UBool *)uprv_malloc(sizeof(UBool)*t->maxJamoExpansions->size);
322 /* test for NULL */
323 if (r->maxJamoExpansions->isV == NULL) {
324 *status = U_MEMORY_ALLOCATION_ERROR;
325 return NULL;
326 }
327 uprv_memcpy(r->maxJamoExpansions->isV, t->maxJamoExpansions->isV, t->maxJamoExpansions->size*sizeof(UBool));
328 } else {
329 r->maxJamoExpansions->endExpansionCE = NULL;
330 r->maxJamoExpansions->isV = NULL;
331 }
332 }
333
334 if(t->unsafeCP != NULL) {
335 r->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
336 /* test for NULL */
337 if (r->unsafeCP == NULL) {
338 *status = U_MEMORY_ALLOCATION_ERROR;
339 return NULL;
340 }
341 uprv_memcpy(r->unsafeCP, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
342 }
343
344 if(t->contrEndCP != NULL) {
345 r->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
346 /* test for NULL */
347 if (r->contrEndCP == NULL) {
348 *status = U_MEMORY_ALLOCATION_ERROR;
349 return NULL;
350 }
351 uprv_memcpy(r->contrEndCP, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
352 }
353
354 r->UCA = t->UCA;
355 r->image = t->image;
356 r->options = t->options;
357
358 return r;
359}
360
361
362U_CAPI void U_EXPORT2
363uprv_uca_closeTempTable(tempUCATable *t) {
364 if(t != NULL) {
365 uprv_free(t->expansions->CEs);
366 uprv_free(t->expansions);
367 if(t->contractions != NULL) {
368 uprv_cnttab_close(t->contractions);
369 }
370 /*ucmpe32_close(t->mapping);*/
371 utrie_close(t->mapping);
372
373 if(t->prefixLookup != NULL) {
374 uhash_close(t->prefixLookup);
375 }
376
377 uprv_free(t->maxExpansions->endExpansionCE);
378 uprv_free(t->maxExpansions->expansionCESize);
379 uprv_free(t->maxExpansions);
380
381 if (t->maxJamoExpansions->size > 0) {
382 uprv_free(t->maxJamoExpansions->endExpansionCE);
383 uprv_free(t->maxJamoExpansions->isV);
384 }
385 uprv_free(t->maxJamoExpansions);
386
387 uprv_free(t->unsafeCP);
388 uprv_free(t->contrEndCP);
389
390 uprv_free(t);
391 }
392}
393
394/**
395* Looks for the maximum length of all expansion sequences ending with the same
396* collation element. The size required for maxexpansion and maxsize is
397* returned if the arrays are too small.
398* @param endexpansion the last expansion collation element to be added
399* @param expansionsize size of the expansion
400* @param maxexpansion data structure to store the maximum expansion data.
401* @param status error status
402* @returns size of the maxexpansion and maxsize used.
403*/
404int uprv_uca_setMaxExpansion(uint32_t endexpansion,
405 uint8_t expansionsize,
406 MaxExpansionTable *maxexpansion,
407 UErrorCode *status)
408{
409 if (maxexpansion->size == 0) {
410 /* we'll always make the first element 0, for easier manipulation */
411 maxexpansion->endExpansionCE =
412 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(int32_t));
413 /* test for NULL */
414 if (maxexpansion->endExpansionCE == NULL) {
415 *status = U_MEMORY_ALLOCATION_ERROR;
416 return 0;
417 }
418 *(maxexpansion->endExpansionCE) = 0;
419 maxexpansion->expansionCESize =
420 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint8_t));
421 /* test for NULL */;
422 if (maxexpansion->expansionCESize == NULL) {
423 *status = U_MEMORY_ALLOCATION_ERROR;
424 return 0;
425 }
426 *(maxexpansion->expansionCESize) = 0;
427 maxexpansion->size = INIT_EXP_TABLE_SIZE;
428 maxexpansion->position = 0;
429 }
430
431 if (maxexpansion->position + 1 == maxexpansion->size) {
432 uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE,
433 2 * maxexpansion->size * sizeof(uint32_t));
434 uint8_t *neweces = (uint8_t *)uprv_realloc(maxexpansion->expansionCESize,
435 2 * maxexpansion->size * sizeof(uint8_t));
436 if (neweece == NULL || neweces == NULL) {
437#ifdef UCOL_DEBUG
438 fprintf(stderr, "out of memory for maxExpansions\n");
439#endif
440 *status = U_MEMORY_ALLOCATION_ERROR;
441 return -1;
442 }
443 maxexpansion->endExpansionCE = neweece;
444 maxexpansion->expansionCESize = neweces;
445 maxexpansion->size *= 2;
446 }
447
448 uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
449 uint8_t *pexpansionsize = maxexpansion->expansionCESize;
450 int pos = maxexpansion->position;
451
452 uint32_t *start = pendexpansionce;
453 uint32_t *limit = pendexpansionce + pos;
454
455 /* using binary search to determine if last expansion element is
456 already in the array */
457 uint32_t *mid;
458 int result = -1;
459 while (start < limit - 1) {
460 mid = start + ((limit - start) >> 1);
461 if (endexpansion <= *mid) {
462 limit = mid;
463 }
464 else {
465 start = mid;
466 }
467 }
468
469 if (*start == endexpansion) {
470 result = start - pendexpansionce;
471 }
472 else
473 if (*limit == endexpansion) {
474 result = limit - pendexpansionce;
475 }
476
477 if (result > -1) {
478 /* found the ce in expansion, we'll just modify the size if it is
479 smaller */
480 uint8_t *currentsize = pexpansionsize + result;
481 if (*currentsize < expansionsize) {
482 *currentsize = expansionsize;
483 }
484 }
485 else {
486 /* we'll need to squeeze the value into the array.
487 initial implementation. */
488 /* shifting the subarray down by 1 */
489 int shiftsize = (pendexpansionce + pos) - start;
490 uint32_t *shiftpos = start + 1;
491 uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
492
493 /* okay need to rearrange the array into sorted order */
494 if (shiftsize == 0 || *(pendexpansionce + pos) < endexpansion) {
495 *(pendexpansionce + pos + 1) = endexpansion;
496 *(pexpansionsize + pos + 1) = expansionsize;
497 }
498 else {
499 uprv_memmove(shiftpos + 1, shiftpos, shiftsize * sizeof(int32_t));
500 uprv_memmove(sizeshiftpos + 1, sizeshiftpos,
501 shiftsize * sizeof(uint8_t));
502 *shiftpos = endexpansion;
503 *sizeshiftpos = expansionsize;
504 }
505 maxexpansion->position ++;
506
507#ifdef UCOL_DEBUG
508 int temp;
509 UBool found = FALSE;
510 for (temp = 0; temp < maxexpansion->position; temp ++) {
511 if (pendexpansionce[temp] >= pendexpansionce[temp + 1]) {
512 fprintf(stderr, "expansions %d\n", temp);
513 }
514 if (pendexpansionce[temp] == endexpansion) {
515 found =TRUE;
516 if (pexpansionsize[temp] < expansionsize) {
517 fprintf(stderr, "expansions size %d\n", temp);
518 }
519 }
520 }
521 if (pendexpansionce[temp] == endexpansion) {
522 found =TRUE;
523 if (pexpansionsize[temp] < expansionsize) {
524 fprintf(stderr, "expansions size %d\n", temp);
525 }
526 }
527 if (!found)
528 fprintf(stderr, "expansion not found %d\n", temp);
529#endif
530 }
531
532 return maxexpansion->position;
533}
534
535/**
536* Sets the maximum length of all jamo expansion sequences ending with the same
537* collation element. The size required for maxexpansion and maxsize is
538* returned if the arrays are too small.
539* @param ch the jamo codepoint
540* @param endexpansion the last expansion collation element to be added
541* @param expansionsize size of the expansion
542* @param maxexpansion data structure to store the maximum expansion data.
543* @param status error status
544* @returns size of the maxexpansion and maxsize used.
545*/
546int uprv_uca_setMaxJamoExpansion(UChar ch,
547 uint32_t endexpansion,
548 uint8_t expansionsize,
549 MaxJamoExpansionTable *maxexpansion,
550 UErrorCode *status)
551{
552 UBool isV = TRUE;
553 if (((uint32_t)ch - 0x1100) <= (0x1112 - 0x1100)) {
554 /* determines L for Jamo, doesn't need to store this since it is never
555 at the end of a expansion */
556 if (maxexpansion->maxLSize < expansionsize) {
557 maxexpansion->maxLSize = expansionsize;
558 }
559 return maxexpansion->position;
560 }
561
562 if (((uint32_t)ch - 0x1161) <= (0x1175 - 0x1161)) {
563 /* determines V for Jamo */
564 if (maxexpansion->maxVSize < expansionsize) {
565 maxexpansion->maxVSize = expansionsize;
566 }
567 }
568
569 if (((uint32_t)ch - 0x11A8) <= (0x11C2 - 0x11A8)) {
570 isV = FALSE;
571 /* determines T for Jamo */
572 if (maxexpansion->maxTSize < expansionsize) {
573 maxexpansion->maxTSize = expansionsize;
574 }
575 }
576
577 if (maxexpansion->size == 0) {
578 /* we'll always make the first element 0, for easier manipulation */
579 maxexpansion->endExpansionCE =
580 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(uint32_t));
581 /* test for NULL */;
582 if (maxexpansion->endExpansionCE == NULL) {
583 *status = U_MEMORY_ALLOCATION_ERROR;
584 return 0;
585 }
586 *(maxexpansion->endExpansionCE) = 0;
587 maxexpansion->isV =
588 (UBool *)uprv_malloc(INIT_EXP_TABLE_SIZE * sizeof(UBool));
589 /* test for NULL */;
590 if (maxexpansion->isV == NULL) {
591 *status = U_MEMORY_ALLOCATION_ERROR;
592 return 0;
593 }
594 *(maxexpansion->isV) = 0;
595 maxexpansion->size = INIT_EXP_TABLE_SIZE;
596 maxexpansion->position = 0;
597 }
598
599 if (maxexpansion->position + 1 == maxexpansion->size) {
600 uint32_t *neweece = (uint32_t *)uprv_realloc(maxexpansion->endExpansionCE,
601 2 * maxexpansion->size * sizeof(uint32_t));
602 UBool *newisV = (UBool *)uprv_realloc(maxexpansion->isV,
603 2 * maxexpansion->size * sizeof(UBool));
604 if (neweece == NULL || newisV == NULL) {
605#ifdef UCOL_DEBUG
606 fprintf(stderr, "out of memory for maxExpansions\n");
607#endif
608 *status = U_MEMORY_ALLOCATION_ERROR;
609 return -1;
610 }
611 maxexpansion->endExpansionCE = neweece;
612 maxexpansion->isV = newisV;
613 maxexpansion->size *= 2;
614 }
615
616 uint32_t *pendexpansionce = maxexpansion->endExpansionCE;
617 int pos = maxexpansion->position;
618
619 while (pos > 0) {
620 pos --;
621 if (*(pendexpansionce + pos) == endexpansion) {
622 return maxexpansion->position;
623 }
624 }
625
626 *(pendexpansionce + maxexpansion->position) = endexpansion;
627 *(maxexpansion->isV + maxexpansion->position) = isV;
628 maxexpansion->position ++;
629
630 return maxexpansion->position;
631}
632
633
634static void ContrEndCPSet(uint8_t *table, UChar c) {
635 uint32_t hash;
636 uint8_t *htByte;
637
638 hash = c;
639 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
640 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
641 }
642 htByte = &table[hash>>3];
643 *htByte |= (1 << (hash & 7));
644}
645
646
647static void unsafeCPSet(uint8_t *table, UChar c) {
648 uint32_t hash;
649 uint8_t *htByte;
650
651 hash = c;
652 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
653 if (hash >= 0xd800 && hash <= 0xf8ff) {
654 /* Part of a surrogate, or in private use area. */
655 /* These don't go in the table */
656 return;
657 }
658 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
659 }
660 htByte = &table[hash>>3];
661 *htByte |= (1 << (hash & 7));
662}
663
664
665/* to the UnsafeCP hash table, add all chars with combining class != 0 */
666static void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t, UErrorCode *status) {
667
668 UChar c;
669 uint16_t fcd; // Hi byte is lead combining class.
670 // lo byte is trailing combing class.
671 const uint16_t *fcdTrieData;
672
673 fcdTrieData = unorm_getFCDTrie(status);
674 if (U_FAILURE(*status)) {
675 return;
676 }
677
678 for (c=0; c<0xffff; c++) {
679 fcd = unorm_getFCD16(fcdTrieData, c);
680 if (fcd >= 0x100 || // if the leading combining class(c) > 0 ||
681 (UTF_IS_LEAD(c) && fcd != 0)) // c is a leading surrogate with some FCD data
682 unsafeCPSet(t->unsafeCP, c);
683 }
684
685 if(t->prefixLookup != NULL) {
686 int32_t i = -1;
687 const UHashElement *e = NULL;
688 UCAElements *element = NULL;
689 UChar NFCbuf[256];
690 uint32_t NFCbufLen = 0;
691 while((e = uhash_nextElement(t->prefixLookup, &i)) != NULL) {
692 element = (UCAElements *)e->value.pointer;
693 // codepoints here are in the NFD form. We need to add the
694 // first code point of the NFC form to unsafe, because
695 // strcoll needs to backup over them.
696 NFCbufLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0,
697 NFCbuf, 256, status);
698 unsafeCPSet(t->unsafeCP, NFCbuf[0]);
699 }
700 }
701}
702
703uint32_t uprv_uca_addPrefix(tempUCATable *t, uint32_t CE,
704 UCAElements *element, UErrorCode *status) {
705 // currently the longest prefix we're supporting in Japanese is two characters
706 // long. Although this table could quite easily mimic complete contraction stuff
707 // there is no good reason to make a general solution, as it would require some
708 // error prone messing.
709 CntTable *contractions = t->contractions;
710 UChar32 cp;
711 uint32_t cpsize = 0;
712 UChar *oldCP = element->cPoints;
713 uint32_t oldCPSize = element->cSize;
714
715
716 contractions->currentTag = SPEC_PROC_TAG;
717
718 // here, we will normalize & add prefix to the table.
719 uint32_t j = 0;
720#ifdef UCOL_DEBUG
721 for(j=0; j<element->cSize; j++) {
722 fprintf(stdout, "CP: %04X ", element->cPoints[j]);
723 }
724 fprintf(stdout, "El: %08X Pref: ", CE);
725 for(j=0; j<element->prefixSize; j++) {
726 fprintf(stdout, "%04X ", element->prefix[j]);
727 }
728 fprintf(stdout, "%08X ", element->mapCE);
729#endif
730
731 for (j = 1; j<element->prefixSize; j++) { /* First add NFD prefix chars to unsafe CP hash table */
732 // Unless it is a trail surrogate, which is handled algoritmically and
733 // shouldn't take up space in the table.
734 if(!(UTF_IS_TRAIL(element->prefix[j]))) {
735 unsafeCPSet(t->unsafeCP, element->prefix[j]);
736 }
737 }
738
739 UChar tempPrefix = 0;
740
741 for(j = 0; j < /*nfcSize*/element->prefixSize/2; j++) { // prefixes are going to be looked up backwards
742 // therefore, we will promptly reverse the prefix buffer...
743 tempPrefix = *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1);
744 *(/*nfcBuffer*/element->prefix+element->prefixSize-j-1) = element->prefix[j];
745 element->prefix[j] = tempPrefix;
746 }
747
748#ifdef UCOL_DEBUG
749 fprintf(stdout, "Reversed: ");
750 for(j=0; j<element->prefixSize; j++) {
751 fprintf(stdout, "%04X ", element->prefix[j]);
752 }
753 fprintf(stdout, "%08X\n", element->mapCE);
754#endif
755
756 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix
757 if(!(UTF_IS_TRAIL(element->cPoints[0]))) {
758 unsafeCPSet(t->unsafeCP, element->cPoints[0]);
759 }
760
761 // Maybe we need this... To handle prefixes completely in the forward direction...
762 //if(element->cSize == 1) {
763 // if(!(UTF_IS_TRAIL(element->cPoints[0]))) {
764 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]);
765 // }
766 //}
767
768 element->cPoints = element->prefix;
769 element->cSize = element->prefixSize;
770
771 // Add the last char of the contraction to the contraction-end hash table.
772 // unless it is a trail surrogate, which is handled algorithmically and
773 // shouldn't be in the table
774 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) {
775 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
776 }
777
778 // First we need to check if contractions starts with a surrogate
779 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
780
781 // If there are any Jamos in the contraction, we should turn on special
782 // processing for Jamos
783 if(UCOL_ISJAMO(element->prefix[0])) {
784 t->image->jamoSpecial = TRUE;
785 }
786 /* then we need to deal with it */
787 /* we could aready have something in table - or we might not */
788
789 if(!isPrefix(CE)) {
790 /* if it wasn't contraction, we wouldn't end up here*/
791 int32_t firstContractionOffset = 0;
792 int32_t contractionOffset = 0;
793 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
794 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
795 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->prefix, newCE, status);
796 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
797 CE = constructContractCE(SPEC_PROC_TAG, firstContractionOffset);
798 } else { /* we are adding to existing contraction */
799 /* there were already some elements in the table, so we need to add a new contraction */
800 /* Two things can happen here: either the codepoint is already in the table, or it is not */
801 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->prefix, status);
802 if(position > 0) { /* if it is we just continue down the chain */
803 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
804 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
805 uprv_cnttab_setContraction(contractions, CE, position, *(element->prefix), newCE, status);
806 } else { /* if it isn't, we will have to create a new sequence */
807 uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
808 uprv_cnttab_insertContraction(contractions, CE, *(element->prefix), element->mapCE, status);
809 }
810 }
811
812 element->cPoints = oldCP;
813 element->cSize = oldCPSize;
814
815 return CE;
816}
817
818// Note regarding surrogate handling: We are interested only in the single
819// or leading surrogates in a contraction. If a surrogate is somewhere else
820// in the contraction, it is going to be handled as a pair of code units,
821// as it doesn't affect the performance AND handling surrogates specially
822// would complicate code way too much.
823uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE,
824 UCAElements *element, UErrorCode *status) {
825 CntTable *contractions = t->contractions;
826 UChar32 cp;
827 uint32_t cpsize = 0;
828
829 contractions->currentTag = CONTRACTION_TAG;
830
831 // First we need to check if contractions starts with a surrogate
832 UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
833
834 if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first
835 uint32_t j = 0;
836 for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */
837 // Unless it is a trail surrogate, which is handled algoritmically and
838 // shouldn't take up space in the table.
839 if(!(UTF_IS_TRAIL(element->cPoints[j]))) {
840 unsafeCPSet(t->unsafeCP, element->cPoints[j]);
841 }
842 }
843 // Add the last char of the contraction to the contraction-end hash table.
844 // unless it is a trail surrogate, which is handled algorithmically and
845 // shouldn't be in the table
846 if(!(UTF_IS_TRAIL(element->cPoints[element->cSize -1]))) {
847 ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
848 }
849
850 // If there are any Jamos in the contraction, we should turn on special
851 // processing for Jamos
852 if(UCOL_ISJAMO(element->cPoints[0])) {
853 t->image->jamoSpecial = TRUE;
854 }
855 /* then we need to deal with it */
856 /* we could aready have something in table - or we might not */
857 element->cPoints+=cpsize;
858 element->cSize-=cpsize;
859 if(!isContraction(CE)) {
860 /* if it wasn't contraction, we wouldn't end up here*/
861 int32_t firstContractionOffset = 0;
862 int32_t contractionOffset = 0;
863 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
864 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
865 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
866 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
867 CE = constructContractCE(CONTRACTION_TAG, firstContractionOffset);
868 } else { /* we are adding to existing contraction */
869 /* there were already some elements in the table, so we need to add a new contraction */
870 /* Two things can happen here: either the codepoint is already in the table, or it is not */
871 int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status);
872 if(position > 0) { /* if it is we just continue down the chain */
873 uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
874 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
875 uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status);
876 } else { /* if it isn't, we will have to create a new sequence */
877 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
878 uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status);
879 }
880 }
881 element->cPoints-=cpsize;
882 element->cSize+=cpsize;
883 /*ucmpe32_set(t->mapping, cp, CE);*/
884 utrie_set32(t->mapping, cp, CE);
885 } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */
886 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/
887 utrie_set32(t->mapping, cp, element->mapCE);
888 } else { /* fill out the first stage of the contraction with the surrogate CE */
889 uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status);
890 uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status);
891 }
892 return CE;
893}
894
895
896static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
897 int32_t firstContractionOffset = 0;
898 int32_t contractionOffset = 0;
899// uint32_t contractionElement = UCOL_NOT_FOUND;
900
901 if(U_FAILURE(*status)) {
902 return UCOL_NOT_FOUND;
903 }
904
905 /* end of recursion */
906 if(element->cSize == 1) {
907 if(isCntTableElement(existingCE) && ((UColCETags)getCETag(existingCE) == contractions->currentTag)) {
908 uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
909 uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
910 return existingCE;
911 } else {
912 return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
913 }
914 }
915
916 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
917 /* for both backward and forward cycles */
918
919 /* we encountered either an empty space or a non-contraction element */
920 /* this means we are constructing a new contraction sequence */
921 element->cPoints++;
922 element->cSize--;
923 if(!isCntTableElement(existingCE)) {
924 /* if it wasn't contraction, we wouldn't end up here*/
925 firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
926 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
927 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
928 contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
929 existingCE = constructContractCE(contractions->currentTag, firstContractionOffset);
930 } else { /* we are adding to existing contraction */
931 /* there were already some elements in the table, so we need to add a new contraction */
932 /* Two things can happen here: either the codepoint is already in the table, or it is not */
933 int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status);
934 if(position > 0) { /* if it is we just continue down the chain */
935 uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
936 uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
937 uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
938 } else { /* if it isn't, we will have to create a new sequence */
939 uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
940 uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
941 }
942 }
943 element->cPoints--;
944 element->cSize++;
945 return existingCE;
946}
947
948static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) {
949 uint32_t CE = UCOL_NOT_FOUND;
950 // This should add a completely ignorable element to the
951 // unsafe table, so that backward iteration will skip
952 // over it when treating contractions.
953 uint32_t i = 0;
954 if(element->mapCE == 0) {
955 for(i = 0; i < element->cSize; i++) {
956 if(!UTF_IS_TRAIL(element->cPoints[i])) {
957 unsafeCPSet(t->unsafeCP, element->cPoints[i]);
958 }
959 }
960 }
961 if(element->cSize > 1) { /* we're adding a contraction */
962 uint32_t i = 0;
963 UChar32 cp;
964
965 UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp);
966 /*CE = ucmpe32_get(t->mapping, cp);*/
967 CE = utrie_get32(t->mapping, cp, NULL);
968
969 CE = uprv_uca_addContraction(t, CE, element, status);
970 } else { /* easy case, */
971 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/
972 CE = utrie_get32(t->mapping, element->cPoints[0], NULL);
973
974 if( CE != UCOL_NOT_FOUND) {
975 if(isCntTableElement(CE) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
976 if(!isPrefix(element->mapCE)) { // we cannot reenter prefix elements - as we are going to create a dead loop
977 // Only expansions and regular CEs can go here... Contractions will never happen in this place
978 uprv_cnttab_setContraction(t->contractions, CE, 0, 0, element->mapCE, status);
979 /* This loop has to change the CE at the end of contraction REDO!*/
980 uprv_cnttab_changeLastCE(t->contractions, CE, element->mapCE, status);
981 }
982 } else {
983 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
984 utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
985#ifdef UCOL_DEBUG
986 fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]);
987 //*status = U_ILLEGAL_ARGUMENT_ERROR;
988#endif
989 }
990 } else {
991 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
992 utrie_set32(t->mapping, element->cPoints[0], element->mapCE);
993 }
994 }
995 return CE;
996}
997
998/* This adds a read element, while testing for existence */
999U_CAPI uint32_t U_EXPORT2
1000uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) {
1001 ExpansionTable *expansions = t->expansions;
1002
1003 uint32_t i = 1;
1004 uint32_t expansion = 0;
1005 uint32_t CE;
1006
1007 if(U_FAILURE(*status)) {
1008 return 0xFFFF;
1009 }
1010 if(element->noOfCEs == 1) {
1011 if(element->isThai == FALSE) {
1012 UChar32 uniChar = 0;
1013 //printElement(element);
1014 if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
1015 uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);
1016
1017 } else if (element->cSize == 1){
1018 uniChar = element->uchars[0];
1019
1020 }
1021
1022 if (uniChar != 0 && u_isdigit(uniChar)){
1023 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT)
1024 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1025 | 0x1);
1026 element->mapCE = expansion;
1027 unsafeCPSet(t->unsafeCP, uniChar);
1028 }else
1029 element->mapCE = element->CEs[0];
1030 } else { /* add thai - totally bad here */
1031 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT)
1032 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1033 | 0x1);
1034 element->mapCE = expansion;
1035 }
1036 } else {
1037 /* ICU 2.1 long primaries */
1038 /* unfortunately, it looks like we have to look for a long primary here */
1039 /* since in canonical closure we are going to hit some long primaries from */
1040 /* the first phase, and they will come back as continuations/expansions */
1041 /* destroying the effect of the previous opitimization */
1042 /* A long primary is a three byte primary with starting secondaries and tertiaries */
1043 /* It can appear in long runs of only primary differences (like east Asian tailorings) */
1044 /* also, it should not be an expansion, as expansions would break with this */
1045 // This part came in from ucol_bld.cpp
1046 //if(tok->expansion == 0
1047 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
1048 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
1049 /* we will construct a special CE that will go unchanged to the table */
1050 if(element->noOfCEs == 2 // a two CE expansion
1051 && isContinuation(element->CEs[1]) // which is a continuation
1052 && (element->CEs[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER))) == 0 // that has only primaries in continuation,
1053 && (((element->CEs[0]>>8) & 0xFF) == UCOL_BYTE_COMMON) // a common secondary
1054 && ((element->CEs[0] & 0xFF) == UCOL_BYTE_COMMON) // and a common tertiary
1055 ) {
1056#ifdef UCOL_DEBUG
1057 fprintf(stdout, "Long primary %04X\n", element->cPoints[0]);
1058#endif
1059 element->mapCE = UCOL_SPECIAL_FLAG | (LONG_PRIMARY_TAG<<24) // a long primary special
1060 | ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
1061 | ((element->CEs[1]>>24) & 0xFF); // third byte of primary
1062 } else {
1063
1064 /* Checking here to see if we should insert the DIGIT_TAG or the EXPANSION_TAG */
1065 UChar32 uniChar = 0;
1066
1067 if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
1068 uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);
1069 } else if (element->cSize == 1){
1070 uniChar = element->uchars[0];
1071 }
1072
1073 if (uniChar != 0 && u_isdigit(uniChar)){
1074 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT)
1075 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1076 | 0x1);
1077 unsafeCPSet(t->unsafeCP, uniChar);
1078 }else{
1079 expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
1080 | ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
1081 & 0xFFFFF0);
1082 }
1083
1084 for(i = 1; i<element->noOfCEs; i++) {
1085 uprv_uca_addExpansion(expansions, element->CEs[i], status);
1086 }
1087 if(element->noOfCEs <= 0xF) {
1088 expansion |= element->noOfCEs;
1089 } else {
1090 uprv_uca_addExpansion(expansions, 0, status);
1091 }
1092 element->mapCE = expansion;
1093 uprv_uca_setMaxExpansion(element->CEs[element->noOfCEs - 1],
1094 (uint8_t)element->noOfCEs,
1095 t->maxExpansions,
1096 status);
1097 if(UCOL_ISJAMO(element->cPoints[0])) {
1098 t->image->jamoSpecial = TRUE;
1099 uprv_uca_setMaxJamoExpansion(element->cPoints[0],
1100 element->CEs[element->noOfCEs - 1],
1101 (uint8_t)element->noOfCEs,
1102 t->maxJamoExpansions,
1103 status);
1104 }
1105 }
1106 }
1107
1108 // here we want to add the prefix structure.
1109 // I will try to process it as a reverse contraction, if possible.
1110 // prefix buffer is already reversed.
1111
1112 if(element->prefixSize!=0) {
1113 // We keep the seen prefix starter elements in a hashtable
1114 // we need it to be able to distinguish between the simple
1115 // codepoints and prefix starters. Also, we need to use it
1116 // for canonical closure.
1117
1118 UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
1119 /* test for NULL */
1120 if (composed == NULL) {
1121 *status = U_MEMORY_ALLOCATION_ERROR;
1122 return 0;
1123 }
1124 uprv_memcpy(composed, element, sizeof(UCAElements));
1125 composed->cPoints = composed->uchars;
1126 composed->prefix = composed->prefixChars;
1127
1128 composed->prefixSize = unorm_normalize(element->prefix, element->prefixSize, UNORM_NFC, 0, composed->prefix, 128, status);
1129
1130
1131 if(t->prefixLookup != NULL) {
1132 UCAElements *uCE = (UCAElements *)uhash_get(t->prefixLookup, element);
1133 if(uCE != NULL) { // there is already a set of code points here
1134 element->mapCE = uprv_uca_addPrefix(t, uCE->mapCE, element, status);
1135 } else { // no code points, so this spot is clean
1136 element->mapCE = uprv_uca_addPrefix(t, UCOL_NOT_FOUND, element, status);
1137 uCE = (UCAElements *)uprv_malloc(sizeof(UCAElements));
1138 /* test for NULL */
1139 if (uCE == NULL) {
1140 *status = U_MEMORY_ALLOCATION_ERROR;
1141 return 0;
1142 }
1143 uprv_memcpy(uCE, element, sizeof(UCAElements));
1144 uCE->cPoints = uCE->uchars;
1145 uhash_put(t->prefixLookup, uCE, uCE, status);
1146 }
1147 if(composed->prefixSize != element->prefixSize || uprv_memcmp(composed->prefix, element->prefix, element->prefixSize)) {
1148 // do it!
1149 composed->mapCE = uprv_uca_addPrefix(t, element->mapCE, composed, status);
1150 }
1151 }
1152 uprv_free(composed);
1153 }
1154
1155 // We need to use the canonical iterator here
1156 // the way we do it is to generate the canonically equivalent strings
1157 // for the contraction and then add the sequences that pass FCD check
1158 if(element->cSize > 1 && !(element->cSize==2 && UTF16_IS_LEAD(element->cPoints[0]) && UTF16_IS_TRAIL(element->cPoints[1]))) { // this is a contraction, we should check whether a composed form should also be included
1159 UnicodeString source(element->cPoints, element->cSize);
1160 CanonicalIterator it(source, *status);
1161 source = it.next();
1162 while(!source.isBogus()) {
1163 if(Normalizer::quickCheck(source, UNORM_FCD, *status) != UNORM_NO) {
1164 element->cSize = source.extract(element->cPoints, 128, *status);
1165 uprv_uca_finalizeAddition(t, element, status);
1166 }
1167 source = it.next();
1168 }
1169 CE = element->mapCE;
1170 } else {
1171 CE = uprv_uca_finalizeAddition(t, element, status);
1172 }
1173
1174 return CE;
1175}
1176
1177
1178/*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */
1179void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping,
1180 MaxExpansionTable *maxexpansion,
1181 MaxJamoExpansionTable *maxjamoexpansion,
1182 UBool jamospecial,
1183 UErrorCode *status)
1184{
1185 const uint32_t VBASE = 0x1161;
1186 const uint32_t TBASE = 0x11A8;
1187 const uint32_t VCOUNT = 21;
1188 const uint32_t TCOUNT = 28;
1189
1190 uint32_t v = VBASE + VCOUNT - 1;
1191 uint32_t t = TBASE + TCOUNT - 1;
1192 uint32_t ce;
1193
1194 while (v >= VBASE) {
1195 /*ce = ucmpe32_get(mapping, v);*/
1196 ce = utrie_get32(mapping, v, NULL);
1197 if (ce < UCOL_SPECIAL_FLAG) {
1198 uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status);
1199 }
1200 v --;
1201 }
1202
1203 while (t >= TBASE)
1204 {
1205 /*ce = ucmpe32_get(mapping, t);*/
1206 ce = utrie_get32(mapping, t, NULL);
1207 if (ce < UCOL_SPECIAL_FLAG) {
1208 uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status);
1209 }
1210 t --;
1211 }
1212 /* According to the docs, 99% of the time, the Jamo will not be special */
1213 if (jamospecial) {
1214 /* gets the max expansion in all unicode characters */
1215 int count = maxjamoexpansion->position;
1216 uint8_t maxTSize = (uint8_t)(maxjamoexpansion->maxLSize +
1217 maxjamoexpansion->maxVSize +
1218 maxjamoexpansion->maxTSize);
1219 uint8_t maxVSize = (uint8_t)(maxjamoexpansion->maxLSize +
1220 maxjamoexpansion->maxVSize);
1221
1222 while (count > 0) {
1223 count --;
1224 if (*(maxjamoexpansion->isV + count) == TRUE) {
1225 uprv_uca_setMaxExpansion(
1226 *(maxjamoexpansion->endExpansionCE + count),
1227 maxVSize, maxexpansion, status);
1228 }
1229 else {
1230 uprv_uca_setMaxExpansion(
1231 *(maxjamoexpansion->endExpansionCE + count),
1232 maxTSize, maxexpansion, status);
1233 }
1234 }
1235 }
1236}
1237
1238U_CDECL_BEGIN
1239static inline uint32_t U_CALLCONV
1240getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset)
1241{
1242 uint32_t value;
1243 uint32_t tag;
1244 UChar32 limit;
1245 UBool inBlockZero;
1246
1247 limit=start+0x400;
1248 while(start<limit) {
1249 value=utrie_get32(trie, start, &inBlockZero);
1250 tag = getCETag(value);
1251 if(inBlockZero == TRUE) {
1252 start+=UTRIE_DATA_BLOCK_LENGTH;
1253 } else if(!(isSpecial(value) && (tag == IMPLICIT_TAG || tag == NOT_FOUND_TAG))) {
1254 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the
1255 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is
1256 * nothing in this position and that it should be skipped.
1257 */
1258#ifdef UCOL_DEBUG
1259 static int32_t count = 1;
1260 fprintf(stdout, "%i, Folded %08X, value %08X\n", count++, start, value);
1261#endif
1262 return (uint32_t)(UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24) | offset);
1263 } else {
1264 ++start;
1265 }
1266 }
1267 return 0;
1268}
1269U_CDECL_END
1270
1271#ifdef UCOL_DEBUG
1272// This is a debug function to print the contents of a trie.
1273// It is used in conjuction with the code around utrie_unserialize call
1274void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
1275 if(start<0x10000) {
1276 fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
1277 } else {
1278 fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value);
1279 }
1280}
1281
1282int32_t
1283myGetFoldingOffset(uint32_t data) {
1284 if(data > UCOL_NOT_FOUND && getCETag(data) == SURROGATE_TAG) {
1285 return (data&0xFFFFFF);
1286 } else {
1287 return 0;
1288 }
1289}
1290#endif
1291
1292U_CAPI UCATableHeader* U_EXPORT2
1293uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
1294 /*CompactEIntArray *mapping = t->mapping;*/
1295 UNewTrie *mapping = t->mapping;
1296 ExpansionTable *expansions = t->expansions;
1297 CntTable *contractions = t->contractions;
1298 MaxExpansionTable *maxexpansion = t->maxExpansions;
1299
1300 if(U_FAILURE(*status)) {
1301 return NULL;
1302 }
1303
1304 uint32_t beforeContractions = (uint32_t)((headersize+paddedsize(expansions->position*sizeof(uint32_t)))/sizeof(UChar));
1305
1306 int32_t contractionsSize = 0;
1307 contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
1308
1309 /* the following operation depends on the trie data. Therefore, we have to do it before */
1310 /* the trie is compacted */
1311 /* sets jamo expansions */
1312 uprv_uca_getMaxExpansionJamo(mapping, maxexpansion, t->maxJamoExpansions,
1313 t->image->jamoSpecial, status);
1314
1315 /*ucmpe32_compact(mapping);*/
1316 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/
1317 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/
1318 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/
1319
1320 // After setting the jamo expansions, compact the trie and get the needed size
1321 int32_t mappingSize = utrie_serialize(mapping, NULL, 0, getFoldedValue /*getFoldedValue*/, FALSE, status);
1322
1323 uint32_t tableOffset = 0;
1324 uint8_t *dataStart;
1325
1326 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */
1327
1328 uint32_t toAllocate =(uint32_t)(headersize+
1329 paddedsize(expansions->position*sizeof(uint32_t))+
1330 paddedsize(mappingSize)+
1331 paddedsize(contractionsSize*(sizeof(UChar)+sizeof(uint32_t)))+
1332 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */
1333 /* maxexpansion array */
1334 + paddedsize(maxexpansion->position * sizeof(uint32_t)) +
1335 /* maxexpansion size array */
1336 paddedsize(maxexpansion->position * sizeof(uint8_t)) +
1337 paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */
1338 paddedsize(UCOL_UNSAFECP_TABLE_SIZE)); /* Contraction Ending chars */
1339
1340
1341 dataStart = (uint8_t *)uprv_malloc(toAllocate);
1342 /* test for NULL */
1343 if (dataStart == NULL) {
1344 *status = U_MEMORY_ALLOCATION_ERROR;
1345 return NULL;
1346 }
1347
1348 UCATableHeader *myData = (UCATableHeader *)dataStart;
1349 uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
1350
1351 myData->contractionSize = contractionsSize;
1352
1353 tableOffset += (uint32_t)(paddedsize(sizeof(UCATableHeader)));
1354
1355 myData->options = tableOffset;
1356 uprv_memcpy(dataStart+tableOffset, t->options, sizeof(UColOptionSet));
1357 tableOffset += (uint32_t)(paddedsize(sizeof(UColOptionSet)));
1358
1359 /* copy expansions */
1360 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
1361 myData->expansion = tableOffset;
1362 uprv_memcpy(dataStart+tableOffset, expansions->CEs, expansions->position*sizeof(uint32_t));
1363 tableOffset += (uint32_t)(paddedsize(expansions->position*sizeof(uint32_t)));
1364
1365 /* contractions block */
1366 if(contractionsSize != 0) {
1367 /* copy contraction index */
1368 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
1369 myData->contractionIndex = tableOffset;
1370 uprv_memcpy(dataStart+tableOffset, contractions->codePoints, contractionsSize*sizeof(UChar));
1371 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(UChar)));
1372
1373 /* copy contraction collation elements */
1374 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
1375 myData->contractionCEs = tableOffset;
1376 uprv_memcpy(dataStart+tableOffset, contractions->CEs, contractionsSize*sizeof(uint32_t));
1377 tableOffset += (uint32_t)(paddedsize(contractionsSize*sizeof(uint32_t)));
1378 } else {
1379 myData->contractionIndex = 0;
1380 myData->contractionIndex = 0;
1381 }
1382
1383 /* copy mapping table */
1384 /*myData->mappingPosition = dataStart+tableOffset;*/
1385 /*myData->mappingPosition = tableOffset;*/
1386 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/
1387
1388 myData->mappingPosition = tableOffset;
1389 utrie_serialize(mapping, dataStart+tableOffset, toAllocate-tableOffset, getFoldedValue, FALSE, status);
1390#ifdef UCOL_DEBUG
1391 // This is debug code to dump the contents of the trie. It needs two functions defined above
1392 {
1393 UTrie UCAt = { 0 };
1394 utrie_unserialize(&UCAt, dataStart+tableOffset, 9999999, status);
1395 UCAt.getFoldingOffset = myGetFoldingOffset;
1396 if(U_SUCCESS(*status)) {
1397 utrie_enum(&UCAt, NULL, enumRange, NULL);
1398 }
1399 }
1400#endif
1401 tableOffset += paddedsize(mappingSize);
1402
1403
1404 int32_t i = 0;
1405
1406 /* copy max expansion table */
1407 myData->endExpansionCE = tableOffset;
1408 myData->endExpansionCECount = maxexpansion->position;
1409 /* not copying the first element which is a dummy */
1410 uprv_memcpy(dataStart + tableOffset, maxexpansion->endExpansionCE + 1,
1411 maxexpansion->position * sizeof(uint32_t));
1412 tableOffset += (uint32_t)(paddedsize(maxexpansion->position * sizeof(uint32_t)));
1413 myData->expansionCESize = tableOffset;
1414 uprv_memcpy(dataStart + tableOffset, maxexpansion->expansionCESize + 1,
1415 maxexpansion->position * sizeof(uint8_t));
1416 tableOffset += (uint32_t)(paddedsize(maxexpansion->position * sizeof(uint8_t)));
1417
1418 /* Unsafe chars table. Finish it off, then copy it. */
1419 uprv_uca_unsafeCPAddCCNZ(t, status);
1420 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */
1421 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
1422 t->unsafeCP[i] |= t->UCA->unsafeCP[i];
1423 }
1424 }
1425 myData->unsafeCP = tableOffset;
1426 uprv_memcpy(dataStart + tableOffset, t->unsafeCP, UCOL_UNSAFECP_TABLE_SIZE);
1427 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
1428
1429
1430 /* Finish building Contraction Ending chars hash table and then copy it out. */
1431 if (t->UCA != 0) { /* Or in unsafebits from UCA, making a combined table. */
1432 for (i=0; i<UCOL_UNSAFECP_TABLE_SIZE; i++) {
1433 t->contrEndCP[i] |= t->UCA->contrEndCP[i];
1434 }
1435 }
1436 myData->contrEndCP = tableOffset;
1437 uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
1438 tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
1439
1440 if(tableOffset != toAllocate) {
1441#ifdef UCOL_DEBUG
1442 fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset);
1443#endif
1444 *status = U_INTERNAL_PROGRAM_ERROR;
1445 uprv_free(dataStart);
1446 return 0;
1447 }
1448
1449 myData->size = tableOffset;
1450 /* This should happen upon ressurection */
1451 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/
1452 /*uprv_mstrm_close(ms);*/
1453 return myData;
1454}
1455
1456
1457struct enumStruct {
1458 tempUCATable *t;
1459 UCollator *tempColl;
1460 UCollationElements* colEl;
1461 int32_t noOfClosures;
1462 UErrorCode *status;
1463};
1464U_CDECL_BEGIN
1465static UBool U_CALLCONV
1466_enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
1467
1468 UErrorCode *status = ((enumStruct *)context)->status;
1469 tempUCATable *t = ((enumStruct *)context)->t;
1470 UCollator *tempColl = ((enumStruct *)context)->tempColl;
1471 UCollationElements* colEl = ((enumStruct *)context)->colEl;
1472 UCAElements el;
1473 UChar decomp[256] = { 0 };
1474 int32_t noOfDec = 0;
1475
1476 UChar32 u32 = 0;
1477 UChar comp[2];
1478 uint32_t len = 0;
1479
1480 if (type != U_UNASSIGNED && type != U_PRIVATE_USE_CHAR) { // if the range is assigned - we might ommit more categories later
1481 for(u32 = start; u32 < limit; u32++) {
1482 noOfDec = unorm_getDecomposition(u32, FALSE, decomp, 256);
1483 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
1484 //|| (noOfDec == 1 && *decomp != (UChar)u32))
1485 if(noOfDec > 0) // if we're positive, that means there is no decomposition
1486 {
1487 len = 0;
1488 UTF_APPEND_CHAR_UNSAFE(comp, len, u32);
1489 if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) {
1490#ifdef UCOL_DEBUG
1491 fprintf(stderr, "Closure: %08X -> ", u32);
1492 uint32_t i = 0;
1493 for(i = 0; i<noOfDec; i++) {
1494 fprintf(stderr, "%04X ", decomp[i]);
1495 }
1496 fprintf(stderr, "\n");
1497#endif
1498 ((enumStruct *)context)->noOfClosures++;
1499 el.cPoints = decomp;
1500 el.cSize = noOfDec;
1501 el.noOfCEs = 0;
1502 el.prefix = el.prefixChars;
1503 el.prefixSize = 0;
1504
1505 UCAElements *prefix=(UCAElements *)uhash_get(t->prefixLookup, &el);
1506 if(prefix == NULL) {
1507 el.cPoints = comp;
1508 el.cSize = len;
1509 el.prefix = el.prefixChars;
1510 el.prefixSize = 0;
1511 el.noOfCEs = 0;
1512 ucol_setText(colEl, decomp, noOfDec, status);
1513 while((el.CEs[el.noOfCEs] = ucol_next(colEl, status)) != UCOL_NULLORDER) {
1514 el.noOfCEs++;
1515 }
1516 } else {
1517 el.cPoints = comp;
1518 el.cSize = len;
1519 el.prefix = el.prefixChars;
1520 el.prefixSize = 0;
1521 el.noOfCEs = 1;
1522 el.CEs[0] = prefix->mapCE;
1523 // This character uses a prefix. We have to add it
1524 // to the unsafe table, as it decomposed form is already
1525 // in. In Japanese, this happens for \u309e & \u30fe
1526 // Since unsafeCPSet is static in ucol_elm, we are going
1527 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
1528 }
1529 if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
1530 el.isThai = TRUE;
1531 } else {
1532 el.isThai = FALSE;
1533 }
1534
1535 uprv_uca_addAnElement(t, &el, status);
1536 }
1537 }
1538 }
1539 }
1540 return TRUE;
1541}
1542U_CDECL_END
1543
1544U_CAPI int32_t U_EXPORT2
1545uprv_uca_canonicalClosure(tempUCATable *t, UErrorCode *status)
1546{
1547 enumStruct context;
1548 context.noOfClosures = 0;
1549 if(U_SUCCESS(*status)) {
1550 UCollator *tempColl = NULL;
1551 tempUCATable *tempTable = uprv_uca_cloneTempTable(t, status);
1552
1553 UCATableHeader *tempData = uprv_uca_assembleTable(tempTable, status);
1554 tempColl = ucol_initCollator(tempData, 0, status);
1555 uprv_uca_closeTempTable(tempTable);
1556
1557 if(U_SUCCESS(*status)) {
1558 tempColl->rb = NULL;
1559 tempColl->elements = NULL;
1560 tempColl->validLocale = NULL;
1561 tempColl->requestedLocale = NULL;
1562 tempColl->hasRealData = TRUE;
1563 tempColl->freeImageOnClose = TRUE;
1564 } else if(tempData != 0) {
1565 uprv_free(tempData);
1566 }
1567
1568 /* produce canonical closure */
1569 UCollationElements* colEl = ucol_openElements(tempColl, NULL, 0, status);
1570
1571 context.t = t;
1572 context.tempColl = tempColl;
1573 context.colEl = colEl;
1574 context.status = status;
1575 u_enumCharTypes(_enumCategoryRangeClosureCategory, &context);
1576
1577 ucol_closeElements(colEl);
1578 ucol_close(tempColl);
1579 }
1580 return context.noOfClosures;
1581}
1582
1583U_NAMESPACE_END
1584
1585#endif /* #if !UCONFIG_NO_COLLATION */