2 *******************************************************************************
4 * Copyright (C) 2001-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucaelems.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
22 * 03/02/2001 synwee added setMaxExpansion
23 * 03/07/2001 synwee merged UCA's maxexpansion and tailoring's
26 #include "unicode/utypes.h"
28 #if !UCONFIG_NO_COLLATION
30 #include "unicode/uchar.h"
31 #include "unicode/unistr.h"
32 #include "unicode/ucoleitr.h"
33 #include "unicode/normlzr.h"
36 #include "unicode/caniter.h"
39 static uint32_t uprv_uca_processContraction(CntTable
*contractions
, UCAElements
*element
, uint32_t existingCE
, UErrorCode
*status
);
42 static int32_t U_CALLCONV
43 prefixLookupHash(const UHashTok e
) {
44 UCAElements
*element
= (UCAElements
*)e
.pointer
;
48 uprv_memcpy(buf
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
49 buf
[element
->cSize
] = 0;
50 //key.pointer = element->cPoints;
51 //element->cPoints[element->cSize] = 0;
52 return uhash_hashUChars(key
);
55 static int8_t U_CALLCONV
56 prefixLookupComp(const UHashTok e1
, const UHashTok e2
) {
57 UCAElements
*element1
= (UCAElements
*)e1
.pointer
;
58 UCAElements
*element2
= (UCAElements
*)e2
.pointer
;
63 uprv_memcpy(buf1
, element1
->cPoints
, element1
->cSize
*sizeof(UChar
));
64 buf1
[element1
->cSize
] = 0;
69 uprv_memcpy(buf2
, element2
->cPoints
, element2
->cSize
*sizeof(UChar
));
70 buf2
[element2
->cSize
] = 0;
72 return uhash_compareUChars(key1
, key2
);
76 static int32_t uprv_uca_addExpansion(ExpansionTable
*expansions
, uint32_t value
, UErrorCode
*status
) {
77 if(U_FAILURE(*status
)) {
80 if(expansions
->CEs
== NULL
) {
81 expansions
->CEs
= (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
*sizeof(uint32_t));
83 if (expansions
->CEs
== NULL
) {
84 *status
= U_MEMORY_ALLOCATION_ERROR
;
87 expansions
->size
= INIT_EXP_TABLE_SIZE
;
88 expansions
->position
= 0;
91 if(expansions
->position
== expansions
->size
) {
92 uint32_t *newData
= (uint32_t *)uprv_realloc(expansions
->CEs
, 2*expansions
->size
*sizeof(uint32_t));
95 fprintf(stderr
, "out of memory for expansions\n");
97 *status
= U_MEMORY_ALLOCATION_ERROR
;
100 expansions
->CEs
= newData
;
101 expansions
->size
*= 2;
104 expansions
->CEs
[expansions
->position
] = value
;
105 return(expansions
->position
++);
108 U_CAPI tempUCATable
* U_EXPORT2
109 uprv_uca_initTempTable(UCATableHeader
*image
, UColOptionSet
*opts
, const UCollator
*UCA
, UColCETags initTag
, UColCETags supplementaryInitTag
, UErrorCode
*status
) {
110 MaxJamoExpansionTable
*maxjet
;
111 MaxExpansionTable
*maxet
;
112 tempUCATable
*t
= (tempUCATable
*)uprv_malloc(sizeof(tempUCATable
));
115 *status
= U_MEMORY_ALLOCATION_ERROR
;
118 uprv_memset(t
, 0, sizeof(tempUCATable
));
120 maxet
= (MaxExpansionTable
*)uprv_malloc(sizeof(MaxExpansionTable
));
122 goto allocation_failure
;
124 uprv_memset(maxet
, 0, sizeof(MaxExpansionTable
));
125 t
->maxExpansions
= maxet
;
127 maxjet
= (MaxJamoExpansionTable
*)uprv_malloc(sizeof(MaxJamoExpansionTable
));
128 if (maxjet
== NULL
) {
129 goto allocation_failure
;
131 uprv_memset(maxjet
, 0, sizeof(MaxJamoExpansionTable
));
132 t
->maxJamoExpansions
= maxjet
;
138 t
->expansions
= (ExpansionTable
*)uprv_malloc(sizeof(ExpansionTable
));
140 if (t
->expansions
== NULL
) {
141 goto allocation_failure
;
143 uprv_memset(t
->expansions
, 0, sizeof(ExpansionTable
));
144 /*t->mapping = ucmpe32_open(UCOL_SPECIAL_FLAG | (initTag<<24), UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), UCOL_SPECIAL_FLAG | (LEAD_SURROGATE_TAG<<24), status);*/
145 /*t->mapping = utrie_open(NULL, NULL, 0x100000, UCOL_SPECIAL_FLAG | (initTag<<24), TRUE); // Do your own mallocs for the structure, array and have linear Latin 1*/
147 t
->mapping
= utrie_open(NULL
, NULL
, 0x100000,
148 UCOL_SPECIAL_FLAG
| (initTag
<<24),
149 UCOL_SPECIAL_FLAG
| (supplementaryInitTag
<< 24),
150 TRUE
); // Do your own mallocs for the structure, array and have linear Latin 1
151 t
->prefixLookup
= uhash_open(prefixLookupHash
, prefixLookupComp
, NULL
, status
);
152 uhash_setValueDeleter(t
->prefixLookup
, uhash_freeBlock
);
154 t
->contractions
= uprv_cnttab_open(t
->mapping
, status
);
156 /* copy UCA's maxexpansion and merge as we go along */
158 /* adding an extra initial value for easier manipulation */
159 maxet
->size
= (UCA
->lastEndExpansionCE
- UCA
->endExpansionCE
)
161 maxet
->position
= maxet
->size
- 1;
162 maxet
->endExpansionCE
=
163 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet
->size
);
165 if (maxet
->endExpansionCE
== NULL
) {
166 goto allocation_failure
;
168 maxet
->expansionCESize
=
169 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet
->size
);
171 if (maxet
->expansionCESize
== NULL
) {
172 goto allocation_failure
;
174 /* initialized value */
175 *(maxet
->endExpansionCE
) = 0;
176 *(maxet
->expansionCESize
) = 0;
177 uprv_memcpy(maxet
->endExpansionCE
+ 1, UCA
->endExpansionCE
,
178 sizeof(uint32_t) * (maxet
->size
- 1));
179 uprv_memcpy(maxet
->expansionCESize
+ 1, UCA
->expansionCESize
,
180 sizeof(uint8_t) * (maxet
->size
- 1));
185 maxjet
->endExpansionCE
= NULL
;
188 maxjet
->position
= 0;
189 maxjet
->maxLSize
= 1;
190 maxjet
->maxVSize
= 1;
191 maxjet
->maxTSize
= 1;
193 t
->unsafeCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
195 if (t
->unsafeCP
== NULL
) {
196 goto allocation_failure
;
198 t
->contrEndCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
200 if (t
->contrEndCP
== NULL
) {
201 goto allocation_failure
;
203 uprv_memset(t
->unsafeCP
, 0, UCOL_UNSAFECP_TABLE_SIZE
);
204 uprv_memset(t
->contrEndCP
, 0, UCOL_UNSAFECP_TABLE_SIZE
);
208 uprv_uca_closeTempTable(t
);
209 *status
= U_MEMORY_ALLOCATION_ERROR
;
213 U_CAPI tempUCATable
* U_EXPORT2
214 uprv_uca_cloneTempTable(tempUCATable
*t
, UErrorCode
*status
) {
215 if(U_FAILURE(*status
)) {
219 tempUCATable
*r
= (tempUCATable
*)uprv_malloc(sizeof(tempUCATable
));
222 *status
= U_MEMORY_ALLOCATION_ERROR
;
225 uprv_memset(r
, 0, sizeof(tempUCATable
));
228 if(t
->mapping
!= NULL
) {
229 /*r->mapping = ucmpe32_clone(t->mapping, status);*/
230 r
->mapping
= utrie_clone(NULL
, t
->mapping
, NULL
, 0);
233 // a hashing clone function would be very nice. We have none currently...
234 // However, we should be good, as closing should not produce any prefixed elements.
235 r
->prefixLookup
= NULL
; // prefixes are not used in closing
238 if(t
->expansions
!= NULL
) {
239 r
->expansions
= (ExpansionTable
*)uprv_malloc(sizeof(ExpansionTable
));
241 if (r
->expansions
== NULL
) {
242 *status
= U_MEMORY_ALLOCATION_ERROR
;
245 r
->expansions
->position
= t
->expansions
->position
;
246 r
->expansions
->size
= t
->expansions
->size
;
247 if(t
->expansions
->CEs
!= NULL
) {
248 r
->expansions
->CEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->expansions
->size
);
250 if (r
->expansions
->CEs
== NULL
) {
251 *status
= U_MEMORY_ALLOCATION_ERROR
;
254 uprv_memcpy(r
->expansions
->CEs
, t
->expansions
->CEs
, sizeof(uint32_t)*t
->expansions
->position
);
256 r
->expansions
->CEs
= NULL
;
260 if(t
->contractions
!= NULL
) {
261 r
->contractions
= uprv_cnttab_clone(t
->contractions
, status
);
262 r
->contractions
->mapping
= r
->mapping
;
265 if(t
->maxExpansions
!= NULL
) {
266 r
->maxExpansions
= (MaxExpansionTable
*)uprv_malloc(sizeof(MaxExpansionTable
));
268 if (r
->maxExpansions
== NULL
) {
269 *status
= U_MEMORY_ALLOCATION_ERROR
;
272 r
->maxExpansions
->size
= t
->maxExpansions
->size
;
273 r
->maxExpansions
->position
= t
->maxExpansions
->position
;
274 if(t
->maxExpansions
->endExpansionCE
!= NULL
) {
275 r
->maxExpansions
->endExpansionCE
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->maxExpansions
->size
);
276 uprv_memset(r
->maxExpansions
->endExpansionCE
, 0xDB, sizeof(uint32_t)*t
->maxExpansions
->size
);
278 if (r
->maxExpansions
->endExpansionCE
== NULL
) {
279 *status
= U_MEMORY_ALLOCATION_ERROR
;
282 uprv_memcpy(r
->maxExpansions
->endExpansionCE
, t
->maxExpansions
->endExpansionCE
, t
->maxExpansions
->position
*sizeof(uint32_t));
284 r
->maxExpansions
->endExpansionCE
= NULL
;
286 if(t
->maxExpansions
->expansionCESize
!= NULL
) {
287 r
->maxExpansions
->expansionCESize
= (uint8_t *)uprv_malloc(sizeof(uint8_t)*t
->maxExpansions
->size
);
288 uprv_memset(r
->maxExpansions
->expansionCESize
, 0xDB, sizeof(uint8_t)*t
->maxExpansions
->size
);
290 if (r
->maxExpansions
->expansionCESize
== NULL
) {
291 *status
= U_MEMORY_ALLOCATION_ERROR
;
294 uprv_memcpy(r
->maxExpansions
->expansionCESize
, t
->maxExpansions
->expansionCESize
, t
->maxExpansions
->position
*sizeof(uint8_t));
296 r
->maxExpansions
->expansionCESize
= NULL
;
300 if(t
->maxJamoExpansions
!= NULL
) {
301 r
->maxJamoExpansions
= (MaxJamoExpansionTable
*)uprv_malloc(sizeof(MaxJamoExpansionTable
));
303 if (r
->maxJamoExpansions
== NULL
) {
304 *status
= U_MEMORY_ALLOCATION_ERROR
;
307 r
->maxJamoExpansions
->size
= t
->maxJamoExpansions
->size
;
308 r
->maxJamoExpansions
->position
= t
->maxJamoExpansions
->position
;
309 r
->maxJamoExpansions
->maxLSize
= t
->maxJamoExpansions
->maxLSize
;
310 r
->maxJamoExpansions
->maxVSize
= t
->maxJamoExpansions
->maxVSize
;
311 r
->maxJamoExpansions
->maxTSize
= t
->maxJamoExpansions
->maxTSize
;
312 if(t
->maxJamoExpansions
->size
!= 0) {
313 r
->maxJamoExpansions
->endExpansionCE
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->maxJamoExpansions
->size
);
315 if (r
->maxJamoExpansions
->endExpansionCE
== NULL
) {
316 *status
= U_MEMORY_ALLOCATION_ERROR
;
319 uprv_memcpy(r
->maxJamoExpansions
->endExpansionCE
, t
->maxJamoExpansions
->endExpansionCE
, t
->maxJamoExpansions
->position
*sizeof(uint32_t));
320 r
->maxJamoExpansions
->isV
= (UBool
*)uprv_malloc(sizeof(UBool
)*t
->maxJamoExpansions
->size
);
322 if (r
->maxJamoExpansions
->isV
== NULL
) {
323 *status
= U_MEMORY_ALLOCATION_ERROR
;
326 uprv_memcpy(r
->maxJamoExpansions
->isV
, t
->maxJamoExpansions
->isV
, t
->maxJamoExpansions
->position
*sizeof(UBool
));
328 r
->maxJamoExpansions
->endExpansionCE
= NULL
;
329 r
->maxJamoExpansions
->isV
= NULL
;
333 if(t
->unsafeCP
!= NULL
) {
334 r
->unsafeCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
336 if (r
->unsafeCP
== NULL
) {
337 *status
= U_MEMORY_ALLOCATION_ERROR
;
340 uprv_memcpy(r
->unsafeCP
, t
->unsafeCP
, UCOL_UNSAFECP_TABLE_SIZE
);
343 if(t
->contrEndCP
!= NULL
) {
344 r
->contrEndCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
346 if (r
->contrEndCP
== NULL
) {
347 *status
= U_MEMORY_ALLOCATION_ERROR
;
350 uprv_memcpy(r
->contrEndCP
, t
->contrEndCP
, UCOL_UNSAFECP_TABLE_SIZE
);
355 r
->options
= t
->options
;
361 U_CAPI
void U_EXPORT2
362 uprv_uca_closeTempTable(tempUCATable
*t
) {
364 if (t
->expansions
!= NULL
) {
365 uprv_free(t
->expansions
->CEs
);
366 uprv_free(t
->expansions
);
368 if(t
->contractions
!= NULL
) {
369 uprv_cnttab_close(t
->contractions
);
371 if (t
->mapping
!= NULL
) {
372 utrie_close(t
->mapping
);
375 if(t
->prefixLookup
!= NULL
) {
376 uhash_close(t
->prefixLookup
);
379 if (t
->maxExpansions
!= NULL
) {
380 uprv_free(t
->maxExpansions
->endExpansionCE
);
381 uprv_free(t
->maxExpansions
->expansionCESize
);
382 uprv_free(t
->maxExpansions
);
385 if (t
->maxJamoExpansions
->size
> 0) {
386 uprv_free(t
->maxJamoExpansions
->endExpansionCE
);
387 uprv_free(t
->maxJamoExpansions
->isV
);
389 uprv_free(t
->maxJamoExpansions
);
391 uprv_free(t
->unsafeCP
);
392 uprv_free(t
->contrEndCP
);
399 * Looks for the maximum length of all expansion sequences ending with the same
400 * collation element. The size required for maxexpansion and maxsize is
401 * returned if the arrays are too small.
402 * @param endexpansion the last expansion collation element to be added
403 * @param expansionsize size of the expansion
404 * @param maxexpansion data structure to store the maximum expansion data.
405 * @param status error status
406 * @returns size of the maxexpansion and maxsize used.
408 static int uprv_uca_setMaxExpansion(uint32_t endexpansion
,
409 uint8_t expansionsize
,
410 MaxExpansionTable
*maxexpansion
,
413 if (maxexpansion
->size
== 0) {
414 /* we'll always make the first element 0, for easier manipulation */
415 maxexpansion
->endExpansionCE
=
416 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(int32_t));
418 if (maxexpansion
->endExpansionCE
== NULL
) {
419 *status
= U_MEMORY_ALLOCATION_ERROR
;
422 *(maxexpansion
->endExpansionCE
) = 0;
423 maxexpansion
->expansionCESize
=
424 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(uint8_t));
426 if (maxexpansion
->expansionCESize
== NULL
) {
427 *status
= U_MEMORY_ALLOCATION_ERROR
;
430 *(maxexpansion
->expansionCESize
) = 0;
431 maxexpansion
->size
= INIT_EXP_TABLE_SIZE
;
432 maxexpansion
->position
= 0;
435 if (maxexpansion
->position
+ 1 == maxexpansion
->size
) {
436 uint32_t *neweece
= (uint32_t *)uprv_realloc(maxexpansion
->endExpansionCE
,
437 2 * maxexpansion
->size
* sizeof(uint32_t));
438 uint8_t *neweces
= (uint8_t *)uprv_realloc(maxexpansion
->expansionCESize
,
439 2 * maxexpansion
->size
* sizeof(uint8_t));
440 if (neweece
== NULL
|| neweces
== NULL
) {
442 fprintf(stderr
, "out of memory for maxExpansions\n");
444 *status
= U_MEMORY_ALLOCATION_ERROR
;
447 maxexpansion
->endExpansionCE
= neweece
;
448 maxexpansion
->expansionCESize
= neweces
;
449 maxexpansion
->size
*= 2;
452 uint32_t *pendexpansionce
= maxexpansion
->endExpansionCE
;
453 uint8_t *pexpansionsize
= maxexpansion
->expansionCESize
;
454 int pos
= maxexpansion
->position
;
456 uint32_t *start
= pendexpansionce
;
457 uint32_t *limit
= pendexpansionce
+ pos
;
459 /* using binary search to determine if last expansion element is
460 already in the array */
463 while (start
< limit
- 1) {
464 mid
= start
+ ((limit
- start
) >> 1);
465 if (endexpansion
<= *mid
) {
473 if (*start
== endexpansion
) {
474 result
= start
- pendexpansionce
;
477 if (*limit
== endexpansion
) {
478 result
= limit
- pendexpansionce
;
482 /* found the ce in expansion, we'll just modify the size if it is
484 uint8_t *currentsize
= pexpansionsize
+ result
;
485 if (*currentsize
< expansionsize
) {
486 *currentsize
= expansionsize
;
490 /* we'll need to squeeze the value into the array.
491 initial implementation. */
492 /* shifting the subarray down by 1 */
493 int shiftsize
= (pendexpansionce
+ pos
) - start
;
494 uint32_t *shiftpos
= start
+ 1;
495 uint8_t *sizeshiftpos
= pexpansionsize
+ (shiftpos
- pendexpansionce
);
497 /* okay need to rearrange the array into sorted order */
498 if (shiftsize
== 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */
499 *(pendexpansionce
+ pos
+ 1) = endexpansion
;
500 *(pexpansionsize
+ pos
+ 1) = expansionsize
;
503 uprv_memmove(shiftpos
+ 1, shiftpos
, shiftsize
* sizeof(int32_t));
504 uprv_memmove(sizeshiftpos
+ 1, sizeshiftpos
,
505 shiftsize
* sizeof(uint8_t));
506 *shiftpos
= endexpansion
;
507 *sizeshiftpos
= expansionsize
;
509 maxexpansion
->position
++;
514 for (temp
= 0; temp
< maxexpansion
->position
; temp
++) {
515 if (pendexpansionce
[temp
] >= pendexpansionce
[temp
+ 1]) {
516 fprintf(stderr
, "expansions %d\n", temp
);
518 if (pendexpansionce
[temp
] == endexpansion
) {
520 if (pexpansionsize
[temp
] < expansionsize
) {
521 fprintf(stderr
, "expansions size %d\n", temp
);
525 if (pendexpansionce
[temp
] == endexpansion
) {
527 if (pexpansionsize
[temp
] < expansionsize
) {
528 fprintf(stderr
, "expansions size %d\n", temp
);
532 fprintf(stderr
, "expansion not found %d\n", temp
);
536 return maxexpansion
->position
;
540 * Sets the maximum length of all jamo expansion sequences ending with the same
541 * collation element. The size required for maxexpansion and maxsize is
542 * returned if the arrays are too small.
543 * @param ch the jamo codepoint
544 * @param endexpansion the last expansion collation element to be added
545 * @param expansionsize size of the expansion
546 * @param maxexpansion data structure to store the maximum expansion data.
547 * @param status error status
548 * @returns size of the maxexpansion and maxsize used.
550 static int uprv_uca_setMaxJamoExpansion(UChar ch
,
551 uint32_t endexpansion
,
552 uint8_t expansionsize
,
553 MaxJamoExpansionTable
*maxexpansion
,
557 if (((uint32_t)ch
- 0x1100) <= (0x1112 - 0x1100)) {
558 /* determines L for Jamo, doesn't need to store this since it is never
559 at the end of a expansion */
560 if (maxexpansion
->maxLSize
< expansionsize
) {
561 maxexpansion
->maxLSize
= expansionsize
;
563 return maxexpansion
->position
;
566 if (((uint32_t)ch
- 0x1161) <= (0x1175 - 0x1161)) {
567 /* determines V for Jamo */
568 if (maxexpansion
->maxVSize
< expansionsize
) {
569 maxexpansion
->maxVSize
= expansionsize
;
573 if (((uint32_t)ch
- 0x11A8) <= (0x11C2 - 0x11A8)) {
575 /* determines T for Jamo */
576 if (maxexpansion
->maxTSize
< expansionsize
) {
577 maxexpansion
->maxTSize
= expansionsize
;
581 if (maxexpansion
->size
== 0) {
582 /* we'll always make the first element 0, for easier manipulation */
583 maxexpansion
->endExpansionCE
=
584 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(uint32_t));
586 if (maxexpansion
->endExpansionCE
== NULL
) {
587 *status
= U_MEMORY_ALLOCATION_ERROR
;
590 *(maxexpansion
->endExpansionCE
) = 0;
592 (UBool
*)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(UBool
));
594 if (maxexpansion
->isV
== NULL
) {
595 *status
= U_MEMORY_ALLOCATION_ERROR
;
596 uprv_free(maxexpansion
->endExpansionCE
);
597 maxexpansion
->endExpansionCE
= NULL
;
600 *(maxexpansion
->isV
) = 0;
601 maxexpansion
->size
= INIT_EXP_TABLE_SIZE
;
602 maxexpansion
->position
= 0;
605 if (maxexpansion
->position
+ 1 == maxexpansion
->size
) {
606 maxexpansion
->size
*= 2;
607 maxexpansion
->endExpansionCE
= (uint32_t *)uprv_realloc(maxexpansion
->endExpansionCE
,
608 maxexpansion
->size
* sizeof(uint32_t));
609 if (maxexpansion
->endExpansionCE
== NULL
) {
611 fprintf(stderr
, "out of memory for maxExpansions\n");
613 *status
= U_MEMORY_ALLOCATION_ERROR
;
616 maxexpansion
->isV
= (UBool
*)uprv_realloc(maxexpansion
->isV
,
617 maxexpansion
->size
* sizeof(UBool
));
618 if (maxexpansion
->isV
== NULL
) {
620 fprintf(stderr
, "out of memory for maxExpansions\n");
622 *status
= U_MEMORY_ALLOCATION_ERROR
;
623 uprv_free(maxexpansion
->endExpansionCE
);
624 maxexpansion
->endExpansionCE
= NULL
;
629 uint32_t *pendexpansionce
= maxexpansion
->endExpansionCE
;
630 int pos
= maxexpansion
->position
;
634 if (*(pendexpansionce
+ pos
) == endexpansion
) {
635 return maxexpansion
->position
;
639 *(pendexpansionce
+ maxexpansion
->position
) = endexpansion
;
640 *(maxexpansion
->isV
+ maxexpansion
->position
) = isV
;
641 maxexpansion
->position
++;
643 return maxexpansion
->position
;
647 static void ContrEndCPSet(uint8_t *table
, UChar c
) {
652 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
653 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
655 htByte
= &table
[hash
>>3];
656 *htByte
|= (1 << (hash
& 7));
660 static void unsafeCPSet(uint8_t *table
, UChar c
) {
665 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
666 if (hash
>= 0xd800 && hash
<= 0xf8ff) {
667 /* Part of a surrogate, or in private use area. */
668 /* These don't go in the table */
671 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
673 htByte
= &table
[hash
>>3];
674 *htByte
|= (1 << (hash
& 7));
678 /* to the UnsafeCP hash table, add all chars with combining class != 0 */
679 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable
*t
, UErrorCode
*status
) {
682 uint16_t fcd
; // Hi byte is lead combining class.
683 // lo byte is trailing combing class.
684 const uint16_t *fcdTrieData
;
686 fcdTrieData
= unorm_getFCDTrie(status
);
687 if (U_FAILURE(*status
)) {
691 for (c
=0; c
<0xffff; c
++) {
692 fcd
= unorm_getFCD16(fcdTrieData
, c
);
693 if (fcd
>= 0x100 || // if the leading combining class(c) > 0 ||
694 (UTF_IS_LEAD(c
) && fcd
!= 0)) // c is a leading surrogate with some FCD data
695 unsafeCPSet(t
->unsafeCP
, c
);
698 if(t
->prefixLookup
!= NULL
) {
700 const UHashElement
*e
= NULL
;
701 UCAElements
*element
= NULL
;
703 uint32_t NFCbufLen
= 0;
704 while((e
= uhash_nextElement(t
->prefixLookup
, &i
)) != NULL
) {
705 element
= (UCAElements
*)e
->value
.pointer
;
706 // codepoints here are in the NFD form. We need to add the
707 // first code point of the NFC form to unsafe, because
708 // strcoll needs to backup over them.
709 NFCbufLen
= unorm_normalize(element
->cPoints
, element
->cSize
, UNORM_NFC
, 0,
710 NFCbuf
, 256, status
);
711 unsafeCPSet(t
->unsafeCP
, NFCbuf
[0]);
716 static uint32_t uprv_uca_addPrefix(tempUCATable
*t
, uint32_t CE
,
717 UCAElements
*element
, UErrorCode
*status
) {
718 // currently the longest prefix we're supporting in Japanese is two characters
719 // long. Although this table could quite easily mimic complete contraction stuff
720 // there is no good reason to make a general solution, as it would require some
721 // error prone messing.
722 CntTable
*contractions
= t
->contractions
;
725 UChar
*oldCP
= element
->cPoints
;
726 uint32_t oldCPSize
= element
->cSize
;
729 contractions
->currentTag
= SPEC_PROC_TAG
;
731 // here, we will normalize & add prefix to the table.
734 for(j
=0; j
<element
->cSize
; j
++) {
735 fprintf(stdout
, "CP: %04X ", element
->cPoints
[j
]);
737 fprintf(stdout
, "El: %08X Pref: ", CE
);
738 for(j
=0; j
<element
->prefixSize
; j
++) {
739 fprintf(stdout
, "%04X ", element
->prefix
[j
]);
741 fprintf(stdout
, "%08X ", element
->mapCE
);
744 for (j
= 1; j
<element
->prefixSize
; j
++) { /* First add NFD prefix chars to unsafe CP hash table */
745 // Unless it is a trail surrogate, which is handled algoritmically and
746 // shouldn't take up space in the table.
747 if(!(UTF_IS_TRAIL(element
->prefix
[j
]))) {
748 unsafeCPSet(t
->unsafeCP
, element
->prefix
[j
]);
752 UChar tempPrefix
= 0;
754 for(j
= 0; j
< /*nfcSize*/element
->prefixSize
/2; j
++) { // prefixes are going to be looked up backwards
755 // therefore, we will promptly reverse the prefix buffer...
756 tempPrefix
= *(/*nfcBuffer*/element
->prefix
+element
->prefixSize
-j
-1);
757 *(/*nfcBuffer*/element
->prefix
+element
->prefixSize
-j
-1) = element
->prefix
[j
];
758 element
->prefix
[j
] = tempPrefix
;
762 fprintf(stdout
, "Reversed: ");
763 for(j
=0; j
<element
->prefixSize
; j
++) {
764 fprintf(stdout
, "%04X ", element
->prefix
[j
]);
766 fprintf(stdout
, "%08X\n", element
->mapCE
);
769 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix
770 if(!(UTF_IS_TRAIL(element
->cPoints
[0]))) {
771 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[0]);
774 // Maybe we need this... To handle prefixes completely in the forward direction...
775 //if(element->cSize == 1) {
776 // if(!(UTF_IS_TRAIL(element->cPoints[0]))) {
777 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]);
781 element
->cPoints
= element
->prefix
;
782 element
->cSize
= element
->prefixSize
;
784 // Add the last char of the contraction to the contraction-end hash table.
785 // unless it is a trail surrogate, which is handled algorithmically and
786 // shouldn't be in the table
787 if(!(UTF_IS_TRAIL(element
->cPoints
[element
->cSize
-1]))) {
788 ContrEndCPSet(t
->contrEndCP
, element
->cPoints
[element
->cSize
-1]);
791 // First we need to check if contractions starts with a surrogate
792 UTF_NEXT_CHAR(element
->cPoints
, cpsize
, element
->cSize
, cp
);
794 // If there are any Jamos in the contraction, we should turn on special
795 // processing for Jamos
796 if(UCOL_ISJAMO(element
->prefix
[0])) {
797 t
->image
->jamoSpecial
= TRUE
;
799 /* then we need to deal with it */
800 /* we could aready have something in table - or we might not */
803 /* if it wasn't contraction, we wouldn't end up here*/
804 int32_t firstContractionOffset
= 0;
805 int32_t contractionOffset
= 0;
806 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, CE
, status
);
807 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
808 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->prefix
, newCE
, status
);
809 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, CE
, status
);
810 CE
= constructContractCE(SPEC_PROC_TAG
, firstContractionOffset
);
811 } else { /* we are adding to existing contraction */
812 /* there were already some elements in the table, so we need to add a new contraction */
813 /* Two things can happen here: either the codepoint is already in the table, or it is not */
814 int32_t position
= uprv_cnttab_findCP(contractions
, CE
, *element
->prefix
, status
);
815 if(position
> 0) { /* if it is we just continue down the chain */
816 uint32_t eCE
= uprv_cnttab_getCE(contractions
, CE
, position
, status
);
817 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
818 uprv_cnttab_setContraction(contractions
, CE
, position
, *(element
->prefix
), newCE
, status
);
819 } else { /* if it isn't, we will have to create a new sequence */
820 uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
821 uprv_cnttab_insertContraction(contractions
, CE
, *(element
->prefix
), element
->mapCE
, status
);
825 element
->cPoints
= oldCP
;
826 element
->cSize
= oldCPSize
;
831 // Note regarding surrogate handling: We are interested only in the single
832 // or leading surrogates in a contraction. If a surrogate is somewhere else
833 // in the contraction, it is going to be handled as a pair of code units,
834 // as it doesn't affect the performance AND handling surrogates specially
835 // would complicate code way too much.
836 static uint32_t uprv_uca_addContraction(tempUCATable
*t
, uint32_t CE
,
837 UCAElements
*element
, UErrorCode
*status
) {
838 CntTable
*contractions
= t
->contractions
;
842 contractions
->currentTag
= CONTRACTION_TAG
;
844 // First we need to check if contractions starts with a surrogate
845 UTF_NEXT_CHAR(element
->cPoints
, cpsize
, element
->cSize
, cp
);
847 if(cpsize
<element
->cSize
) { // This is a real contraction, if there are other characters after the first
849 for (j
=1; j
<element
->cSize
; j
++) { /* First add contraction chars to unsafe CP hash table */
850 // Unless it is a trail surrogate, which is handled algoritmically and
851 // shouldn't take up space in the table.
852 if(!(UTF_IS_TRAIL(element
->cPoints
[j
]))) {
853 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[j
]);
856 // Add the last char of the contraction to the contraction-end hash table.
857 // unless it is a trail surrogate, which is handled algorithmically and
858 // shouldn't be in the table
859 if(!(UTF_IS_TRAIL(element
->cPoints
[element
->cSize
-1]))) {
860 ContrEndCPSet(t
->contrEndCP
, element
->cPoints
[element
->cSize
-1]);
863 // If there are any Jamos in the contraction, we should turn on special
864 // processing for Jamos
865 if(UCOL_ISJAMO(element
->cPoints
[0])) {
866 t
->image
->jamoSpecial
= TRUE
;
868 /* then we need to deal with it */
869 /* we could aready have something in table - or we might not */
870 element
->cPoints
+=cpsize
;
871 element
->cSize
-=cpsize
;
872 if(!isContraction(CE
)) {
873 /* if it wasn't contraction, we wouldn't end up here*/
874 int32_t firstContractionOffset
= 0;
875 int32_t contractionOffset
= 0;
876 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, CE
, status
);
877 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
878 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->cPoints
, newCE
, status
);
879 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, CE
, status
);
880 CE
= constructContractCE(CONTRACTION_TAG
, firstContractionOffset
);
881 } else { /* we are adding to existing contraction */
882 /* there were already some elements in the table, so we need to add a new contraction */
883 /* Two things can happen here: either the codepoint is already in the table, or it is not */
884 int32_t position
= uprv_cnttab_findCP(contractions
, CE
, *element
->cPoints
, status
);
885 if(position
> 0) { /* if it is we just continue down the chain */
886 uint32_t eCE
= uprv_cnttab_getCE(contractions
, CE
, position
, status
);
887 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
888 uprv_cnttab_setContraction(contractions
, CE
, position
, *(element
->cPoints
), newCE
, status
);
889 } else { /* if it isn't, we will have to create a new sequence */
890 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
891 uprv_cnttab_insertContraction(contractions
, CE
, *(element
->cPoints
), newCE
, status
);
894 element
->cPoints
-=cpsize
;
895 element
->cSize
+=cpsize
;
896 /*ucmpe32_set(t->mapping, cp, CE);*/
897 utrie_set32(t
->mapping
, cp
, CE
);
898 } else if(!isContraction(CE
)) { /* this is just a surrogate, and there is no contraction */
899 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/
900 utrie_set32(t
->mapping
, cp
, element
->mapCE
);
901 } else { /* fill out the first stage of the contraction with the surrogate CE */
902 uprv_cnttab_changeContraction(contractions
, CE
, 0, element
->mapCE
, status
);
903 uprv_cnttab_changeContraction(contractions
, CE
, 0xFFFF, element
->mapCE
, status
);
909 static uint32_t uprv_uca_processContraction(CntTable
*contractions
, UCAElements
*element
, uint32_t existingCE
, UErrorCode
*status
) {
910 int32_t firstContractionOffset
= 0;
911 int32_t contractionOffset
= 0;
912 // uint32_t contractionElement = UCOL_NOT_FOUND;
914 if(U_FAILURE(*status
)) {
915 return UCOL_NOT_FOUND
;
918 /* end of recursion */
919 if(element
->cSize
== 1) {
920 if(isCntTableElement(existingCE
) && ((UColCETags
)getCETag(existingCE
) == contractions
->currentTag
)) {
921 uprv_cnttab_changeContraction(contractions
, existingCE
, 0, element
->mapCE
, status
);
922 uprv_cnttab_changeContraction(contractions
, existingCE
, 0xFFFF, element
->mapCE
, status
);
925 return element
->mapCE
; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
929 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
930 /* for both backward and forward cycles */
932 /* we encountered either an empty space or a non-contraction element */
933 /* this means we are constructing a new contraction sequence */
936 if(!isCntTableElement(existingCE
)) {
937 /* if it wasn't contraction, we wouldn't end up here*/
938 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, existingCE
, status
);
939 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
940 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->cPoints
, newCE
, status
);
941 contractionOffset
= uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, existingCE
, status
);
942 existingCE
= constructContractCE(contractions
->currentTag
, firstContractionOffset
);
943 } else { /* we are adding to existing contraction */
944 /* there were already some elements in the table, so we need to add a new contraction */
945 /* Two things can happen here: either the codepoint is already in the table, or it is not */
946 int32_t position
= uprv_cnttab_findCP(contractions
, existingCE
, *element
->cPoints
, status
);
947 if(position
> 0) { /* if it is we just continue down the chain */
948 uint32_t eCE
= uprv_cnttab_getCE(contractions
, existingCE
, position
, status
);
949 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
950 uprv_cnttab_setContraction(contractions
, existingCE
, position
, *(element
->cPoints
), newCE
, status
);
951 } else { /* if it isn't, we will have to create a new sequence */
952 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
953 uprv_cnttab_insertContraction(contractions
, existingCE
, *(element
->cPoints
), newCE
, status
);
961 static uint32_t uprv_uca_finalizeAddition(tempUCATable
*t
, UCAElements
*element
, UErrorCode
*status
) {
962 uint32_t CE
= UCOL_NOT_FOUND
;
963 // This should add a completely ignorable element to the
964 // unsafe table, so that backward iteration will skip
965 // over it when treating contractions.
967 if(element
->mapCE
== 0) {
968 for(i
= 0; i
< element
->cSize
; i
++) {
969 if(!UTF_IS_TRAIL(element
->cPoints
[i
])) {
970 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[i
]);
974 if(element
->cSize
> 1) { /* we're adding a contraction */
978 UTF_NEXT_CHAR(element
->cPoints
, i
, element
->cSize
, cp
);
979 /*CE = ucmpe32_get(t->mapping, cp);*/
980 CE
= utrie_get32(t
->mapping
, cp
, NULL
);
982 CE
= uprv_uca_addContraction(t
, CE
, element
, status
);
983 } else { /* easy case, */
984 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/
985 CE
= utrie_get32(t
->mapping
, element
->cPoints
[0], NULL
);
987 if( CE
!= UCOL_NOT_FOUND
) {
988 if(isCntTableElement(CE
) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
989 if(!isPrefix(element
->mapCE
)) { // we cannot reenter prefix elements - as we are going to create a dead loop
990 // Only expansions and regular CEs can go here... Contractions will never happen in this place
991 uprv_cnttab_setContraction(t
->contractions
, CE
, 0, 0, element
->mapCE
, status
);
992 /* This loop has to change the CE at the end of contraction REDO!*/
993 uprv_cnttab_changeLastCE(t
->contractions
, CE
, element
->mapCE
, status
);
996 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
997 utrie_set32(t
->mapping
, element
->cPoints
[0], element
->mapCE
);
999 fprintf(stderr
, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE
, element
->cPoints
[0], element
->CEs
[0]);
1000 //*status = U_ILLEGAL_ARGUMENT_ERROR;
1004 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
1005 utrie_set32(t
->mapping
, element
->cPoints
[0], element
->mapCE
);
1011 /* This adds a read element, while testing for existence */
1012 U_CAPI
uint32_t U_EXPORT2
1013 uprv_uca_addAnElement(tempUCATable
*t
, UCAElements
*element
, UErrorCode
*status
) {
1014 ExpansionTable
*expansions
= t
->expansions
;
1017 uint32_t expansion
= 0;
1020 if(U_FAILURE(*status
)) {
1024 element
->mapCE
= 0; // clear mapCE so that we can catch expansions
1026 if(element
->noOfCEs
== 1) {
1027 element
->mapCE
= element
->CEs
[0];
1029 /* ICU 2.1 long primaries */
1030 /* unfortunately, it looks like we have to look for a long primary here */
1031 /* since in canonical closure we are going to hit some long primaries from */
1032 /* the first phase, and they will come back as continuations/expansions */
1033 /* destroying the effect of the previous opitimization */
1034 /* A long primary is a three byte primary with starting secondaries and tertiaries */
1035 /* It can appear in long runs of only primary differences (like east Asian tailorings) */
1036 /* also, it should not be an expansion, as expansions would break with this */
1037 // This part came in from ucol_bld.cpp
1038 //if(tok->expansion == 0
1039 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
1040 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
1041 /* we will construct a special CE that will go unchanged to the table */
1042 if(element
->noOfCEs
== 2 // a two CE expansion
1043 && isContinuation(element
->CEs
[1]) // which is a continuation
1044 && (element
->CEs
[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER
))) == 0 // that has only primaries in continuation,
1045 && (((element
->CEs
[0]>>8) & 0xFF) == UCOL_BYTE_COMMON
) // a common secondary
1046 && ((element
->CEs
[0] & 0xFF) == UCOL_BYTE_COMMON
) // and a common tertiary
1049 fprintf(stdout
, "Long primary %04X\n", element
->cPoints
[0]);
1051 element
->mapCE
= UCOL_SPECIAL_FLAG
| (LONG_PRIMARY_TAG
<<24) // a long primary special
1052 | ((element
->CEs
[0]>>8) & 0xFFFF00) // first and second byte of primary
1053 | ((element
->CEs
[1]>>24) & 0xFF); // third byte of primary
1055 expansion
= (uint32_t)(UCOL_SPECIAL_FLAG
| (EXPANSION_TAG
<<UCOL_TAG_SHIFT
)
1056 | ((uprv_uca_addExpansion(expansions
, element
->CEs
[0], status
)+(headersize
>>2))<<4)
1059 for(i
= 1; i
<element
->noOfCEs
; i
++) {
1060 uprv_uca_addExpansion(expansions
, element
->CEs
[i
], status
);
1062 if(element
->noOfCEs
<= 0xF) {
1063 expansion
|= element
->noOfCEs
;
1065 uprv_uca_addExpansion(expansions
, 0, status
);
1067 element
->mapCE
= expansion
;
1068 uprv_uca_setMaxExpansion(element
->CEs
[element
->noOfCEs
- 1],
1069 (uint8_t)element
->noOfCEs
,
1072 if(UCOL_ISJAMO(element
->cPoints
[0])) {
1073 t
->image
->jamoSpecial
= TRUE
;
1074 uprv_uca_setMaxJamoExpansion(element
->cPoints
[0],
1075 element
->CEs
[element
->noOfCEs
- 1],
1076 (uint8_t)element
->noOfCEs
,
1077 t
->maxJamoExpansions
,
1079 if (U_FAILURE(*status
)) {
1086 // We treat digits differently - they are "uber special" and should be
1087 // processed differently if numeric collation is on.
1088 UChar32 uniChar
= 0;
1089 //printElement(element);
1090 if ((element
->cSize
== 2) && U16_IS_LEAD(element
->cPoints
[0])){
1091 uniChar
= U16_GET_SUPPLEMENTARY(element
->cPoints
[0], element
->cPoints
[1]);
1092 } else if (element
->cSize
== 1){
1093 uniChar
= element
->cPoints
[0];
1096 // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
1097 // one element to the expansion buffer. When we encounter a digit and we don't
1098 // do numeric collation, we will just pick the CE we have and break out of case
1099 // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
1100 // a special, further processing will occur. If it's a simple CE, we'll return due
1101 // to how the loop is constructed.
1102 if (uniChar
!= 0 && u_isdigit(uniChar
)){
1103 expansion
= (uint32_t)(UCOL_SPECIAL_FLAG
| (DIGIT_TAG
<<UCOL_TAG_SHIFT
) | 1); // prepare the element
1104 if(element
->mapCE
) { // if there is an expansion, we'll pick it here
1105 expansion
|= ((uprv_uca_addExpansion(expansions
, element
->mapCE
, status
)+(headersize
>>2))<<4);
1107 expansion
|= ((uprv_uca_addExpansion(expansions
, element
->CEs
[0], status
)+(headersize
>>2))<<4);
1109 element
->mapCE
= expansion
;
1111 // Need to go back to the beginning of the digit string if in the middle!
1112 if(uniChar
<= 0xFFFF) { // supplementaries are always unsafe. API takes UChars
1113 unsafeCPSet(t
->unsafeCP
, (UChar
)uniChar
);
1117 // here we want to add the prefix structure.
1118 // I will try to process it as a reverse contraction, if possible.
1119 // prefix buffer is already reversed.
1121 if(element
->prefixSize
!=0) {
1122 // We keep the seen prefix starter elements in a hashtable
1123 // we need it to be able to distinguish between the simple
1124 // codepoints and prefix starters. Also, we need to use it
1125 // for canonical closure.
1127 UCAElements
*composed
= (UCAElements
*)uprv_malloc(sizeof(UCAElements
));
1129 if (composed
== NULL
) {
1130 *status
= U_MEMORY_ALLOCATION_ERROR
;
1133 uprv_memcpy(composed
, element
, sizeof(UCAElements
));
1134 composed
->cPoints
= composed
->uchars
;
1135 composed
->prefix
= composed
->prefixChars
;
1137 composed
->prefixSize
= unorm_normalize(element
->prefix
, element
->prefixSize
, UNORM_NFC
, 0, composed
->prefix
, 128, status
);
1140 if(t
->prefixLookup
!= NULL
) {
1141 UCAElements
*uCE
= (UCAElements
*)uhash_get(t
->prefixLookup
, element
);
1142 if(uCE
!= NULL
) { // there is already a set of code points here
1143 element
->mapCE
= uprv_uca_addPrefix(t
, uCE
->mapCE
, element
, status
);
1144 } else { // no code points, so this spot is clean
1145 element
->mapCE
= uprv_uca_addPrefix(t
, UCOL_NOT_FOUND
, element
, status
);
1146 uCE
= (UCAElements
*)uprv_malloc(sizeof(UCAElements
));
1149 *status
= U_MEMORY_ALLOCATION_ERROR
;
1152 uprv_memcpy(uCE
, element
, sizeof(UCAElements
));
1153 uCE
->cPoints
= uCE
->uchars
;
1154 uhash_put(t
->prefixLookup
, uCE
, uCE
, status
);
1156 if(composed
->prefixSize
!= element
->prefixSize
|| uprv_memcmp(composed
->prefix
, element
->prefix
, element
->prefixSize
)) {
1158 composed
->mapCE
= uprv_uca_addPrefix(t
, element
->mapCE
, composed
, status
);
1161 uprv_free(composed
);
1164 // We need to use the canonical iterator here
1165 // the way we do it is to generate the canonically equivalent strings
1166 // for the contraction and then add the sequences that pass FCD check
1167 if(element
->cSize
> 1 && !(element
->cSize
==2 && UTF16_IS_LEAD(element
->cPoints
[0]) && UTF16_IS_TRAIL(element
->cPoints
[1]))) { // this is a contraction, we should check whether a composed form should also be included
1168 UnicodeString
source(element
->cPoints
, element
->cSize
);
1169 CanonicalIterator
it(source
, *status
);
1171 while(!source
.isBogus()) {
1172 if(Normalizer::quickCheck(source
, UNORM_FCD
, *status
) != UNORM_NO
) {
1173 element
->cSize
= source
.extract(element
->cPoints
, 128, *status
);
1174 uprv_uca_finalizeAddition(t
, element
, status
);
1178 CE
= element
->mapCE
;
1180 CE
= uprv_uca_finalizeAddition(t
, element
, status
);
1187 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */
1188 static void uprv_uca_getMaxExpansionJamo(UNewTrie
*mapping
,
1189 MaxExpansionTable
*maxexpansion
,
1190 MaxJamoExpansionTable
*maxjamoexpansion
,
1194 const uint32_t VBASE
= 0x1161;
1195 const uint32_t TBASE
= 0x11A8;
1196 const uint32_t VCOUNT
= 21;
1197 const uint32_t TCOUNT
= 28;
1199 uint32_t v
= VBASE
+ VCOUNT
- 1;
1200 uint32_t t
= TBASE
+ TCOUNT
- 1;
1203 while (v
>= VBASE
) {
1204 /*ce = ucmpe32_get(mapping, v);*/
1205 ce
= utrie_get32(mapping
, v
, NULL
);
1206 if (ce
< UCOL_SPECIAL_FLAG
) {
1207 uprv_uca_setMaxExpansion(ce
, 2, maxexpansion
, status
);
1214 /*ce = ucmpe32_get(mapping, t);*/
1215 ce
= utrie_get32(mapping
, t
, NULL
);
1216 if (ce
< UCOL_SPECIAL_FLAG
) {
1217 uprv_uca_setMaxExpansion(ce
, 3, maxexpansion
, status
);
1221 /* According to the docs, 99% of the time, the Jamo will not be special */
1223 /* gets the max expansion in all unicode characters */
1224 int count
= maxjamoexpansion
->position
;
1225 uint8_t maxTSize
= (uint8_t)(maxjamoexpansion
->maxLSize
+
1226 maxjamoexpansion
->maxVSize
+
1227 maxjamoexpansion
->maxTSize
);
1228 uint8_t maxVSize
= (uint8_t)(maxjamoexpansion
->maxLSize
+
1229 maxjamoexpansion
->maxVSize
);
1233 if (*(maxjamoexpansion
->isV
+ count
) == TRUE
) {
1234 uprv_uca_setMaxExpansion(
1235 *(maxjamoexpansion
->endExpansionCE
+ count
),
1236 maxVSize
, maxexpansion
, status
);
1239 uprv_uca_setMaxExpansion(
1240 *(maxjamoexpansion
->endExpansionCE
+ count
),
1241 maxTSize
, maxexpansion
, status
);
1248 static inline uint32_t U_CALLCONV
1249 getFoldedValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
)
1257 while(start
<limit
) {
1258 value
=utrie_get32(trie
, start
, &inBlockZero
);
1259 tag
= getCETag(value
);
1260 if(inBlockZero
== TRUE
) {
1261 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1262 } else if(!(isSpecial(value
) && (tag
== IMPLICIT_TAG
|| tag
== NOT_FOUND_TAG
))) {
1263 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the
1264 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is
1265 * nothing in this position and that it should be skipped.
1268 static int32_t count
= 1;
1269 fprintf(stdout
, "%i, Folded %08X, value %08X\n", count
++, start
, value
);
1271 return (uint32_t)(UCOL_SPECIAL_FLAG
| (SURROGATE_TAG
<<24) | offset
);
1281 // This is a debug function to print the contents of a trie.
1282 // It is used in conjuction with the code around utrie_unserialize call
1283 void enumRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
1285 fprintf(stdout
, "%08X, %08X, %08X\n", start
, limit
, value
);
1287 fprintf(stdout
, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start
, UTF16_LEAD(start
), UTF16_TRAIL(start
), limit
, UTF16_LEAD(limit
), UTF16_TRAIL(limit
), value
);
1292 myGetFoldingOffset(uint32_t data
) {
1293 if(data
> UCOL_NOT_FOUND
&& getCETag(data
) == SURROGATE_TAG
) {
1294 return (data
&0xFFFFFF);
1301 U_CAPI UCATableHeader
* U_EXPORT2
1302 uprv_uca_assembleTable(tempUCATable
*t
, UErrorCode
*status
) {
1303 /*CompactEIntArray *mapping = t->mapping;*/
1304 UNewTrie
*mapping
= t
->mapping
;
1305 ExpansionTable
*expansions
= t
->expansions
;
1306 CntTable
*contractions
= t
->contractions
;
1307 MaxExpansionTable
*maxexpansion
= t
->maxExpansions
;
1309 if(U_FAILURE(*status
)) {
1313 uint32_t beforeContractions
= (uint32_t)((headersize
+paddedsize(expansions
->position
*sizeof(uint32_t)))/sizeof(UChar
));
1315 int32_t contractionsSize
= 0;
1316 contractionsSize
= uprv_cnttab_constructTable(contractions
, beforeContractions
, status
);
1318 /* the following operation depends on the trie data. Therefore, we have to do it before */
1319 /* the trie is compacted */
1320 /* sets jamo expansions */
1321 uprv_uca_getMaxExpansionJamo(mapping
, maxexpansion
, t
->maxJamoExpansions
,
1322 t
->image
->jamoSpecial
, status
);
1324 /*ucmpe32_compact(mapping);*/
1325 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/
1326 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/
1327 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/
1329 // After setting the jamo expansions, compact the trie and get the needed size
1330 int32_t mappingSize
= utrie_serialize(mapping
, NULL
, 0, getFoldedValue
/*getFoldedValue*/, FALSE
, status
);
1332 uint32_t tableOffset
= 0;
1335 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */
1337 uint32_t toAllocate
=(uint32_t)(headersize
+
1338 paddedsize(expansions
->position
*sizeof(uint32_t))+
1339 paddedsize(mappingSize
)+
1340 paddedsize(contractionsSize
*(sizeof(UChar
)+sizeof(uint32_t)))+
1341 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */
1342 /* maxexpansion array */
1343 + paddedsize(maxexpansion
->position
* sizeof(uint32_t)) +
1344 /* maxexpansion size array */
1345 paddedsize(maxexpansion
->position
* sizeof(uint8_t)) +
1346 paddedsize(UCOL_UNSAFECP_TABLE_SIZE
) + /* Unsafe chars */
1347 paddedsize(UCOL_UNSAFECP_TABLE_SIZE
)); /* Contraction Ending chars */
1350 dataStart
= (uint8_t *)uprv_malloc(toAllocate
);
1352 if (dataStart
== NULL
) {
1353 *status
= U_MEMORY_ALLOCATION_ERROR
;
1357 UCATableHeader
*myData
= (UCATableHeader
*)dataStart
;
1358 // Please, do reset all the fields!
1359 uprv_memset(dataStart
, 0, toAllocate
);
1360 // Make sure we know this is reset
1361 myData
->magic
= UCOL_HEADER_MAGIC
;
1362 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
1363 myData
->charSetFamily
= U_CHARSET_FAMILY
;
1364 myData
->formatVersion
[0] = UCA_FORMAT_VERSION_0
;
1365 myData
->formatVersion
[1] = UCA_FORMAT_VERSION_1
;
1366 myData
->formatVersion
[2] = UCA_FORMAT_VERSION_2
;
1367 myData
->formatVersion
[3] = UCA_FORMAT_VERSION_3
;
1368 myData
->jamoSpecial
= t
->image
->jamoSpecial
;
1370 // Don't copy stuff from UCA header!
1371 //uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
1373 myData
->contractionSize
= contractionsSize
;
1375 tableOffset
+= (uint32_t)(paddedsize(sizeof(UCATableHeader
)));
1377 myData
->options
= tableOffset
;
1378 uprv_memcpy(dataStart
+tableOffset
, t
->options
, sizeof(UColOptionSet
));
1379 tableOffset
+= (uint32_t)(paddedsize(sizeof(UColOptionSet
)));
1381 /* copy expansions */
1382 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
1383 myData
->expansion
= tableOffset
;
1384 uprv_memcpy(dataStart
+tableOffset
, expansions
->CEs
, expansions
->position
*sizeof(uint32_t));
1385 tableOffset
+= (uint32_t)(paddedsize(expansions
->position
*sizeof(uint32_t)));
1387 /* contractions block */
1388 if(contractionsSize
!= 0) {
1389 /* copy contraction index */
1390 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
1391 myData
->contractionIndex
= tableOffset
;
1392 uprv_memcpy(dataStart
+tableOffset
, contractions
->codePoints
, contractionsSize
*sizeof(UChar
));
1393 tableOffset
+= (uint32_t)(paddedsize(contractionsSize
*sizeof(UChar
)));
1395 /* copy contraction collation elements */
1396 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
1397 myData
->contractionCEs
= tableOffset
;
1398 uprv_memcpy(dataStart
+tableOffset
, contractions
->CEs
, contractionsSize
*sizeof(uint32_t));
1399 tableOffset
+= (uint32_t)(paddedsize(contractionsSize
*sizeof(uint32_t)));
1401 myData
->contractionIndex
= 0;
1402 myData
->contractionCEs
= 0;
1405 /* copy mapping table */
1406 /*myData->mappingPosition = dataStart+tableOffset;*/
1407 /*myData->mappingPosition = tableOffset;*/
1408 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/
1410 myData
->mappingPosition
= tableOffset
;
1411 utrie_serialize(mapping
, dataStart
+tableOffset
, toAllocate
-tableOffset
, getFoldedValue
, FALSE
, status
);
1413 // This is debug code to dump the contents of the trie. It needs two functions defined above
1417 utrie_unserialize(&UCAt
, dataStart
+tableOffset
, 9999999, status
);
1418 UCAt
.getFoldingOffset
= myGetFoldingOffset
;
1419 if(U_SUCCESS(*status
)) {
1420 utrie_enum(&UCAt
, NULL
, enumRange
, NULL
);
1422 trieWord
= UTRIE_GET32_FROM_LEAD(UCAt
, 0xDC01)
1425 tableOffset
+= paddedsize(mappingSize
);
1430 /* copy max expansion table */
1431 myData
->endExpansionCE
= tableOffset
;
1432 myData
->endExpansionCECount
= maxexpansion
->position
- 1;
1433 /* not copying the first element which is a dummy */
1434 uprv_memcpy(dataStart
+ tableOffset
, maxexpansion
->endExpansionCE
+ 1,
1435 (maxexpansion
->position
- 1) * sizeof(uint32_t));
1436 tableOffset
+= (uint32_t)(paddedsize((maxexpansion
->position
)* sizeof(uint32_t)));
1437 myData
->expansionCESize
= tableOffset
;
1438 uprv_memcpy(dataStart
+ tableOffset
, maxexpansion
->expansionCESize
+ 1,
1439 (maxexpansion
->position
- 1) * sizeof(uint8_t));
1440 tableOffset
+= (uint32_t)(paddedsize((maxexpansion
->position
)* sizeof(uint8_t)));
1442 /* Unsafe chars table. Finish it off, then copy it. */
1443 uprv_uca_unsafeCPAddCCNZ(t
, status
);
1444 if (t
->UCA
!= 0) { /* Or in unsafebits from UCA, making a combined table. */
1445 for (i
=0; i
<UCOL_UNSAFECP_TABLE_SIZE
; i
++) {
1446 t
->unsafeCP
[i
] |= t
->UCA
->unsafeCP
[i
];
1449 myData
->unsafeCP
= tableOffset
;
1450 uprv_memcpy(dataStart
+ tableOffset
, t
->unsafeCP
, UCOL_UNSAFECP_TABLE_SIZE
);
1451 tableOffset
+= paddedsize(UCOL_UNSAFECP_TABLE_SIZE
);
1454 /* Finish building Contraction Ending chars hash table and then copy it out. */
1455 if (t
->UCA
!= 0) { /* Or in unsafebits from UCA, making a combined table. */
1456 for (i
=0; i
<UCOL_UNSAFECP_TABLE_SIZE
; i
++) {
1457 t
->contrEndCP
[i
] |= t
->UCA
->contrEndCP
[i
];
1460 myData
->contrEndCP
= tableOffset
;
1461 uprv_memcpy(dataStart
+ tableOffset
, t
->contrEndCP
, UCOL_UNSAFECP_TABLE_SIZE
);
1462 tableOffset
+= paddedsize(UCOL_UNSAFECP_TABLE_SIZE
);
1464 if(tableOffset
!= toAllocate
) {
1466 fprintf(stderr
, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate
, tableOffset
);
1468 *status
= U_INTERNAL_PROGRAM_ERROR
;
1469 uprv_free(dataStart
);
1473 myData
->size
= tableOffset
;
1474 /* This should happen upon ressurection */
1475 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/
1476 /*uprv_mstrm_close(ms);*/
1483 UCollator
*tempColl
;
1484 UCollationElements
* colEl
;
1485 int32_t noOfClosures
;
1489 static UBool U_CALLCONV
1490 _enumCategoryRangeClosureCategory(const void *context
, UChar32 start
, UChar32 limit
, UCharCategory type
) {
1492 if (type
!= U_UNASSIGNED
&& type
!= U_PRIVATE_USE_CHAR
) { // if the range is assigned - we might ommit more categories later
1493 UErrorCode
*status
= ((enumStruct
*)context
)->status
;
1494 tempUCATable
*t
= ((enumStruct
*)context
)->t
;
1495 UCollator
*tempColl
= ((enumStruct
*)context
)->tempColl
;
1496 UCollationElements
* colEl
= ((enumStruct
*)context
)->colEl
;
1498 UChar decomp
[256] = { 0 };
1499 int32_t noOfDec
= 0;
1505 for(u32
= start
; u32
< limit
; u32
++) {
1506 noOfDec
= unorm_getDecomposition(u32
, FALSE
, decomp
, 256);
1507 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
1508 //|| (noOfDec == 1 && *decomp != (UChar)u32))
1509 if(noOfDec
> 0) // if we're positive, that means there is no decomposition
1512 UTF_APPEND_CHAR_UNSAFE(comp
, len
, u32
);
1513 if(ucol_strcoll(tempColl
, comp
, len
, decomp
, noOfDec
) != UCOL_EQUAL
) {
1515 fprintf(stderr
, "Closure: %08X -> ", u32
);
1517 for(i
= 0; i
<noOfDec
; i
++) {
1518 fprintf(stderr
, "%04X ", decomp
[i
]);
1520 fprintf(stderr
, "\n");
1522 ((enumStruct
*)context
)->noOfClosures
++;
1523 el
.cPoints
= decomp
;
1526 el
.prefix
= el
.prefixChars
;
1529 UCAElements
*prefix
=(UCAElements
*)uhash_get(t
->prefixLookup
, &el
);
1532 el
.prefix
= el
.prefixChars
;
1534 if(prefix
== NULL
) {
1536 ucol_setText(colEl
, decomp
, noOfDec
, status
);
1537 while((el
.CEs
[el
.noOfCEs
] = ucol_next(colEl
, status
)) != (uint32_t)UCOL_NULLORDER
) {
1542 el
.CEs
[0] = prefix
->mapCE
;
1543 // This character uses a prefix. We have to add it
1544 // to the unsafe table, as it decomposed form is already
1545 // in. In Japanese, this happens for \u309e & \u30fe
1546 // Since unsafeCPSet is static in ucol_elm, we are going
1547 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
1549 uprv_uca_addAnElement(t
, &el
, status
);
1558 U_CAPI
int32_t U_EXPORT2
1559 uprv_uca_canonicalClosure(tempUCATable
*t
, UErrorCode
*status
)
1562 context
.noOfClosures
= 0;
1563 if(U_SUCCESS(*status
)) {
1564 UCollator
*tempColl
= NULL
;
1565 tempUCATable
*tempTable
= uprv_uca_cloneTempTable(t
, status
);
1567 UCATableHeader
*tempData
= uprv_uca_assembleTable(tempTable
, status
);
1568 tempColl
= ucol_initCollator(tempData
, 0, t
->UCA
, status
);
1569 uprv_uca_closeTempTable(tempTable
);
1571 if(U_SUCCESS(*status
)) {
1572 tempColl
->rb
= NULL
;
1573 tempColl
->elements
= NULL
;
1574 tempColl
->validLocale
= NULL
;
1575 tempColl
->requestedLocale
= NULL
;
1576 tempColl
->hasRealData
= TRUE
;
1577 tempColl
->freeImageOnClose
= TRUE
;
1578 } else if(tempData
!= 0) {
1579 uprv_free(tempData
);
1582 /* produce canonical closure */
1583 UCollationElements
* colEl
= ucol_openElements(tempColl
, NULL
, 0, status
);
1586 context
.tempColl
= tempColl
;
1587 context
.colEl
= colEl
;
1588 context
.status
= status
;
1589 u_enumCharTypes(_enumCategoryRangeClosureCategory
, &context
);
1591 ucol_closeElements(colEl
);
1592 ucol_close(tempColl
);
1594 return context
.noOfClosures
;
1597 #endif /* #if !UCONFIG_NO_COLLATION */