2 *******************************************************************************
4 * Copyright (C) 2001-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucaelems.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This program reads the Franctional UCA table and generates
17 * internal format for UCA table as well as inverse UCA table.
18 * It then writes binary files containing the data: ucadata.dat
22 * 03/02/2001 synwee added setMaxExpansion
23 * 03/07/2001 synwee merged UCA's maxexpansion and tailoring's
26 #include "unicode/utypes.h"
28 #if !UCONFIG_NO_COLLATION
30 #include "unicode/uchar.h"
31 #include "unicode/unistr.h"
32 #include "unicode/ucoleitr.h"
33 #include "unicode/normlzr.h"
34 #include "unicode/utf16.h"
35 #include "normalizer2impl.h"
39 #include "unicode/caniter.h"
45 static uint32_t uprv_uca_processContraction(CntTable
*contractions
, UCAElements
*element
, uint32_t existingCE
, UErrorCode
*status
);
48 static int32_t U_CALLCONV
49 prefixLookupHash(const UHashTok e
) {
50 UCAElements
*element
= (UCAElements
*)e
.pointer
;
54 uprv_memcpy(buf
, element
->cPoints
, element
->cSize
*sizeof(UChar
));
55 buf
[element
->cSize
] = 0;
56 //key.pointer = element->cPoints;
57 //element->cPoints[element->cSize] = 0;
58 return uhash_hashUChars(key
);
61 static int8_t U_CALLCONV
62 prefixLookupComp(const UHashTok e1
, const UHashTok e2
) {
63 UCAElements
*element1
= (UCAElements
*)e1
.pointer
;
64 UCAElements
*element2
= (UCAElements
*)e2
.pointer
;
69 uprv_memcpy(buf1
, element1
->cPoints
, element1
->cSize
*sizeof(UChar
));
70 buf1
[element1
->cSize
] = 0;
75 uprv_memcpy(buf2
, element2
->cPoints
, element2
->cSize
*sizeof(UChar
));
76 buf2
[element2
->cSize
] = 0;
78 return uhash_compareUChars(key1
, key2
);
82 static int32_t uprv_uca_addExpansion(ExpansionTable
*expansions
, uint32_t value
, UErrorCode
*status
) {
83 if(U_FAILURE(*status
)) {
86 if(expansions
->CEs
== NULL
) {
87 expansions
->CEs
= (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
*sizeof(uint32_t));
89 if (expansions
->CEs
== NULL
) {
90 *status
= U_MEMORY_ALLOCATION_ERROR
;
93 expansions
->size
= INIT_EXP_TABLE_SIZE
;
94 expansions
->position
= 0;
97 if(expansions
->position
== expansions
->size
) {
98 uint32_t *newData
= (uint32_t *)uprv_realloc(expansions
->CEs
, 2*expansions
->size
*sizeof(uint32_t));
101 fprintf(stderr
, "out of memory for expansions\n");
103 *status
= U_MEMORY_ALLOCATION_ERROR
;
106 expansions
->CEs
= newData
;
107 expansions
->size
*= 2;
110 expansions
->CEs
[expansions
->position
] = value
;
111 return(expansions
->position
++);
114 U_CAPI tempUCATable
* U_EXPORT2
115 uprv_uca_initTempTable(UCATableHeader
*image
, UColOptionSet
*opts
, const UCollator
*UCA
, UColCETags initTag
, UColCETags supplementaryInitTag
, UErrorCode
*status
) {
116 MaxJamoExpansionTable
*maxjet
;
117 MaxExpansionTable
*maxet
;
118 tempUCATable
*t
= (tempUCATable
*)uprv_malloc(sizeof(tempUCATable
));
121 *status
= U_MEMORY_ALLOCATION_ERROR
;
124 uprv_memset(t
, 0, sizeof(tempUCATable
));
126 maxet
= (MaxExpansionTable
*)uprv_malloc(sizeof(MaxExpansionTable
));
128 goto allocation_failure
;
130 uprv_memset(maxet
, 0, sizeof(MaxExpansionTable
));
131 t
->maxExpansions
= maxet
;
133 maxjet
= (MaxJamoExpansionTable
*)uprv_malloc(sizeof(MaxJamoExpansionTable
));
134 if (maxjet
== NULL
) {
135 goto allocation_failure
;
137 uprv_memset(maxjet
, 0, sizeof(MaxJamoExpansionTable
));
138 t
->maxJamoExpansions
= maxjet
;
144 t
->expansions
= (ExpansionTable
*)uprv_malloc(sizeof(ExpansionTable
));
146 if (t
->expansions
== NULL
) {
147 goto allocation_failure
;
149 uprv_memset(t
->expansions
, 0, sizeof(ExpansionTable
));
151 t
->mapping
= utrie_open(NULL
, NULL
, UCOL_ELM_TRIE_CAPACITY
,
152 UCOL_SPECIAL_FLAG
| (initTag
<<24),
153 UCOL_SPECIAL_FLAG
| (supplementaryInitTag
<< 24),
154 TRUE
); // Do your own mallocs for the structure, array and have linear Latin 1
155 if (U_FAILURE(*status
)) {
156 goto allocation_failure
;
158 t
->prefixLookup
= uhash_open(prefixLookupHash
, prefixLookupComp
, NULL
, status
);
159 if (U_FAILURE(*status
)) {
160 goto allocation_failure
;
162 uhash_setValueDeleter(t
->prefixLookup
, uprv_free
);
164 t
->contractions
= uprv_cnttab_open(t
->mapping
, status
);
165 if (U_FAILURE(*status
)) {
169 /* copy UCA's maxexpansion and merge as we go along */
171 /* adding an extra initial value for easier manipulation */
172 maxet
->size
= (int32_t)(UCA
->lastEndExpansionCE
- UCA
->endExpansionCE
) + 2;
173 maxet
->position
= maxet
->size
- 1;
174 maxet
->endExpansionCE
=
175 (uint32_t *)uprv_malloc(sizeof(uint32_t) * maxet
->size
);
177 if (maxet
->endExpansionCE
== NULL
) {
178 goto allocation_failure
;
180 maxet
->expansionCESize
=
181 (uint8_t *)uprv_malloc(sizeof(uint8_t) * maxet
->size
);
183 if (maxet
->expansionCESize
== NULL
) {
184 goto allocation_failure
;
186 /* initialized value */
187 *(maxet
->endExpansionCE
) = 0;
188 *(maxet
->expansionCESize
) = 0;
189 uprv_memcpy(maxet
->endExpansionCE
+ 1, UCA
->endExpansionCE
,
190 sizeof(uint32_t) * (maxet
->size
- 1));
191 uprv_memcpy(maxet
->expansionCESize
+ 1, UCA
->expansionCESize
,
192 sizeof(uint8_t) * (maxet
->size
- 1));
197 maxjet
->endExpansionCE
= NULL
;
200 maxjet
->position
= 0;
201 maxjet
->maxLSize
= 1;
202 maxjet
->maxVSize
= 1;
203 maxjet
->maxTSize
= 1;
205 t
->unsafeCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
207 if (t
->unsafeCP
== NULL
) {
208 goto allocation_failure
;
210 t
->contrEndCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
212 if (t
->contrEndCP
== NULL
) {
213 goto allocation_failure
;
215 uprv_memset(t
->unsafeCP
, 0, UCOL_UNSAFECP_TABLE_SIZE
);
216 uprv_memset(t
->contrEndCP
, 0, UCOL_UNSAFECP_TABLE_SIZE
);
221 *status
= U_MEMORY_ALLOCATION_ERROR
;
223 uprv_uca_closeTempTable(t
);
227 static tempUCATable
* U_EXPORT2
228 uprv_uca_cloneTempTable(tempUCATable
*t
, UErrorCode
*status
) {
229 if(U_FAILURE(*status
)) {
233 tempUCATable
*r
= (tempUCATable
*)uprv_malloc(sizeof(tempUCATable
));
236 *status
= U_MEMORY_ALLOCATION_ERROR
;
239 uprv_memset(r
, 0, sizeof(tempUCATable
));
242 if(t
->mapping
!= NULL
) {
243 /*r->mapping = ucmpe32_clone(t->mapping, status);*/
244 r
->mapping
= utrie_clone(NULL
, t
->mapping
, NULL
, 0);
247 // a hashing clone function would be very nice. We have none currently...
248 // However, we should be good, as closing should not produce any prefixed elements.
249 r
->prefixLookup
= NULL
; // prefixes are not used in closing
252 if(t
->expansions
!= NULL
) {
253 r
->expansions
= (ExpansionTable
*)uprv_malloc(sizeof(ExpansionTable
));
255 if (r
->expansions
== NULL
) {
256 *status
= U_MEMORY_ALLOCATION_ERROR
;
259 r
->expansions
->position
= t
->expansions
->position
;
260 r
->expansions
->size
= t
->expansions
->size
;
261 if(t
->expansions
->CEs
!= NULL
) {
262 r
->expansions
->CEs
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->expansions
->size
);
264 if (r
->expansions
->CEs
== NULL
) {
265 *status
= U_MEMORY_ALLOCATION_ERROR
;
268 uprv_memcpy(r
->expansions
->CEs
, t
->expansions
->CEs
, sizeof(uint32_t)*t
->expansions
->position
);
270 r
->expansions
->CEs
= NULL
;
274 if(t
->contractions
!= NULL
) {
275 r
->contractions
= uprv_cnttab_clone(t
->contractions
, status
);
276 // Check for cloning failure.
277 if (r
->contractions
== NULL
) {
278 *status
= U_MEMORY_ALLOCATION_ERROR
;
281 r
->contractions
->mapping
= r
->mapping
;
284 if(t
->maxExpansions
!= NULL
) {
285 r
->maxExpansions
= (MaxExpansionTable
*)uprv_malloc(sizeof(MaxExpansionTable
));
287 if (r
->maxExpansions
== NULL
) {
288 *status
= U_MEMORY_ALLOCATION_ERROR
;
291 r
->maxExpansions
->size
= t
->maxExpansions
->size
;
292 r
->maxExpansions
->position
= t
->maxExpansions
->position
;
293 if(t
->maxExpansions
->endExpansionCE
!= NULL
) {
294 r
->maxExpansions
->endExpansionCE
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->maxExpansions
->size
);
296 if (r
->maxExpansions
->endExpansionCE
== NULL
) {
297 *status
= U_MEMORY_ALLOCATION_ERROR
;
300 uprv_memset(r
->maxExpansions
->endExpansionCE
, 0xDB, sizeof(uint32_t)*t
->maxExpansions
->size
);
301 uprv_memcpy(r
->maxExpansions
->endExpansionCE
, t
->maxExpansions
->endExpansionCE
, t
->maxExpansions
->position
*sizeof(uint32_t));
303 r
->maxExpansions
->endExpansionCE
= NULL
;
305 if(t
->maxExpansions
->expansionCESize
!= NULL
) {
306 r
->maxExpansions
->expansionCESize
= (uint8_t *)uprv_malloc(sizeof(uint8_t)*t
->maxExpansions
->size
);
308 if (r
->maxExpansions
->expansionCESize
== NULL
) {
309 *status
= U_MEMORY_ALLOCATION_ERROR
;
312 uprv_memset(r
->maxExpansions
->expansionCESize
, 0xDB, sizeof(uint8_t)*t
->maxExpansions
->size
);
313 uprv_memcpy(r
->maxExpansions
->expansionCESize
, t
->maxExpansions
->expansionCESize
, t
->maxExpansions
->position
*sizeof(uint8_t));
315 r
->maxExpansions
->expansionCESize
= NULL
;
319 if(t
->maxJamoExpansions
!= NULL
) {
320 r
->maxJamoExpansions
= (MaxJamoExpansionTable
*)uprv_malloc(sizeof(MaxJamoExpansionTable
));
322 if (r
->maxJamoExpansions
== NULL
) {
323 *status
= U_MEMORY_ALLOCATION_ERROR
;
326 r
->maxJamoExpansions
->size
= t
->maxJamoExpansions
->size
;
327 r
->maxJamoExpansions
->position
= t
->maxJamoExpansions
->position
;
328 r
->maxJamoExpansions
->maxLSize
= t
->maxJamoExpansions
->maxLSize
;
329 r
->maxJamoExpansions
->maxVSize
= t
->maxJamoExpansions
->maxVSize
;
330 r
->maxJamoExpansions
->maxTSize
= t
->maxJamoExpansions
->maxTSize
;
331 if(t
->maxJamoExpansions
->size
!= 0) {
332 r
->maxJamoExpansions
->endExpansionCE
= (uint32_t *)uprv_malloc(sizeof(uint32_t)*t
->maxJamoExpansions
->size
);
334 if (r
->maxJamoExpansions
->endExpansionCE
== NULL
) {
335 *status
= U_MEMORY_ALLOCATION_ERROR
;
338 uprv_memcpy(r
->maxJamoExpansions
->endExpansionCE
, t
->maxJamoExpansions
->endExpansionCE
, t
->maxJamoExpansions
->position
*sizeof(uint32_t));
339 r
->maxJamoExpansions
->isV
= (UBool
*)uprv_malloc(sizeof(UBool
)*t
->maxJamoExpansions
->size
);
341 if (r
->maxJamoExpansions
->isV
== NULL
) {
342 *status
= U_MEMORY_ALLOCATION_ERROR
;
345 uprv_memcpy(r
->maxJamoExpansions
->isV
, t
->maxJamoExpansions
->isV
, t
->maxJamoExpansions
->position
*sizeof(UBool
));
347 r
->maxJamoExpansions
->endExpansionCE
= NULL
;
348 r
->maxJamoExpansions
->isV
= NULL
;
352 if(t
->unsafeCP
!= NULL
) {
353 r
->unsafeCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
355 if (r
->unsafeCP
== NULL
) {
356 *status
= U_MEMORY_ALLOCATION_ERROR
;
359 uprv_memcpy(r
->unsafeCP
, t
->unsafeCP
, UCOL_UNSAFECP_TABLE_SIZE
);
362 if(t
->contrEndCP
!= NULL
) {
363 r
->contrEndCP
= (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE
);
365 if (r
->contrEndCP
== NULL
) {
366 *status
= U_MEMORY_ALLOCATION_ERROR
;
369 uprv_memcpy(r
->contrEndCP
, t
->contrEndCP
, UCOL_UNSAFECP_TABLE_SIZE
);
374 r
->options
= t
->options
;
378 uprv_uca_closeTempTable(t
);
383 U_CAPI
void U_EXPORT2
384 uprv_uca_closeTempTable(tempUCATable
*t
) {
386 if (t
->expansions
!= NULL
) {
387 uprv_free(t
->expansions
->CEs
);
388 uprv_free(t
->expansions
);
390 if(t
->contractions
!= NULL
) {
391 uprv_cnttab_close(t
->contractions
);
393 if (t
->mapping
!= NULL
) {
394 utrie_close(t
->mapping
);
397 if(t
->prefixLookup
!= NULL
) {
398 uhash_close(t
->prefixLookup
);
401 if (t
->maxExpansions
!= NULL
) {
402 uprv_free(t
->maxExpansions
->endExpansionCE
);
403 uprv_free(t
->maxExpansions
->expansionCESize
);
404 uprv_free(t
->maxExpansions
);
407 if (t
->maxJamoExpansions
->size
> 0) {
408 uprv_free(t
->maxJamoExpansions
->endExpansionCE
);
409 uprv_free(t
->maxJamoExpansions
->isV
);
411 uprv_free(t
->maxJamoExpansions
);
413 uprv_free(t
->unsafeCP
);
414 uprv_free(t
->contrEndCP
);
416 if (t
->cmLookup
!= NULL
) {
417 uprv_free(t
->cmLookup
->cPoints
);
418 uprv_free(t
->cmLookup
);
426 * Looks for the maximum length of all expansion sequences ending with the same
427 * collation element. The size required for maxexpansion and maxsize is
428 * returned if the arrays are too small.
429 * @param endexpansion the last expansion collation element to be added
430 * @param expansionsize size of the expansion
431 * @param maxexpansion data structure to store the maximum expansion data.
432 * @param status error status
433 * @returns size of the maxexpansion and maxsize used.
435 static int uprv_uca_setMaxExpansion(uint32_t endexpansion
,
436 uint8_t expansionsize
,
437 MaxExpansionTable
*maxexpansion
,
440 if (maxexpansion
->size
== 0) {
441 /* we'll always make the first element 0, for easier manipulation */
442 maxexpansion
->endExpansionCE
=
443 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(int32_t));
445 if (maxexpansion
->endExpansionCE
== NULL
) {
446 *status
= U_MEMORY_ALLOCATION_ERROR
;
449 *(maxexpansion
->endExpansionCE
) = 0;
450 maxexpansion
->expansionCESize
=
451 (uint8_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(uint8_t));
453 if (maxexpansion
->expansionCESize
== NULL
) {
454 *status
= U_MEMORY_ALLOCATION_ERROR
;
457 *(maxexpansion
->expansionCESize
) = 0;
458 maxexpansion
->size
= INIT_EXP_TABLE_SIZE
;
459 maxexpansion
->position
= 0;
462 if (maxexpansion
->position
+ 1 == maxexpansion
->size
) {
463 uint32_t *neweece
= (uint32_t *)uprv_realloc(maxexpansion
->endExpansionCE
,
464 2 * maxexpansion
->size
* sizeof(uint32_t));
465 if (neweece
== NULL
) {
466 *status
= U_MEMORY_ALLOCATION_ERROR
;
469 maxexpansion
->endExpansionCE
= neweece
;
471 uint8_t *neweces
= (uint8_t *)uprv_realloc(maxexpansion
->expansionCESize
,
472 2 * maxexpansion
->size
* sizeof(uint8_t));
473 if (neweces
== NULL
) {
474 *status
= U_MEMORY_ALLOCATION_ERROR
;
477 maxexpansion
->expansionCESize
= neweces
;
478 maxexpansion
->size
*= 2;
481 uint32_t *pendexpansionce
= maxexpansion
->endExpansionCE
;
482 uint8_t *pexpansionsize
= maxexpansion
->expansionCESize
;
483 int pos
= maxexpansion
->position
;
485 uint32_t *start
= pendexpansionce
;
486 uint32_t *limit
= pendexpansionce
+ pos
;
488 /* using binary search to determine if last expansion element is
489 already in the array */
492 while (start
< limit
- 1) {
493 mid
= start
+ ((limit
- start
) >> 1);
494 if (endexpansion
<= *mid
) {
502 if (*start
== endexpansion
) {
503 result
= (int)(start
- pendexpansionce
);
505 else if (*limit
== endexpansion
) {
506 result
= (int)(limit
- pendexpansionce
);
510 /* found the ce in expansion, we'll just modify the size if it is
512 uint8_t *currentsize
= pexpansionsize
+ result
;
513 if (*currentsize
< expansionsize
) {
514 *currentsize
= expansionsize
;
518 /* we'll need to squeeze the value into the array.
519 initial implementation. */
520 /* shifting the subarray down by 1 */
521 int shiftsize
= (int)((pendexpansionce
+ pos
) - start
);
522 uint32_t *shiftpos
= start
+ 1;
523 uint8_t *sizeshiftpos
= pexpansionsize
+ (shiftpos
- pendexpansionce
);
525 /* okay need to rearrange the array into sorted order */
526 if (shiftsize
== 0 /*|| *(pendexpansionce + pos) < endexpansion*/) { /* the commented part is actually both redundant and dangerous */
527 *(pendexpansionce
+ pos
+ 1) = endexpansion
;
528 *(pexpansionsize
+ pos
+ 1) = expansionsize
;
531 uprv_memmove(shiftpos
+ 1, shiftpos
, shiftsize
* sizeof(int32_t));
532 uprv_memmove(sizeshiftpos
+ 1, sizeshiftpos
,
533 shiftsize
* sizeof(uint8_t));
534 *shiftpos
= endexpansion
;
535 *sizeshiftpos
= expansionsize
;
537 maxexpansion
->position
++;
542 for (temp
= 0; temp
< maxexpansion
->position
; temp
++) {
543 if (pendexpansionce
[temp
] >= pendexpansionce
[temp
+ 1]) {
544 fprintf(stderr
, "expansions %d\n", temp
);
546 if (pendexpansionce
[temp
] == endexpansion
) {
548 if (pexpansionsize
[temp
] < expansionsize
) {
549 fprintf(stderr
, "expansions size %d\n", temp
);
553 if (pendexpansionce
[temp
] == endexpansion
) {
555 if (pexpansionsize
[temp
] < expansionsize
) {
556 fprintf(stderr
, "expansions size %d\n", temp
);
560 fprintf(stderr
, "expansion not found %d\n", temp
);
564 return maxexpansion
->position
;
568 * Sets the maximum length of all jamo expansion sequences ending with the same
569 * collation element. The size required for maxexpansion and maxsize is
570 * returned if the arrays are too small.
571 * @param ch the jamo codepoint
572 * @param endexpansion the last expansion collation element to be added
573 * @param expansionsize size of the expansion
574 * @param maxexpansion data structure to store the maximum expansion data.
575 * @param status error status
576 * @returns size of the maxexpansion and maxsize used.
578 static int uprv_uca_setMaxJamoExpansion(UChar ch
,
579 uint32_t endexpansion
,
580 uint8_t expansionsize
,
581 MaxJamoExpansionTable
*maxexpansion
,
585 if (((uint32_t)ch
- 0x1100) <= (0x1112 - 0x1100)) {
586 /* determines L for Jamo, doesn't need to store this since it is never
587 at the end of a expansion */
588 if (maxexpansion
->maxLSize
< expansionsize
) {
589 maxexpansion
->maxLSize
= expansionsize
;
591 return maxexpansion
->position
;
594 if (((uint32_t)ch
- 0x1161) <= (0x1175 - 0x1161)) {
595 /* determines V for Jamo */
596 if (maxexpansion
->maxVSize
< expansionsize
) {
597 maxexpansion
->maxVSize
= expansionsize
;
601 if (((uint32_t)ch
- 0x11A8) <= (0x11C2 - 0x11A8)) {
603 /* determines T for Jamo */
604 if (maxexpansion
->maxTSize
< expansionsize
) {
605 maxexpansion
->maxTSize
= expansionsize
;
609 if (maxexpansion
->size
== 0) {
610 /* we'll always make the first element 0, for easier manipulation */
611 maxexpansion
->endExpansionCE
=
612 (uint32_t *)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(uint32_t));
614 if (maxexpansion
->endExpansionCE
== NULL
) {
615 *status
= U_MEMORY_ALLOCATION_ERROR
;
618 *(maxexpansion
->endExpansionCE
) = 0;
620 (UBool
*)uprv_malloc(INIT_EXP_TABLE_SIZE
* sizeof(UBool
));
622 if (maxexpansion
->isV
== NULL
) {
623 *status
= U_MEMORY_ALLOCATION_ERROR
;
624 uprv_free(maxexpansion
->endExpansionCE
);
625 maxexpansion
->endExpansionCE
= NULL
;
628 *(maxexpansion
->isV
) = 0;
629 maxexpansion
->size
= INIT_EXP_TABLE_SIZE
;
630 maxexpansion
->position
= 0;
633 if (maxexpansion
->position
+ 1 == maxexpansion
->size
) {
634 maxexpansion
->size
*= 2;
635 maxexpansion
->endExpansionCE
= (uint32_t *)uprv_realloc(maxexpansion
->endExpansionCE
,
636 maxexpansion
->size
* sizeof(uint32_t));
637 if (maxexpansion
->endExpansionCE
== NULL
) {
639 fprintf(stderr
, "out of memory for maxExpansions\n");
641 *status
= U_MEMORY_ALLOCATION_ERROR
;
644 maxexpansion
->isV
= (UBool
*)uprv_realloc(maxexpansion
->isV
,
645 maxexpansion
->size
* sizeof(UBool
));
646 if (maxexpansion
->isV
== NULL
) {
648 fprintf(stderr
, "out of memory for maxExpansions\n");
650 *status
= U_MEMORY_ALLOCATION_ERROR
;
651 uprv_free(maxexpansion
->endExpansionCE
);
652 maxexpansion
->endExpansionCE
= NULL
;
657 uint32_t *pendexpansionce
= maxexpansion
->endExpansionCE
;
658 int pos
= maxexpansion
->position
;
662 if (*(pendexpansionce
+ pos
) == endexpansion
) {
663 return maxexpansion
->position
;
667 *(pendexpansionce
+ maxexpansion
->position
) = endexpansion
;
668 *(maxexpansion
->isV
+ maxexpansion
->position
) = isV
;
669 maxexpansion
->position
++;
671 return maxexpansion
->position
;
675 static void ContrEndCPSet(uint8_t *table
, UChar c
) {
680 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
681 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
683 htByte
= &table
[hash
>>3];
684 *htByte
|= (1 << (hash
& 7));
688 static void unsafeCPSet(uint8_t *table
, UChar c
) {
693 if (hash
>= UCOL_UNSAFECP_TABLE_SIZE
*8) {
694 if (hash
>= 0xd800 && hash
<= 0xf8ff) {
695 /* Part of a surrogate, or in private use area. */
696 /* These don't go in the table */
699 hash
= (hash
& UCOL_UNSAFECP_TABLE_MASK
) + 256;
701 htByte
= &table
[hash
>>3];
702 *htByte
|= (1 << (hash
& 7));
706 uprv_uca_createCMTable(tempUCATable
*t
, int32_t noOfCM
, UErrorCode
*status
) {
707 t
->cmLookup
= (CombinClassTable
*)uprv_malloc(sizeof(CombinClassTable
));
708 if (t
->cmLookup
==NULL
) {
709 *status
= U_MEMORY_ALLOCATION_ERROR
;
712 t
->cmLookup
->cPoints
=(UChar
*)uprv_malloc(noOfCM
*sizeof(UChar
));
713 if (t
->cmLookup
->cPoints
==NULL
) {
714 uprv_free(t
->cmLookup
);
716 *status
= U_MEMORY_ALLOCATION_ERROR
;
720 t
->cmLookup
->size
=noOfCM
;
721 uprv_memset(t
->cmLookup
->index
, 0, sizeof(t
->cmLookup
->index
));
727 uprv_uca_copyCMTable(tempUCATable
*t
, UChar
*cm
, uint16_t *index
) {
730 for (int32_t i
=0; i
<256; ++i
) {
732 // cPoints is ordered by combining class value.
733 uprv_memcpy(t
->cmLookup
->cPoints
+count
, cm
+(i
<<8), index
[i
]*sizeof(UChar
));
736 t
->cmLookup
->index
[i
]=count
;
741 /* 1. to the UnsafeCP hash table, add all chars with combining class != 0 */
742 /* 2. build combining marks table for all chars with combining class != 0 */
743 static void uprv_uca_unsafeCPAddCCNZ(tempUCATable
*t
, UErrorCode
*status
) {
746 uint16_t fcd
; // Hi byte is lead combining class. lo byte is trailing combing class.
747 UBool buildCMTable
= (t
->cmLookup
==NULL
); // flag for building combining class table
751 const Normalizer2Impl
*nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
752 if (U_FAILURE(*status
)) {
758 cm
= (UChar
*)uprv_malloc(sizeof(UChar
)*UCOL_MAX_CM_TAB
);
760 *status
= U_MEMORY_ALLOCATION_ERROR
;
764 uprv_memset(index
, 0, sizeof(index
));
766 for (c
=0; c
<0xffff; c
++) {
767 if (U16_IS_LEAD(c
)) {
769 if (nfcImpl
->singleLeadMightHaveNonZeroFCD16(c
)) {
770 UChar32 supp
= U16_GET_SUPPLEMENTARY(c
, 0xdc00);
771 UChar32 suppLimit
= supp
+ 0x400;
772 while (supp
< suppLimit
) {
773 fcd
|= nfcImpl
->getFCD16FromNormData(supp
++);
777 fcd
= nfcImpl
->getFCD16(c
);
779 if (fcd
>= 0x100 || // if the leading combining class(c) > 0 ||
780 (U16_IS_LEAD(c
) && fcd
!= 0)) {// c is a leading surrogate with some FCD data
782 uint32_t cClass
= fcd
& 0xff;
783 //uint32_t temp=(cClass<<8)+index[cClass];
784 cm
[(cClass
<<8)+index
[cClass
]] = c
; //
788 unsafeCPSet(t
->unsafeCP
, c
);
794 uprv_uca_createCMTable(t
, count
, status
);
795 if(U_FAILURE(*status
)) {
801 uprv_uca_copyCMTable(t
, cm
, index
);
804 if(t
->prefixLookup
!= NULL
) {
806 const UHashElement
*e
= NULL
;
807 UCAElements
*element
= NULL
;
809 while((e
= uhash_nextElement(t
->prefixLookup
, &i
)) != NULL
) {
810 element
= (UCAElements
*)e
->value
.pointer
;
811 // codepoints here are in the NFD form. We need to add the
812 // first code point of the NFC form to unsafe, because
813 // strcoll needs to backup over them.
814 unorm_normalize(element
->cPoints
, element
->cSize
, UNORM_NFC
, 0,
815 NFCbuf
, 256, status
);
816 unsafeCPSet(t
->unsafeCP
, NFCbuf
[0]);
825 static uint32_t uprv_uca_addPrefix(tempUCATable
*t
, uint32_t CE
,
826 UCAElements
*element
, UErrorCode
*status
)
828 // currently the longest prefix we're supporting in Japanese is two characters
829 // long. Although this table could quite easily mimic complete contraction stuff
830 // there is no good reason to make a general solution, as it would require some
831 // error prone messing.
832 CntTable
*contractions
= t
->contractions
;
835 UChar
*oldCP
= element
->cPoints
;
836 uint32_t oldCPSize
= element
->cSize
;
839 contractions
->currentTag
= SPEC_PROC_TAG
;
841 // here, we will normalize & add prefix to the table.
844 for(j
=0; j
<element
->cSize
; j
++) {
845 fprintf(stdout
, "CP: %04X ", element
->cPoints
[j
]);
847 fprintf(stdout
, "El: %08X Pref: ", CE
);
848 for(j
=0; j
<element
->prefixSize
; j
++) {
849 fprintf(stdout
, "%04X ", element
->prefix
[j
]);
851 fprintf(stdout
, "%08X ", element
->mapCE
);
854 for (j
= 1; j
<element
->prefixSize
; j
++) { /* First add NFD prefix chars to unsafe CP hash table */
855 // Unless it is a trail surrogate, which is handled algoritmically and
856 // shouldn't take up space in the table.
857 if(!(U16_IS_TRAIL(element
->prefix
[j
]))) {
858 unsafeCPSet(t
->unsafeCP
, element
->prefix
[j
]);
862 UChar tempPrefix
= 0;
864 for(j
= 0; j
< /*nfcSize*/element
->prefixSize
/2; j
++) { // prefixes are going to be looked up backwards
865 // therefore, we will promptly reverse the prefix buffer...
866 tempPrefix
= *(/*nfcBuffer*/element
->prefix
+element
->prefixSize
-j
-1);
867 *(/*nfcBuffer*/element
->prefix
+element
->prefixSize
-j
-1) = element
->prefix
[j
];
868 element
->prefix
[j
] = tempPrefix
;
872 fprintf(stdout
, "Reversed: ");
873 for(j
=0; j
<element
->prefixSize
; j
++) {
874 fprintf(stdout
, "%04X ", element
->prefix
[j
]);
876 fprintf(stdout
, "%08X\n", element
->mapCE
);
879 // the first codepoint is also unsafe, as it forms a 'contraction' with the prefix
880 if(!(U16_IS_TRAIL(element
->cPoints
[0]))) {
881 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[0]);
884 // Maybe we need this... To handle prefixes completely in the forward direction...
885 //if(element->cSize == 1) {
886 // if(!(U16_IS_TRAIL(element->cPoints[0]))) {
887 // ContrEndCPSet(t->contrEndCP, element->cPoints[0]);
891 element
->cPoints
= element
->prefix
;
892 element
->cSize
= element
->prefixSize
;
894 // Add the last char of the contraction to the contraction-end hash table.
895 // unless it is a trail surrogate, which is handled algorithmically and
896 // shouldn't be in the table
897 if(!(U16_IS_TRAIL(element
->cPoints
[element
->cSize
-1]))) {
898 ContrEndCPSet(t
->contrEndCP
, element
->cPoints
[element
->cSize
-1]);
901 // First we need to check if contractions starts with a surrogate
902 U16_NEXT(element
->cPoints
, cpsize
, element
->cSize
, cp
);
904 // If there are any Jamos in the contraction, we should turn on special
905 // processing for Jamos
906 if(UCOL_ISJAMO(element
->prefix
[0])) {
907 t
->image
->jamoSpecial
= TRUE
;
909 /* then we need to deal with it */
910 /* we could aready have something in table - or we might not */
913 /* if it wasn't contraction, we wouldn't end up here*/
914 int32_t firstContractionOffset
= 0;
915 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, CE
, status
);
916 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
917 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->prefix
, newCE
, status
);
918 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, CE
, status
);
919 CE
= constructContractCE(SPEC_PROC_TAG
, firstContractionOffset
);
920 } else { /* we are adding to existing contraction */
921 /* there were already some elements in the table, so we need to add a new contraction */
922 /* Two things can happen here: either the codepoint is already in the table, or it is not */
923 int32_t position
= uprv_cnttab_findCP(contractions
, CE
, *element
->prefix
, status
);
924 if(position
> 0) { /* if it is we just continue down the chain */
925 uint32_t eCE
= uprv_cnttab_getCE(contractions
, CE
, position
, status
);
926 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
927 uprv_cnttab_setContraction(contractions
, CE
, position
, *(element
->prefix
), newCE
, status
);
928 } else { /* if it isn't, we will have to create a new sequence */
929 uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
930 uprv_cnttab_insertContraction(contractions
, CE
, *(element
->prefix
), element
->mapCE
, status
);
934 element
->cPoints
= oldCP
;
935 element
->cSize
= oldCPSize
;
940 // Note regarding surrogate handling: We are interested only in the single
941 // or leading surrogates in a contraction. If a surrogate is somewhere else
942 // in the contraction, it is going to be handled as a pair of code units,
943 // as it doesn't affect the performance AND handling surrogates specially
944 // would complicate code way too much.
945 static uint32_t uprv_uca_addContraction(tempUCATable
*t
, uint32_t CE
,
946 UCAElements
*element
, UErrorCode
*status
)
948 CntTable
*contractions
= t
->contractions
;
952 contractions
->currentTag
= CONTRACTION_TAG
;
954 // First we need to check if contractions starts with a surrogate
955 U16_NEXT(element
->cPoints
, cpsize
, element
->cSize
, cp
);
957 if(cpsize
<element
->cSize
) { // This is a real contraction, if there are other characters after the first
959 for (j
=1; j
<element
->cSize
; j
++) { /* First add contraction chars to unsafe CP hash table */
960 // Unless it is a trail surrogate, which is handled algoritmically and
961 // shouldn't take up space in the table.
962 if(!(U16_IS_TRAIL(element
->cPoints
[j
]))) {
963 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[j
]);
966 // Add the last char of the contraction to the contraction-end hash table.
967 // unless it is a trail surrogate, which is handled algorithmically and
968 // shouldn't be in the table
969 if(!(U16_IS_TRAIL(element
->cPoints
[element
->cSize
-1]))) {
970 ContrEndCPSet(t
->contrEndCP
, element
->cPoints
[element
->cSize
-1]);
973 // If there are any Jamos in the contraction, we should turn on special
974 // processing for Jamos
975 if(UCOL_ISJAMO(element
->cPoints
[0])) {
976 t
->image
->jamoSpecial
= TRUE
;
978 /* then we need to deal with it */
979 /* we could aready have something in table - or we might not */
980 element
->cPoints
+=cpsize
;
981 element
->cSize
-=cpsize
;
982 if(!isContraction(CE
)) {
983 /* if it wasn't contraction, we wouldn't end up here*/
984 int32_t firstContractionOffset
= 0;
985 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, CE
, status
);
986 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
987 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->cPoints
, newCE
, status
);
988 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, CE
, status
);
989 CE
= constructContractCE(CONTRACTION_TAG
, firstContractionOffset
);
990 } else { /* we are adding to existing contraction */
991 /* there were already some elements in the table, so we need to add a new contraction */
992 /* Two things can happen here: either the codepoint is already in the table, or it is not */
993 int32_t position
= uprv_cnttab_findCP(contractions
, CE
, *element
->cPoints
, status
);
994 if(position
> 0) { /* if it is we just continue down the chain */
995 uint32_t eCE
= uprv_cnttab_getCE(contractions
, CE
, position
, status
);
996 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
997 uprv_cnttab_setContraction(contractions
, CE
, position
, *(element
->cPoints
), newCE
, status
);
998 } else { /* if it isn't, we will have to create a new sequence */
999 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
1000 uprv_cnttab_insertContraction(contractions
, CE
, *(element
->cPoints
), newCE
, status
);
1003 element
->cPoints
-=cpsize
;
1004 element
->cSize
+=cpsize
;
1005 /*ucmpe32_set(t->mapping, cp, CE);*/
1006 utrie_set32(t
->mapping
, cp
, CE
);
1007 } else if(!isContraction(CE
)) { /* this is just a surrogate, and there is no contraction */
1008 /*ucmpe32_set(t->mapping, cp, element->mapCE);*/
1009 utrie_set32(t
->mapping
, cp
, element
->mapCE
);
1010 } else { /* fill out the first stage of the contraction with the surrogate CE */
1011 uprv_cnttab_changeContraction(contractions
, CE
, 0, element
->mapCE
, status
);
1012 uprv_cnttab_changeContraction(contractions
, CE
, 0xFFFF, element
->mapCE
, status
);
1018 static uint32_t uprv_uca_processContraction(CntTable
*contractions
, UCAElements
*element
, uint32_t existingCE
, UErrorCode
*status
) {
1019 int32_t firstContractionOffset
= 0;
1020 // uint32_t contractionElement = UCOL_NOT_FOUND;
1022 if(U_FAILURE(*status
)) {
1023 return UCOL_NOT_FOUND
;
1026 /* end of recursion */
1027 if(element
->cSize
== 1) {
1028 if(isCntTableElement(existingCE
) && ((UColCETags
)getCETag(existingCE
) == contractions
->currentTag
)) {
1029 uprv_cnttab_changeContraction(contractions
, existingCE
, 0, element
->mapCE
, status
);
1030 uprv_cnttab_changeContraction(contractions
, existingCE
, 0xFFFF, element
->mapCE
, status
);
1033 return element
->mapCE
; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
1037 /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
1038 /* for both backward and forward cycles */
1040 /* we encountered either an empty space or a non-contraction element */
1041 /* this means we are constructing a new contraction sequence */
1044 if(!isCntTableElement(existingCE
)) {
1045 /* if it wasn't contraction, we wouldn't end up here*/
1046 firstContractionOffset
= uprv_cnttab_addContraction(contractions
, UPRV_CNTTAB_NEWELEMENT
, 0, existingCE
, status
);
1047 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
1048 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, *element
->cPoints
, newCE
, status
);
1049 uprv_cnttab_addContraction(contractions
, firstContractionOffset
, 0xFFFF, existingCE
, status
);
1050 existingCE
= constructContractCE(contractions
->currentTag
, firstContractionOffset
);
1051 } else { /* we are adding to existing contraction */
1052 /* there were already some elements in the table, so we need to add a new contraction */
1053 /* Two things can happen here: either the codepoint is already in the table, or it is not */
1054 int32_t position
= uprv_cnttab_findCP(contractions
, existingCE
, *element
->cPoints
, status
);
1055 if(position
> 0) { /* if it is we just continue down the chain */
1056 uint32_t eCE
= uprv_cnttab_getCE(contractions
, existingCE
, position
, status
);
1057 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, eCE
, status
);
1058 uprv_cnttab_setContraction(contractions
, existingCE
, position
, *(element
->cPoints
), newCE
, status
);
1059 } else { /* if it isn't, we will have to create a new sequence */
1060 uint32_t newCE
= uprv_uca_processContraction(contractions
, element
, UCOL_NOT_FOUND
, status
);
1061 uprv_cnttab_insertContraction(contractions
, existingCE
, *(element
->cPoints
), newCE
, status
);
1069 static uint32_t uprv_uca_finalizeAddition(tempUCATable
*t
, UCAElements
*element
, UErrorCode
*status
) {
1070 uint32_t CE
= UCOL_NOT_FOUND
;
1071 // This should add a completely ignorable element to the
1072 // unsafe table, so that backward iteration will skip
1073 // over it when treating contractions.
1075 if(element
->mapCE
== 0) {
1076 for(i
= 0; i
< element
->cSize
; i
++) {
1077 if(!U16_IS_TRAIL(element
->cPoints
[i
])) {
1078 unsafeCPSet(t
->unsafeCP
, element
->cPoints
[i
]);
1082 if(element
->cSize
> 1) { /* we're adding a contraction */
1086 U16_NEXT(element
->cPoints
, i
, element
->cSize
, cp
);
1087 /*CE = ucmpe32_get(t->mapping, cp);*/
1088 CE
= utrie_get32(t
->mapping
, cp
, NULL
);
1090 CE
= uprv_uca_addContraction(t
, CE
, element
, status
);
1091 } else { /* easy case, */
1092 /*CE = ucmpe32_get(t->mapping, element->cPoints[0]);*/
1093 CE
= utrie_get32(t
->mapping
, element
->cPoints
[0], NULL
);
1095 if( CE
!= UCOL_NOT_FOUND
) {
1096 if(isCntTableElement(CE
) /*isContraction(CE)*/) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
1097 if(!isPrefix(element
->mapCE
)) { // we cannot reenter prefix elements - as we are going to create a dead loop
1098 // Only expansions and regular CEs can go here... Contractions will never happen in this place
1099 uprv_cnttab_setContraction(t
->contractions
, CE
, 0, 0, element
->mapCE
, status
);
1100 /* This loop has to change the CE at the end of contraction REDO!*/
1101 uprv_cnttab_changeLastCE(t
->contractions
, CE
, element
->mapCE
, status
);
1104 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
1105 utrie_set32(t
->mapping
, element
->cPoints
[0], element
->mapCE
);
1106 if ((element
->prefixSize
!=0) && (!isSpecial(CE
) || (getCETag(CE
)!=IMPLICIT_TAG
))) {
1107 UCAElements
*origElem
= (UCAElements
*)uprv_malloc(sizeof(UCAElements
));
1109 if (origElem
== NULL
) {
1110 *status
= U_MEMORY_ALLOCATION_ERROR
;
1113 /* copy the original UCA value */
1114 origElem
->prefixSize
= 0;
1115 origElem
->prefix
= NULL
;
1116 origElem
->cPoints
= origElem
->uchars
;
1117 origElem
->cPoints
[0] = element
->cPoints
[0];
1118 origElem
->cSize
= 1;
1119 origElem
->CEs
[0]=CE
;
1121 origElem
->noOfCEs
=1;
1122 uprv_uca_finalizeAddition(t
, origElem
, status
);
1123 uprv_free(origElem
);
1126 fprintf(stderr
, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE
, element
->cPoints
[0], element
->CEs
[0]);
1127 //*status = U_ILLEGAL_ARGUMENT_ERROR;
1131 /*ucmpe32_set(t->mapping, element->cPoints[0], element->mapCE);*/
1132 utrie_set32(t
->mapping
, element
->cPoints
[0], element
->mapCE
);
1138 /* This adds a read element, while testing for existence */
1139 U_CAPI
uint32_t U_EXPORT2
1140 uprv_uca_addAnElement(tempUCATable
*t
, UCAElements
*element
, UErrorCode
*status
) {
1143 ExpansionTable
*expansions
= t
->expansions
;
1146 uint32_t expansion
= 0;
1149 if(U_FAILURE(*status
)) {
1153 element
->mapCE
= 0; // clear mapCE so that we can catch expansions
1155 if(element
->noOfCEs
== 1) {
1156 element
->mapCE
= element
->CEs
[0];
1158 /* ICU 2.1 long primaries */
1159 /* unfortunately, it looks like we have to look for a long primary here */
1160 /* since in canonical closure we are going to hit some long primaries from */
1161 /* the first phase, and they will come back as continuations/expansions */
1162 /* destroying the effect of the previous opitimization */
1163 /* A long primary is a three byte primary with starting secondaries and tertiaries */
1164 /* It can appear in long runs of only primary differences (like east Asian tailorings) */
1165 /* also, it should not be an expansion, as expansions would break with this */
1166 // This part came in from ucol_bld.cpp
1167 //if(tok->expansion == 0
1168 //&& noOfBytes[0] == 3 && noOfBytes[1] == 1 && noOfBytes[2] == 1
1169 //&& CEparts[1] == (UCOL_BYTE_COMMON << 24) && CEparts[2] == (UCOL_BYTE_COMMON << 24)) {
1170 /* we will construct a special CE that will go unchanged to the table */
1171 if(element
->noOfCEs
== 2 // a two CE expansion
1172 && isContinuation(element
->CEs
[1]) // which is a continuation
1173 && (element
->CEs
[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER
))) == 0 // that has only primaries in continuation,
1174 && (((element
->CEs
[0]>>8) & 0xFF) == UCOL_BYTE_COMMON
) // a common secondary
1175 && ((element
->CEs
[0] & 0xFF) == UCOL_BYTE_COMMON
) // and a common tertiary
1179 fprintf(stdout
, "Long primary %04X\n", element
->cPoints
[0]);
1181 element
->mapCE
= UCOL_SPECIAL_FLAG
| (LONG_PRIMARY_TAG
<<24) // a long primary special
1182 | ((element
->CEs
[0]>>8) & 0xFFFF00) // first and second byte of primary
1183 | ((element
->CEs
[1]>>24) & 0xFF); // third byte of primary
1186 expansion
= (uint32_t)(UCOL_SPECIAL_FLAG
| (EXPANSION_TAG
<<UCOL_TAG_SHIFT
)
1187 | (((uprv_uca_addExpansion(expansions
, element
->CEs
[0], status
)+(headersize
>>2))<<4)
1190 for(i
= 1; i
<element
->noOfCEs
; i
++) {
1191 uprv_uca_addExpansion(expansions
, element
->CEs
[i
], status
);
1193 if(element
->noOfCEs
<= 0xF) {
1194 expansion
|= element
->noOfCEs
;
1196 uprv_uca_addExpansion(expansions
, 0, status
);
1198 element
->mapCE
= expansion
;
1199 uprv_uca_setMaxExpansion(element
->CEs
[element
->noOfCEs
- 1],
1200 (uint8_t)element
->noOfCEs
,
1203 if(UCOL_ISJAMO(element
->cPoints
[0])) {
1204 t
->image
->jamoSpecial
= TRUE
;
1205 uprv_uca_setMaxJamoExpansion(element
->cPoints
[0],
1206 element
->CEs
[element
->noOfCEs
- 1],
1207 (uint8_t)element
->noOfCEs
,
1208 t
->maxJamoExpansions
,
1211 if (U_FAILURE(*status
)) {
1217 // We treat digits differently - they are "uber special" and should be
1218 // processed differently if numeric collation is on.
1219 UChar32 uniChar
= 0;
1220 //printElement(element);
1221 if ((element
->cSize
== 2) && U16_IS_LEAD(element
->cPoints
[0])){
1222 uniChar
= U16_GET_SUPPLEMENTARY(element
->cPoints
[0], element
->cPoints
[1]);
1223 } else if (element
->cSize
== 1){
1224 uniChar
= element
->cPoints
[0];
1227 // Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
1228 // one element to the expansion buffer. When we encounter a digit and we don't
1229 // do numeric collation, we will just pick the CE we have and break out of case
1230 // (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
1231 // a special, further processing will occur. If it's a simple CE, we'll return due
1232 // to how the loop is constructed.
1233 if (uniChar
!= 0 && u_isdigit(uniChar
)){
1234 expansion
= (uint32_t)(UCOL_SPECIAL_FLAG
| (DIGIT_TAG
<<UCOL_TAG_SHIFT
) | 1); // prepare the element
1235 if(element
->mapCE
) { // if there is an expansion, we'll pick it here
1236 expansion
|= ((uprv_uca_addExpansion(expansions
, element
->mapCE
, status
)+(headersize
>>2))<<4);
1238 expansion
|= ((uprv_uca_addExpansion(expansions
, element
->CEs
[0], status
)+(headersize
>>2))<<4);
1240 element
->mapCE
= expansion
;
1242 // Need to go back to the beginning of the digit string if in the middle!
1243 if(uniChar
<= 0xFFFF) { // supplementaries are always unsafe. API takes UChars
1244 unsafeCPSet(t
->unsafeCP
, (UChar
)uniChar
);
1248 // here we want to add the prefix structure.
1249 // I will try to process it as a reverse contraction, if possible.
1250 // prefix buffer is already reversed.
1252 if(element
->prefixSize
!=0) {
1253 // We keep the seen prefix starter elements in a hashtable
1254 // we need it to be able to distinguish between the simple
1255 // codepoints and prefix starters. Also, we need to use it
1256 // for canonical closure.
1258 UCAElements
*composed
= (UCAElements
*)uprv_malloc(sizeof(UCAElements
));
1260 if (composed
== NULL
) {
1261 *status
= U_MEMORY_ALLOCATION_ERROR
;
1264 uprv_memcpy(composed
, element
, sizeof(UCAElements
));
1265 composed
->cPoints
= composed
->uchars
;
1266 composed
->prefix
= composed
->prefixChars
;
1268 composed
->prefixSize
= unorm_normalize(element
->prefix
, element
->prefixSize
, UNORM_NFC
, 0, composed
->prefix
, 128, status
);
1271 if(t
->prefixLookup
!= NULL
) {
1272 UCAElements
*uCE
= (UCAElements
*)uhash_get(t
->prefixLookup
, element
);
1273 if(uCE
!= NULL
) { // there is already a set of code points here
1274 element
->mapCE
= uprv_uca_addPrefix(t
, uCE
->mapCE
, element
, status
);
1275 } else { // no code points, so this spot is clean
1276 element
->mapCE
= uprv_uca_addPrefix(t
, UCOL_NOT_FOUND
, element
, status
);
1277 uCE
= (UCAElements
*)uprv_malloc(sizeof(UCAElements
));
1280 *status
= U_MEMORY_ALLOCATION_ERROR
;
1283 uprv_memcpy(uCE
, element
, sizeof(UCAElements
));
1284 uCE
->cPoints
= uCE
->uchars
;
1285 uhash_put(t
->prefixLookup
, uCE
, uCE
, status
);
1287 if(composed
->prefixSize
!= element
->prefixSize
|| uprv_memcmp(composed
->prefix
, element
->prefix
, element
->prefixSize
)) {
1289 composed
->mapCE
= uprv_uca_addPrefix(t
, element
->mapCE
, composed
, status
);
1292 uprv_free(composed
);
1295 // We need to use the canonical iterator here
1296 // the way we do it is to generate the canonically equivalent strings
1297 // for the contraction and then add the sequences that pass FCD check
1298 if(element
->cSize
> 1 && !(element
->cSize
==2 && U16_IS_LEAD(element
->cPoints
[0]) && U16_IS_TRAIL(element
->cPoints
[1]))) { // this is a contraction, we should check whether a composed form should also be included
1299 UnicodeString
source(element
->cPoints
, element
->cSize
);
1300 CanonicalIterator
it(source
, *status
);
1302 while(!source
.isBogus()) {
1303 if(Normalizer::quickCheck(source
, UNORM_FCD
, *status
) != UNORM_NO
) {
1304 element
->cSize
= source
.extract(element
->cPoints
, 128, *status
);
1305 uprv_uca_finalizeAddition(t
, element
, status
);
1309 CE
= element
->mapCE
;
1311 CE
= uprv_uca_finalizeAddition(t
, element
, status
);
1318 /*void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, */
1319 static void uprv_uca_getMaxExpansionJamo(UNewTrie
*mapping
,
1320 MaxExpansionTable
*maxexpansion
,
1321 MaxJamoExpansionTable
*maxjamoexpansion
,
1325 const uint32_t VBASE
= 0x1161;
1326 const uint32_t TBASE
= 0x11A8;
1327 const uint32_t VCOUNT
= 21;
1328 const uint32_t TCOUNT
= 28;
1330 uint32_t v
= VBASE
+ VCOUNT
- 1;
1331 uint32_t t
= TBASE
+ TCOUNT
- 1;
1334 while (v
>= VBASE
) {
1335 /*ce = ucmpe32_get(mapping, v);*/
1336 ce
= utrie_get32(mapping
, v
, NULL
);
1337 if (ce
< UCOL_SPECIAL_FLAG
) {
1338 uprv_uca_setMaxExpansion(ce
, 2, maxexpansion
, status
);
1345 /*ce = ucmpe32_get(mapping, t);*/
1346 ce
= utrie_get32(mapping
, t
, NULL
);
1347 if (ce
< UCOL_SPECIAL_FLAG
) {
1348 uprv_uca_setMaxExpansion(ce
, 3, maxexpansion
, status
);
1352 /* According to the docs, 99% of the time, the Jamo will not be special */
1354 /* gets the max expansion in all unicode characters */
1355 int count
= maxjamoexpansion
->position
;
1356 uint8_t maxTSize
= (uint8_t)(maxjamoexpansion
->maxLSize
+
1357 maxjamoexpansion
->maxVSize
+
1358 maxjamoexpansion
->maxTSize
);
1359 uint8_t maxVSize
= (uint8_t)(maxjamoexpansion
->maxLSize
+
1360 maxjamoexpansion
->maxVSize
);
1364 if (*(maxjamoexpansion
->isV
+ count
) == TRUE
) {
1365 uprv_uca_setMaxExpansion(
1366 *(maxjamoexpansion
->endExpansionCE
+ count
),
1367 maxVSize
, maxexpansion
, status
);
1370 uprv_uca_setMaxExpansion(
1371 *(maxjamoexpansion
->endExpansionCE
+ count
),
1372 maxTSize
, maxexpansion
, status
);
1379 static inline uint32_t U_CALLCONV
1380 getFoldedValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
)
1388 while(start
<limit
) {
1389 value
=utrie_get32(trie
, start
, &inBlockZero
);
1390 tag
= getCETag(value
);
1391 if(inBlockZero
== TRUE
) {
1392 start
+=UTRIE_DATA_BLOCK_LENGTH
;
1393 } else if(!(isSpecial(value
) && (tag
== IMPLICIT_TAG
|| tag
== NOT_FOUND_TAG
))) {
1394 /* These are values that are starting in either UCA (IMPLICIT_TAG) or in the
1395 * tailorings (NOT_FOUND_TAG). Presence of these tags means that there is
1396 * nothing in this position and that it should be skipped.
1399 static int32_t count
= 1;
1400 fprintf(stdout
, "%i, Folded %08X, value %08X\n", count
++, start
, value
);
1402 return (uint32_t)(UCOL_SPECIAL_FLAG
| (SURROGATE_TAG
<<24) | offset
);
1412 // This is a debug function to print the contents of a trie.
1413 // It is used in conjuction with the code around utrie_unserialize call
1414 UBool
enumRange(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
1416 fprintf(stdout
, "%08X, %08X, %08X\n", start
, limit
, value
);
1418 fprintf(stdout
, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start
, U16_LEAD(start
), U16_TRAIL(start
), limit
, U16_LEAD(limit
), U16_TRAIL(limit
), value
);
1424 myGetFoldingOffset(uint32_t data
) {
1425 if(data
> UCOL_NOT_FOUND
&& getCETag(data
) == SURROGATE_TAG
) {
1426 return (data
&0xFFFFFF);
1433 U_CAPI UCATableHeader
* U_EXPORT2
1434 uprv_uca_assembleTable(tempUCATable
*t
, UErrorCode
*status
) {
1435 /*CompactEIntArray *mapping = t->mapping;*/
1436 UNewTrie
*mapping
= t
->mapping
;
1437 ExpansionTable
*expansions
= t
->expansions
;
1438 CntTable
*contractions
= t
->contractions
;
1439 MaxExpansionTable
*maxexpansion
= t
->maxExpansions
;
1441 if(U_FAILURE(*status
)) {
1445 uint32_t beforeContractions
= (uint32_t)((headersize
+paddedsize(expansions
->position
*sizeof(uint32_t)))/sizeof(UChar
));
1447 int32_t contractionsSize
= 0;
1448 contractionsSize
= uprv_cnttab_constructTable(contractions
, beforeContractions
, status
);
1450 /* the following operation depends on the trie data. Therefore, we have to do it before */
1451 /* the trie is compacted */
1452 /* sets jamo expansions */
1453 uprv_uca_getMaxExpansionJamo(mapping
, maxexpansion
, t
->maxJamoExpansions
,
1454 t
->image
->jamoSpecial
, status
);
1456 /*ucmpe32_compact(mapping);*/
1457 /*UMemoryStream *ms = uprv_mstrm_openNew(8192);*/
1458 /*int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);*/
1459 /*const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);*/
1461 // After setting the jamo expansions, compact the trie and get the needed size
1462 int32_t mappingSize
= utrie_serialize(mapping
, NULL
, 0, getFoldedValue
/*getFoldedValue*/, FALSE
, status
);
1464 uint32_t tableOffset
= 0;
1467 /* TODO: LATIN1 array is now in the utrie - it should be removed from the calculation */
1469 uint32_t toAllocate
=(uint32_t)(headersize
+
1470 paddedsize(expansions
->position
*sizeof(uint32_t))+
1471 paddedsize(mappingSize
)+
1472 paddedsize(contractionsSize
*(sizeof(UChar
)+sizeof(uint32_t)))+
1473 //paddedsize(0x100*sizeof(uint32_t)) /* Latin1 is now included in the trie */
1474 /* maxexpansion array */
1475 + paddedsize(maxexpansion
->position
* sizeof(uint32_t)) +
1476 /* maxexpansion size array */
1477 paddedsize(maxexpansion
->position
* sizeof(uint8_t)) +
1478 paddedsize(UCOL_UNSAFECP_TABLE_SIZE
) + /* Unsafe chars */
1479 paddedsize(UCOL_UNSAFECP_TABLE_SIZE
)); /* Contraction Ending chars */
1482 dataStart
= (uint8_t *)uprv_malloc(toAllocate
);
1484 if (dataStart
== NULL
) {
1485 *status
= U_MEMORY_ALLOCATION_ERROR
;
1489 UCATableHeader
*myData
= (UCATableHeader
*)dataStart
;
1490 // Please, do reset all the fields!
1491 uprv_memset(dataStart
, 0, toAllocate
);
1492 // Make sure we know this is reset
1493 myData
->magic
= UCOL_HEADER_MAGIC
;
1494 myData
->isBigEndian
= U_IS_BIG_ENDIAN
;
1495 myData
->charSetFamily
= U_CHARSET_FAMILY
;
1496 myData
->formatVersion
[0] = UCA_FORMAT_VERSION_0
;
1497 myData
->formatVersion
[1] = UCA_FORMAT_VERSION_1
;
1498 myData
->formatVersion
[2] = UCA_FORMAT_VERSION_2
;
1499 myData
->formatVersion
[3] = UCA_FORMAT_VERSION_3
;
1500 myData
->jamoSpecial
= t
->image
->jamoSpecial
;
1502 // Don't copy stuff from UCA header!
1503 //uprv_memcpy(myData, t->image, sizeof(UCATableHeader));
1505 myData
->contractionSize
= contractionsSize
;
1507 tableOffset
+= (uint32_t)(paddedsize(sizeof(UCATableHeader
)));
1509 myData
->options
= tableOffset
;
1510 uprv_memcpy(dataStart
+tableOffset
, t
->options
, sizeof(UColOptionSet
));
1511 tableOffset
+= (uint32_t)(paddedsize(sizeof(UColOptionSet
)));
1513 /* copy expansions */
1514 /*myData->expansion = (uint32_t *)dataStart+tableOffset;*/
1515 myData
->expansion
= tableOffset
;
1516 uprv_memcpy(dataStart
+tableOffset
, expansions
->CEs
, expansions
->position
*sizeof(uint32_t));
1517 tableOffset
+= (uint32_t)(paddedsize(expansions
->position
*sizeof(uint32_t)));
1519 /* contractions block */
1520 if(contractionsSize
!= 0) {
1521 /* copy contraction index */
1522 /*myData->contractionIndex = (UChar *)(dataStart+tableOffset);*/
1523 myData
->contractionIndex
= tableOffset
;
1524 uprv_memcpy(dataStart
+tableOffset
, contractions
->codePoints
, contractionsSize
*sizeof(UChar
));
1525 tableOffset
+= (uint32_t)(paddedsize(contractionsSize
*sizeof(UChar
)));
1527 /* copy contraction collation elements */
1528 /*myData->contractionCEs = (uint32_t *)(dataStart+tableOffset);*/
1529 myData
->contractionCEs
= tableOffset
;
1530 uprv_memcpy(dataStart
+tableOffset
, contractions
->CEs
, contractionsSize
*sizeof(uint32_t));
1531 tableOffset
+= (uint32_t)(paddedsize(contractionsSize
*sizeof(uint32_t)));
1533 myData
->contractionIndex
= 0;
1534 myData
->contractionCEs
= 0;
1537 /* copy mapping table */
1538 /*myData->mappingPosition = dataStart+tableOffset;*/
1539 /*myData->mappingPosition = tableOffset;*/
1540 /*uprv_memcpy(dataStart+tableOffset, flattened, mappingSize);*/
1542 myData
->mappingPosition
= tableOffset
;
1543 utrie_serialize(mapping
, dataStart
+tableOffset
, toAllocate
-tableOffset
, getFoldedValue
, FALSE
, status
);
1545 // This is debug code to dump the contents of the trie. It needs two functions defined above
1549 utrie_unserialize(&UCAt
, dataStart
+tableOffset
, 9999999, status
);
1550 UCAt
.getFoldingOffset
= myGetFoldingOffset
;
1551 if(U_SUCCESS(*status
)) {
1552 utrie_enum(&UCAt
, NULL
, enumRange
, NULL
);
1554 trieWord
= UTRIE_GET32_FROM_LEAD(&UCAt
, 0xDC01);
1557 tableOffset
+= paddedsize(mappingSize
);
1562 /* copy max expansion table */
1563 myData
->endExpansionCE
= tableOffset
;
1564 myData
->endExpansionCECount
= maxexpansion
->position
- 1;
1565 /* not copying the first element which is a dummy */
1566 uprv_memcpy(dataStart
+ tableOffset
, maxexpansion
->endExpansionCE
+ 1,
1567 (maxexpansion
->position
- 1) * sizeof(uint32_t));
1568 tableOffset
+= (uint32_t)(paddedsize((maxexpansion
->position
)* sizeof(uint32_t)));
1569 myData
->expansionCESize
= tableOffset
;
1570 uprv_memcpy(dataStart
+ tableOffset
, maxexpansion
->expansionCESize
+ 1,
1571 (maxexpansion
->position
- 1) * sizeof(uint8_t));
1572 tableOffset
+= (uint32_t)(paddedsize((maxexpansion
->position
)* sizeof(uint8_t)));
1574 /* Unsafe chars table. Finish it off, then copy it. */
1575 uprv_uca_unsafeCPAddCCNZ(t
, status
);
1576 if (t
->UCA
!= 0) { /* Or in unsafebits from UCA, making a combined table. */
1577 for (i
=0; i
<UCOL_UNSAFECP_TABLE_SIZE
; i
++) {
1578 t
->unsafeCP
[i
] |= t
->UCA
->unsafeCP
[i
];
1581 myData
->unsafeCP
= tableOffset
;
1582 uprv_memcpy(dataStart
+ tableOffset
, t
->unsafeCP
, UCOL_UNSAFECP_TABLE_SIZE
);
1583 tableOffset
+= paddedsize(UCOL_UNSAFECP_TABLE_SIZE
);
1586 /* Finish building Contraction Ending chars hash table and then copy it out. */
1587 if (t
->UCA
!= 0) { /* Or in unsafebits from UCA, making a combined table. */
1588 for (i
=0; i
<UCOL_UNSAFECP_TABLE_SIZE
; i
++) {
1589 t
->contrEndCP
[i
] |= t
->UCA
->contrEndCP
[i
];
1592 myData
->contrEndCP
= tableOffset
;
1593 uprv_memcpy(dataStart
+ tableOffset
, t
->contrEndCP
, UCOL_UNSAFECP_TABLE_SIZE
);
1594 tableOffset
+= paddedsize(UCOL_UNSAFECP_TABLE_SIZE
);
1596 if(tableOffset
!= toAllocate
) {
1598 fprintf(stderr
, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate
, tableOffset
);
1600 *status
= U_INTERNAL_PROGRAM_ERROR
;
1601 uprv_free(dataStart
);
1605 myData
->size
= tableOffset
;
1606 /* This should happen upon ressurection */
1607 /*const uint8_t *mapPosition = (uint8_t*)myData+myData->mappingPosition;*/
1608 /*uprv_mstrm_close(ms);*/
1615 UCollator
*tempColl
;
1616 UCollationElements
* colEl
;
1617 const Normalizer2Impl
*nfcImpl
;
1619 int32_t noOfClosures
;
1623 static UBool U_CALLCONV
1624 _enumCategoryRangeClosureCategory(const void *context
, UChar32 start
, UChar32 limit
, UCharCategory type
) {
1626 if (type
!= U_UNASSIGNED
&& type
!= U_PRIVATE_USE_CHAR
) { // if the range is assigned - we might ommit more categories later
1627 UErrorCode
*status
= ((enumStruct
*)context
)->status
;
1628 tempUCATable
*t
= ((enumStruct
*)context
)->t
;
1629 UCollator
*tempColl
= ((enumStruct
*)context
)->tempColl
;
1630 UCollationElements
* colEl
= ((enumStruct
*)context
)->colEl
;
1632 UChar decompBuffer
[4];
1633 const UChar
*decomp
;
1634 int32_t noOfDec
= 0;
1640 for(u32
= start
; u32
< limit
; u32
++) {
1641 decomp
= ((enumStruct
*)context
)->nfcImpl
->
1642 getDecomposition(u32
, decompBuffer
, noOfDec
);
1643 //if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
1644 //|| (noOfDec == 1 && *decomp != (UChar)u32))
1648 U16_APPEND_UNSAFE(comp
, len
, u32
);
1649 if(ucol_strcoll(tempColl
, comp
, len
, decomp
, noOfDec
) != UCOL_EQUAL
) {
1651 fprintf(stderr
, "Closure: U+%04X -> ", u32
);
1654 while(i
< noOfDec
) {
1655 U16_NEXT(decomp
, i
, noOfDec
, c
);
1656 fprintf(stderr
, "%04X ", c
);
1658 fprintf(stderr
, "\n");
1659 // print CEs for code point vs. decomposition
1660 fprintf(stderr
, "U+%04X CEs: ", u32
);
1661 UCollationElements
*iter
= ucol_openElements(tempColl
, comp
, len
, status
);
1663 while((ce
= ucol_next(iter
, status
)) != UCOL_NULLORDER
) {
1664 fprintf(stderr
, "%08X ", ce
);
1666 fprintf(stderr
, "\nDecomp CEs: ");
1667 ucol_setText(iter
, decomp
, noOfDec
, status
);
1668 while((ce
= ucol_next(iter
, status
)) != UCOL_NULLORDER
) {
1669 fprintf(stderr
, "%08X ", ce
);
1671 fprintf(stderr
, "\n");
1672 ucol_closeElements(iter
);
1674 if(((enumStruct
*)context
)->closed
!= NULL
) {
1675 ((enumStruct
*)context
)->closed
->add(u32
);
1677 ((enumStruct
*)context
)->noOfClosures
++;
1678 el
.cPoints
= (UChar
*)decomp
;
1681 el
.prefix
= el
.prefixChars
;
1684 UCAElements
*prefix
=(UCAElements
*)uhash_get(t
->prefixLookup
, &el
);
1687 el
.prefix
= el
.prefixChars
;
1689 if(prefix
== NULL
) {
1691 ucol_setText(colEl
, decomp
, noOfDec
, status
);
1692 while((el
.CEs
[el
.noOfCEs
] = ucol_next(colEl
, status
)) != (uint32_t)UCOL_NULLORDER
) {
1697 el
.CEs
[0] = prefix
->mapCE
;
1698 // This character uses a prefix. We have to add it
1699 // to the unsafe table, as it decomposed form is already
1700 // in. In Japanese, this happens for \u309e & \u30fe
1701 // Since unsafeCPSet is static in ucol_elm, we are going
1702 // to wrap it up in the uprv_uca_unsafeCPAddCCNZ function
1704 uprv_uca_addAnElement(t
, &el
, status
);
1714 uprv_uca_setMapCE(tempUCATable
*t
, UCAElements
*element
, UErrorCode
*status
) {
1715 uint32_t expansion
= 0;
1718 ExpansionTable
*expansions
= t
->expansions
;
1719 if(element
->noOfCEs
== 2 // a two CE expansion
1720 && isContinuation(element
->CEs
[1]) // which is a continuation
1721 && (element
->CEs
[1] & (~(0xFF << 24 | UCOL_CONTINUATION_MARKER
))) == 0 // that has only primaries in continuation,
1722 && (((element
->CEs
[0]>>8) & 0xFF) == UCOL_BYTE_COMMON
) // a common secondary
1723 && ((element
->CEs
[0] & 0xFF) == UCOL_BYTE_COMMON
) // and a common tertiary
1725 element
->mapCE
= UCOL_SPECIAL_FLAG
| (LONG_PRIMARY_TAG
<<24) // a long primary special
1726 | ((element
->CEs
[0]>>8) & 0xFFFF00) // first and second byte of primary
1727 | ((element
->CEs
[1]>>24) & 0xFF); // third byte of primary
1729 expansion
= (uint32_t)(UCOL_SPECIAL_FLAG
| (EXPANSION_TAG
<<UCOL_TAG_SHIFT
)
1730 | (((uprv_uca_addExpansion(expansions
, element
->CEs
[0], status
)+(headersize
>>2))<<4)
1733 for(j
= 1; j
<(int32_t)element
->noOfCEs
; j
++) {
1734 uprv_uca_addExpansion(expansions
, element
->CEs
[j
], status
);
1736 if(element
->noOfCEs
<= 0xF) {
1737 expansion
|= element
->noOfCEs
;
1739 uprv_uca_addExpansion(expansions
, 0, status
);
1741 element
->mapCE
= expansion
;
1742 uprv_uca_setMaxExpansion(element
->CEs
[element
->noOfCEs
- 1],
1743 (uint8_t)element
->noOfCEs
,
1750 uprv_uca_addFCD4AccentedContractions(tempUCATable
*t
,
1751 UCollationElements
* colEl
,
1755 UErrorCode
*status
) {
1756 UChar decomp
[256], comp
[256];
1757 int32_t decLen
, compLen
;
1759 decLen
= unorm_normalize(data
, len
, UNORM_NFD
, 0, decomp
, 256, status
);
1760 compLen
= unorm_normalize(data
, len
, UNORM_NFC
, 0, comp
, 256, status
);
1761 decomp
[decLen
] = comp
[compLen
] = 0;
1763 el
->cPoints
= decomp
;
1767 el
->prefix
= el
->prefixChars
;
1769 UCAElements
*prefix
=(UCAElements
*)uhash_get(t
->prefixLookup
, el
);
1771 el
->cSize
= compLen
;
1772 el
->prefix
= el
->prefixChars
;
1774 if(prefix
== NULL
) {
1776 ucol_setText(colEl
, decomp
, decLen
, status
);
1777 while((el
->CEs
[el
->noOfCEs
] = ucol_next(colEl
, status
)) != (uint32_t)UCOL_NULLORDER
) {
1780 uprv_uca_setMapCE(t
, el
, status
);
1781 uprv_uca_addAnElement(t
, el
, status
);
1783 el
->cPoints
=NULL
; /* don't leak reference to stack */
1787 uprv_uca_addMultiCMContractions(tempUCATable
*t
,
1788 UCollationElements
* colEl
,
1789 tempTailorContext
*c
,
1791 UErrorCode
*status
) {
1792 CombinClassTable
*cmLookup
= t
->cmLookup
;
1793 UChar newDecomp
[256];
1794 int32_t maxComp
, newDecLen
;
1795 const Normalizer2Impl
*nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
1796 if (U_FAILURE(*status
)) {
1799 int16_t curClass
= nfcImpl
->getFCD16(c
->tailoringCM
) & 0xff;
1800 CompData
*precomp
= c
->precomp
;
1801 int32_t compLen
= c
->compLen
;
1802 UChar
*comp
= c
->comp
;
1803 maxComp
= c
->precompLen
;
1805 for (int32_t j
=0; j
< maxComp
; j
++) {
1808 if ( count
== 0 ) { // Decompose the saved precomposed char.
1810 temp
[0]=precomp
[j
].cp
;
1812 newDecLen
= unorm_normalize(temp
, 1, UNORM_NFD
, 0,
1813 newDecomp
, sizeof(newDecomp
)/sizeof(UChar
), status
);
1814 newDecomp
[newDecLen
++] = cmLookup
->cPoints
[c
->cmPos
];
1816 else { // swap 2 combining marks when they are equal.
1817 uprv_memcpy(newDecomp
, c
->decomp
, sizeof(UChar
)*(c
->decompLen
));
1818 newDecLen
= c
->decompLen
;
1819 newDecomp
[newDecLen
++] = precomp
[j
].cClass
;
1821 newDecomp
[newDecLen
] = 0;
1822 compLen
= unorm_normalize(newDecomp
, newDecLen
, UNORM_NFC
, 0,
1825 comp
[compLen
++] = newDecomp
[newDecLen
++] = c
->tailoringCM
;
1826 comp
[compLen
] = newDecomp
[newDecLen
] = 0;
1827 el
->cPoints
= newDecomp
;
1828 el
->cSize
= newDecLen
;
1830 UCAElements
*prefix
=(UCAElements
*)uhash_get(t
->prefixLookup
, el
);
1831 el
->cPoints
= c
->comp
;
1832 el
->cSize
= compLen
;
1833 el
->prefix
= el
->prefixChars
;
1835 if(prefix
== NULL
) {
1837 ucol_setText(colEl
, newDecomp
, newDecLen
, status
);
1838 while((el
->CEs
[el
->noOfCEs
] = ucol_next(colEl
, status
)) != (uint32_t)UCOL_NULLORDER
) {
1841 uprv_uca_setMapCE(t
, el
, status
);
1842 uprv_uca_finalizeAddition(t
, el
, status
);
1844 // Save the current precomposed char and its class to find any
1845 // other combining mark combinations.
1846 precomp
[c
->precompLen
].cp
=comp
[0];
1847 precomp
[c
->precompLen
].cClass
= curClass
;
1851 } while (++count
<2 && (precomp
[j
].cClass
== curClass
));
1857 uprv_uca_addTailCanonicalClosures(tempUCATable
*t
,
1858 UCollationElements
* colEl
,
1862 UErrorCode
*status
) {
1863 CombinClassTable
*cmLookup
= t
->cmLookup
;
1864 const Normalizer2Impl
*nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
1865 if (U_FAILURE(*status
)) {
1868 int16_t maxIndex
= nfcImpl
->getFCD16(cMark
) & 0xff;
1869 UCAElements element
;
1873 CompData precomp
[256]; // precomposed array
1874 int32_t precompLen
= 0; // count for precomp
1875 int32_t i
, len
, decompLen
, replacedPos
;
1876 tempTailorContext c
;
1878 if ( cmLookup
== NULL
) {
1881 index
= cmLookup
->index
;
1882 int32_t cClass
=nfcImpl
->getFCD16(cMark
) & 0xff;
1883 maxIndex
= (int32_t)index
[(nfcImpl
->getFCD16(cMark
) & 0xff)-1];
1886 c
.precomp
= precomp
;
1887 c
.tailoringCM
= cMark
;
1890 maxIndex
= (int32_t)index
[cClass
-1];
1896 for ( i
=0; i
<maxIndex
; i
++ ) {
1897 decomp
[1] = cmLookup
->cPoints
[i
];
1900 len
= unorm_normalize(decomp
, decompLen
, UNORM_NFC
, 0, comp
, 256, status
);
1902 // Save the current precomposed char and its class to find any
1903 // other combining mark combinations.
1904 precomp
[precompLen
].cp
=comp
[0];
1905 precomp
[precompLen
].cClass
=
1906 index
[nfcImpl
->getFCD16(decomp
[1]) & 0xff];
1909 for (decompLen
=0; decompLen
< (int32_t)el
->cSize
; decompLen
++) {
1910 decomp
[decompLen
] = el
->cPoints
[decompLen
];
1911 if (decomp
[decompLen
]==cMark
) {
1912 replacedPos
= decompLen
; // record the position for later use
1915 if ( replacedPos
!= 0 ) {
1916 decomp
[replacedPos
]=cmLookup
->cPoints
[i
];
1918 decomp
[decompLen
] = 0;
1919 len
= unorm_normalize(decomp
, decompLen
, UNORM_NFC
, 0, comp
, 256, status
);
1920 comp
[len
++] = decomp
[decompLen
++] = cMark
;
1921 comp
[len
] = decomp
[decompLen
] = 0;
1922 element
.cPoints
= decomp
;
1923 element
.cSize
= decompLen
;
1924 element
.noOfCEs
= 0;
1925 element
.prefix
= el
->prefixChars
;
1926 element
.prefixSize
= 0;
1928 UCAElements
*prefix
=(UCAElements
*)uhash_get(t
->prefixLookup
, &element
);
1929 element
.cPoints
= comp
;
1930 element
.cSize
= len
;
1931 element
.prefix
= el
->prefixChars
;
1932 element
.prefixSize
= 0;
1933 if(prefix
== NULL
) {
1934 element
.noOfCEs
= 0;
1935 ucol_setText(colEl
, decomp
, decompLen
, status
);
1936 while((element
.CEs
[element
.noOfCEs
] = ucol_next(colEl
, status
)) != (uint32_t)UCOL_NULLORDER
) {
1939 uprv_uca_setMapCE(t
, &element
, status
);
1940 uprv_uca_finalizeAddition(t
, &element
, status
);
1943 // This is a fix for tailoring contractions with accented
1944 // character at the end of contraction string.
1946 (nfcImpl
->getFCD16(comp
[len
-2]) & 0xff00)==0) {
1947 uprv_uca_addFCD4AccentedContractions(t
, colEl
, comp
, len
, &element
, status
);
1950 if (precompLen
>1) {
1952 c
.decompLen
= decompLen
;
1953 c
.precompLen
= precompLen
;
1955 uprv_uca_addMultiCMContractions(t
, colEl
, &c
, &element
, status
);
1956 precompLen
= c
.precompLen
;
1962 U_CFUNC
int32_t U_EXPORT2
1963 uprv_uca_canonicalClosure(tempUCATable
*t
,
1964 UColTokenParser
*src
,
1969 context
.closed
= closed
;
1970 context
.noOfClosures
= 0;
1973 uint32_t i
= 0, j
= 0;
1974 UChar baseChar
, firstCM
;
1975 context
.nfcImpl
=Normalizer2Factory::getNFCImpl(*status
);
1976 if(U_FAILURE(*status
)) {
1980 UCollator
*tempColl
= NULL
;
1981 tempUCATable
*tempTable
= uprv_uca_cloneTempTable(t
, status
);
1982 // Check for null pointer
1983 if (U_FAILURE(*status
)) {
1987 UCATableHeader
*tempData
= uprv_uca_assembleTable(tempTable
, status
);
1988 tempColl
= ucol_initCollator(tempData
, 0, t
->UCA
, status
);
1989 if ( tempTable
->cmLookup
!= NULL
) {
1990 t
->cmLookup
= tempTable
->cmLookup
; // copy over to t
1991 tempTable
->cmLookup
= NULL
;
1993 uprv_uca_closeTempTable(tempTable
);
1995 if(U_SUCCESS(*status
)) {
1996 tempColl
->ucaRules
= NULL
;
1997 tempColl
->actualLocale
= NULL
;
1998 tempColl
->validLocale
= NULL
;
1999 tempColl
->requestedLocale
= NULL
;
2000 tempColl
->hasRealData
= TRUE
;
2001 tempColl
->freeImageOnClose
= TRUE
;
2002 } else if(tempData
!= 0) {
2003 uprv_free(tempData
);
2006 /* produce canonical closure */
2007 UCollationElements
* colEl
= ucol_openElements(tempColl
, NULL
, 0, status
);
2008 // Check for null pointer
2009 if (U_FAILURE(*status
)) {
2013 context
.tempColl
= tempColl
;
2014 context
.colEl
= colEl
;
2015 context
.status
= status
;
2016 u_enumCharTypes(_enumCategoryRangeClosureCategory
, &context
);
2018 if ( (src
==NULL
) || !src
->buildCCTabFlag
) {
2019 ucol_closeElements(colEl
);
2020 ucol_close(tempColl
);
2021 return context
.noOfClosures
; // no extra contraction needed to add
2024 for (i
=0; i
< src
->resultLen
; i
++) {
2025 baseChar
= firstCM
= (UChar
)0;
2026 tok
= src
->lh
[i
].first
;
2027 while (tok
!= NULL
&& U_SUCCESS(*status
)) {
2028 el
.prefix
= el
.prefixChars
;
2029 el
.cPoints
= el
.uchars
;
2030 if(tok
->prefix
!= 0) {
2031 el
.prefixSize
= tok
->prefix
>>24;
2032 uprv_memcpy(el
.prefix
, src
->source
+ (tok
->prefix
& 0x00FFFFFF), el
.prefixSize
*sizeof(UChar
));
2034 el
.cSize
= (tok
->source
>> 24)-(tok
->prefix
>>24);
2035 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF)+(tok
->prefix
>>24) + src
->source
, el
.cSize
*sizeof(UChar
));
2040 el
.cSize
= (tok
->source
>> 24);
2041 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF) + src
->source
, el
.cSize
*sizeof(UChar
));
2043 if(src
->UCA
!= NULL
) {
2044 for(j
= 0; j
<el
.cSize
; j
++) {
2045 int16_t fcd
= context
.nfcImpl
->getFCD16(el
.cPoints
[j
]);
2046 if ( (fcd
& 0xff) == 0 ) {
2047 baseChar
= el
.cPoints
[j
]; // last base character
2048 firstCM
=0; // reset combining mark value
2051 if ( (baseChar
!=0) && (firstCM
==0) ) {
2052 firstCM
= el
.cPoints
[j
]; // first combining mark
2057 if ( (baseChar
!= (UChar
)0) && (firstCM
!= (UChar
)0) ) {
2058 // find all the canonical rules
2059 uprv_uca_addTailCanonicalClosures(t
, colEl
, baseChar
, firstCM
, &el
, status
);
2064 ucol_closeElements(colEl
);
2065 ucol_close(tempColl
);
2067 return context
.noOfClosures
;
2070 #endif /* #if !UCONFIG_NO_COLLATION */