2 *******************************************************************************
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_bld.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This module builds a collator based on the rule set.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "normalizer2impl.h"
39 static const InverseUCATableHeader
* _staticInvUCA
= NULL
;
40 static UDataMemory
* invUCA_DATA_MEM
= NULL
;
43 static UBool U_CALLCONV
44 isAcceptableInvUCA(void * /*context*/,
45 const char * /*type*/, const char * /*name*/,
46 const UDataInfo
*pInfo
)
48 /* context, type & name are intentionally not used */
49 if( pInfo
->size
>=20 &&
50 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
51 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
52 pInfo
->dataFormat
[0]==INVUCA_DATA_FORMAT_0
&& /* dataFormat="InvC" */
53 pInfo
->dataFormat
[1]==INVUCA_DATA_FORMAT_1
&&
54 pInfo
->dataFormat
[2]==INVUCA_DATA_FORMAT_2
&&
55 pInfo
->dataFormat
[3]==INVUCA_DATA_FORMAT_3
&&
56 pInfo
->formatVersion
[0]==INVUCA_FORMAT_VERSION_0
&&
57 pInfo
->formatVersion
[1]>=INVUCA_FORMAT_VERSION_1
//&&
58 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
59 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
60 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
63 UVersionInfo UCDVersion
;
64 u_getUnicodeVersion(UCDVersion
);
65 return (pInfo
->dataVersion
[0]==UCDVersion
[0] &&
66 pInfo
->dataVersion
[1]==UCDVersion
[1]);
67 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
68 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
69 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
77 * Takes two CEs (lead and continuation) and
78 * compares them as CEs should be compared:
79 * primary vs. primary, secondary vs. secondary
80 * tertiary vs. tertiary
82 static int32_t compareCEs(uint32_t source0
, uint32_t source1
, uint32_t target0
, uint32_t target1
) {
83 uint32_t s1
= source0
, s2
, t1
= target0
, t2
;
84 if(isContinuation(source1
)) {
89 if(isContinuation(target1
)) {
95 uint32_t s
= 0, t
= 0;
96 if(s1
== t1
&& s2
== t2
) {
99 s
= (s1
& 0xFFFF0000)|((s2
& 0xFFFF0000)>>16);
100 t
= (t1
& 0xFFFF0000)|((t2
& 0xFFFF0000)>>16);
106 s
= (s1
& 0x0000FF00) | (s2
& 0x0000FF00)>>8;
107 t
= (t1
& 0x0000FF00) | (t2
& 0x0000FF00)>>8;
113 s
= (s1
& 0x000000FF)<<8 | (s2
& 0x000000FF);
114 t
= (t1
& 0x000000FF)<<8 | (t2
& 0x000000FF);
125 int32_t ucol_inv_findCE(const UColTokenParser
*src
, uint32_t CE
, uint32_t SecondCE
) {
126 uint32_t bottom
= 0, top
= src
->invUCA
->tableSize
;
128 uint32_t first
= 0, second
= 0;
129 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
132 while(bottom
< top
-1) {
134 first
= *(CETable
+3*i
);
135 second
= *(CETable
+3*i
+1);
136 res
= compareCEs(first
, second
, CE
, SecondCE
);
147 /* in searching for elements, I have removed the failure */
148 /* The reason for this is that the builder does not rely */
149 /* on search mechanism telling it that it didn't find an */
150 /* element. However, indirect positioning relies on being */
151 /* able to find the elements around any CE, even if it is */
152 /* not defined in the UCA. */
155 if((first == CE && second == SecondCE)) {
163 static const uint32_t strengthMask
[UCOL_CE_STRENGTH_LIMIT
] = {
169 U_CAPI
int32_t U_EXPORT2
ucol_inv_getNextCE(const UColTokenParser
*src
,
170 uint32_t CE
, uint32_t contCE
,
171 uint32_t *nextCE
, uint32_t *nextContCE
,
174 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
177 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
180 *nextCE
= UCOL_NOT_FOUND
;
184 CE
&= strengthMask
[strength
];
185 contCE
&= strengthMask
[strength
];
188 *nextContCE
= contCE
;
190 while((*nextCE
& strengthMask
[strength
]) == CE
191 && (*nextContCE
& strengthMask
[strength
]) == contCE
)
193 *nextCE
= (*(CETable
+3*(++iCE
)));
194 *nextContCE
= (*(CETable
+3*(iCE
)+1));
200 U_CFUNC
int32_t U_EXPORT2
ucol_inv_getPrevCE(const UColTokenParser
*src
,
201 uint32_t CE
, uint32_t contCE
,
202 uint32_t *prevCE
, uint32_t *prevContCE
,
205 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
208 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
211 *prevCE
= UCOL_NOT_FOUND
;
215 CE
&= strengthMask
[strength
];
216 contCE
&= strengthMask
[strength
];
219 *prevContCE
= contCE
;
221 while((*prevCE
& strengthMask
[strength
]) == CE
222 && (*prevContCE
& strengthMask
[strength
])== contCE
223 && iCE
> 0) /* this condition should prevent falling off the edge of the world */
225 /* here, we end up in a singularity - zero */
226 *prevCE
= (*(CETable
+3*(--iCE
)));
227 *prevContCE
= (*(CETable
+3*(iCE
)+1));
233 U_CFUNC
uint32_t U_EXPORT2
ucol_getCEStrengthDifference(uint32_t CE
, uint32_t contCE
,
234 uint32_t prevCE
, uint32_t prevContCE
)
236 if(prevCE
== CE
&& prevContCE
== contCE
) {
237 return UCOL_IDENTICAL
;
239 if((prevCE
& strengthMask
[UCOL_PRIMARY
]) != (CE
& strengthMask
[UCOL_PRIMARY
])
240 || (prevContCE
& strengthMask
[UCOL_PRIMARY
]) != (contCE
& strengthMask
[UCOL_PRIMARY
]))
244 if((prevCE
& strengthMask
[UCOL_SECONDARY
]) != (CE
& strengthMask
[UCOL_SECONDARY
])
245 || (prevContCE
& strengthMask
[UCOL_SECONDARY
]) != (contCE
& strengthMask
[UCOL_SECONDARY
]))
247 return UCOL_SECONDARY
;
249 return UCOL_TERTIARY
;
254 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
256 uint32_t CE = lh->baseCE;
257 uint32_t SecondCE = lh->baseContCE;
259 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
260 uint32_t previousCE, previousContCE;
263 iCE = ucol_inv_findCE(src, CE, SecondCE);
269 CE &= strengthMask[strength];
270 SecondCE &= strengthMask[strength];
273 previousContCE = SecondCE;
275 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
276 previousCE = (*(CETable+3*(--iCE)));
277 previousContCE = (*(CETable+3*(iCE)+1));
279 lh->previousCE = previousCE;
280 lh->previousContCE = previousContCE;
286 inline int32_t ucol_inv_getNext(UColTokenParser
*src
, UColTokListHeader
*lh
, uint32_t strength
) {
287 uint32_t CE
= lh
->baseCE
;
288 uint32_t SecondCE
= lh
->baseContCE
;
290 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
291 uint32_t nextCE
, nextContCE
;
294 iCE
= ucol_inv_findCE(src
, CE
, SecondCE
);
300 CE
&= strengthMask
[strength
];
301 SecondCE
&= strengthMask
[strength
];
304 nextContCE
= SecondCE
;
306 while((nextCE
& strengthMask
[strength
]) == CE
307 && (nextContCE
& strengthMask
[strength
]) == SecondCE
)
309 nextCE
= (*(CETable
+3*(++iCE
)));
310 nextContCE
= (*(CETable
+3*(iCE
)+1));
314 lh
->nextContCE
= nextContCE
;
319 static void ucol_inv_getGapPositions(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
320 /* reset all the gaps */
322 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
327 UColToken
*tok
= lh
->first
;
328 uint32_t tokStrength
= tok
->strength
;
330 for(i
= 0; i
<3; i
++) {
332 lh
->gapsHi
[3*i
+1] = 0;
333 lh
->gapsHi
[3*i
+2] = 0;
335 lh
->gapsLo
[3*i
+1] = 0;
336 lh
->gapsLo
[3*i
+2] = 0;
338 lh
->fStrToken
[i
] = NULL
;
339 lh
->lStrToken
[i
] = NULL
;
343 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
345 if((lh
->baseCE
& 0xFF000000)>= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) && (lh
->baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
346 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
349 t2
= lh
->baseContCE
& UCOL_REMOVE_CONTINUATION
;
350 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
351 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
352 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
353 uint32_t primaryCE
= (t1
& UCOL_PRIMARYMASK
) | ((t2
& UCOL_PRIMARYMASK
) >> 16);
354 primaryCE
= uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE
)+1);
356 t1
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
357 t2
= (primaryCE
<< 16) & UCOL_PRIMARYMASK
; // | UCOL_CONTINUATION_MARKER;
359 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
360 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
361 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
362 } else if(lh
->indirect
== TRUE
&& lh
->nextCE
!= 0) {
363 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
366 t2
= lh
->baseContCE
&UCOL_REMOVE_CONTINUATION
;
367 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
368 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
369 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
371 t2
= lh
->nextContCE
&UCOL_REMOVE_CONTINUATION
;
372 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
373 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
374 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
377 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
378 if((lh
->pos
[tokStrength
] = ucol_inv_getNext(src
, lh
, tokStrength
)) >= 0) {
379 lh
->fStrToken
[tokStrength
] = tok
;
380 } else { /* The CE must be implicit, since it's not in the table */
382 *status
= U_INTERNAL_PROGRAM_ERROR
;
386 while(tok
!= NULL
&& tok
->strength
>= tokStrength
) {
387 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
388 lh
->lStrToken
[tokStrength
] = tok
;
392 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
-1) {
393 /* check if previous interval is the same and merge the intervals if it is so */
394 if(lh
->pos
[tokStrength
] == lh
->pos
[tokStrength
+1]) {
395 lh
->fStrToken
[tokStrength
] = lh
->fStrToken
[tokStrength
+1];
396 lh
->fStrToken
[tokStrength
+1] = NULL
;
397 lh
->lStrToken
[tokStrength
+1] = NULL
;
398 lh
->pos
[tokStrength
+1] = -1;
402 tokStrength
= tok
->strength
;
407 for(st
= 0; st
< 3; st
++) {
408 if((pos
= lh
->pos
[st
]) >= 0) {
409 t1
= *(CETable
+3*(pos
));
410 t2
= *(CETable
+3*(pos
)+1);
411 lh
->gapsHi
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
412 lh
->gapsHi
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
413 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
414 lh
->gapsHi
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
416 //t1 = *(CETable+3*(pos));
417 //t2 = *(CETable+3*(pos)+1);
420 lh
->gapsLo
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
421 lh
->gapsLo
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
422 lh
->gapsLo
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
429 #define ucol_countBytes(value, noOfBytes) \
431 uint32_t mask = 0xFFFFFFFF; \
434 if(((value) & mask) != 0) { \
441 static uint32_t ucol_getNextGenerated(ucolCEGenerator
*g
, UErrorCode
*status
) {
442 if(U_SUCCESS(*status
)) {
443 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
448 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator
*g
, UColToken
*tok
, uint32_t strength
, UErrorCode
*status
) {
449 /* TODO: rename to enum names */
450 uint32_t high
, low
, count
=1;
451 uint32_t maxByte
= (strength
== UCOL_TERTIARY
)?0x3F:0xFF;
453 if(strength
== UCOL_SECONDARY
) {
454 low
= UCOL_COMMON_TOP2
<<24;
456 count
= 0xFF - UCOL_COMMON_TOP2
;
458 low
= UCOL_BYTE_COMMON
<< 24; //0x05000000;
460 count
= 0x40 - UCOL_BYTE_COMMON
;
463 if(tok
->next
!= NULL
&& tok
->next
->strength
== strength
) {
464 count
= tok
->next
->toInsert
;
467 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
468 g
->current
= UCOL_BYTE_COMMON
<<24;
470 if(g
->noOfRanges
== 0) {
471 *status
= U_INTERNAL_PROGRAM_ERROR
;
476 static uint32_t ucol_getCEGenerator(ucolCEGenerator
*g
, uint32_t* lows
, uint32_t* highs
, UColToken
*tok
, uint32_t fStrength
, UErrorCode
*status
) {
477 uint32_t strength
= tok
->strength
;
478 uint32_t low
= lows
[fStrength
*3+strength
];
479 uint32_t high
= highs
[fStrength
*3+strength
];
480 uint32_t maxByte
= 0;
481 if(strength
== UCOL_TERTIARY
) {
483 } else if(strength
== UCOL_PRIMARY
) {
489 uint32_t count
= tok
->toInsert
;
491 if(low
>= high
&& strength
> UCOL_PRIMARY
) {
492 int32_t s
= strength
;
495 if(lows
[fStrength
*3+s
] != highs
[fStrength
*3+s
]) {
496 if(strength
== UCOL_SECONDARY
) {
497 if (low
< UCOL_COMMON_TOP2
<<24 ) {
498 // Override if low range is less than UCOL_COMMON_TOP2.
499 low
= UCOL_COMMON_TOP2
<<24;
503 // Override if low range is less than UCOL_COMMON_BOT3.
504 if ( low
< UCOL_COMMON_BOT3
<<24 ) {
505 low
= UCOL_COMMON_BOT3
<<24;
512 *status
= U_INTERNAL_PROGRAM_ERROR
;
518 if(low
< 0x02000000) {
519 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
520 // See http://site.icu-project.org/design/collation/bytes
524 if(strength
== UCOL_SECONDARY
) { /* similar as simple */
525 if(low
>= (UCOL_COMMON_BOT2
<<24) && low
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
526 low
= UCOL_COMMON_TOP2
<<24;
528 if(high
> (UCOL_COMMON_BOT2
<<24) && high
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
529 high
= UCOL_COMMON_TOP2
<<24;
531 if(low
< (UCOL_COMMON_BOT2
<<24)) {
532 g
->noOfRanges
= ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN
<<24, high
, count
, maxByte
, g
->ranges
);
533 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
534 //g->current = UCOL_COMMON_BOT2<<24;
539 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
540 if(g
->noOfRanges
== 0) {
541 *status
= U_INTERNAL_PROGRAM_ERROR
;
543 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
548 uint32_t u_toLargeKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
552 if(U_FAILURE(*status
)) {
556 if(sourceLen
> resLen
) {
557 *status
= U_MEMORY_ALLOCATION_ERROR
;
561 for(i
= 0; i
< sourceLen
; i
++) {
563 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
565 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
566 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
583 uint32_t u_toSmallKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
587 if(U_FAILURE(*status
)) {
591 if(sourceLen
> resLen
) {
592 *status
= U_MEMORY_ALLOCATION_ERROR
;
596 for(i
= 0; i
< sourceLen
; i
++) {
598 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
600 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
601 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
620 uint8_t ucol_uprv_getCaseBits(const UCollator
*UCA
, const UChar
*src
, uint32_t len
, UErrorCode
*status
) {
624 uint32_t uCount
= 0, lCount
= 0;
629 if(U_FAILURE(*status
)) {
630 return UCOL_LOWER_CASE
;
633 nLen
= unorm_normalize(src
, len
, UNORM_NFKD
, 0, n
, 128, status
);
634 if(U_SUCCESS(*status
)) {
635 for(i
= 0; i
< nLen
; i
++) {
636 uprv_init_collIterate(UCA
, &n
[i
], 1, &s
, status
);
637 order
= ucol_getNextCE(UCA
, &s
, status
);
638 if(isContinuation(order
)) {
639 *status
= U_INTERNAL_PROGRAM_ERROR
;
640 return UCOL_LOWER_CASE
;
642 if((order
&UCOL_CASE_BIT_MASK
)== UCOL_UPPER_CASE
) {
645 if(u_islower(n
[i
])) {
647 } else if(U_SUCCESS(*status
)) {
649 u_toSmallKana(&n
[i
], 1, sk
, 1, status
);
650 u_toLargeKana(&n
[i
], 1, lk
, 1, status
);
651 if(sk
[0] == n
[i
] && lk
[0] != n
[i
]) {
659 if(uCount
!= 0 && lCount
!= 0) {
660 return UCOL_MIXED_CASE
;
661 } else if(uCount
!= 0) {
662 return UCOL_UPPER_CASE
;
664 return UCOL_LOWER_CASE
;
669 U_CFUNC
void ucol_doCE(UColTokenParser
*src
, uint32_t *CEparts
, UColToken
*tok
, UErrorCode
*status
) {
670 /* this one makes the table and stuff */
671 uint32_t noOfBytes
[3];
674 for(i
= 0; i
<3; i
++) {
675 ucol_countBytes(CEparts
[i
], noOfBytes
[i
]);
678 /* Here we have to pack CEs from parts */
683 while(2*CEi
<noOfBytes
[0] || CEi
<noOfBytes
[1] || CEi
<noOfBytes
[2]) {
685 value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
690 if(2*CEi
<noOfBytes
[0]) {
691 value
|= ((CEparts
[0]>>(32-16*(CEi
+1))) & 0xFFFF) << 16;
693 if(CEi
<noOfBytes
[1]) {
694 value
|= ((CEparts
[1]>>(32-8*(CEi
+1))) & 0xFF) << 8;
696 if(CEi
<noOfBytes
[2]) {
697 value
|= ((CEparts
[2]>>(32-8*(CEi
+1))) & 0x3F);
699 tok
->CEs
[CEi
] = value
;
702 if(CEi
== 0) { /* totally ignorable */
705 } else { /* there is at least something */
710 // we want to set case bits here and now, not later.
711 // Case bits handling
712 if(tok
->CEs
[0] != 0) { // case bits should be set only for non-ignorables
713 tok
->CEs
[0] &= 0xFFFFFF3F; // Clean the case bits field
714 int32_t cSize
= (tok
->source
& 0xFF000000) >> 24;
715 UChar
*cPoints
= (tok
->source
& 0x00FFFFFF) + src
->source
;
719 tok
->CEs
[0] |= ucol_uprv_getCaseBits(src
->UCA
, cPoints
, cSize
, status
);
721 // Copy it from the UCA
722 uint32_t caseCE
= ucol_getFirstCE(src
->UCA
, cPoints
[0], status
);
723 tok
->CEs
[0] |= (caseCE
& 0xC0);
728 fprintf(stderr
, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok
->debugSource
, tok
->strength
, CEparts
[0] >> (32-8*noOfBytes
[0]), CEparts
[1] >> (32-8*noOfBytes
[1]), CEparts
[2]>> (32-8*noOfBytes
[2]));
729 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
730 fprintf(stderr
, "%08X ", tok
->CEs
[i
]);
732 fprintf(stderr
, "\n");
736 U_CFUNC
void ucol_initBuffers(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
737 ucolCEGenerator Gens
[UCOL_CE_STRENGTH_LIMIT
];
738 uint32_t CEparts
[UCOL_CE_STRENGTH_LIMIT
];
740 UColToken
*tok
= lh
->last
;
741 uint32_t t
[UCOL_STRENGTH_LIMIT
];
743 uprv_memset(t
, 0, UCOL_STRENGTH_LIMIT
*sizeof(uint32_t));
746 t
[tok
->strength
] = 1;
748 while(tok
->previous
!= NULL
) {
749 if(tok
->previous
->strength
< tok
->strength
) { /* going up */
750 t
[tok
->strength
] = 0;
751 t
[tok
->previous
->strength
]++;
752 } else if(tok
->previous
->strength
> tok
->strength
) { /* going down */
753 t
[tok
->previous
->strength
] = 1;
758 tok
->toInsert
= t
[tok
->strength
];
761 tok
->toInsert
= t
[tok
->strength
];
762 ucol_inv_getGapPositions(src
, lh
, status
);
765 fprintf(stderr
, "BaseCE: %08X %08X\n", lh
->baseCE
, lh
->baseContCE
);
767 for(j
= 2; j
>= 0; j
--) {
768 fprintf(stderr
, "gapsLo[%i] [%08X %08X %08X]\n", j
, lh
->gapsLo
[j
*3], lh
->gapsLo
[j
*3+1], lh
->gapsLo
[j
*3+2]);
769 fprintf(stderr
, "gapsHi[%i] [%08X %08X %08X]\n", j
, lh
->gapsHi
[j
*3], lh
->gapsHi
[j
*3+1], lh
->gapsHi
[j
*3+2]);
771 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
774 fprintf(stderr
,"%i", tok
->strength
);
776 } while(tok
!= NULL
);
777 fprintf(stderr
, "\n");
779 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
782 fprintf(stderr
,"%i", tok
->toInsert
);
784 } while(tok
!= NULL
);
788 uint32_t fStrength
= UCOL_IDENTICAL
;
789 uint32_t initStrength
= UCOL_IDENTICAL
;
792 CEparts
[UCOL_PRIMARY
] = (lh
->baseCE
& UCOL_PRIMARYMASK
) | (lh
->baseContCE
& UCOL_PRIMARYMASK
) >> 16;
793 CEparts
[UCOL_SECONDARY
] = (lh
->baseCE
& UCOL_SECONDARYMASK
) << 16 | (lh
->baseContCE
& UCOL_SECONDARYMASK
) << 8;
794 CEparts
[UCOL_TERTIARY
] = (UCOL_TERTIARYORDER(lh
->baseCE
)) << 24 | (UCOL_TERTIARYORDER(lh
->baseContCE
)) << 16;
796 while (tok
!= NULL
&& U_SUCCESS(*status
)) {
797 fStrength
= tok
->strength
;
798 if(fStrength
< initStrength
) {
799 initStrength
= fStrength
;
800 if(lh
->pos
[fStrength
] == -1) {
801 while(lh
->pos
[fStrength
] == -1 && fStrength
> 0) {
804 if(lh
->pos
[fStrength
] == -1) {
805 *status
= U_INTERNAL_PROGRAM_ERROR
;
809 if(initStrength
== UCOL_TERTIARY
) { /* starting with tertiary */
810 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
811 CEparts
[UCOL_SECONDARY
] = lh
->gapsLo
[fStrength
*3+1];
812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
813 CEparts
[UCOL_TERTIARY
] = ucol_getCEGenerator(&Gens
[UCOL_TERTIARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
814 } else if(initStrength
== UCOL_SECONDARY
) { /* secondaries */
815 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
817 CEparts
[UCOL_SECONDARY
] = ucol_getCEGenerator(&Gens
[UCOL_SECONDARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
818 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
819 } else { /* primaries */
820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
821 CEparts
[UCOL_PRIMARY
] = ucol_getCEGenerator(&Gens
[UCOL_PRIMARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
822 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
823 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
826 if(tok
->strength
== UCOL_TERTIARY
) {
827 CEparts
[UCOL_TERTIARY
] = ucol_getNextGenerated(&Gens
[UCOL_TERTIARY
], status
);
828 } else if(tok
->strength
== UCOL_SECONDARY
) {
829 CEparts
[UCOL_SECONDARY
] = ucol_getNextGenerated(&Gens
[UCOL_SECONDARY
], status
);
830 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
831 } else if(tok
->strength
== UCOL_PRIMARY
) {
832 CEparts
[UCOL_PRIMARY
] = ucol_getNextGenerated(&Gens
[UCOL_PRIMARY
], status
);
833 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
834 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
837 ucol_doCE(src
, CEparts
, tok
, status
);
842 U_CFUNC
void ucol_createElements(UColTokenParser
*src
, tempUCATable
*t
, UColTokListHeader
*lh
, UErrorCode
*status
) {
844 UColToken
*tok
= lh
->first
;
845 UColToken
*expt
= NULL
;
846 uint32_t i
= 0, j
= 0;
847 UChar32 fcdHighStart
;
848 const uint16_t *fcdTrieIndex
= unorm_getFCDTrieIndex(fcdHighStart
, status
);
850 while(tok
!= NULL
&& U_SUCCESS(*status
)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok
->expansion
!= 0) {
855 uint32_t len
= tok
->expansion
>> 24;
856 uint32_t currentSequenceLen
= len
;
857 uint32_t expOffset
= tok
->expansion
& 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
860 exp
.source
= currentSequenceLen
| expOffset
;
861 exp
.rulesToParseHdl
= &(src
->source
);
864 currentSequenceLen
= len
;
865 while(currentSequenceLen
> 0) {
866 exp
.source
= (currentSequenceLen
<< 24) | expOffset
;
867 if((expt
= (UColToken
*)uhash_get(src
->tailored
, &exp
)) != NULL
&& expt
->strength
!= UCOL_TOK_RESET
) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy
= expt
->noOfCEs
;
869 for(j
= 0; j
<noOfCEsToCopy
; j
++) {
870 tok
->expCEs
[tok
->noOfExpCEs
+ j
] = expt
->CEs
[j
];
872 tok
->noOfExpCEs
+= noOfCEsToCopy
;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset
+= currentSequenceLen
; //noOfCEsToCopy;
876 len
-= currentSequenceLen
; //noOfCEsToCopy;
879 currentSequenceLen
--;
882 if(currentSequenceLen
== 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
888 uprv_init_collIterate(src
->UCA
, expOffset
+ src
->source
, 1, &s
, status
);
891 order
= ucol_getNextCE(src
->UCA
, &s
, status
);
892 if(order
== UCOL_NO_MORE_CES
) {
895 tok
->expCEs
[tok
->noOfExpCEs
++] = order
;
905 /* set the ucaelement with obtained values */
906 el
.noOfCEs
= tok
->noOfCEs
+ tok
->noOfExpCEs
;
908 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
909 el
.CEs
[i
] = tok
->CEs
[i
];
911 for(i
= 0; i
<tok
->noOfExpCEs
; i
++) {
912 el
.CEs
[i
+tok
->noOfCEs
] = tok
->expCEs
[i
];
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el
.prefix
= el
.prefixChars
;
919 el
.cPoints
= el
.uchars
;
920 if(tok
->prefix
!= 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el
.prefixSize
= tok
->prefix
>>24;
924 uprv_memcpy(el
.prefix
, src
->source
+ (tok
->prefix
& 0x00FFFFFF), el
.prefixSize
*sizeof(UChar
));
926 el
.cSize
= (tok
->source
>> 24)-(tok
->prefix
>>24);
927 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF)+(tok
->prefix
>>24) + src
->source
, el
.cSize
*sizeof(UChar
));
932 el
.cSize
= (tok
->source
>> 24);
933 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF) + src
->source
, el
.cSize
*sizeof(UChar
));
935 if(src
->UCA
!= NULL
) {
936 for(i
= 0; i
<el
.cSize
; i
++) {
937 if(UCOL_ISJAMO(el
.cPoints
[i
])) {
938 t
->image
->jamoSpecial
= TRUE
;
941 if (!src
->buildCCTabFlag
&& el
.cSize
> 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar
*s
= el
.cPoints
+ el
.cSize
;
944 uint16_t fcd
= unorm_prevFCD16(fcdTrieIndex
, fcdHighStart
, el
.cPoints
, s
);
945 if ((fcd
& 0xff) != 0) {
946 src
->buildCCTabFlag
= TRUE
;
951 /* and then, add it */
953 fprintf(stderr
, "Adding: %04X with %08X\n", el
.cPoints
[0], el
.CEs
[0]);
955 uprv_uca_addAnElement(t
, &el
, status
);
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status
!= U_ZERO_ERROR
) {
959 fprintf(stderr
, "replaced CE for %04X with CE for %04X\n", el
.cPoints
[0], tok
->debugSource
);
960 *status
= U_ZERO_ERROR
;
969 static UBool U_CALLCONV
970 _processUCACompleteIgnorables(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
971 UErrorCode status
= U_ZERO_ERROR
;
972 tempUCATable
*t
= (tempUCATable
*)context
;
974 while(start
< limit
) {
975 uint32_t CE
= utrie_get32(t
->mapping
, start
, NULL
);
976 if(CE
== UCOL_NOT_FOUND
) {
980 el
.prefixChars
[0] = 0;
981 el
.prefix
= el
.prefixChars
;
982 el
.cPoints
= el
.uchars
;
985 UTF_APPEND_CHAR(el
.uchars
, el
.cSize
, 1024, start
);
989 uprv_uca_addAnElement(t
, &el
, &status
);
995 if(U_FAILURE(status
)) {
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser
*src
, tempUCATable
*t
,
1005 UChar32 start
, UChar32 end
,
1008 //UChar decomp[256];
1009 uint32_t CE
= UCOL_NOT_FOUND
;
1014 el
.prefixChars
[0] = 0;
1017 if(U_SUCCESS(*status
)) {
1018 for(u
= start
; u
<=end
; u
++) {
1019 if((CE
= utrie_get32(t
->mapping
, u
, NULL
)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE
)) &&
1022 (uprv_cnttab_getCE(t
->contractions
, CE
, 0, status
) == UCOL_NOT_FOUND
))
1026 U16_APPEND_UNSAFE(el
.uchars
, el
.cSize
, u
);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el
.cPoints
= el
.uchars
;
1032 el
.prefix
= el
.prefixChars
;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1055 uprv_init_collIterate(src
->UCA
, el
.uchars
, el
.cSize
, &colIt
, status
);
1056 while(CE
!= UCOL_NO_MORE_CES
) {
1057 CE
= ucol_getNextCE(src
->UCA
, &colIt
, status
);
1058 if(CE
!= UCOL_NO_MORE_CES
) {
1059 el
.CEs
[el
.noOfCEs
++] = CE
;
1063 uprv_uca_addAnElement(t
, &el
, status
);
1071 U_CFUNC UCATableHeader
*
1072 ucol_assembleTailoringTable(UColTokenParser
*src
, UErrorCode
*status
) {
1076 if(U_FAILURE(*status
)) {
1080 2. Eliminate the negative lists by doing the following for each non-null negative list:
1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082 create new ListHeader X
1083 o reverse the list, add to the end of X's positive list. Reset the strength of the
1084 first item you add, based on the stronger strength levels of the two lists.
1087 3. For each ListHeader with a non-null positive list:
1090 o Find all character strings with CEs between the baseCE and the
1091 next/previous CE, at the strength of the first token. Add these to the
1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1094 tailoring has & x < z...
1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1097 /* It is possible that this part should be done even while constructing list */
1098 /* The problem is that it is unknown what is going to be the strongest weight */
1099 /* So we might as well do it here */
1102 o Allocate CEs for each token in the list, based on the total number N of the
1103 largest level difference, and the gap G between baseCE and nextCE at that
1104 level. The relation * between the last item and nextCE is the same as the
1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108 Then fit b and c into the secondary gap between a and d, then fit q
1109 into the tertiary gap between b and c.
1111 o Example: baseCE << b <<< q << c * nextCE(X,2)
1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113 Then fit q into the tertiary gap between b and c.
1114 o When incrementing primary values, we will not cross high byte
1115 boundaries except where there is only a single-byte primary. That is to
1116 ensure that the script reordering will continue to work.
1118 UCATableHeader
*image
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
1120 if (image
== NULL
) {
1121 *status
= U_MEMORY_ALLOCATION_ERROR
;
1124 uprv_memcpy(image
, src
->UCA
->image
, sizeof(UCATableHeader
));
1126 for(i
= 0; i
<src
->resultLen
; i
++) {
1127 /* now we need to generate the CEs */
1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129 /* According to strength */
1130 if(U_SUCCESS(*status
)) {
1131 if(src
->lh
[i
].first
) { // if there are any elements
1132 // due to the way parser works, subsequent tailorings
1133 // may remove all the elements from a sequence, therefore
1134 // leaving an empty tailoring sequence.
1135 ucol_initBuffers(src
, &src
->lh
[i
], status
);
1138 if(U_FAILURE(*status
)) {
1144 if(src
->varTop
!= NULL
) { /* stuff the variable top value */
1145 src
->opts
->variableTopValue
= (*(src
->varTop
->CEs
))>>16;
1146 /* remove it from the list */
1147 if(src
->varTop
->listHeader
->first
== src
->varTop
) { /* first in list */
1148 src
->varTop
->listHeader
->first
= src
->varTop
->next
;
1150 if(src
->varTop
->listHeader
->last
== src
->varTop
) { /* first in list */
1151 src
->varTop
->listHeader
->last
= src
->varTop
->previous
;
1153 if(src
->varTop
->next
!= NULL
) {
1154 src
->varTop
->next
->previous
= src
->varTop
->previous
;
1156 if(src
->varTop
->previous
!= NULL
) {
1157 src
->varTop
->previous
->next
= src
->varTop
->next
;
1162 tempUCATable
*t
= uprv_uca_initTempTable(image
, src
->opts
, src
->UCA
, NOT_FOUND_TAG
, NOT_FOUND_TAG
, status
);
1163 if(U_FAILURE(*status
)) {
1169 /* After this, we have assigned CE values to all regular CEs */
1170 /* now we will go through list once more and resolve expansions, */
1171 /* make UCAElements structs and add them to table */
1172 for(i
= 0; i
<src
->resultLen
; i
++) {
1173 /* now we need to generate the CEs */
1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175 /* According to strength */
1176 if(U_SUCCESS(*status
)) {
1177 ucol_createElements(src
, t
, &src
->lh
[i
], status
);
1184 el
.prefixChars
[0] = 0;
1186 /* add latin-1 stuff */
1187 ucol_uprv_bld_copyRangeFromUCA(src
, t
, 0, 0xFF, status
);
1189 /* add stuff for copying */
1190 if(src
->copySet
!= NULL
) {
1192 UnicodeSet
*set
= (UnicodeSet
*)src
->copySet
;
1193 for(i
= 0; i
< set
->getRangeCount(); i
++) {
1194 ucol_uprv_bld_copyRangeFromUCA(src
, t
, set
->getRangeStart(i
), set
->getRangeEnd(i
), status
);
1198 if(U_SUCCESS(*status
)) {
1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1201 uint32_t tailoredCE
= UCOL_NOT_FOUND
;
1202 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1203 UChar
*conts
= (UChar
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->contractionUCACombos
);
1204 UCollationElements
*ucaEl
= ucol_openElements(src
->UCA
, NULL
, 0, status
);
1205 // Check for null pointer
1206 if (ucaEl
== NULL
) {
1207 *status
= U_MEMORY_ALLOCATION_ERROR
;
1210 while(*conts
!= 0) {
1211 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1212 tailoredCE
= utrie_get32(t
->mapping
, *conts
, NULL
);
1213 if(tailoredCE
!= UCOL_NOT_FOUND
) {
1214 UBool needToAdd
= TRUE
;
1215 if(isCntTableElement(tailoredCE
)) {
1216 if(uprv_cnttab_isTailored(t
->contractions
, tailoredCE
, conts
+1, status
) == TRUE
) {
1220 if (!needToAdd
&& isPrefix(tailoredCE
) && *(conts
+1)==0) {
1222 elm
.cPoints
= el
.uchars
;
1224 elm
.uchars
[0] = *conts
;
1227 elm
.prefixChars
[0] = *(conts
+2);
1229 elm
.prefix
= elm
.prefixChars
;
1231 UCAElements
*prefixEnt
=(UCAElements
*)uhash_get(t
->prefixLookup
, &elm
);
1232 if ((prefixEnt
==NULL
) || *(prefixEnt
->prefix
)!=*(conts
+2)) {
1236 if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, *conts
)) {
1240 if(needToAdd
== TRUE
) { // we need to add if this contraction is not tailored.
1241 if (*(conts
+1) != 0) { // contractions
1242 el
.prefix
= el
.prefixChars
;
1244 el
.cPoints
= el
.uchars
;
1246 el
.uchars
[0] = *conts
;
1247 el
.uchars
[1] = *(conts
+1);
1249 el
.uchars
[2] = *(conts
+2);
1254 ucol_setText(ucaEl
, el
.uchars
, el
.cSize
, status
);
1256 else { // pre-context character
1257 UChar str
[4] = { 0 };
1259 int32_t preKeyLen
=0;
1261 el
.cPoints
= el
.uchars
;
1263 el
.uchars
[0] = *conts
;
1266 el
.prefixChars
[0] = *(conts
+2);
1267 el
.prefix
= el
.prefixChars
;
1269 if (el
.prefixChars
[0]!=0) {
1270 // get CE of prefix character first
1271 str
[0]=el
.prefixChars
[0];
1273 ucol_setText(ucaEl
, str
, 1, status
);
1274 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
))
1275 != UCOL_NULLORDER
) {
1276 preKeyLen
++; // count number of keys for prefix character
1278 str
[len
++] = el
.prefixChars
[0];
1281 str
[len
++] = el
.uchars
[0];
1283 ucol_setText(ucaEl
, str
, len
, status
);
1284 // Skip the keys for prefix character, then copy the rest to el.
1285 while ((preKeyLen
-->0) &&
1286 (int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1291 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1294 uprv_uca_addAnElement(t
, &el
, status
);
1297 } else if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, *conts
)) {
1298 ucol_uprv_bld_copyRangeFromUCA(src
, t
, *conts
, *conts
, status
);
1302 ucol_closeElements(ucaEl
);
1305 // Add completely ignorable elements
1306 utrie_enum(&t
->UCA
->mapping
, NULL
, _processUCACompleteIgnorables
, t
);
1308 // add tailoring characters related canonical closures
1309 uprv_uca_canonicalClosure(t
, src
, NULL
, status
);
1311 /* still need to produce compatibility closure */
1313 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
1315 uprv_uca_closeTempTable(t
);
1322 static UBool U_CALLCONV
1323 ucol_bld_cleanup(void)
1325 udata_close(invUCA_DATA_MEM
);
1326 invUCA_DATA_MEM
= NULL
;
1327 _staticInvUCA
= NULL
;
1332 U_CAPI
const InverseUCATableHeader
* U_EXPORT2
1333 ucol_initInverseUCA(UErrorCode
*status
)
1335 if(U_FAILURE(*status
)) return NULL
;
1338 UMTX_CHECK(NULL
, (_staticInvUCA
== NULL
), needsInit
);
1341 InverseUCATableHeader
*newInvUCA
= NULL
;
1342 UDataMemory
*result
= udata_openChoice(U_ICUDATA_COLL
, INVC_DATA_TYPE
, INVC_DATA_NAME
, isAcceptableInvUCA
, NULL
, status
);
1344 if(U_FAILURE(*status
)) {
1346 udata_close(result
);
1348 // This is not needed, as we are talking about
1349 // memory we got from UData
1350 //uprv_free(newInvUCA);
1353 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1354 newInvUCA
= (InverseUCATableHeader
*)udata_getMemory(result
);
1355 UCollator
*UCA
= ucol_initUCA(status
);
1356 // UCA versions of UCA and inverse UCA should match
1357 if(uprv_memcmp(newInvUCA
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0) {
1358 *status
= U_INVALID_FORMAT_ERROR
;
1359 udata_close(result
);
1364 if(_staticInvUCA
== NULL
) {
1365 invUCA_DATA_MEM
= result
;
1366 _staticInvUCA
= newInvUCA
;
1372 if(newInvUCA
!= NULL
) {
1373 udata_close(result
);
1374 // This is not needed, as we are talking about
1375 // memory we got from UData
1376 //uprv_free(newInvUCA);
1379 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD
, ucol_bld_cleanup
);
1383 return _staticInvUCA
;
1386 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1387 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1389 static const char* ReorderingTokenNames
[] = {
1398 static void toUpper(const char* src
, char* dst
, uint32_t length
) {
1399 for (uint32_t i
= 0; *src
!= '\0' && i
< length
- 1; ++src
, ++dst
, ++i
) {
1400 *dst
= toupper(*src
);
1405 U_INTERNAL
int32_t U_EXPORT2
1406 ucol_findReorderingEntry(const char* name
) {
1408 toUpper(name
, buffer
, 32);
1409 for (uint32_t entry
= 0; ReorderingTokenNames
[entry
] != NULL
; entry
++) {
1410 if (uprv_strcmp(buffer
, ReorderingTokenNames
[entry
]) == 0) {
1411 return entry
+ UCOL_REORDER_CODE_FIRST
;
1414 return USCRIPT_INVALID_CODE
;
1417 #endif /* #if !UCONFIG_NO_COLLATION */