2 *******************************************************************************
4 * Copyright (C) 2001-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_bld.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This module builds a collator based on the rule set.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
40 static const InverseUCATableHeader
* _staticInvUCA
= NULL
;
41 static UDataMemory
* invUCA_DATA_MEM
= NULL
;
44 static UBool U_CALLCONV
45 isAcceptableInvUCA(void * /*context*/,
46 const char * /*type*/, const char * /*name*/,
47 const UDataInfo
*pInfo
)
49 /* context, type & name are intentionally not used */
50 if( pInfo
->size
>=20 &&
51 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
52 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
53 pInfo
->dataFormat
[0]==INVUCA_DATA_FORMAT_0
&& /* dataFormat="InvC" */
54 pInfo
->dataFormat
[1]==INVUCA_DATA_FORMAT_1
&&
55 pInfo
->dataFormat
[2]==INVUCA_DATA_FORMAT_2
&&
56 pInfo
->dataFormat
[3]==INVUCA_DATA_FORMAT_3
&&
57 pInfo
->formatVersion
[0]==INVUCA_FORMAT_VERSION_0
&&
58 pInfo
->formatVersion
[1]>=INVUCA_FORMAT_VERSION_1
//&&
59 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
60 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
61 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
64 UVersionInfo UCDVersion
;
65 u_getUnicodeVersion(UCDVersion
);
66 return (pInfo
->dataVersion
[0]==UCDVersion
[0] &&
67 pInfo
->dataVersion
[1]==UCDVersion
[1]);
68 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
69 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
70 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
78 * Takes two CEs (lead and continuation) and
79 * compares them as CEs should be compared:
80 * primary vs. primary, secondary vs. secondary
81 * tertiary vs. tertiary
83 static int32_t compareCEs(uint32_t source0
, uint32_t source1
, uint32_t target0
, uint32_t target1
) {
84 uint32_t s1
= source0
, s2
, t1
= target0
, t2
;
85 if(isContinuation(source1
)) {
90 if(isContinuation(target1
)) {
96 uint32_t s
= 0, t
= 0;
97 if(s1
== t1
&& s2
== t2
) {
100 s
= (s1
& 0xFFFF0000)|((s2
& 0xFFFF0000)>>16);
101 t
= (t1
& 0xFFFF0000)|((t2
& 0xFFFF0000)>>16);
107 s
= (s1
& 0x0000FF00) | (s2
& 0x0000FF00)>>8;
108 t
= (t1
& 0x0000FF00) | (t2
& 0x0000FF00)>>8;
114 s
= (s1
& 0x000000FF)<<8 | (s2
& 0x000000FF);
115 t
= (t1
& 0x000000FF)<<8 | (t2
& 0x000000FF);
126 int32_t ucol_inv_findCE(const UColTokenParser
*src
, uint32_t CE
, uint32_t SecondCE
) {
127 uint32_t bottom
= 0, top
= src
->invUCA
->tableSize
;
129 uint32_t first
= 0, second
= 0;
130 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
133 while(bottom
< top
-1) {
135 first
= *(CETable
+3*i
);
136 second
= *(CETable
+3*i
+1);
137 res
= compareCEs(first
, second
, CE
, SecondCE
);
148 /* in searching for elements, I have removed the failure */
149 /* The reason for this is that the builder does not rely */
150 /* on search mechanism telling it that it didn't find an */
151 /* element. However, indirect positioning relies on being */
152 /* able to find the elements around any CE, even if it is */
153 /* not defined in the UCA. */
156 if((first == CE && second == SecondCE)) {
164 static const uint32_t strengthMask
[UCOL_CE_STRENGTH_LIMIT
] = {
170 U_CAPI
int32_t U_EXPORT2
ucol_inv_getNextCE(const UColTokenParser
*src
,
171 uint32_t CE
, uint32_t contCE
,
172 uint32_t *nextCE
, uint32_t *nextContCE
,
175 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
178 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
181 *nextCE
= UCOL_NOT_FOUND
;
185 CE
&= strengthMask
[strength
];
186 contCE
&= strengthMask
[strength
];
189 *nextContCE
= contCE
;
191 while((*nextCE
& strengthMask
[strength
]) == CE
192 && (*nextContCE
& strengthMask
[strength
]) == contCE
)
194 *nextCE
= (*(CETable
+3*(++iCE
)));
195 *nextContCE
= (*(CETable
+3*(iCE
)+1));
201 U_CFUNC
int32_t U_EXPORT2
ucol_inv_getPrevCE(const UColTokenParser
*src
,
202 uint32_t CE
, uint32_t contCE
,
203 uint32_t *prevCE
, uint32_t *prevContCE
,
206 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
209 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
212 *prevCE
= UCOL_NOT_FOUND
;
216 CE
&= strengthMask
[strength
];
217 contCE
&= strengthMask
[strength
];
220 *prevContCE
= contCE
;
222 while((*prevCE
& strengthMask
[strength
]) == CE
223 && (*prevContCE
& strengthMask
[strength
])== contCE
224 && iCE
> 0) /* this condition should prevent falling off the edge of the world */
226 /* here, we end up in a singularity - zero */
227 *prevCE
= (*(CETable
+3*(--iCE
)));
228 *prevContCE
= (*(CETable
+3*(iCE
)+1));
234 U_CFUNC
uint32_t U_EXPORT2
ucol_getCEStrengthDifference(uint32_t CE
, uint32_t contCE
,
235 uint32_t prevCE
, uint32_t prevContCE
)
237 if(prevCE
== CE
&& prevContCE
== contCE
) {
238 return UCOL_IDENTICAL
;
240 if((prevCE
& strengthMask
[UCOL_PRIMARY
]) != (CE
& strengthMask
[UCOL_PRIMARY
])
241 || (prevContCE
& strengthMask
[UCOL_PRIMARY
]) != (contCE
& strengthMask
[UCOL_PRIMARY
]))
245 if((prevCE
& strengthMask
[UCOL_SECONDARY
]) != (CE
& strengthMask
[UCOL_SECONDARY
])
246 || (prevContCE
& strengthMask
[UCOL_SECONDARY
]) != (contCE
& strengthMask
[UCOL_SECONDARY
]))
248 return UCOL_SECONDARY
;
250 return UCOL_TERTIARY
;
255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
257 uint32_t CE = lh->baseCE;
258 uint32_t SecondCE = lh->baseContCE;
260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
261 uint32_t previousCE, previousContCE;
264 iCE = ucol_inv_findCE(src, CE, SecondCE);
270 CE &= strengthMask[strength];
271 SecondCE &= strengthMask[strength];
274 previousContCE = SecondCE;
276 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
277 previousCE = (*(CETable+3*(--iCE)));
278 previousContCE = (*(CETable+3*(iCE)+1));
280 lh->previousCE = previousCE;
281 lh->previousContCE = previousContCE;
287 inline int32_t ucol_inv_getNext(UColTokenParser
*src
, UColTokListHeader
*lh
, uint32_t strength
) {
288 uint32_t CE
= lh
->baseCE
;
289 uint32_t SecondCE
= lh
->baseContCE
;
291 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
292 uint32_t nextCE
, nextContCE
;
295 iCE
= ucol_inv_findCE(src
, CE
, SecondCE
);
301 CE
&= strengthMask
[strength
];
302 SecondCE
&= strengthMask
[strength
];
305 nextContCE
= SecondCE
;
307 while((nextCE
& strengthMask
[strength
]) == CE
308 && (nextContCE
& strengthMask
[strength
]) == SecondCE
)
310 nextCE
= (*(CETable
+3*(++iCE
)));
311 nextContCE
= (*(CETable
+3*(iCE
)+1));
315 lh
->nextContCE
= nextContCE
;
320 static void ucol_inv_getGapPositions(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
321 /* reset all the gaps */
323 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
328 UColToken
*tok
= lh
->first
;
329 uint32_t tokStrength
= tok
->strength
;
331 for(i
= 0; i
<3; i
++) {
333 lh
->gapsHi
[3*i
+1] = 0;
334 lh
->gapsHi
[3*i
+2] = 0;
336 lh
->gapsLo
[3*i
+1] = 0;
337 lh
->gapsLo
[3*i
+2] = 0;
339 lh
->fStrToken
[i
] = NULL
;
340 lh
->lStrToken
[i
] = NULL
;
344 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
346 if((lh
->baseCE
& 0xFF000000)>= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) && (lh
->baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
347 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
350 t2
= lh
->baseContCE
& UCOL_REMOVE_CONTINUATION
;
351 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
352 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
353 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
354 uint32_t primaryCE
= (t1
& UCOL_PRIMARYMASK
) | ((t2
& UCOL_PRIMARYMASK
) >> 16);
355 primaryCE
= uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE
)+1);
357 t1
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
358 t2
= (primaryCE
<< 16) & UCOL_PRIMARYMASK
; // | UCOL_CONTINUATION_MARKER;
360 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
361 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
362 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
363 } else if(lh
->indirect
== TRUE
&& lh
->nextCE
!= 0) {
364 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
367 t2
= lh
->baseContCE
&UCOL_REMOVE_CONTINUATION
;
368 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
369 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
370 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
372 t2
= lh
->nextContCE
&UCOL_REMOVE_CONTINUATION
;
373 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
374 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
375 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
378 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
379 if((lh
->pos
[tokStrength
] = ucol_inv_getNext(src
, lh
, tokStrength
)) >= 0) {
380 lh
->fStrToken
[tokStrength
] = tok
;
381 } else { /* The CE must be implicit, since it's not in the table */
383 *status
= U_INTERNAL_PROGRAM_ERROR
;
387 while(tok
!= NULL
&& tok
->strength
>= tokStrength
) {
388 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
389 lh
->lStrToken
[tokStrength
] = tok
;
393 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
-1) {
394 /* check if previous interval is the same and merge the intervals if it is so */
395 if(lh
->pos
[tokStrength
] == lh
->pos
[tokStrength
+1]) {
396 lh
->fStrToken
[tokStrength
] = lh
->fStrToken
[tokStrength
+1];
397 lh
->fStrToken
[tokStrength
+1] = NULL
;
398 lh
->lStrToken
[tokStrength
+1] = NULL
;
399 lh
->pos
[tokStrength
+1] = -1;
403 tokStrength
= tok
->strength
;
408 for(st
= 0; st
< 3; st
++) {
409 if((pos
= lh
->pos
[st
]) >= 0) {
410 t1
= *(CETable
+3*(pos
));
411 t2
= *(CETable
+3*(pos
)+1);
412 lh
->gapsHi
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
413 lh
->gapsHi
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
414 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
415 lh
->gapsHi
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
417 //t1 = *(CETable+3*(pos));
418 //t2 = *(CETable+3*(pos)+1);
421 lh
->gapsLo
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
422 lh
->gapsLo
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
423 lh
->gapsLo
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
430 #define ucol_countBytes(value, noOfBytes) \
432 uint32_t mask = 0xFFFFFFFF; \
435 if(((value) & mask) != 0) { \
442 static uint32_t ucol_getNextGenerated(ucolCEGenerator
*g
, UErrorCode
*status
) {
443 if(U_SUCCESS(*status
)) {
444 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator
*g
, UColToken
*tok
, uint32_t strength
, UErrorCode
*status
) {
450 /* TODO: rename to enum names */
451 uint32_t high
, low
, count
=1;
452 uint32_t maxByte
= (strength
== UCOL_TERTIARY
)?0x3F:0xFF;
454 if(strength
== UCOL_SECONDARY
) {
455 low
= UCOL_COMMON_TOP2
<<24;
457 count
= 0xFF - UCOL_COMMON_TOP2
;
459 low
= UCOL_BYTE_COMMON
<< 24; //0x05000000;
461 count
= 0x40 - UCOL_BYTE_COMMON
;
464 if(tok
->next
!= NULL
&& tok
->next
->strength
== strength
) {
465 count
= tok
->next
->toInsert
;
468 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
469 g
->current
= UCOL_BYTE_COMMON
<<24;
471 if(g
->noOfRanges
== 0) {
472 *status
= U_INTERNAL_PROGRAM_ERROR
;
477 static uint32_t ucol_getCEGenerator(ucolCEGenerator
*g
, uint32_t* lows
, uint32_t* highs
, UColToken
*tok
, uint32_t fStrength
, UErrorCode
*status
) {
478 uint32_t strength
= tok
->strength
;
479 uint32_t low
= lows
[fStrength
*3+strength
];
480 uint32_t high
= highs
[fStrength
*3+strength
];
481 uint32_t maxByte
= 0;
482 if(strength
== UCOL_TERTIARY
) {
484 } else if(strength
== UCOL_PRIMARY
) {
490 uint32_t count
= tok
->toInsert
;
492 if(low
>= high
&& strength
> UCOL_PRIMARY
) {
493 int32_t s
= strength
;
496 if(lows
[fStrength
*3+s
] != highs
[fStrength
*3+s
]) {
497 if(strength
== UCOL_SECONDARY
) {
498 if (low
< UCOL_COMMON_TOP2
<<24 ) {
499 // Override if low range is less than UCOL_COMMON_TOP2.
500 low
= UCOL_COMMON_TOP2
<<24;
504 // Override if low range is less than UCOL_COMMON_BOT3.
505 if ( low
< UCOL_COMMON_BOT3
<<24 ) {
506 low
= UCOL_COMMON_BOT3
<<24;
513 *status
= U_INTERNAL_PROGRAM_ERROR
;
519 if(low
< 0x02000000) {
520 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
521 // See http://site.icu-project.org/design/collation/bytes
525 if(strength
== UCOL_SECONDARY
) { /* similar as simple */
526 if(low
>= (UCOL_COMMON_BOT2
<<24) && low
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
527 low
= UCOL_COMMON_TOP2
<<24;
529 if(high
> (UCOL_COMMON_BOT2
<<24) && high
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
530 high
= UCOL_COMMON_TOP2
<<24;
532 if(low
< (UCOL_COMMON_BOT2
<<24)) {
533 g
->noOfRanges
= ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN
<<24, high
, count
, maxByte
, g
->ranges
);
534 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
535 //g->current = UCOL_COMMON_BOT2<<24;
540 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
541 if(g
->noOfRanges
== 0) {
542 *status
= U_INTERNAL_PROGRAM_ERROR
;
544 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
549 uint32_t u_toLargeKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
553 if(U_FAILURE(*status
)) {
557 if(sourceLen
> resLen
) {
558 *status
= U_MEMORY_ALLOCATION_ERROR
;
562 for(i
= 0; i
< sourceLen
; i
++) {
564 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
566 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
567 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
584 uint32_t u_toSmallKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
588 if(U_FAILURE(*status
)) {
592 if(sourceLen
> resLen
) {
593 *status
= U_MEMORY_ALLOCATION_ERROR
;
597 for(i
= 0; i
< sourceLen
; i
++) {
599 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
601 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
602 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
621 uint8_t ucol_uprv_getCaseBits(const UCollator
*UCA
, const UChar
*src
, uint32_t len
, UErrorCode
*status
) {
625 uint32_t uCount
= 0, lCount
= 0;
630 if(U_FAILURE(*status
)) {
631 return UCOL_LOWER_CASE
;
634 nLen
= unorm_normalize(src
, len
, UNORM_NFKD
, 0, n
, 128, status
);
635 if(U_SUCCESS(*status
)) {
636 for(i
= 0; i
< nLen
; i
++) {
637 uprv_init_collIterate(UCA
, &n
[i
], 1, &s
, status
);
638 order
= ucol_getNextCE(UCA
, &s
, status
);
639 if(isContinuation(order
)) {
640 *status
= U_INTERNAL_PROGRAM_ERROR
;
641 return UCOL_LOWER_CASE
;
643 if((order
&UCOL_CASE_BIT_MASK
)== UCOL_UPPER_CASE
) {
646 if(u_islower(n
[i
])) {
648 } else if(U_SUCCESS(*status
)) {
650 u_toSmallKana(&n
[i
], 1, sk
, 1, status
);
651 u_toLargeKana(&n
[i
], 1, lk
, 1, status
);
652 if(sk
[0] == n
[i
] && lk
[0] != n
[i
]) {
660 if(uCount
!= 0 && lCount
!= 0) {
661 return UCOL_MIXED_CASE
;
662 } else if(uCount
!= 0) {
663 return UCOL_UPPER_CASE
;
665 return UCOL_LOWER_CASE
;
670 U_CFUNC
void ucol_doCE(UColTokenParser
*src
, uint32_t *CEparts
, UColToken
*tok
, UErrorCode
*status
) {
671 /* this one makes the table and stuff */
672 uint32_t noOfBytes
[3];
675 for(i
= 0; i
<3; i
++) {
676 ucol_countBytes(CEparts
[i
], noOfBytes
[i
]);
679 /* Here we have to pack CEs from parts */
684 while(2*CEi
<noOfBytes
[0] || CEi
<noOfBytes
[1] || CEi
<noOfBytes
[2]) {
686 value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
691 if(2*CEi
<noOfBytes
[0]) {
692 value
|= ((CEparts
[0]>>(32-16*(CEi
+1))) & 0xFFFF) << 16;
694 if(CEi
<noOfBytes
[1]) {
695 value
|= ((CEparts
[1]>>(32-8*(CEi
+1))) & 0xFF) << 8;
697 if(CEi
<noOfBytes
[2]) {
698 value
|= ((CEparts
[2]>>(32-8*(CEi
+1))) & 0x3F);
700 tok
->CEs
[CEi
] = value
;
703 if(CEi
== 0) { /* totally ignorable */
706 } else { /* there is at least something */
711 // we want to set case bits here and now, not later.
712 // Case bits handling
713 if(tok
->CEs
[0] != 0) { // case bits should be set only for non-ignorables
714 tok
->CEs
[0] &= 0xFFFFFF3F; // Clean the case bits field
715 int32_t cSize
= (tok
->source
& 0xFF000000) >> 24;
716 UChar
*cPoints
= (tok
->source
& 0x00FFFFFF) + src
->source
;
720 tok
->CEs
[0] |= ucol_uprv_getCaseBits(src
->UCA
, cPoints
, cSize
, status
);
722 // Copy it from the UCA
723 uint32_t caseCE
= ucol_getFirstCE(src
->UCA
, cPoints
[0], status
);
724 tok
->CEs
[0] |= (caseCE
& 0xC0);
729 fprintf(stderr
, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok
->debugSource
, tok
->strength
, CEparts
[0] >> (32-8*noOfBytes
[0]), CEparts
[1] >> (32-8*noOfBytes
[1]), CEparts
[2]>> (32-8*noOfBytes
[2]));
730 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
731 fprintf(stderr
, "%08X ", tok
->CEs
[i
]);
733 fprintf(stderr
, "\n");
737 U_CFUNC
void ucol_initBuffers(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
738 ucolCEGenerator Gens
[UCOL_CE_STRENGTH_LIMIT
];
739 uint32_t CEparts
[UCOL_CE_STRENGTH_LIMIT
];
741 UColToken
*tok
= lh
->last
;
742 uint32_t t
[UCOL_STRENGTH_LIMIT
];
744 uprv_memset(t
, 0, UCOL_STRENGTH_LIMIT
*sizeof(uint32_t));
747 t
[tok
->strength
] = 1;
749 while(tok
->previous
!= NULL
) {
750 if(tok
->previous
->strength
< tok
->strength
) { /* going up */
751 t
[tok
->strength
] = 0;
752 t
[tok
->previous
->strength
]++;
753 } else if(tok
->previous
->strength
> tok
->strength
) { /* going down */
754 t
[tok
->previous
->strength
] = 1;
759 tok
->toInsert
= t
[tok
->strength
];
762 tok
->toInsert
= t
[tok
->strength
];
763 ucol_inv_getGapPositions(src
, lh
, status
);
766 fprintf(stderr
, "BaseCE: %08X %08X\n", lh
->baseCE
, lh
->baseContCE
);
768 for(j
= 2; j
>= 0; j
--) {
769 fprintf(stderr
, "gapsLo[%i] [%08X %08X %08X]\n", j
, lh
->gapsLo
[j
*3], lh
->gapsLo
[j
*3+1], lh
->gapsLo
[j
*3+2]);
770 fprintf(stderr
, "gapsHi[%i] [%08X %08X %08X]\n", j
, lh
->gapsHi
[j
*3], lh
->gapsHi
[j
*3+1], lh
->gapsHi
[j
*3+2]);
772 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
775 fprintf(stderr
,"%i", tok
->strength
);
777 } while(tok
!= NULL
);
778 fprintf(stderr
, "\n");
780 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
783 fprintf(stderr
,"%i", tok
->toInsert
);
785 } while(tok
!= NULL
);
789 uint32_t fStrength
= UCOL_IDENTICAL
;
790 uint32_t initStrength
= UCOL_IDENTICAL
;
793 CEparts
[UCOL_PRIMARY
] = (lh
->baseCE
& UCOL_PRIMARYMASK
) | (lh
->baseContCE
& UCOL_PRIMARYMASK
) >> 16;
794 CEparts
[UCOL_SECONDARY
] = (lh
->baseCE
& UCOL_SECONDARYMASK
) << 16 | (lh
->baseContCE
& UCOL_SECONDARYMASK
) << 8;
795 CEparts
[UCOL_TERTIARY
] = (UCOL_TERTIARYORDER(lh
->baseCE
)) << 24 | (UCOL_TERTIARYORDER(lh
->baseContCE
)) << 16;
797 while (tok
!= NULL
&& U_SUCCESS(*status
)) {
798 fStrength
= tok
->strength
;
799 if(fStrength
< initStrength
) {
800 initStrength
= fStrength
;
801 if(lh
->pos
[fStrength
] == -1) {
802 while(lh
->pos
[fStrength
] == -1 && fStrength
> 0) {
805 if(lh
->pos
[fStrength
] == -1) {
806 *status
= U_INTERNAL_PROGRAM_ERROR
;
810 if(initStrength
== UCOL_TERTIARY
) { /* starting with tertiary */
811 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
812 CEparts
[UCOL_SECONDARY
] = lh
->gapsLo
[fStrength
*3+1];
813 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
814 CEparts
[UCOL_TERTIARY
] = ucol_getCEGenerator(&Gens
[UCOL_TERTIARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
815 } else if(initStrength
== UCOL_SECONDARY
) { /* secondaries */
816 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
817 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
818 CEparts
[UCOL_SECONDARY
] = ucol_getCEGenerator(&Gens
[UCOL_SECONDARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
819 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
820 } else { /* primaries */
821 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
822 CEparts
[UCOL_PRIMARY
] = ucol_getCEGenerator(&Gens
[UCOL_PRIMARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
823 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
824 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
827 if(tok
->strength
== UCOL_TERTIARY
) {
828 CEparts
[UCOL_TERTIARY
] = ucol_getNextGenerated(&Gens
[UCOL_TERTIARY
], status
);
829 } else if(tok
->strength
== UCOL_SECONDARY
) {
830 CEparts
[UCOL_SECONDARY
] = ucol_getNextGenerated(&Gens
[UCOL_SECONDARY
], status
);
831 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
832 } else if(tok
->strength
== UCOL_PRIMARY
) {
833 CEparts
[UCOL_PRIMARY
] = ucol_getNextGenerated(&Gens
[UCOL_PRIMARY
], status
);
834 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
835 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
838 ucol_doCE(src
, CEparts
, tok
, status
);
843 U_CFUNC
void ucol_createElements(UColTokenParser
*src
, tempUCATable
*t
, UColTokListHeader
*lh
, UErrorCode
*status
) {
845 UColToken
*tok
= lh
->first
;
846 UColToken
*expt
= NULL
;
847 uint32_t i
= 0, j
= 0;
848 const Normalizer2Impl
*nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
850 while(tok
!= NULL
&& U_SUCCESS(*status
)) {
851 /* first, check if there are any expansions */
852 /* if there are expansions, we need to do a little bit more processing */
853 /* since parts of expansion can be tailored, while others are not */
854 if(tok
->expansion
!= 0) {
855 uint32_t len
= tok
->expansion
>> 24;
856 uint32_t currentSequenceLen
= len
;
857 uint32_t expOffset
= tok
->expansion
& 0x00FFFFFF;
858 //uint32_t exp = currentSequenceLen | expOffset;
860 exp
.source
= currentSequenceLen
| expOffset
;
861 exp
.rulesToParseHdl
= &(src
->source
);
864 currentSequenceLen
= len
;
865 while(currentSequenceLen
> 0) {
866 exp
.source
= (currentSequenceLen
<< 24) | expOffset
;
867 if((expt
= (UColToken
*)uhash_get(src
->tailored
, &exp
)) != NULL
&& expt
->strength
!= UCOL_TOK_RESET
) { /* expansion is tailored */
868 uint32_t noOfCEsToCopy
= expt
->noOfCEs
;
869 for(j
= 0; j
<noOfCEsToCopy
; j
++) {
870 tok
->expCEs
[tok
->noOfExpCEs
+ j
] = expt
->CEs
[j
];
872 tok
->noOfExpCEs
+= noOfCEsToCopy
;
873 // Smart people never try to add codepoints and CEs.
874 // For some odd reason, it won't work.
875 expOffset
+= currentSequenceLen
; //noOfCEsToCopy;
876 len
-= currentSequenceLen
; //noOfCEsToCopy;
879 currentSequenceLen
--;
882 if(currentSequenceLen
== 0) { /* couldn't find any tailored subsequence */
883 /* will have to get one from UCA */
884 /* first, get the UChars from the rules */
885 /* then pick CEs out until there is no more and stuff them into expansion */
888 uprv_init_collIterate(src
->UCA
, expOffset
+ src
->source
, 1, &s
, status
);
891 order
= ucol_getNextCE(src
->UCA
, &s
, status
);
892 if(order
== UCOL_NO_MORE_CES
) {
895 tok
->expCEs
[tok
->noOfExpCEs
++] = order
;
905 /* set the ucaelement with obtained values */
906 el
.noOfCEs
= tok
->noOfCEs
+ tok
->noOfExpCEs
;
908 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
909 el
.CEs
[i
] = tok
->CEs
[i
];
911 for(i
= 0; i
<tok
->noOfExpCEs
; i
++) {
912 el
.CEs
[i
+tok
->noOfCEs
] = tok
->expCEs
[i
];
916 // We kept prefix and source kind of together, as it is a kind of a contraction.
917 // However, now we have to slice the prefix off the main thing -
918 el
.prefix
= el
.prefixChars
;
919 el
.cPoints
= el
.uchars
;
920 if(tok
->prefix
!= 0) { // we will just copy the prefix here, and adjust accordingly in the
921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
922 // decomposed elements to the unsaf table.
923 el
.prefixSize
= tok
->prefix
>>24;
924 uprv_memcpy(el
.prefix
, src
->source
+ (tok
->prefix
& 0x00FFFFFF), el
.prefixSize
*sizeof(UChar
));
926 el
.cSize
= (tok
->source
>> 24)-(tok
->prefix
>>24);
927 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF)+(tok
->prefix
>>24) + src
->source
, el
.cSize
*sizeof(UChar
));
932 el
.cSize
= (tok
->source
>> 24);
933 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF) + src
->source
, el
.cSize
*sizeof(UChar
));
935 if(src
->UCA
!= NULL
) {
936 for(i
= 0; i
<el
.cSize
; i
++) {
937 if(UCOL_ISJAMO(el
.cPoints
[i
])) {
938 t
->image
->jamoSpecial
= TRUE
;
941 if (!src
->buildCCTabFlag
&& el
.cSize
> 0) {
942 // Check the trailing canonical combining class (tccc) of the last character.
943 const UChar
*s
= el
.cPoints
+ el
.cSize
;
944 uint16_t fcd
= nfcImpl
->previousFCD16(el
.cPoints
, s
);
945 if ((fcd
& 0xff) != 0) {
946 src
->buildCCTabFlag
= TRUE
;
951 /* and then, add it */
953 fprintf(stderr
, "Adding: %04X with %08X\n", el
.cPoints
[0], el
.CEs
[0]);
955 uprv_uca_addAnElement(t
, &el
, status
);
957 #if UCOL_DEBUG_DUPLICATES
958 if(*status
!= U_ZERO_ERROR
) {
959 fprintf(stderr
, "replaced CE for %04X with CE for %04X\n", el
.cPoints
[0], tok
->debugSource
);
960 *status
= U_ZERO_ERROR
;
969 static UBool U_CALLCONV
970 _processUCACompleteIgnorables(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
971 UErrorCode status
= U_ZERO_ERROR
;
972 tempUCATable
*t
= (tempUCATable
*)context
;
974 while(start
< limit
) {
975 uint32_t CE
= utrie_get32(t
->mapping
, start
, NULL
);
976 if(CE
== UCOL_NOT_FOUND
) {
980 el
.prefixChars
[0] = 0;
981 el
.prefix
= el
.prefixChars
;
982 el
.cPoints
= el
.uchars
;
985 U16_APPEND_UNSAFE(el
.uchars
, el
.cSize
, start
);
989 uprv_uca_addAnElement(t
, &el
, &status
);
995 if(U_FAILURE(status
)) {
1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser
*src
, tempUCATable
*t
,
1005 UChar32 start
, UChar32 end
,
1008 //UChar decomp[256];
1009 uint32_t CE
= UCOL_NOT_FOUND
;
1014 el
.prefixChars
[0] = 0;
1017 if(U_SUCCESS(*status
)) {
1018 for(u
= start
; u
<=end
; u
++) {
1019 if((CE
= utrie_get32(t
->mapping
, u
, NULL
)) == UCOL_NOT_FOUND
1020 /* this test is for contractions that are missing the starting element. */
1021 || ((isCntTableElement(CE
)) &&
1022 (uprv_cnttab_getCE(t
->contractions
, CE
, 0, status
) == UCOL_NOT_FOUND
))
1026 U16_APPEND_UNSAFE(el
.uchars
, el
.cSize
, u
);
1027 //decomp[0] = (UChar)u;
1028 //el.uchars[0] = (UChar)u;
1029 el
.cPoints
= el
.uchars
;
1032 el
.prefix
= el
.prefixChars
;
1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1035 // We actually want to check whether this element is a special
1036 // If it is an implicit element (hangul, CJK - we want to copy the
1037 // special, not the resolved CEs) - for hangul, copying resolved
1038 // would just make things the same (there is an expansion and it
1039 // takes approximately the same amount of time to resolve as
1040 // falling back to the UCA).
1042 UTRIE_GET32(src->UCA->mapping, u, CE);
1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1046 || tag == LEAD_SURROGATE_TAG) {
1047 el.CEs[el.noOfCEs++] = CE;
1050 // It turns out that it does not make sense to keep implicits
1051 // unresolved. The cost of resolving them is big enough so that
1052 // it doesn't make any difference whether we have to go to the UCA
1055 uprv_init_collIterate(src
->UCA
, el
.uchars
, el
.cSize
, &colIt
, status
);
1056 while(CE
!= UCOL_NO_MORE_CES
) {
1057 CE
= ucol_getNextCE(src
->UCA
, &colIt
, status
);
1058 if(CE
!= UCOL_NO_MORE_CES
) {
1059 el
.CEs
[el
.noOfCEs
++] = CE
;
1063 uprv_uca_addAnElement(t
, &el
, status
);
1071 U_CFUNC UCATableHeader
*
1072 ucol_assembleTailoringTable(UColTokenParser
*src
, UErrorCode
*status
) {
1076 if(U_FAILURE(*status
)) {
1080 2. Eliminate the negative lists by doing the following for each non-null negative list:
1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1082 create new ListHeader X
1083 o reverse the list, add to the end of X's positive list. Reset the strength of the
1084 first item you add, based on the stronger strength levels of the two lists.
1087 3. For each ListHeader with a non-null positive list:
1090 o Find all character strings with CEs between the baseCE and the
1091 next/previous CE, at the strength of the first token. Add these to the
1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1094 tailoring has & x < z...
1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1097 /* It is possible that this part should be done even while constructing list */
1098 /* The problem is that it is unknown what is going to be the strongest weight */
1099 /* So we might as well do it here */
1102 o Allocate CEs for each token in the list, based on the total number N of the
1103 largest level difference, and the gap G between baseCE and nextCE at that
1104 level. The relation * between the last item and nextCE is the same as the
1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1108 Then fit b and c into the secondary gap between a and d, then fit q
1109 into the tertiary gap between b and c.
1111 o Example: baseCE << b <<< q << c * nextCE(X,2)
1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1113 Then fit q into the tertiary gap between b and c.
1114 o When incrementing primary values, we will not cross high byte
1115 boundaries except where there is only a single-byte primary. That is to
1116 ensure that the script reordering will continue to work.
1118 UCATableHeader
*image
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
1120 if (image
== NULL
) {
1121 *status
= U_MEMORY_ALLOCATION_ERROR
;
1124 uprv_memcpy(image
, src
->UCA
->image
, sizeof(UCATableHeader
));
1126 for(i
= 0; i
<src
->resultLen
; i
++) {
1127 /* now we need to generate the CEs */
1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1129 /* According to strength */
1130 if(U_SUCCESS(*status
)) {
1131 if(src
->lh
[i
].first
) { // if there are any elements
1132 // due to the way parser works, subsequent tailorings
1133 // may remove all the elements from a sequence, therefore
1134 // leaving an empty tailoring sequence.
1135 ucol_initBuffers(src
, &src
->lh
[i
], status
);
1138 if(U_FAILURE(*status
)) {
1144 if(src
->varTop
!= NULL
) { /* stuff the variable top value */
1145 src
->opts
->variableTopValue
= (*(src
->varTop
->CEs
))>>16;
1146 /* remove it from the list */
1147 if(src
->varTop
->listHeader
->first
== src
->varTop
) { /* first in list */
1148 src
->varTop
->listHeader
->first
= src
->varTop
->next
;
1150 if(src
->varTop
->listHeader
->last
== src
->varTop
) { /* first in list */
1151 src
->varTop
->listHeader
->last
= src
->varTop
->previous
;
1153 if(src
->varTop
->next
!= NULL
) {
1154 src
->varTop
->next
->previous
= src
->varTop
->previous
;
1156 if(src
->varTop
->previous
!= NULL
) {
1157 src
->varTop
->previous
->next
= src
->varTop
->next
;
1162 tempUCATable
*t
= uprv_uca_initTempTable(image
, src
->opts
, src
->UCA
, NOT_FOUND_TAG
, NOT_FOUND_TAG
, status
);
1163 if(U_FAILURE(*status
)) {
1169 /* After this, we have assigned CE values to all regular CEs */
1170 /* now we will go through list once more and resolve expansions, */
1171 /* make UCAElements structs and add them to table */
1172 for(i
= 0; i
<src
->resultLen
; i
++) {
1173 /* now we need to generate the CEs */
1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1175 /* According to strength */
1176 if(U_SUCCESS(*status
)) {
1177 ucol_createElements(src
, t
, &src
->lh
[i
], status
);
1184 el
.prefixChars
[0] = 0;
1186 /* add latin-1 stuff */
1187 ucol_uprv_bld_copyRangeFromUCA(src
, t
, 0, 0xFF, status
);
1189 /* add stuff for copying */
1190 if(src
->copySet
!= NULL
) {
1192 UnicodeSet
*set
= (UnicodeSet
*)src
->copySet
;
1193 for(i
= 0; i
< set
->getRangeCount(); i
++) {
1194 ucol_uprv_bld_copyRangeFromUCA(src
, t
, set
->getRangeStart(i
), set
->getRangeEnd(i
), status
);
1198 if(U_SUCCESS(*status
)) {
1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1201 uint32_t tailoredCE
= UCOL_NOT_FOUND
;
1202 UChar
*conts
= (UChar
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->contractionUCACombos
);
1203 int32_t maxUCAContractionLength
= src
->UCA
->image
->contractionUCACombosWidth
;
1204 UCollationElements
*ucaEl
= ucol_openElements(src
->UCA
, NULL
, 0, status
);
1205 // Check for null pointer
1206 if (ucaEl
== NULL
) {
1207 *status
= U_MEMORY_ALLOCATION_ERROR
;
1210 while(*conts
!= 0) {
1211 // A continuation is NUL-terminated and NUL-padded
1212 // except if it has the maximum length.
1213 int32_t contractionLength
= maxUCAContractionLength
;
1214 while(contractionLength
> 0 && conts
[contractionLength
- 1] == 0) {
1215 --contractionLength
;
1218 int32_t firstLength
= 0;
1219 U16_NEXT(conts
, firstLength
, contractionLength
, first
);
1220 tailoredCE
= utrie_get32(t
->mapping
, first
, NULL
);
1221 if(tailoredCE
!= UCOL_NOT_FOUND
) {
1222 UBool needToAdd
= TRUE
;
1223 if(isCntTableElement(tailoredCE
)) {
1224 if(uprv_cnttab_isTailored(t
->contractions
, tailoredCE
, conts
+firstLength
, status
) == TRUE
) {
1228 if (!needToAdd
&& isPrefix(tailoredCE
) && *(conts
+1)==0) {
1230 elm
.cPoints
= el
.uchars
;
1232 elm
.uchars
[0] = *conts
;
1235 elm
.prefixChars
[0] = *(conts
+2);
1237 elm
.prefix
= elm
.prefixChars
;
1239 UCAElements
*prefixEnt
=(UCAElements
*)uhash_get(t
->prefixLookup
, &elm
);
1240 if ((prefixEnt
==NULL
) || *(prefixEnt
->prefix
)!=*(conts
+2)) {
1244 if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, first
)) {
1248 if(needToAdd
== TRUE
) { // we need to add if this contraction is not tailored.
1249 if (*(conts
+1) != 0) { // contractions
1250 el
.prefix
= el
.prefixChars
;
1252 el
.cPoints
= el
.uchars
;
1254 u_memcpy(el
.uchars
, conts
, contractionLength
);
1255 el
.cSize
= contractionLength
;
1256 ucol_setText(ucaEl
, el
.uchars
, el
.cSize
, status
);
1258 else { // pre-context character
1259 UChar str
[4] = { 0 };
1261 int32_t preKeyLen
=0;
1263 el
.cPoints
= el
.uchars
;
1265 el
.uchars
[0] = *conts
;
1268 el
.prefixChars
[0] = *(conts
+2);
1269 el
.prefix
= el
.prefixChars
;
1271 if (el
.prefixChars
[0]!=0) {
1272 // get CE of prefix character first
1273 str
[0]=el
.prefixChars
[0];
1275 ucol_setText(ucaEl
, str
, 1, status
);
1276 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
))
1277 != UCOL_NULLORDER
) {
1278 preKeyLen
++; // count number of keys for prefix character
1280 str
[len
++] = el
.prefixChars
[0];
1283 str
[len
++] = el
.uchars
[0];
1285 ucol_setText(ucaEl
, str
, len
, status
);
1286 // Skip the keys for prefix character, then copy the rest to el.
1287 while ((preKeyLen
-->0) &&
1288 (int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1293 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1296 uprv_uca_addAnElement(t
, &el
, status
);
1299 } else if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, first
)) {
1300 ucol_uprv_bld_copyRangeFromUCA(src
, t
, first
, first
, status
);
1302 conts
+=maxUCAContractionLength
;
1304 ucol_closeElements(ucaEl
);
1307 // Add completely ignorable elements
1308 utrie_enum(&t
->UCA
->mapping
, NULL
, _processUCACompleteIgnorables
, t
);
1310 // add tailoring characters related canonical closures
1311 uprv_uca_canonicalClosure(t
, src
, NULL
, status
);
1313 /* still need to produce compatibility closure */
1315 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
1317 uprv_uca_closeTempTable(t
);
1324 static UBool U_CALLCONV
1325 ucol_bld_cleanup(void)
1327 udata_close(invUCA_DATA_MEM
);
1328 invUCA_DATA_MEM
= NULL
;
1329 _staticInvUCA
= NULL
;
1334 U_CAPI
const InverseUCATableHeader
* U_EXPORT2
1335 ucol_initInverseUCA(UErrorCode
*status
)
1337 if(U_FAILURE(*status
)) return NULL
;
1340 UMTX_CHECK(NULL
, (_staticInvUCA
== NULL
), needsInit
);
1343 InverseUCATableHeader
*newInvUCA
= NULL
;
1344 UDataMemory
*result
= udata_openChoice(U_ICUDATA_COLL
, INVC_DATA_TYPE
, INVC_DATA_NAME
, isAcceptableInvUCA
, NULL
, status
);
1346 if(U_FAILURE(*status
)) {
1348 udata_close(result
);
1350 // This is not needed, as we are talking about
1351 // memory we got from UData
1352 //uprv_free(newInvUCA);
1355 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1356 newInvUCA
= (InverseUCATableHeader
*)udata_getMemory(result
);
1357 UCollator
*UCA
= ucol_initUCA(status
);
1358 // UCA versions of UCA and inverse UCA should match
1359 if(uprv_memcmp(newInvUCA
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0) {
1360 *status
= U_INVALID_FORMAT_ERROR
;
1361 udata_close(result
);
1366 if(_staticInvUCA
== NULL
) {
1367 invUCA_DATA_MEM
= result
;
1368 _staticInvUCA
= newInvUCA
;
1374 if(newInvUCA
!= NULL
) {
1375 udata_close(result
);
1376 // This is not needed, as we are talking about
1377 // memory we got from UData
1378 //uprv_free(newInvUCA);
1381 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD
, ucol_bld_cleanup
);
1385 return _staticInvUCA
;
1388 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1389 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1391 static const char* ReorderingTokenNames
[] = {
1400 static void toUpper(const char* src
, char* dst
, uint32_t length
) {
1401 for (uint32_t i
= 0; *src
!= '\0' && i
< length
- 1; ++src
, ++dst
, ++i
) {
1402 *dst
= uprv_toupper(*src
);
1407 U_INTERNAL
int32_t U_EXPORT2
1408 ucol_findReorderingEntry(const char* name
) {
1410 toUpper(name
, buffer
, 32);
1411 for (uint32_t entry
= 0; ReorderingTokenNames
[entry
] != NULL
; entry
++) {
1412 if (uprv_strcmp(buffer
, ReorderingTokenNames
[entry
]) == 0) {
1413 return entry
+ UCOL_REORDER_CODE_FIRST
;
1416 return USCRIPT_INVALID_CODE
;
1419 #endif /* #if !UCONFIG_NO_COLLATION */