2 *******************************************************************************
4 * Copyright (C) 2001-2012, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucol_bld.cpp
10 * tab size: 8 (not used)
14 * created by: Vladimir Weinstein
16 * This module builds a collator based on the rule set.
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_COLLATION
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "unicode/uscript.h"
29 #include "unicode/ustring.h"
30 #include "unicode/utf16.h"
31 #include "normalizer2impl.h"
40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
42 static const InverseUCATableHeader
* _staticInvUCA
= NULL
;
43 static UDataMemory
* invUCA_DATA_MEM
= NULL
;
46 static UBool U_CALLCONV
47 isAcceptableInvUCA(void * /*context*/,
48 const char * /*type*/, const char * /*name*/,
49 const UDataInfo
*pInfo
)
51 /* context, type & name are intentionally not used */
52 if( pInfo
->size
>=20 &&
53 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
54 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
55 pInfo
->dataFormat
[0]==INVUCA_DATA_FORMAT_0
&& /* dataFormat="InvC" */
56 pInfo
->dataFormat
[1]==INVUCA_DATA_FORMAT_1
&&
57 pInfo
->dataFormat
[2]==INVUCA_DATA_FORMAT_2
&&
58 pInfo
->dataFormat
[3]==INVUCA_DATA_FORMAT_3
&&
59 pInfo
->formatVersion
[0]==INVUCA_FORMAT_VERSION_0
&&
60 pInfo
->formatVersion
[1]>=INVUCA_FORMAT_VERSION_1
//&&
61 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
62 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
63 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
66 UVersionInfo UCDVersion
;
67 u_getUnicodeVersion(UCDVersion
);
68 return (pInfo
->dataVersion
[0]==UCDVersion
[0] &&
69 pInfo
->dataVersion
[1]==UCDVersion
[1]);
70 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
71 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
72 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
80 * Takes two CEs (lead and continuation) and
81 * compares them as CEs should be compared:
82 * primary vs. primary, secondary vs. secondary
83 * tertiary vs. tertiary
85 static int32_t compareCEs(uint32_t source0
, uint32_t source1
, uint32_t target0
, uint32_t target1
) {
86 uint32_t s1
= source0
, s2
, t1
= target0
, t2
;
87 if(isContinuation(source1
)) {
92 if(isContinuation(target1
)) {
98 uint32_t s
= 0, t
= 0;
99 if(s1
== t1
&& s2
== t2
) {
102 s
= (s1
& 0xFFFF0000)|((s2
& 0xFFFF0000)>>16);
103 t
= (t1
& 0xFFFF0000)|((t2
& 0xFFFF0000)>>16);
109 s
= (s1
& 0x0000FF00) | (s2
& 0x0000FF00)>>8;
110 t
= (t1
& 0x0000FF00) | (t2
& 0x0000FF00)>>8;
116 s
= (s1
& 0x000000FF)<<8 | (s2
& 0x000000FF);
117 t
= (t1
& 0x000000FF)<<8 | (t2
& 0x000000FF);
128 int32_t ucol_inv_findCE(const UColTokenParser
*src
, uint32_t CE
, uint32_t SecondCE
) {
129 uint32_t bottom
= 0, top
= src
->invUCA
->tableSize
;
131 uint32_t first
= 0, second
= 0;
132 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
135 while(bottom
< top
-1) {
137 first
= *(CETable
+3*i
);
138 second
= *(CETable
+3*i
+1);
139 res
= compareCEs(first
, second
, CE
, SecondCE
);
150 /* in searching for elements, I have removed the failure */
151 /* The reason for this is that the builder does not rely */
152 /* on search mechanism telling it that it didn't find an */
153 /* element. However, indirect positioning relies on being */
154 /* able to find the elements around any CE, even if it is */
155 /* not defined in the UCA. */
158 if((first == CE && second == SecondCE)) {
166 static const uint32_t strengthMask
[UCOL_CE_STRENGTH_LIMIT
] = {
172 U_CAPI
int32_t U_EXPORT2
ucol_inv_getNextCE(const UColTokenParser
*src
,
173 uint32_t CE
, uint32_t contCE
,
174 uint32_t *nextCE
, uint32_t *nextContCE
,
177 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
180 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
183 *nextCE
= UCOL_NOT_FOUND
;
187 CE
&= strengthMask
[strength
];
188 contCE
&= strengthMask
[strength
];
191 *nextContCE
= contCE
;
193 while((*nextCE
& strengthMask
[strength
]) == CE
194 && (*nextContCE
& strengthMask
[strength
]) == contCE
)
196 *nextCE
= (*(CETable
+3*(++iCE
)));
197 *nextContCE
= (*(CETable
+3*(iCE
)+1));
203 U_CFUNC
int32_t U_EXPORT2
ucol_inv_getPrevCE(const UColTokenParser
*src
,
204 uint32_t CE
, uint32_t contCE
,
205 uint32_t *prevCE
, uint32_t *prevContCE
,
208 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
211 iCE
= ucol_inv_findCE(src
, CE
, contCE
);
214 *prevCE
= UCOL_NOT_FOUND
;
218 CE
&= strengthMask
[strength
];
219 contCE
&= strengthMask
[strength
];
222 *prevContCE
= contCE
;
224 while((*prevCE
& strengthMask
[strength
]) == CE
225 && (*prevContCE
& strengthMask
[strength
])== contCE
226 && iCE
> 0) /* this condition should prevent falling off the edge of the world */
228 /* here, we end up in a singularity - zero */
229 *prevCE
= (*(CETable
+3*(--iCE
)));
230 *prevContCE
= (*(CETable
+3*(iCE
)+1));
236 U_CFUNC
uint32_t U_EXPORT2
ucol_getCEStrengthDifference(uint32_t CE
, uint32_t contCE
,
237 uint32_t prevCE
, uint32_t prevContCE
)
239 if(prevCE
== CE
&& prevContCE
== contCE
) {
240 return UCOL_IDENTICAL
;
242 if((prevCE
& strengthMask
[UCOL_PRIMARY
]) != (CE
& strengthMask
[UCOL_PRIMARY
])
243 || (prevContCE
& strengthMask
[UCOL_PRIMARY
]) != (contCE
& strengthMask
[UCOL_PRIMARY
]))
247 if((prevCE
& strengthMask
[UCOL_SECONDARY
]) != (CE
& strengthMask
[UCOL_SECONDARY
])
248 || (prevContCE
& strengthMask
[UCOL_SECONDARY
]) != (contCE
& strengthMask
[UCOL_SECONDARY
]))
250 return UCOL_SECONDARY
;
252 return UCOL_TERTIARY
;
257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
259 uint32_t CE = lh->baseCE;
260 uint32_t SecondCE = lh->baseContCE;
262 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
263 uint32_t previousCE, previousContCE;
266 iCE = ucol_inv_findCE(src, CE, SecondCE);
272 CE &= strengthMask[strength];
273 SecondCE &= strengthMask[strength];
276 previousContCE = SecondCE;
278 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
279 previousCE = (*(CETable+3*(--iCE)));
280 previousContCE = (*(CETable+3*(iCE)+1));
282 lh->previousCE = previousCE;
283 lh->previousContCE = previousContCE;
289 inline int32_t ucol_inv_getNext(UColTokenParser
*src
, UColTokListHeader
*lh
, uint32_t strength
) {
290 uint32_t CE
= lh
->baseCE
;
291 uint32_t SecondCE
= lh
->baseContCE
;
293 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
294 uint32_t nextCE
, nextContCE
;
297 iCE
= ucol_inv_findCE(src
, CE
, SecondCE
);
303 CE
&= strengthMask
[strength
];
304 SecondCE
&= strengthMask
[strength
];
307 nextContCE
= SecondCE
;
309 while((nextCE
& strengthMask
[strength
]) == CE
310 && (nextContCE
& strengthMask
[strength
]) == SecondCE
)
312 nextCE
= (*(CETable
+3*(++iCE
)));
313 nextContCE
= (*(CETable
+3*(iCE
)+1));
317 lh
->nextContCE
= nextContCE
;
322 static void ucol_inv_getGapPositions(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
323 /* reset all the gaps */
325 uint32_t *CETable
= (uint32_t *)((uint8_t *)src
->invUCA
+src
->invUCA
->table
);
330 UColToken
*tok
= lh
->first
;
331 uint32_t tokStrength
= tok
->strength
;
333 for(i
= 0; i
<3; i
++) {
335 lh
->gapsHi
[3*i
+1] = 0;
336 lh
->gapsHi
[3*i
+2] = 0;
338 lh
->gapsLo
[3*i
+1] = 0;
339 lh
->gapsLo
[3*i
+2] = 0;
341 lh
->fStrToken
[i
] = NULL
;
342 lh
->lStrToken
[i
] = NULL
;
346 UCAConstants
*consts
= (UCAConstants
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->UCAConsts
);
348 if((lh
->baseCE
& 0xFF000000)>= (consts
->UCA_PRIMARY_IMPLICIT_MIN
<<24) && (lh
->baseCE
& 0xFF000000) <= (consts
->UCA_PRIMARY_IMPLICIT_MAX
<<24) ) { /* implicits - */
349 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
352 t2
= lh
->baseContCE
& UCOL_REMOVE_CONTINUATION
;
353 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
354 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
355 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
356 uint32_t primaryCE
= (t1
& UCOL_PRIMARYMASK
) | ((t2
& UCOL_PRIMARYMASK
) >> 16);
357 primaryCE
= uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE
)+1);
359 t1
= (primaryCE
& UCOL_PRIMARYMASK
) | 0x0505;
360 t2
= (primaryCE
<< 16) & UCOL_PRIMARYMASK
; // | UCOL_CONTINUATION_MARKER;
362 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
363 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
364 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
365 } else if(lh
->indirect
== TRUE
&& lh
->nextCE
!= 0) {
366 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
369 t2
= lh
->baseContCE
&UCOL_REMOVE_CONTINUATION
;
370 lh
->gapsLo
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
371 lh
->gapsLo
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
372 lh
->gapsLo
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
374 t2
= lh
->nextContCE
&UCOL_REMOVE_CONTINUATION
;
375 lh
->gapsHi
[0] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
376 lh
->gapsHi
[1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
377 lh
->gapsHi
[2] = (UCOL_TERTIARYORDER(t1
)) << 24 | (UCOL_TERTIARYORDER(t2
)) << 16;
380 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
381 if((lh
->pos
[tokStrength
] = ucol_inv_getNext(src
, lh
, tokStrength
)) >= 0) {
382 lh
->fStrToken
[tokStrength
] = tok
;
383 } else { /* The CE must be implicit, since it's not in the table */
385 *status
= U_INTERNAL_PROGRAM_ERROR
;
389 while(tok
!= NULL
&& tok
->strength
>= tokStrength
) {
390 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
) {
391 lh
->lStrToken
[tokStrength
] = tok
;
395 if(tokStrength
< UCOL_CE_STRENGTH_LIMIT
-1) {
396 /* check if previous interval is the same and merge the intervals if it is so */
397 if(lh
->pos
[tokStrength
] == lh
->pos
[tokStrength
+1]) {
398 lh
->fStrToken
[tokStrength
] = lh
->fStrToken
[tokStrength
+1];
399 lh
->fStrToken
[tokStrength
+1] = NULL
;
400 lh
->lStrToken
[tokStrength
+1] = NULL
;
401 lh
->pos
[tokStrength
+1] = -1;
405 tokStrength
= tok
->strength
;
410 for(st
= 0; st
< 3; st
++) {
411 if((pos
= lh
->pos
[st
]) >= 0) {
412 t1
= *(CETable
+3*(pos
));
413 t2
= *(CETable
+3*(pos
)+1);
414 lh
->gapsHi
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
415 lh
->gapsHi
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
416 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
417 lh
->gapsHi
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
419 //t1 = *(CETable+3*(pos));
420 //t2 = *(CETable+3*(pos)+1);
423 lh
->gapsLo
[3*st
] = (t1
& UCOL_PRIMARYMASK
) | (t2
& UCOL_PRIMARYMASK
) >> 16;
424 lh
->gapsLo
[3*st
+1] = (t1
& UCOL_SECONDARYMASK
) << 16 | (t2
& UCOL_SECONDARYMASK
) << 8;
425 lh
->gapsLo
[3*st
+2] = (t1
&0x3f) << 24 | (t2
&0x3f) << 16;
432 #define ucol_countBytes(value, noOfBytes) \
434 uint32_t mask = 0xFFFFFFFF; \
437 if(((value) & mask) != 0) { \
444 static uint32_t ucol_getNextGenerated(ucolCEGenerator
*g
, UErrorCode
*status
) {
445 if(U_SUCCESS(*status
)) {
446 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator
*g
, UColToken
*tok
, uint32_t strength
, UErrorCode
*status
) {
452 /* TODO: rename to enum names */
453 uint32_t high
, low
, count
=1;
454 uint32_t maxByte
= (strength
== UCOL_TERTIARY
)?0x3F:0xFF;
456 if(strength
== UCOL_SECONDARY
) {
457 low
= UCOL_COMMON_TOP2
<<24;
459 count
= 0xFF - UCOL_COMMON_TOP2
;
461 low
= UCOL_BYTE_COMMON
<< 24; //0x05000000;
463 count
= 0x40 - UCOL_BYTE_COMMON
;
466 if(tok
->next
!= NULL
&& tok
->next
->strength
== strength
) {
467 count
= tok
->next
->toInsert
;
470 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
471 g
->current
= UCOL_BYTE_COMMON
<<24;
473 if(g
->noOfRanges
== 0) {
474 *status
= U_INTERNAL_PROGRAM_ERROR
;
479 static uint32_t ucol_getCEGenerator(ucolCEGenerator
*g
, uint32_t* lows
, uint32_t* highs
, UColToken
*tok
, uint32_t fStrength
, UErrorCode
*status
) {
480 uint32_t strength
= tok
->strength
;
481 uint32_t low
= lows
[fStrength
*3+strength
];
482 uint32_t high
= highs
[fStrength
*3+strength
];
483 uint32_t maxByte
= 0;
484 if(strength
== UCOL_TERTIARY
) {
486 } else if(strength
== UCOL_PRIMARY
) {
492 uint32_t count
= tok
->toInsert
;
494 if(low
>= high
&& strength
> UCOL_PRIMARY
) {
495 int32_t s
= strength
;
498 if(lows
[fStrength
*3+s
] != highs
[fStrength
*3+s
]) {
499 if(strength
== UCOL_SECONDARY
) {
500 if (low
< UCOL_COMMON_TOP2
<<24 ) {
501 // Override if low range is less than UCOL_COMMON_TOP2.
502 low
= UCOL_COMMON_TOP2
<<24;
506 // Override if low range is less than UCOL_COMMON_BOT3.
507 if ( low
< UCOL_COMMON_BOT3
<<24 ) {
508 low
= UCOL_COMMON_BOT3
<<24;
515 *status
= U_INTERNAL_PROGRAM_ERROR
;
521 if(low
< 0x02000000) {
522 // We must not use CE weight byte 02, so we set it as the minimum lower bound.
523 // See http://site.icu-project.org/design/collation/bytes
527 if(strength
== UCOL_SECONDARY
) { /* similar as simple */
528 if(low
>= (UCOL_COMMON_BOT2
<<24) && low
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
529 low
= UCOL_COMMON_TOP2
<<24;
531 if(high
> (UCOL_COMMON_BOT2
<<24) && high
< (uint32_t)(UCOL_COMMON_TOP2
<<24)) {
532 high
= UCOL_COMMON_TOP2
<<24;
534 if(low
< (UCOL_COMMON_BOT2
<<24)) {
535 g
->noOfRanges
= ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN
<<24, high
, count
, maxByte
, g
->ranges
);
536 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
537 //g->current = UCOL_COMMON_BOT2<<24;
542 g
->noOfRanges
= ucol_allocWeights(low
, high
, count
, maxByte
, g
->ranges
);
543 if(g
->noOfRanges
== 0) {
544 *status
= U_INTERNAL_PROGRAM_ERROR
;
546 g
->current
= ucol_nextWeight(g
->ranges
, &g
->noOfRanges
);
551 uint32_t u_toLargeKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
555 if(U_FAILURE(*status
)) {
559 if(sourceLen
> resLen
) {
560 *status
= U_MEMORY_ALLOCATION_ERROR
;
564 for(i
= 0; i
< sourceLen
; i
++) {
566 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
568 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
569 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
586 uint32_t u_toSmallKana(const UChar
*source
, const uint32_t sourceLen
, UChar
*resBuf
, const uint32_t resLen
, UErrorCode
*status
) {
590 if(U_FAILURE(*status
)) {
594 if(sourceLen
> resLen
) {
595 *status
= U_MEMORY_ALLOCATION_ERROR
;
599 for(i
= 0; i
< sourceLen
; i
++) {
601 if(0x3041 <= c
&& c
<= 0x30FA) { /* Kana range */
603 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
604 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
623 uint8_t ucol_uprv_getCaseBits(const UCollator
*UCA
, const UChar
*src
, uint32_t len
, UErrorCode
*status
) {
627 uint32_t uCount
= 0, lCount
= 0;
632 if(U_FAILURE(*status
)) {
633 return UCOL_LOWER_CASE
;
636 nLen
= unorm_normalize(src
, len
, UNORM_NFKD
, 0, n
, 128, status
);
637 if(U_SUCCESS(*status
)) {
638 for(i
= 0; i
< nLen
; i
++) {
639 uprv_init_collIterate(UCA
, &n
[i
], 1, &s
, status
);
640 order
= ucol_getNextCE(UCA
, &s
, status
);
641 if(isContinuation(order
)) {
642 *status
= U_INTERNAL_PROGRAM_ERROR
;
643 return UCOL_LOWER_CASE
;
645 if((order
&UCOL_CASE_BIT_MASK
)== UCOL_UPPER_CASE
) {
648 if(u_islower(n
[i
])) {
650 } else if(U_SUCCESS(*status
)) {
652 u_toSmallKana(&n
[i
], 1, sk
, 1, status
);
653 u_toLargeKana(&n
[i
], 1, lk
, 1, status
);
654 if(sk
[0] == n
[i
] && lk
[0] != n
[i
]) {
662 if(uCount
!= 0 && lCount
!= 0) {
663 return UCOL_MIXED_CASE
;
664 } else if(uCount
!= 0) {
665 return UCOL_UPPER_CASE
;
667 return UCOL_LOWER_CASE
;
672 U_CFUNC
void ucol_doCE(UColTokenParser
*src
, uint32_t *CEparts
, UColToken
*tok
, UErrorCode
*status
) {
673 /* this one makes the table and stuff */
674 uint32_t noOfBytes
[3];
677 for(i
= 0; i
<3; i
++) {
678 ucol_countBytes(CEparts
[i
], noOfBytes
[i
]);
681 /* Here we have to pack CEs from parts */
686 while(2*CEi
<noOfBytes
[0] || CEi
<noOfBytes
[1] || CEi
<noOfBytes
[2]) {
688 value
= UCOL_CONTINUATION_MARKER
; /* Continuation marker */
693 if(2*CEi
<noOfBytes
[0]) {
694 value
|= ((CEparts
[0]>>(32-16*(CEi
+1))) & 0xFFFF) << 16;
696 if(CEi
<noOfBytes
[1]) {
697 value
|= ((CEparts
[1]>>(32-8*(CEi
+1))) & 0xFF) << 8;
699 if(CEi
<noOfBytes
[2]) {
700 value
|= ((CEparts
[2]>>(32-8*(CEi
+1))) & 0x3F);
702 tok
->CEs
[CEi
] = value
;
705 if(CEi
== 0) { /* totally ignorable */
708 } else { /* there is at least something */
713 // we want to set case bits here and now, not later.
714 // Case bits handling
715 if(tok
->CEs
[0] != 0) { // case bits should be set only for non-ignorables
716 tok
->CEs
[0] &= 0xFFFFFF3F; // Clean the case bits field
717 int32_t cSize
= (tok
->source
& 0xFF000000) >> 24;
718 UChar
*cPoints
= (tok
->source
& 0x00FFFFFF) + src
->source
;
722 tok
->CEs
[0] |= ucol_uprv_getCaseBits(src
->UCA
, cPoints
, cSize
, status
);
724 // Copy it from the UCA
725 uint32_t caseCE
= ucol_getFirstCE(src
->UCA
, cPoints
[0], status
);
726 tok
->CEs
[0] |= (caseCE
& 0xC0);
731 fprintf(stderr
, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok
->debugSource
, tok
->strength
, CEparts
[0] >> (32-8*noOfBytes
[0]), CEparts
[1] >> (32-8*noOfBytes
[1]), CEparts
[2]>> (32-8*noOfBytes
[2]));
732 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
733 fprintf(stderr
, "%08X ", tok
->CEs
[i
]);
735 fprintf(stderr
, "\n");
739 U_CFUNC
void ucol_initBuffers(UColTokenParser
*src
, UColTokListHeader
*lh
, UErrorCode
*status
) {
740 ucolCEGenerator Gens
[UCOL_CE_STRENGTH_LIMIT
];
741 uint32_t CEparts
[UCOL_CE_STRENGTH_LIMIT
];
743 UColToken
*tok
= lh
->last
;
744 uint32_t t
[UCOL_STRENGTH_LIMIT
];
746 uprv_memset(t
, 0, UCOL_STRENGTH_LIMIT
*sizeof(uint32_t));
748 /* must initialize ranges to avoid memory check warnings */
749 for (int i
= 0; i
< UCOL_CE_STRENGTH_LIMIT
; i
++) {
750 uprv_memset(Gens
[i
].ranges
, 0, sizeof(Gens
[i
].ranges
));
754 t
[tok
->strength
] = 1;
756 while(tok
->previous
!= NULL
) {
757 if(tok
->previous
->strength
< tok
->strength
) { /* going up */
758 t
[tok
->strength
] = 0;
759 t
[tok
->previous
->strength
]++;
760 } else if(tok
->previous
->strength
> tok
->strength
) { /* going down */
761 t
[tok
->previous
->strength
] = 1;
766 tok
->toInsert
= t
[tok
->strength
];
769 tok
->toInsert
= t
[tok
->strength
];
770 ucol_inv_getGapPositions(src
, lh
, status
);
773 fprintf(stderr
, "BaseCE: %08X %08X\n", lh
->baseCE
, lh
->baseContCE
);
775 for(j
= 2; j
>= 0; j
--) {
776 fprintf(stderr
, "gapsLo[%i] [%08X %08X %08X]\n", j
, lh
->gapsLo
[j
*3], lh
->gapsLo
[j
*3+1], lh
->gapsLo
[j
*3+2]);
777 fprintf(stderr
, "gapsHi[%i] [%08X %08X %08X]\n", j
, lh
->gapsHi
[j
*3], lh
->gapsHi
[j
*3+1], lh
->gapsHi
[j
*3+2]);
779 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
782 fprintf(stderr
,"%i", tok
->strength
);
784 } while(tok
!= NULL
);
785 fprintf(stderr
, "\n");
787 tok
=&lh
->first
[UCOL_TOK_POLARITY_POSITIVE
];
790 fprintf(stderr
,"%i", tok
->toInsert
);
792 } while(tok
!= NULL
);
796 uint32_t fStrength
= UCOL_IDENTICAL
;
797 uint32_t initStrength
= UCOL_IDENTICAL
;
800 CEparts
[UCOL_PRIMARY
] = (lh
->baseCE
& UCOL_PRIMARYMASK
) | (lh
->baseContCE
& UCOL_PRIMARYMASK
) >> 16;
801 CEparts
[UCOL_SECONDARY
] = (lh
->baseCE
& UCOL_SECONDARYMASK
) << 16 | (lh
->baseContCE
& UCOL_SECONDARYMASK
) << 8;
802 CEparts
[UCOL_TERTIARY
] = (UCOL_TERTIARYORDER(lh
->baseCE
)) << 24 | (UCOL_TERTIARYORDER(lh
->baseContCE
)) << 16;
804 while (tok
!= NULL
&& U_SUCCESS(*status
)) {
805 fStrength
= tok
->strength
;
806 if(fStrength
< initStrength
) {
807 initStrength
= fStrength
;
808 if(lh
->pos
[fStrength
] == -1) {
809 while(lh
->pos
[fStrength
] == -1 && fStrength
> 0) {
812 if(lh
->pos
[fStrength
] == -1) {
813 *status
= U_INTERNAL_PROGRAM_ERROR
;
817 if(initStrength
== UCOL_TERTIARY
) { /* starting with tertiary */
818 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
819 CEparts
[UCOL_SECONDARY
] = lh
->gapsLo
[fStrength
*3+1];
820 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
821 CEparts
[UCOL_TERTIARY
] = ucol_getCEGenerator(&Gens
[UCOL_TERTIARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
822 } else if(initStrength
== UCOL_SECONDARY
) { /* secondaries */
823 CEparts
[UCOL_PRIMARY
] = lh
->gapsLo
[fStrength
*3];
824 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
825 CEparts
[UCOL_SECONDARY
] = ucol_getCEGenerator(&Gens
[UCOL_SECONDARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
826 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
827 } else { /* primaries */
828 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
829 CEparts
[UCOL_PRIMARY
] = ucol_getCEGenerator(&Gens
[UCOL_PRIMARY
], lh
->gapsLo
, lh
->gapsHi
, tok
, fStrength
, status
);
830 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
831 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
834 if(tok
->strength
== UCOL_TERTIARY
) {
835 CEparts
[UCOL_TERTIARY
] = ucol_getNextGenerated(&Gens
[UCOL_TERTIARY
], status
);
836 } else if(tok
->strength
== UCOL_SECONDARY
) {
837 CEparts
[UCOL_SECONDARY
] = ucol_getNextGenerated(&Gens
[UCOL_SECONDARY
], status
);
838 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
839 } else if(tok
->strength
== UCOL_PRIMARY
) {
840 CEparts
[UCOL_PRIMARY
] = ucol_getNextGenerated(&Gens
[UCOL_PRIMARY
], status
);
841 CEparts
[UCOL_SECONDARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_SECONDARY
], tok
, UCOL_SECONDARY
, status
);
842 CEparts
[UCOL_TERTIARY
] = ucol_getSimpleCEGenerator(&Gens
[UCOL_TERTIARY
], tok
, UCOL_TERTIARY
, status
);
845 ucol_doCE(src
, CEparts
, tok
, status
);
850 U_CFUNC
void ucol_createElements(UColTokenParser
*src
, tempUCATable
*t
, UColTokListHeader
*lh
, UErrorCode
*status
) {
852 UColToken
*tok
= lh
->first
;
853 UColToken
*expt
= NULL
;
854 uint32_t i
= 0, j
= 0;
855 const Normalizer2Impl
*nfcImpl
= Normalizer2Factory::getNFCImpl(*status
);
857 while(tok
!= NULL
&& U_SUCCESS(*status
)) {
858 /* first, check if there are any expansions */
859 /* if there are expansions, we need to do a little bit more processing */
860 /* since parts of expansion can be tailored, while others are not */
861 if(tok
->expansion
!= 0) {
862 uint32_t len
= tok
->expansion
>> 24;
863 uint32_t currentSequenceLen
= len
;
864 uint32_t expOffset
= tok
->expansion
& 0x00FFFFFF;
865 //uint32_t exp = currentSequenceLen | expOffset;
867 exp
.source
= currentSequenceLen
| expOffset
;
868 exp
.rulesToParseHdl
= &(src
->source
);
871 currentSequenceLen
= len
;
872 while(currentSequenceLen
> 0) {
873 exp
.source
= (currentSequenceLen
<< 24) | expOffset
;
874 if((expt
= (UColToken
*)uhash_get(src
->tailored
, &exp
)) != NULL
&& expt
->strength
!= UCOL_TOK_RESET
) { /* expansion is tailored */
875 uint32_t noOfCEsToCopy
= expt
->noOfCEs
;
876 for(j
= 0; j
<noOfCEsToCopy
; j
++) {
877 tok
->expCEs
[tok
->noOfExpCEs
+ j
] = expt
->CEs
[j
];
879 tok
->noOfExpCEs
+= noOfCEsToCopy
;
880 // Smart people never try to add codepoints and CEs.
881 // For some odd reason, it won't work.
882 expOffset
+= currentSequenceLen
; //noOfCEsToCopy;
883 len
-= currentSequenceLen
; //noOfCEsToCopy;
886 currentSequenceLen
--;
889 if(currentSequenceLen
== 0) { /* couldn't find any tailored subsequence */
890 /* will have to get one from UCA */
891 /* first, get the UChars from the rules */
892 /* then pick CEs out until there is no more and stuff them into expansion */
895 uprv_init_collIterate(src
->UCA
, expOffset
+ src
->source
, 1, &s
, status
);
898 order
= ucol_getNextCE(src
->UCA
, &s
, status
);
899 if(order
== UCOL_NO_MORE_CES
) {
902 tok
->expCEs
[tok
->noOfExpCEs
++] = order
;
912 /* set the ucaelement with obtained values */
913 el
.noOfCEs
= tok
->noOfCEs
+ tok
->noOfExpCEs
;
915 for(i
= 0; i
<tok
->noOfCEs
; i
++) {
916 el
.CEs
[i
] = tok
->CEs
[i
];
918 for(i
= 0; i
<tok
->noOfExpCEs
; i
++) {
919 el
.CEs
[i
+tok
->noOfCEs
] = tok
->expCEs
[i
];
923 // We kept prefix and source kind of together, as it is a kind of a contraction.
924 // However, now we have to slice the prefix off the main thing -
925 el
.prefix
= el
.prefixChars
;
926 el
.cPoints
= el
.uchars
;
927 if(tok
->prefix
!= 0) { // we will just copy the prefix here, and adjust accordingly in the
928 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
929 // decomposed elements to the unsaf table.
930 el
.prefixSize
= tok
->prefix
>>24;
931 uprv_memcpy(el
.prefix
, src
->source
+ (tok
->prefix
& 0x00FFFFFF), el
.prefixSize
*sizeof(UChar
));
933 el
.cSize
= (tok
->source
>> 24)-(tok
->prefix
>>24);
934 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF)+(tok
->prefix
>>24) + src
->source
, el
.cSize
*sizeof(UChar
));
939 el
.cSize
= (tok
->source
>> 24);
940 uprv_memcpy(el
.uchars
, (tok
->source
& 0x00FFFFFF) + src
->source
, el
.cSize
*sizeof(UChar
));
942 if(src
->UCA
!= NULL
) {
943 for(i
= 0; i
<el
.cSize
; i
++) {
944 if(UCOL_ISJAMO(el
.cPoints
[i
])) {
945 t
->image
->jamoSpecial
= TRUE
;
948 if (!src
->buildCCTabFlag
&& el
.cSize
> 0) {
949 // Check the trailing canonical combining class (tccc) of the last character.
950 const UChar
*s
= el
.cPoints
+ el
.cSize
;
951 uint16_t fcd
= nfcImpl
->previousFCD16(el
.cPoints
, s
);
952 if ((fcd
& 0xff) != 0) {
953 src
->buildCCTabFlag
= TRUE
;
958 /* and then, add it */
960 fprintf(stderr
, "Adding: %04X with %08X\n", el
.cPoints
[0], el
.CEs
[0]);
962 uprv_uca_addAnElement(t
, &el
, status
);
964 #if UCOL_DEBUG_DUPLICATES
965 if(*status
!= U_ZERO_ERROR
) {
966 fprintf(stderr
, "replaced CE for %04X with CE for %04X\n", el
.cPoints
[0], tok
->debugSource
);
967 *status
= U_ZERO_ERROR
;
976 static UBool U_CALLCONV
977 _processUCACompleteIgnorables(const void *context
, UChar32 start
, UChar32 limit
, uint32_t value
) {
978 UErrorCode status
= U_ZERO_ERROR
;
979 tempUCATable
*t
= (tempUCATable
*)context
;
981 while(start
< limit
) {
982 uint32_t CE
= utrie_get32(t
->mapping
, start
, NULL
);
983 if(CE
== UCOL_NOT_FOUND
) {
987 el
.prefixChars
[0] = 0;
988 el
.prefix
= el
.prefixChars
;
989 el
.cPoints
= el
.uchars
;
992 U16_APPEND_UNSAFE(el
.uchars
, el
.cSize
, start
);
996 uprv_uca_addAnElement(t
, &el
, &status
);
1002 if(U_FAILURE(status
)) {
1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser
*src
, tempUCATable
*t
,
1012 UChar32 start
, UChar32 end
,
1015 //UChar decomp[256];
1016 uint32_t CE
= UCOL_NOT_FOUND
;
1021 el
.prefixChars
[0] = 0;
1024 if(U_SUCCESS(*status
)) {
1025 for(u
= start
; u
<=end
; u
++) {
1026 if((CE
= utrie_get32(t
->mapping
, u
, NULL
)) == UCOL_NOT_FOUND
1027 /* this test is for contractions that are missing the starting element. */
1028 || ((isCntTableElement(CE
)) &&
1029 (uprv_cnttab_getCE(t
->contractions
, CE
, 0, status
) == UCOL_NOT_FOUND
))
1033 U16_APPEND_UNSAFE(el
.uchars
, el
.cSize
, u
);
1034 //decomp[0] = (UChar)u;
1035 //el.uchars[0] = (UChar)u;
1036 el
.cPoints
= el
.uchars
;
1039 el
.prefix
= el
.prefixChars
;
1041 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1042 // We actually want to check whether this element is a special
1043 // If it is an implicit element (hangul, CJK - we want to copy the
1044 // special, not the resolved CEs) - for hangul, copying resolved
1045 // would just make things the same (there is an expansion and it
1046 // takes approximately the same amount of time to resolve as
1047 // falling back to the UCA).
1049 UTRIE_GET32(src->UCA->mapping, u, CE);
1051 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1052 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1053 || tag == LEAD_SURROGATE_TAG) {
1054 el.CEs[el.noOfCEs++] = CE;
1057 // It turns out that it does not make sense to keep implicits
1058 // unresolved. The cost of resolving them is big enough so that
1059 // it doesn't make any difference whether we have to go to the UCA
1062 uprv_init_collIterate(src
->UCA
, el
.uchars
, el
.cSize
, &colIt
, status
);
1063 while(CE
!= UCOL_NO_MORE_CES
) {
1064 CE
= ucol_getNextCE(src
->UCA
, &colIt
, status
);
1065 if(CE
!= UCOL_NO_MORE_CES
) {
1066 el
.CEs
[el
.noOfCEs
++] = CE
;
1070 uprv_uca_addAnElement(t
, &el
, status
);
1078 U_CFUNC UCATableHeader
*
1079 ucol_assembleTailoringTable(UColTokenParser
*src
, UErrorCode
*status
) {
1083 if(U_FAILURE(*status
)) {
1087 2. Eliminate the negative lists by doing the following for each non-null negative list:
1088 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1089 create new ListHeader X
1090 o reverse the list, add to the end of X's positive list. Reset the strength of the
1091 first item you add, based on the stronger strength levels of the two lists.
1094 3. For each ListHeader with a non-null positive list:
1097 o Find all character strings with CEs between the baseCE and the
1098 next/previous CE, at the strength of the first token. Add these to the
1100 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1101 tailoring has & x < z...
1102 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1104 /* It is possible that this part should be done even while constructing list */
1105 /* The problem is that it is unknown what is going to be the strongest weight */
1106 /* So we might as well do it here */
1109 o Allocate CEs for each token in the list, based on the total number N of the
1110 largest level difference, and the gap G between baseCE and nextCE at that
1111 level. The relation * between the last item and nextCE is the same as the
1113 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1114 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1115 Then fit b and c into the secondary gap between a and d, then fit q
1116 into the tertiary gap between b and c.
1118 o Example: baseCE << b <<< q << c * nextCE(X,2)
1119 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1120 Then fit q into the tertiary gap between b and c.
1121 o When incrementing primary values, we will not cross high byte
1122 boundaries except where there is only a single-byte primary. That is to
1123 ensure that the script reordering will continue to work.
1125 UCATableHeader
*image
= (UCATableHeader
*)uprv_malloc(sizeof(UCATableHeader
));
1127 if (image
== NULL
) {
1128 *status
= U_MEMORY_ALLOCATION_ERROR
;
1131 uprv_memcpy(image
, src
->UCA
->image
, sizeof(UCATableHeader
));
1133 for(i
= 0; i
<src
->resultLen
; i
++) {
1134 /* now we need to generate the CEs */
1135 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1136 /* According to strength */
1137 if(U_SUCCESS(*status
)) {
1138 if(src
->lh
[i
].first
) { // if there are any elements
1139 // due to the way parser works, subsequent tailorings
1140 // may remove all the elements from a sequence, therefore
1141 // leaving an empty tailoring sequence.
1142 ucol_initBuffers(src
, &src
->lh
[i
], status
);
1145 if(U_FAILURE(*status
)) {
1151 if(src
->varTop
!= NULL
) { /* stuff the variable top value */
1152 src
->opts
->variableTopValue
= (*(src
->varTop
->CEs
))>>16;
1153 /* remove it from the list */
1154 if(src
->varTop
->listHeader
->first
== src
->varTop
) { /* first in list */
1155 src
->varTop
->listHeader
->first
= src
->varTop
->next
;
1157 if(src
->varTop
->listHeader
->last
== src
->varTop
) { /* first in list */
1158 src
->varTop
->listHeader
->last
= src
->varTop
->previous
;
1160 if(src
->varTop
->next
!= NULL
) {
1161 src
->varTop
->next
->previous
= src
->varTop
->previous
;
1163 if(src
->varTop
->previous
!= NULL
) {
1164 src
->varTop
->previous
->next
= src
->varTop
->next
;
1169 tempUCATable
*t
= uprv_uca_initTempTable(image
, src
->opts
, src
->UCA
, NOT_FOUND_TAG
, NOT_FOUND_TAG
, status
);
1170 if(U_FAILURE(*status
)) {
1176 /* After this, we have assigned CE values to all regular CEs */
1177 /* now we will go through list once more and resolve expansions, */
1178 /* make UCAElements structs and add them to table */
1179 for(i
= 0; i
<src
->resultLen
; i
++) {
1180 /* now we need to generate the CEs */
1181 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1182 /* According to strength */
1183 if(U_SUCCESS(*status
)) {
1184 ucol_createElements(src
, t
, &src
->lh
[i
], status
);
1191 el
.prefixChars
[0] = 0;
1193 /* add latin-1 stuff */
1194 ucol_uprv_bld_copyRangeFromUCA(src
, t
, 0, 0xFF, status
);
1196 /* add stuff for copying */
1197 if(src
->copySet
!= NULL
) {
1199 UnicodeSet
*set
= (UnicodeSet
*)src
->copySet
;
1200 for(i
= 0; i
< set
->getRangeCount(); i
++) {
1201 ucol_uprv_bld_copyRangeFromUCA(src
, t
, set
->getRangeStart(i
), set
->getRangeEnd(i
), status
);
1205 if(U_SUCCESS(*status
)) {
1206 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1208 uint32_t tailoredCE
= UCOL_NOT_FOUND
;
1209 UChar
*conts
= (UChar
*)((uint8_t *)src
->UCA
->image
+ src
->UCA
->image
->contractionUCACombos
);
1210 int32_t maxUCAContractionLength
= src
->UCA
->image
->contractionUCACombosWidth
;
1211 UCollationElements
*ucaEl
= ucol_openElements(src
->UCA
, NULL
, 0, status
);
1212 // Check for null pointer
1213 if (ucaEl
== NULL
) {
1214 *status
= U_MEMORY_ALLOCATION_ERROR
;
1217 while(*conts
!= 0) {
1218 // A continuation is NUL-terminated and NUL-padded
1219 // except if it has the maximum length.
1220 int32_t contractionLength
= maxUCAContractionLength
;
1221 while(contractionLength
> 0 && conts
[contractionLength
- 1] == 0) {
1222 --contractionLength
;
1225 int32_t firstLength
= 0;
1226 U16_NEXT(conts
, firstLength
, contractionLength
, first
);
1227 tailoredCE
= utrie_get32(t
->mapping
, first
, NULL
);
1228 if(tailoredCE
!= UCOL_NOT_FOUND
) {
1229 UBool needToAdd
= TRUE
;
1230 if(isCntTableElement(tailoredCE
)) {
1231 if(uprv_cnttab_isTailored(t
->contractions
, tailoredCE
, conts
+firstLength
, status
) == TRUE
) {
1235 if (!needToAdd
&& isPrefix(tailoredCE
) && *(conts
+1)==0) {
1237 elm
.cPoints
= el
.uchars
;
1239 elm
.uchars
[0] = *conts
;
1242 elm
.prefixChars
[0] = *(conts
+2);
1244 elm
.prefix
= elm
.prefixChars
;
1246 UCAElements
*prefixEnt
=(UCAElements
*)uhash_get(t
->prefixLookup
, &elm
);
1247 if ((prefixEnt
==NULL
) || *(prefixEnt
->prefix
)!=*(conts
+2)) {
1251 if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, first
)) {
1255 if(needToAdd
== TRUE
) { // we need to add if this contraction is not tailored.
1256 if (*(conts
+1) != 0) { // contractions
1257 el
.prefix
= el
.prefixChars
;
1259 el
.cPoints
= el
.uchars
;
1261 u_memcpy(el
.uchars
, conts
, contractionLength
);
1262 el
.cSize
= contractionLength
;
1263 ucol_setText(ucaEl
, el
.uchars
, el
.cSize
, status
);
1265 else { // pre-context character
1266 UChar str
[4] = { 0 };
1268 int32_t preKeyLen
=0;
1270 el
.cPoints
= el
.uchars
;
1272 el
.uchars
[0] = *conts
;
1275 el
.prefixChars
[0] = *(conts
+2);
1276 el
.prefix
= el
.prefixChars
;
1278 if (el
.prefixChars
[0]!=0) {
1279 // get CE of prefix character first
1280 str
[0]=el
.prefixChars
[0];
1282 ucol_setText(ucaEl
, str
, 1, status
);
1283 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
))
1284 != UCOL_NULLORDER
) {
1285 preKeyLen
++; // count number of keys for prefix character
1287 str
[len
++] = el
.prefixChars
[0];
1290 str
[len
++] = el
.uchars
[0];
1292 ucol_setText(ucaEl
, str
, len
, status
);
1293 // Skip the keys for prefix character, then copy the rest to el.
1294 while ((preKeyLen
-->0) &&
1295 (int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1300 while ((int32_t)(el
.CEs
[el
.noOfCEs
] = ucol_next(ucaEl
, status
)) != UCOL_NULLORDER
) {
1303 uprv_uca_addAnElement(t
, &el
, status
);
1306 } else if(src
->removeSet
!= NULL
&& uset_contains(src
->removeSet
, first
)) {
1307 ucol_uprv_bld_copyRangeFromUCA(src
, t
, first
, first
, status
);
1309 conts
+=maxUCAContractionLength
;
1311 ucol_closeElements(ucaEl
);
1314 // Add completely ignorable elements
1315 utrie_enum(&t
->UCA
->mapping
, NULL
, _processUCACompleteIgnorables
, t
);
1317 // add tailoring characters related canonical closures
1318 uprv_uca_canonicalClosure(t
, src
, NULL
, status
);
1320 /* still need to produce compatibility closure */
1322 UCATableHeader
*myData
= uprv_uca_assembleTable(t
, status
);
1324 uprv_uca_closeTempTable(t
);
1331 static UBool U_CALLCONV
1332 ucol_bld_cleanup(void)
1334 udata_close(invUCA_DATA_MEM
);
1335 invUCA_DATA_MEM
= NULL
;
1336 _staticInvUCA
= NULL
;
1341 U_CAPI
const InverseUCATableHeader
* U_EXPORT2
1342 ucol_initInverseUCA(UErrorCode
*status
)
1344 if(U_FAILURE(*status
)) return NULL
;
1347 UMTX_CHECK(NULL
, (_staticInvUCA
== NULL
), needsInit
);
1350 InverseUCATableHeader
*newInvUCA
= NULL
;
1351 UDataMemory
*result
= udata_openChoice(U_ICUDATA_COLL
, INVC_DATA_TYPE
, INVC_DATA_NAME
, isAcceptableInvUCA
, NULL
, status
);
1353 if(U_FAILURE(*status
)) {
1355 udata_close(result
);
1357 // This is not needed, as we are talking about
1358 // memory we got from UData
1359 //uprv_free(newInvUCA);
1362 if(result
!= NULL
) { /* It looks like sometimes we can fail to find the data file */
1363 newInvUCA
= (InverseUCATableHeader
*)udata_getMemory(result
);
1364 UCollator
*UCA
= ucol_initUCA(status
);
1365 // UCA versions of UCA and inverse UCA should match
1366 if(uprv_memcmp(newInvUCA
->UCAVersion
, UCA
->image
->UCAVersion
, sizeof(UVersionInfo
)) != 0) {
1367 *status
= U_INVALID_FORMAT_ERROR
;
1368 udata_close(result
);
1373 if(_staticInvUCA
== NULL
) {
1374 invUCA_DATA_MEM
= result
;
1375 _staticInvUCA
= newInvUCA
;
1381 if(newInvUCA
!= NULL
) {
1382 udata_close(result
);
1383 // This is not needed, as we are talking about
1384 // memory we got from UData
1385 //uprv_free(newInvUCA);
1388 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD
, ucol_bld_cleanup
);
1392 return _staticInvUCA
;
1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept
1396 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
1398 static const char * const ReorderingTokenNames
[] = {
1406 static void toUpper(const char* src
, char* dst
, uint32_t length
) {
1407 for (uint32_t i
= 0; *src
!= '\0' && i
< length
- 1; ++src
, ++dst
, ++i
) {
1408 *dst
= uprv_toupper(*src
);
1413 U_INTERNAL
int32_t U_EXPORT2
1414 ucol_findReorderingEntry(const char* name
) {
1416 toUpper(name
, buffer
, 32);
1417 for (uint32_t entry
= 0; entry
< LENGTHOF(ReorderingTokenNames
); entry
++) {
1418 if (uprv_strcmp(buffer
, ReorderingTokenNames
[entry
]) == 0) {
1419 return entry
+ UCOL_REORDER_CODE_FIRST
;
1422 return USCRIPT_INVALID_CODE
;
1425 #endif /* #if !UCONFIG_NO_COLLATION */