]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_bld.cpp
ICU-400.40.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/udata.h"
26 #include "unicode/uchar.h"
27 #include "unicode/uniset.h"
28 #include "ucol_bld.h"
29 #include "ucol_elm.h"
30 #include "ucol_cnt.h"
31 #include "ucln_in.h"
32 #include "umutex.h"
33 #include "unormimp.h"
34 #include "cmemory.h"
35
36 static const InverseUCATableHeader* _staticInvUCA = NULL;
37 static UDataMemory* invUCA_DATA_MEM = NULL;
38
39 U_CDECL_BEGIN
40 static UBool U_CALLCONV
41 isAcceptableInvUCA(void * /*context*/,
42 const char * /*type*/, const char * /*name*/,
43 const UDataInfo *pInfo)
44 {
45 /* context, type & name are intentionally not used */
46 if( pInfo->size>=20 &&
47 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
48 pInfo->charsetFamily==U_CHARSET_FAMILY &&
49 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
50 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
51 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
52 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
53 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
54 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
55 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
56 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
57 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
58 )
59 {
60 UVersionInfo UCDVersion;
61 u_getUnicodeVersion(UCDVersion);
62 return (pInfo->dataVersion[0]==UCDVersion[0] &&
63 pInfo->dataVersion[1]==UCDVersion[1]);
64 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
65 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
66 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
67 } else {
68 return FALSE;
69 }
70 }
71 U_CDECL_END
72
73 /*
74 * Takes two CEs (lead and continuation) and
75 * compares them as CEs should be compared:
76 * primary vs. primary, secondary vs. secondary
77 * tertiary vs. tertiary
78 */
79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
80 uint32_t s1 = source0, s2, t1 = target0, t2;
81 if(isContinuation(source1)) {
82 s2 = source1;
83 } else {
84 s2 = 0;
85 }
86 if(isContinuation(target1)) {
87 t2 = target1;
88 } else {
89 t2 = 0;
90 }
91
92 uint32_t s = 0, t = 0;
93 if(s1 == t1 && s2 == t2) {
94 return 0;
95 }
96 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
97 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
98 if(s < t) {
99 return -1;
100 } else if(s > t) {
101 return 1;
102 } else {
103 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
104 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
105 if(s < t) {
106 return -1;
107 } else if(s > t) {
108 return 1;
109 } else {
110 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
111 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
112 if(s < t) {
113 return -1;
114 } else {
115 return 1;
116 }
117 }
118 }
119 }
120
121 static
122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
123 uint32_t bottom = 0, top = src->invUCA->tableSize;
124 uint32_t i = 0;
125 uint32_t first = 0, second = 0;
126 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
127 int32_t res = 0;
128
129 while(bottom < top-1) {
130 i = (top+bottom)/2;
131 first = *(CETable+3*i);
132 second = *(CETable+3*i+1);
133 res = compareCEs(first, second, CE, SecondCE);
134 if(res > 0) {
135 top = i;
136 } else if(res < 0) {
137 bottom = i;
138 } else {
139 break;
140 }
141 }
142
143 /* weiv: */
144 /* in searching for elements, I have removed the failure */
145 /* The reason for this is that the builder does not rely */
146 /* on search mechanism telling it that it didn't find an */
147 /* element. However, indirect positioning relies on being */
148 /* able to find the elements around any CE, even if it is */
149 /* not defined in the UCA. */
150 return i;
151 /*
152 if((first == CE && second == SecondCE)) {
153 return i;
154 } else {
155 return -1;
156 }
157 */
158 }
159
160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
161 0xFFFF0000,
162 0xFFFFFF00,
163 0xFFFFFFFF
164 };
165
166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
167 uint32_t CE, uint32_t contCE,
168 uint32_t *nextCE, uint32_t *nextContCE,
169 uint32_t strength)
170 {
171 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
172 int32_t iCE;
173
174 iCE = ucol_inv_findCE(src, CE, contCE);
175
176 if(iCE<0) {
177 *nextCE = UCOL_NOT_FOUND;
178 return -1;
179 }
180
181 CE &= strengthMask[strength];
182 contCE &= strengthMask[strength];
183
184 *nextCE = CE;
185 *nextContCE = contCE;
186
187 while((*nextCE & strengthMask[strength]) == CE
188 && (*nextContCE & strengthMask[strength]) == contCE)
189 {
190 *nextCE = (*(CETable+3*(++iCE)));
191 *nextContCE = (*(CETable+3*(iCE)+1));
192 }
193
194 return iCE;
195 }
196
197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
198 uint32_t CE, uint32_t contCE,
199 uint32_t *prevCE, uint32_t *prevContCE,
200 uint32_t strength)
201 {
202 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
203 int32_t iCE;
204
205 iCE = ucol_inv_findCE(src, CE, contCE);
206
207 if(iCE<0) {
208 *prevCE = UCOL_NOT_FOUND;
209 return -1;
210 }
211
212 CE &= strengthMask[strength];
213 contCE &= strengthMask[strength];
214
215 *prevCE = CE;
216 *prevContCE = contCE;
217
218 while((*prevCE & strengthMask[strength]) == CE
219 && (*prevContCE & strengthMask[strength])== contCE
220 && iCE > 0) /* this condition should prevent falling off the edge of the world */
221 {
222 /* here, we end up in a singularity - zero */
223 *prevCE = (*(CETable+3*(--iCE)));
224 *prevContCE = (*(CETable+3*(iCE)+1));
225 }
226
227 return iCE;
228 }
229
230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
231 uint32_t prevCE, uint32_t prevContCE)
232 {
233 if(prevCE == CE && prevContCE == contCE) {
234 return UCOL_IDENTICAL;
235 }
236 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
237 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
238 {
239 return UCOL_PRIMARY;
240 }
241 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
242 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
243 {
244 return UCOL_SECONDARY;
245 }
246 return UCOL_TERTIARY;
247 }
248
249
250 /*static
251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
252
253 uint32_t CE = lh->baseCE;
254 uint32_t SecondCE = lh->baseContCE;
255
256 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
257 uint32_t previousCE, previousContCE;
258 int32_t iCE;
259
260 iCE = ucol_inv_findCE(src, CE, SecondCE);
261
262 if(iCE<0) {
263 return -1;
264 }
265
266 CE &= strengthMask[strength];
267 SecondCE &= strengthMask[strength];
268
269 previousCE = CE;
270 previousContCE = SecondCE;
271
272 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
273 previousCE = (*(CETable+3*(--iCE)));
274 previousContCE = (*(CETable+3*(iCE)+1));
275 }
276 lh->previousCE = previousCE;
277 lh->previousContCE = previousContCE;
278
279 return iCE;
280 }*/
281
282 static
283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
284 uint32_t CE = lh->baseCE;
285 uint32_t SecondCE = lh->baseContCE;
286
287 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
288 uint32_t nextCE, nextContCE;
289 int32_t iCE;
290
291 iCE = ucol_inv_findCE(src, CE, SecondCE);
292
293 if(iCE<0) {
294 return -1;
295 }
296
297 CE &= strengthMask[strength];
298 SecondCE &= strengthMask[strength];
299
300 nextCE = CE;
301 nextContCE = SecondCE;
302
303 while((nextCE & strengthMask[strength]) == CE
304 && (nextContCE & strengthMask[strength]) == SecondCE)
305 {
306 nextCE = (*(CETable+3*(++iCE)));
307 nextContCE = (*(CETable+3*(iCE)+1));
308 }
309
310 lh->nextCE = nextCE;
311 lh->nextContCE = nextContCE;
312
313 return iCE;
314 }
315
316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
317 /* reset all the gaps */
318 int32_t i = 0;
319 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
320 uint32_t st = 0;
321 uint32_t t1, t2;
322 int32_t pos;
323
324 UColToken *tok = lh->first;
325 uint32_t tokStrength = tok->strength;
326
327 for(i = 0; i<3; i++) {
328 lh->gapsHi[3*i] = 0;
329 lh->gapsHi[3*i+1] = 0;
330 lh->gapsHi[3*i+2] = 0;
331 lh->gapsLo[3*i] = 0;
332 lh->gapsLo[3*i+1] = 0;
333 lh->gapsLo[3*i+2] = 0;
334 lh->numStr[i] = 0;
335 lh->fStrToken[i] = NULL;
336 lh->lStrToken[i] = NULL;
337 lh->pos[i] = -1;
338 }
339
340 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
341
342 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
343 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
344 lh->pos[0] = 0;
345 t1 = lh->baseCE;
346 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
347 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
348 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
349 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
350 uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
351 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
352
353 t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
354 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
355
356 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
357 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
358 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
359 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
360 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
361 lh->pos[0] = 0;
362 t1 = lh->baseCE;
363 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
364 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
365 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
366 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
367 t1 = lh->nextCE;
368 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
369 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
370 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
371 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
372 } else {
373 for(;;) {
374 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
375 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
376 lh->fStrToken[tokStrength] = tok;
377 } else { /* The CE must be implicit, since it's not in the table */
378 /* Error */
379 *status = U_INTERNAL_PROGRAM_ERROR;
380 }
381 }
382
383 while(tok != NULL && tok->strength >= tokStrength) {
384 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
385 lh->lStrToken[tokStrength] = tok;
386 }
387 tok = tok->next;
388 }
389 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
390 /* check if previous interval is the same and merge the intervals if it is so */
391 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
392 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
393 lh->fStrToken[tokStrength+1] = NULL;
394 lh->lStrToken[tokStrength+1] = NULL;
395 lh->pos[tokStrength+1] = -1;
396 }
397 }
398 if(tok != NULL) {
399 tokStrength = tok->strength;
400 } else {
401 break;
402 }
403 }
404 for(st = 0; st < 3; st++) {
405 if((pos = lh->pos[st]) >= 0) {
406 t1 = *(CETable+3*(pos));
407 t2 = *(CETable+3*(pos)+1);
408 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
409 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
410 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
411 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
412 //pos--;
413 //t1 = *(CETable+3*(pos));
414 //t2 = *(CETable+3*(pos)+1);
415 t1 = lh->baseCE;
416 t2 = lh->baseContCE;
417 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
418 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
419 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
420 }
421 }
422 }
423 }
424
425
426 #define ucol_countBytes(value, noOfBytes) \
427 { \
428 uint32_t mask = 0xFFFFFFFF; \
429 (noOfBytes) = 0; \
430 while(mask != 0) { \
431 if(((value) & mask) != 0) { \
432 (noOfBytes)++; \
433 } \
434 mask >>= 8; \
435 } \
436 }
437
438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
439 if(U_SUCCESS(*status)) {
440 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
441 }
442 return g->current;
443 }
444
445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
446 /* TODO: rename to enum names */
447 uint32_t high, low, count=1;
448 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
449
450 if(strength == UCOL_SECONDARY) {
451 low = UCOL_COMMON_TOP2<<24;
452 high = 0xFFFFFFFF;
453 count = 0xFF - UCOL_COMMON_TOP2;
454 } else {
455 low = UCOL_BYTE_COMMON << 24; //0x05000000;
456 high = 0x40000000;
457 count = 0x40 - UCOL_BYTE_COMMON;
458 }
459
460 if(tok->next != NULL && tok->next->strength == strength) {
461 count = tok->next->toInsert;
462 }
463
464 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
465 g->current = UCOL_BYTE_COMMON<<24;
466
467 if(g->noOfRanges == 0) {
468 *status = U_INTERNAL_PROGRAM_ERROR;
469 }
470 return g->current;
471 }
472
473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
474 uint32_t strength = tok->strength;
475 uint32_t low = lows[fStrength*3+strength];
476 uint32_t high = highs[fStrength*3+strength];
477 uint32_t maxByte = 0;
478 if(strength == UCOL_TERTIARY) {
479 maxByte = 0x3F;
480 } else if(strength == UCOL_PRIMARY) {
481 maxByte = 0xFE;
482 } else {
483 maxByte = 0xFF;
484 }
485
486 uint32_t count = tok->toInsert;
487
488 if(low >= high && strength > UCOL_PRIMARY) {
489 int32_t s = strength;
490 for(;;) {
491 s--;
492 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
493 if(strength == UCOL_SECONDARY) {
494 if (low < UCOL_COMMON_TOP2<<24 ) {
495 // Override if low range is less than UCOL_COMMON_TOP2.
496 low = UCOL_COMMON_TOP2<<24;
497 }
498 high = 0xFFFFFFFF;
499 } else {
500 // Override if low range is less than UCOL_COMMON_BOT3.
501 if ( low < UCOL_COMMON_BOT3<<24 ) {
502 low = UCOL_COMMON_BOT3<<24;
503 }
504 high = 0x40000000;
505 }
506 break;
507 }
508 if(s<0) {
509 *status = U_INTERNAL_PROGRAM_ERROR;
510 return 0;
511 }
512 }
513 }
514
515 if(low == 0) {
516 low = 0x01000000;
517 }
518
519 if(strength == UCOL_SECONDARY) { /* similar as simple */
520 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
521 low = UCOL_COMMON_TOP2<<24;
522 }
523 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
524 high = UCOL_COMMON_TOP2<<24;
525 }
526 if(low < (UCOL_COMMON_BOT2<<24)) {
527 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
528 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
529 //g->current = UCOL_COMMON_BOT2<<24;
530 return g->current;
531 }
532 }
533
534 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
535 if(g->noOfRanges == 0) {
536 *status = U_INTERNAL_PROGRAM_ERROR;
537 }
538 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
539 return g->current;
540 }
541
542 static
543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
544 uint32_t i = 0;
545 UChar c;
546
547 if(U_FAILURE(*status)) {
548 return 0;
549 }
550
551 if(sourceLen > resLen) {
552 *status = U_MEMORY_ALLOCATION_ERROR;
553 return 0;
554 }
555
556 for(i = 0; i < sourceLen; i++) {
557 c = source[i];
558 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
559 switch(c - 0x3000) {
560 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
561 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
562 c++;
563 break;
564 case 0xF5:
565 c = 0x30AB;
566 break;
567 case 0xF6:
568 c = 0x30B1;
569 break;
570 }
571 }
572 resBuf[i] = c;
573 }
574 return sourceLen;
575 }
576
577 static
578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
579 uint32_t i = 0;
580 UChar c;
581
582 if(U_FAILURE(*status)) {
583 return 0;
584 }
585
586 if(sourceLen > resLen) {
587 *status = U_MEMORY_ALLOCATION_ERROR;
588 return 0;
589 }
590
591 for(i = 0; i < sourceLen; i++) {
592 c = source[i];
593 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
594 switch(c - 0x3000) {
595 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
596 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
597 c--;
598 break;
599 case 0xAB:
600 c = 0x30F5;
601 break;
602 case 0xB1:
603 c = 0x30F6;
604 break;
605 }
606 }
607 resBuf[i] = c;
608 }
609 return sourceLen;
610 }
611
612 static
613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
614 uint32_t i = 0;
615 UChar n[128];
616 uint32_t nLen = 0;
617 uint32_t uCount = 0, lCount = 0;
618
619 collIterate s;
620 uint32_t order = 0;
621
622 if(U_FAILURE(*status)) {
623 return UCOL_LOWER_CASE;
624 }
625
626 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
627 if(U_SUCCESS(*status)) {
628 for(i = 0; i < nLen; i++) {
629 uprv_init_collIterate(UCA, &n[i], 1, &s);
630 order = ucol_getNextCE(UCA, &s, status);
631 if(isContinuation(order)) {
632 *status = U_INTERNAL_PROGRAM_ERROR;
633 return UCOL_LOWER_CASE;
634 }
635 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
636 uCount++;
637 } else {
638 if(u_islower(n[i])) {
639 lCount++;
640 } else {
641 UChar sk[1], lk[1];
642 u_toSmallKana(&n[i], 1, sk, 1, status);
643 u_toLargeKana(&n[i], 1, lk, 1, status);
644 if(sk[0] == n[i] && lk[0] != n[i]) {
645 lCount++;
646 }
647 }
648 }
649 }
650 }
651
652 if(uCount != 0 && lCount != 0) {
653 return UCOL_MIXED_CASE;
654 } else if(uCount != 0) {
655 return UCOL_UPPER_CASE;
656 } else {
657 return UCOL_LOWER_CASE;
658 }
659 }
660
661
662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
663 /* this one makes the table and stuff */
664 uint32_t noOfBytes[3];
665 uint32_t i;
666
667 for(i = 0; i<3; i++) {
668 ucol_countBytes(CEparts[i], noOfBytes[i]);
669 }
670
671 /* Here we have to pack CEs from parts */
672
673 uint32_t CEi = 0;
674 uint32_t value = 0;
675
676 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
677 if(CEi > 0) {
678 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
679 } else {
680 value = 0;
681 }
682
683 if(2*CEi<noOfBytes[0]) {
684 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
685 }
686 if(CEi<noOfBytes[1]) {
687 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
688 }
689 if(CEi<noOfBytes[2]) {
690 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
691 }
692 tok->CEs[CEi] = value;
693 CEi++;
694 }
695 if(CEi == 0) { /* totally ignorable */
696 tok->noOfCEs = 1;
697 tok->CEs[0] = 0;
698 } else { /* there is at least something */
699 tok->noOfCEs = CEi;
700 }
701
702
703 // we want to set case bits here and now, not later.
704 // Case bits handling
705 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
706 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
707 int32_t cSize = (tok->source & 0xFF000000) >> 24;
708 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
709
710 if(cSize > 1) {
711 // Do it manually
712 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
713 } else {
714 // Copy it from the UCA
715 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
716 tok->CEs[0] |= (caseCE & 0xC0);
717 }
718 }
719
720 #if UCOL_DEBUG==2
721 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
722 for(i = 0; i<tok->noOfCEs; i++) {
723 fprintf(stderr, "%08X ", tok->CEs[i]);
724 }
725 fprintf(stderr, "\n");
726 #endif
727 }
728
729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
730 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
731 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
732
733 UColToken *tok = lh->last;
734 uint32_t t[UCOL_STRENGTH_LIMIT];
735
736 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
737
738 tok->toInsert = 1;
739 t[tok->strength] = 1;
740
741 while(tok->previous != NULL) {
742 if(tok->previous->strength < tok->strength) { /* going up */
743 t[tok->strength] = 0;
744 t[tok->previous->strength]++;
745 } else if(tok->previous->strength > tok->strength) { /* going down */
746 t[tok->previous->strength] = 1;
747 } else {
748 t[tok->strength]++;
749 }
750 tok=tok->previous;
751 tok->toInsert = t[tok->strength];
752 }
753
754 tok->toInsert = t[tok->strength];
755 ucol_inv_getGapPositions(src, lh, status);
756
757 #if UCOL_DEBUG
758 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
759 int32_t j = 2;
760 for(j = 2; j >= 0; j--) {
761 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
762 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
763 }
764 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
765
766 do {
767 fprintf(stderr,"%i", tok->strength);
768 tok = tok->next;
769 } while(tok != NULL);
770 fprintf(stderr, "\n");
771
772 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
773
774 do {
775 fprintf(stderr,"%i", tok->toInsert);
776 tok = tok->next;
777 } while(tok != NULL);
778 #endif
779
780 tok = lh->first;
781 uint32_t fStrength = UCOL_IDENTICAL;
782 uint32_t initStrength = UCOL_IDENTICAL;
783
784
785 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
786 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
787 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
788
789 while (tok != NULL && U_SUCCESS(*status)) {
790 fStrength = tok->strength;
791 if(fStrength < initStrength) {
792 initStrength = fStrength;
793 if(lh->pos[fStrength] == -1) {
794 while(lh->pos[fStrength] == -1 && fStrength > 0) {
795 fStrength--;
796 }
797 if(lh->pos[fStrength] == -1) {
798 *status = U_INTERNAL_PROGRAM_ERROR;
799 return;
800 }
801 }
802 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
803 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
804 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
805 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
806 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
807 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
808 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
809 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
810 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
811 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
812 } else { /* primaries */
813 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
814 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
815 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
816 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
817 }
818 } else {
819 if(tok->strength == UCOL_TERTIARY) {
820 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
821 } else if(tok->strength == UCOL_SECONDARY) {
822 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
824 } else if(tok->strength == UCOL_PRIMARY) {
825 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
826 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
827 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
828 }
829 }
830 ucol_doCE(src, CEparts, tok, status);
831 tok = tok->next;
832 }
833 }
834
835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
836 UCAElements el;
837 UColToken *tok = lh->first;
838 UColToken *expt = NULL;
839 uint32_t i = 0, j = 0;
840 const uint16_t *fcdTrieData = unorm_getFCDTrie(status);
841
842 while(tok != NULL && U_SUCCESS(*status)) {
843 /* first, check if there are any expansions */
844 /* if there are expansions, we need to do a little bit more processing */
845 /* since parts of expansion can be tailored, while others are not */
846 if(tok->expansion != 0) {
847 uint32_t len = tok->expansion >> 24;
848 uint32_t currentSequenceLen = len;
849 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
850 //uint32_t exp = currentSequenceLen | expOffset;
851 UColToken exp;
852 exp.source = currentSequenceLen | expOffset;
853 exp.rulesToParse = src->source;
854
855 while(len > 0) {
856 currentSequenceLen = len;
857 while(currentSequenceLen > 0) {
858 exp.source = (currentSequenceLen << 24) | expOffset;
859 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
860 uint32_t noOfCEsToCopy = expt->noOfCEs;
861 for(j = 0; j<noOfCEsToCopy; j++) {
862 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
863 }
864 tok->noOfExpCEs += noOfCEsToCopy;
865 // Smart people never try to add codepoints and CEs.
866 // For some odd reason, it won't work.
867 expOffset += currentSequenceLen; //noOfCEsToCopy;
868 len -= currentSequenceLen; //noOfCEsToCopy;
869 break;
870 } else {
871 currentSequenceLen--;
872 }
873 }
874 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
875 /* will have to get one from UCA */
876 /* first, get the UChars from the rules */
877 /* then pick CEs out until there is no more and stuff them into expansion */
878 collIterate s;
879 uint32_t order = 0;
880 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
881
882 for(;;) {
883 order = ucol_getNextCE(src->UCA, &s, status);
884 if(order == UCOL_NO_MORE_CES) {
885 break;
886 }
887 tok->expCEs[tok->noOfExpCEs++] = order;
888 }
889 expOffset++;
890 len--;
891 }
892 }
893 } else {
894 tok->noOfExpCEs = 0;
895 }
896
897 /* set the ucaelement with obtained values */
898 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
899 /* copy CEs */
900 for(i = 0; i<tok->noOfCEs; i++) {
901 el.CEs[i] = tok->CEs[i];
902 }
903 for(i = 0; i<tok->noOfExpCEs; i++) {
904 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
905 }
906
907 /* copy UChars */
908 // We kept prefix and source kind of together, as it is a kind of a contraction.
909 // However, now we have to slice the prefix off the main thing -
910 el.prefix = el.prefixChars;
911 el.cPoints = el.uchars;
912 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
913 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
914 // decomposed elements to the unsaf table.
915 el.prefixSize = tok->prefix>>24;
916 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
917
918 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
919 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
920 } else {
921 el.prefixSize = 0;
922 *el.prefix = 0;
923
924 el.cSize = (tok->source >> 24);
925 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
926 }
927 if(src->UCA != NULL) {
928 UBool containCombinMarks = FALSE;
929 for(i = 0; i<el.cSize; i++) {
930 if(UCOL_ISJAMO(el.cPoints[i])) {
931 t->image->jamoSpecial = TRUE;
932 }
933 if ( !src->buildCCTabFlag ) {
934 // check combining class
935 int16_t fcd = unorm_getFCD16(fcdTrieData, el.cPoints[i]);
936 if ( (fcd && 0xff) == 0 ) {
937 // reset flag when current char is not combining mark.
938 containCombinMarks = FALSE;
939 }
940 else {
941 containCombinMarks = TRUE;
942 }
943 }
944 }
945 if ( !src->buildCCTabFlag && containCombinMarks ) {
946 src->buildCCTabFlag = TRUE;
947 }
948 }
949
950 /* and then, add it */
951 #if UCOL_DEBUG==2
952 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
953 #endif
954 uprv_uca_addAnElement(t, &el, status);
955
956 #if UCOL_DEBUG_DUPLICATES
957 if(*status != U_ZERO_ERROR) {
958 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
959 *status = U_ZERO_ERROR;
960 }
961 #endif
962
963 tok = tok->next;
964 }
965 }
966
967 U_CDECL_BEGIN
968 static UBool U_CALLCONV
969 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
970 UErrorCode status = U_ZERO_ERROR;
971 tempUCATable *t = (tempUCATable *)context;
972 if(value == 0) {
973 while(start < limit) {
974 uint32_t CE = utrie_get32(t->mapping, start, NULL);
975 if(CE == UCOL_NOT_FOUND) {
976 UCAElements el;
977 el.isThai = FALSE;
978 el.prefixSize = 0;
979 el.prefixChars[0] = 0;
980 el.prefix = el.prefixChars;
981 el.cPoints = el.uchars;
982
983 el.cSize = 0;
984 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
985
986 el.noOfCEs = 1;
987 el.CEs[0] = 0;
988 uprv_uca_addAnElement(t, &el, &status);
989
990 }
991 start++;
992 }
993 }
994 if(U_FAILURE(status)) {
995 return FALSE;
996 } else {
997 return TRUE;
998 }
999 }
1000 U_CDECL_END
1001
1002 static void
1003 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
1004 UChar32 start, UChar32 end,
1005 UErrorCode *status)
1006 {
1007 //UChar decomp[256];
1008 uint32_t CE = UCOL_NOT_FOUND;
1009 UChar32 u = 0;
1010 UCAElements el;
1011 el.isThai = FALSE;
1012 el.prefixSize = 0;
1013 el.prefixChars[0] = 0;
1014 collIterate colIt;
1015
1016 if(U_SUCCESS(*status)) {
1017 for(u = start; u<=end; u++) {
1018 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
1019 /* this test is for contractions that are missing the starting element. */
1020 || ((isCntTableElement(CE)) &&
1021 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
1022 )
1023 {
1024 el.cSize = 0;
1025 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
1026 //decomp[0] = (UChar)u;
1027 //el.uchars[0] = (UChar)u;
1028 el.cPoints = el.uchars;
1029 //el.cSize = 1;
1030 el.noOfCEs = 0;
1031 el.prefix = el.prefixChars;
1032 el.prefixSize = 0;
1033 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1034 // We actually want to check whether this element is a special
1035 // If it is an implicit element (hangul, CJK - we want to copy the
1036 // special, not the resolved CEs) - for hangul, copying resolved
1037 // would just make things the same (there is an expansion and it
1038 // takes approximately the same amount of time to resolve as
1039 // falling back to the UCA).
1040 /*
1041 UTRIE_GET32(src->UCA->mapping, u, CE);
1042 tag = getCETag(CE);
1043 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1044 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1045 || tag == LEAD_SURROGATE_TAG) {
1046 el.CEs[el.noOfCEs++] = CE;
1047 } else {
1048 */
1049 // It turns out that it does not make sense to keep implicits
1050 // unresolved. The cost of resolving them is big enough so that
1051 // it doesn't make any difference whether we have to go to the UCA
1052 // or not.
1053 {
1054 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1055 while(CE != UCOL_NO_MORE_CES) {
1056 CE = ucol_getNextCE(src->UCA, &colIt, status);
1057 if(CE != UCOL_NO_MORE_CES) {
1058 el.CEs[el.noOfCEs++] = CE;
1059 }
1060 }
1061 }
1062 uprv_uca_addAnElement(t, &el, status);
1063 }
1064 }
1065 }
1066 }
1067
1068 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1069 U_NAMESPACE_USE
1070
1071 uint32_t i = 0;
1072 if(U_FAILURE(*status)) {
1073 return NULL;
1074 }
1075 /*
1076 2. Eliminate the negative lists by doing the following for each non-null negative list:
1077 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1078 create new ListHeader X
1079 o reverse the list, add to the end of X's positive list. Reset the strength of the
1080 first item you add, based on the stronger strength levels of the two lists.
1081 */
1082 /*
1083 3. For each ListHeader with a non-null positive list:
1084 */
1085 /*
1086 o Find all character strings with CEs between the baseCE and the
1087 next/previous CE, at the strength of the first token. Add these to the
1088 tailoring.
1089 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1090 tailoring has & x < z...
1091 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1092 */
1093 /* It is possible that this part should be done even while constructing list */
1094 /* The problem is that it is unknown what is going to be the strongest weight */
1095 /* So we might as well do it here */
1096
1097 /*
1098 o Allocate CEs for each token in the list, based on the total number N of the
1099 largest level difference, and the gap G between baseCE and nextCE at that
1100 level. The relation * between the last item and nextCE is the same as the
1101 strongest strength.
1102 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1103 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1104 Then fit b and c into the secondary gap between a and d, then fit q
1105 into the tertiary gap between b and c.
1106
1107 o Example: baseCE << b <<< q << c * nextCE(X,2)
1108 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1109 Then fit q into the tertiary gap between b and c.
1110 o When incrementing primary values, we will not cross high byte
1111 boundaries except where there is only a single-byte primary. That is to
1112 ensure that the script reordering will continue to work.
1113 */
1114 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1115 /* test for NULL */
1116 if (image == NULL) {
1117 *status = U_MEMORY_ALLOCATION_ERROR;
1118 return NULL;
1119 }
1120 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1121
1122 for(i = 0; i<src->resultLen; i++) {
1123 /* now we need to generate the CEs */
1124 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1125 /* According to strength */
1126 if(U_SUCCESS(*status)) {
1127 if(src->lh[i].first) { // if there are any elements
1128 // due to the way parser works, subsequent tailorings
1129 // may remove all the elements from a sequence, therefore
1130 // leaving an empty tailoring sequence.
1131 ucol_initBuffers(src, &src->lh[i], status);
1132 }
1133 }
1134 if(U_FAILURE(*status)) {
1135 uprv_free(image);
1136 return NULL;
1137 }
1138 }
1139
1140 if(src->varTop != NULL) { /* stuff the variable top value */
1141 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1142 /* remove it from the list */
1143 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1144 src->varTop->listHeader->first = src->varTop->next;
1145 }
1146 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1147 src->varTop->listHeader->last = src->varTop->previous;
1148 }
1149 if(src->varTop->next != NULL) {
1150 src->varTop->next->previous = src->varTop->previous;
1151 }
1152 if(src->varTop->previous != NULL) {
1153 src->varTop->previous->next = src->varTop->next;
1154 }
1155 }
1156
1157
1158 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1159 if(U_FAILURE(*status)) {
1160 uprv_free(image);
1161 return NULL;
1162 }
1163
1164
1165 /* After this, we have assigned CE values to all regular CEs */
1166 /* now we will go through list once more and resolve expansions, */
1167 /* make UCAElements structs and add them to table */
1168 for(i = 0; i<src->resultLen; i++) {
1169 /* now we need to generate the CEs */
1170 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1171 /* According to strength */
1172 if(U_SUCCESS(*status)) {
1173 ucol_createElements(src, t, &src->lh[i], status);
1174 }
1175 }
1176
1177 UCAElements el;
1178 el.isThai = FALSE;
1179 el.prefixSize = 0;
1180 el.prefixChars[0] = 0;
1181
1182 /* add latin-1 stuff */
1183 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1184
1185 /* add stuff for copying */
1186 if(src->copySet != NULL) {
1187 int32_t i = 0;
1188 UnicodeSet *set = (UnicodeSet *)src->copySet;
1189 for(i = 0; i < set->getRangeCount(); i++) {
1190 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1191 }
1192 }
1193
1194 if(U_SUCCESS(*status)) {
1195 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1196
1197 uint32_t tailoredCE = UCOL_NOT_FOUND;
1198 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1199 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1200 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1201 // Check for null pointer
1202 if (ucaEl == NULL) {
1203 *status = U_MEMORY_ALLOCATION_ERROR;
1204 return NULL;
1205 }
1206 while(*conts != 0) {
1207 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1208 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1209 if(tailoredCE != UCOL_NOT_FOUND) {
1210 UBool needToAdd = TRUE;
1211 if(isCntTableElement(tailoredCE)) {
1212 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1213 needToAdd = FALSE;
1214 }
1215 }
1216 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
1217 UCAElements elm;
1218 elm.cPoints = el.uchars;
1219 elm.noOfCEs = 0;
1220 elm.uchars[0] = *conts;
1221 elm.uchars[1] = 0;
1222 elm.cSize = 1;
1223 elm.prefixChars[0] = *(conts+2);
1224 elm.isThai = FALSE;
1225 elm.prefix = elm.prefixChars;
1226 elm.prefixSize = 1;
1227 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
1228 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
1229 needToAdd = TRUE;
1230 }
1231 }
1232 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1233 needToAdd = FALSE;
1234 }
1235
1236 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1237 if (*(conts+1) != 0) { // contractions
1238 el.prefix = el.prefixChars;
1239 el.prefixSize = 0;
1240 el.cPoints = el.uchars;
1241 el.noOfCEs = 0;
1242 el.uchars[0] = *conts;
1243 el.uchars[1] = *(conts+1);
1244 if(*(conts+2)!=0) {
1245 el.uchars[2] = *(conts+2);
1246 el.cSize = 3;
1247 } else {
1248 el.cSize = 2;
1249 }
1250 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1251 }
1252 else { // pre-context character
1253 UChar str[4] = { 0 };
1254 int32_t len=0;
1255 int32_t preKeyLen=0;
1256
1257 el.cPoints = el.uchars;
1258 el.noOfCEs = 0;
1259 el.uchars[0] = *conts;
1260 el.uchars[1] = 0;
1261 el.cSize = 1;
1262 el.prefixChars[0] = *(conts+2);
1263 el.prefix = el.prefixChars;
1264 el.prefixSize = 1;
1265 if (el.prefixChars[0]!=0) {
1266 // get CE of prefix character first
1267 str[0]=el.prefixChars[0];
1268 str[1]=0;
1269 ucol_setText(ucaEl, str, 1, status);
1270 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
1271 != UCOL_NULLORDER) {
1272 preKeyLen++; // count number of keys for prefix character
1273 }
1274 str[len++] = el.prefixChars[0];
1275 }
1276
1277 str[len++] = el.uchars[0];
1278 str[len]=0;
1279 ucol_setText(ucaEl, str, len, status);
1280 // Skip the keys for prefix character, then copy the rest to el.
1281 while ((preKeyLen-->0) &&
1282 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1283 continue;
1284 }
1285
1286 }
1287 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1288 el.noOfCEs++;
1289 }
1290 uprv_uca_addAnElement(t, &el, status);
1291 }
1292
1293 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1294 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1295 }
1296 conts+=3;
1297 }
1298 ucol_closeElements(ucaEl);
1299 }
1300
1301 // Add completely ignorable elements
1302 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1303
1304 // add tailoring characters related canonical closures
1305 uprv_uca_canonicalClosure(t, src, status);
1306
1307 /* still need to produce compatibility closure */
1308
1309 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1310
1311 uprv_uca_closeTempTable(t);
1312 uprv_free(image);
1313
1314 return myData;
1315 }
1316
1317 U_CDECL_BEGIN
1318 static UBool U_CALLCONV
1319 ucol_bld_cleanup(void)
1320 {
1321 udata_close(invUCA_DATA_MEM);
1322 invUCA_DATA_MEM = NULL;
1323 _staticInvUCA = NULL;
1324 return TRUE;
1325 }
1326 U_CDECL_END
1327
1328 U_CAPI const InverseUCATableHeader * U_EXPORT2
1329 ucol_initInverseUCA(UErrorCode *status)
1330 {
1331 if(U_FAILURE(*status)) return NULL;
1332
1333 UBool needsInit;
1334 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
1335
1336 if(needsInit) {
1337 InverseUCATableHeader *newInvUCA = NULL;
1338 UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1339
1340 if(U_FAILURE(*status)) {
1341 if (result) {
1342 udata_close(result);
1343 }
1344 // This is not needed, as we are talking about
1345 // memory we got from UData
1346 //uprv_free(newInvUCA);
1347 }
1348
1349 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1350 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1351 UCollator *UCA = ucol_initUCA(status);
1352 // UCA versions of UCA and inverse UCA should match
1353 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1354 *status = U_INVALID_FORMAT_ERROR;
1355 udata_close(result);
1356 return NULL;
1357 }
1358
1359 umtx_lock(NULL);
1360 if(_staticInvUCA == NULL) {
1361 _staticInvUCA = newInvUCA;
1362 invUCA_DATA_MEM = result;
1363 result = NULL;
1364 newInvUCA = NULL;
1365 }
1366 umtx_unlock(NULL);
1367
1368 if(newInvUCA != NULL) {
1369 udata_close(result);
1370 // This is not needed, as we are talking about
1371 // memory we got from UData
1372 //uprv_free(newInvUCA);
1373 }
1374 else {
1375 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1376 }
1377 }
1378 }
1379 return _staticInvUCA;
1380 }
1381
1382 #endif /* #if !UCONFIG_NO_COLLATION */