]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/ucol_bld.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2001-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucol_bld.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created 02/22/2001
14 * created by: Vladimir Weinstein
15 *
16 * This module builds a collator based on the rule set.
17 *
18 */
19
20 #include "unicode/utypes.h"
21
22 #if !UCONFIG_NO_COLLATION
23
24 #include "unicode/ucoleitr.h"
25 #include "unicode/uchar.h"
26 #include "ucol_bld.h"
27 #include "ucln_in.h"
28 #include "umutex.h"
29 #include "unicode/uniset.h"
30
31 static const InverseUCATableHeader* _staticInvUCA = NULL;
32 static UDataMemory* invUCA_DATA_MEM = NULL;
33
34 U_CDECL_BEGIN
35 static UBool U_CALLCONV
36 isAcceptableInvUCA(void * /*context*/,
37 const char * /*type*/, const char * /*name*/,
38 const UDataInfo *pInfo){
39 /* context, type & name are intentionally not used */
40 if( pInfo->size>=20 &&
41 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
42 pInfo->charsetFamily==U_CHARSET_FAMILY &&
43 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
44 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
45 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
46 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
47 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
48 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
49 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
50 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
51 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
52 ) {
53 UVersionInfo UCDVersion;
54 u_getUnicodeVersion(UCDVersion);
55 if(pInfo->dataVersion[0]==UCDVersion[0] &&
56 pInfo->dataVersion[1]==UCDVersion[1]) {
57 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
58 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
59 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
60 return TRUE;
61 } else {
62 return FALSE;
63 }
64 } else {
65 return FALSE;
66 }
67 }
68 U_CDECL_END
69
70 /*
71 * Takes two CEs (lead and continuation) and
72 * compares them as CEs should be compared:
73 * primary vs. primary, secondary vs. secondary
74 * tertiary vs. tertiary
75 */
76 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
77 uint32_t s1 = source0, s2, t1 = target0, t2;
78 if(isContinuation(source1)) {
79 s2 = source1;
80 } else {
81 s2 = 0;
82 }
83 if(isContinuation(target1)) {
84 t2 = target1;
85 } else {
86 t2 = 0;
87 }
88
89 uint32_t s = 0, t = 0;
90 if(s1 == t1 && s2 == t2) {
91 return 0;
92 }
93 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
94 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
95 if(s < t) {
96 return -1;
97 } else if(s > t) {
98 return 1;
99 } else {
100 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
101 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
102 if(s < t) {
103 return -1;
104 } else if(s > t) {
105 return 1;
106 } else {
107 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
108 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
109 if(s < t) {
110 return -1;
111 } else {
112 return 1;
113 }
114 }
115 }
116 }
117
118 static
119 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
120 uint32_t bottom = 0, top = src->invUCA->tableSize;
121 uint32_t i = 0;
122 uint32_t first = 0, second = 0;
123 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
124 int32_t res = 0;
125
126 while(bottom < top-1) {
127 i = (top+bottom)/2;
128 first = *(CETable+3*i);
129 second = *(CETable+3*i+1);
130 res = compareCEs(first, second, CE, SecondCE);
131 if(res > 0) {
132 top = i;
133 } else if(res < 0) {
134 bottom = i;
135 } else {
136 break;
137 }
138 }
139
140 /* weiv: */
141 /* in searching for elements, I have removed the failure */
142 /* The reason for this is that the builder does not rely */
143 /* on search mechanism telling it that it didn't find an */
144 /* element. However, indirect positioning relies on being */
145 /* able to find the elements around any CE, even if it is */
146 /* not defined in the UCA. */
147 return i;
148 /*
149 if((first == CE && second == SecondCE)) {
150 return i;
151 } else {
152 return -1;
153 }
154 */
155 }
156
157 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
158 0xFFFF0000,
159 0xFFFFFF00,
160 0xFFFFFFFF
161 };
162
163 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
164 uint32_t CE, uint32_t contCE,
165 uint32_t *nextCE, uint32_t *nextContCE,
166 uint32_t strength) {
167 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
168 int32_t iCE;
169
170 iCE = ucol_inv_findCE(src, CE, contCE);
171
172 if(iCE<0) {
173 *nextCE = UCOL_NOT_FOUND;
174 return -1;
175 }
176
177 CE &= strengthMask[strength];
178 contCE &= strengthMask[strength];
179
180 *nextCE = CE;
181 *nextContCE = contCE;
182
183 while((*nextCE & strengthMask[strength]) == CE
184 && (*nextContCE & strengthMask[strength]) == contCE) {
185 *nextCE = (*(CETable+3*(++iCE)));
186 *nextContCE = (*(CETable+3*(iCE)+1));
187 }
188
189 return iCE;
190 }
191
192 U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
193 uint32_t CE, uint32_t contCE,
194 uint32_t *prevCE, uint32_t *prevContCE,
195 uint32_t strength) {
196 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
197 int32_t iCE;
198
199 iCE = ucol_inv_findCE(src, CE, contCE);
200
201 if(iCE<0) {
202 *prevCE = UCOL_NOT_FOUND;
203 return -1;
204 }
205
206 CE &= strengthMask[strength];
207 contCE &= strengthMask[strength];
208
209 *prevCE = CE;
210 *prevContCE = contCE;
211
212 while((*prevCE & strengthMask[strength]) == CE
213 && (*prevContCE & strengthMask[strength])== contCE
214 && iCE > 0) { /* this condition should prevent falling off the edge of the world */
215 /* here, we end up in a singularity - zero */
216 *prevCE = (*(CETable+3*(--iCE)));
217 *prevContCE = (*(CETable+3*(iCE)+1));
218 }
219
220 return iCE;
221 }
222
223 U_CAPI uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
224 uint32_t prevCE, uint32_t prevContCE)
225 {
226 if(prevCE == CE && prevContCE == contCE) {
227 return UCOL_IDENTICAL;
228 }
229 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
230 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) {
231 return UCOL_PRIMARY;
232 }
233 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
234 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) {
235 return UCOL_SECONDARY;
236 }
237 return UCOL_TERTIARY;
238 }
239
240
241 static
242 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
243
244 uint32_t CE = lh->baseCE;
245 uint32_t SecondCE = lh->baseContCE;
246
247 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
248 uint32_t previousCE, previousContCE;
249 int32_t iCE;
250
251 iCE = ucol_inv_findCE(src, CE, SecondCE);
252
253 if(iCE<0) {
254 return -1;
255 }
256
257 CE &= strengthMask[strength];
258 SecondCE &= strengthMask[strength];
259
260 previousCE = CE;
261 previousContCE = SecondCE;
262
263 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
264 previousCE = (*(CETable+3*(--iCE)));
265 previousContCE = (*(CETable+3*(iCE)+1));
266 }
267 lh->previousCE = previousCE;
268 lh->previousContCE = previousContCE;
269
270 return iCE;
271 }
272
273 static
274 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
275 uint32_t CE = lh->baseCE;
276 uint32_t SecondCE = lh->baseContCE;
277
278 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
279 uint32_t nextCE, nextContCE;
280 int32_t iCE;
281
282 iCE = ucol_inv_findCE(src, CE, SecondCE);
283
284 if(iCE<0) {
285 return -1;
286 }
287
288 CE &= strengthMask[strength];
289 SecondCE &= strengthMask[strength];
290
291 nextCE = CE;
292 nextContCE = SecondCE;
293
294 while((nextCE & strengthMask[strength]) == CE
295 && (nextContCE & strengthMask[strength]) == SecondCE) {
296 nextCE = (*(CETable+3*(++iCE)));
297 nextContCE = (*(CETable+3*(iCE)+1));
298 }
299
300 lh->nextCE = nextCE;
301 lh->nextContCE = nextContCE;
302
303 return iCE;
304 }
305
306 U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
307 /* reset all the gaps */
308 int32_t i = 0;
309 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
310 uint32_t st = 0;
311 uint32_t t1, t2;
312 int32_t pos;
313
314 UColToken *tok = lh->first;
315 uint32_t tokStrength = tok->strength;
316
317 for(i = 0; i<3; i++) {
318 lh->gapsHi[3*i] = 0;
319 lh->gapsHi[3*i+1] = 0;
320 lh->gapsHi[3*i+2] = 0;
321 lh->gapsLo[3*i] = 0;
322 lh->gapsLo[3*i+1] = 0;
323 lh->gapsLo[3*i+2] = 0;
324 lh->numStr[i] = 0;
325 lh->fStrToken[i] = NULL;
326 lh->lStrToken[i] = NULL;
327 lh->pos[i] = -1;
328 }
329
330 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
331
332 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
333 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
334 lh->pos[0] = 0;
335 t1 = lh->baseCE;
336 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
337 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
338 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
339 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
340 uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
341 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
342
343 t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
344 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
345
346 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
347 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
348 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
349 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
350 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
351 lh->pos[0] = 0;
352 t1 = lh->baseCE;
353 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
354 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
355 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
356 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
357 t1 = lh->nextCE;
358 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
359 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
360 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
361 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
362 } else {
363 for(;;) {
364 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
365 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
366 lh->fStrToken[tokStrength] = tok;
367 } else { /* The CE must be implicit, since it's not in the table */
368 /* Error */
369 *status = U_INTERNAL_PROGRAM_ERROR;
370 }
371 }
372
373 while(tok != NULL && tok->strength >= tokStrength) {
374 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
375 lh->lStrToken[tokStrength] = tok;
376 }
377 tok = tok->next;
378 }
379 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
380 /* check if previous interval is the same and merge the intervals if it is so */
381 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
382 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
383 lh->fStrToken[tokStrength+1] = NULL;
384 lh->lStrToken[tokStrength+1] = NULL;
385 lh->pos[tokStrength+1] = -1;
386 }
387 }
388 if(tok != NULL) {
389 tokStrength = tok->strength;
390 } else {
391 break;
392 }
393 }
394 for(st = 0; st < 3; st++) {
395 if((pos = lh->pos[st]) >= 0) {
396 t1 = *(CETable+3*(pos));
397 t2 = *(CETable+3*(pos)+1);
398 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
399 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
400 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
401 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
402 //pos--;
403 //t1 = *(CETable+3*(pos));
404 //t2 = *(CETable+3*(pos)+1);
405 t1 = lh->baseCE;
406 t2 = lh->baseContCE;
407 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
408 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
409 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
410 }
411 }
412 }
413 }
414
415
416 #define ucol_countBytes(value, noOfBytes) \
417 { \
418 uint32_t mask = 0xFFFFFFFF; \
419 (noOfBytes) = 0; \
420 while(mask != 0) { \
421 if(((value) & mask) != 0) { \
422 (noOfBytes)++; \
423 } \
424 mask >>= 8; \
425 } \
426 }
427
428 U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
429 if(U_SUCCESS(*status)) {
430 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
431 }
432 return g->current;
433 }
434
435 U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
436 /* TODO: rename to enum names */
437 uint32_t high, low, count=1;
438 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
439
440 if(strength == UCOL_SECONDARY) {
441 low = UCOL_COMMON_TOP2<<24;
442 high = 0xFFFFFFFF;
443 count = 0xFF - UCOL_COMMON_TOP2;
444 } else {
445 low = UCOL_BYTE_COMMON << 24; //0x05000000;
446 high = 0x40000000;
447 count = 0x40 - UCOL_BYTE_COMMON;
448 }
449
450 if(tok->next != NULL && tok->next->strength == strength) {
451 count = tok->next->toInsert;
452 }
453
454 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
455 g->current = UCOL_BYTE_COMMON<<24;
456
457 if(g->noOfRanges == 0) {
458 *status = U_INTERNAL_PROGRAM_ERROR;
459 }
460 return g->current;
461 }
462
463 U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
464 uint32_t strength = tok->strength;
465 uint32_t low = lows[fStrength*3+strength];
466 uint32_t high = highs[fStrength*3+strength];
467 uint32_t maxByte = 0;
468 if(strength == UCOL_TERTIARY) {
469 maxByte = 0x3F;
470 } else if(strength == UCOL_PRIMARY) {
471 maxByte = 0xFE;
472 } else {
473 maxByte = 0xFF;
474 }
475
476 uint32_t count = tok->toInsert;
477
478 if(low >= high && strength > UCOL_PRIMARY) {
479 int32_t s = strength;
480 for(;;) {
481 s--;
482 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
483 if(strength == UCOL_SECONDARY) {
484 low = UCOL_COMMON_TOP2<<24;
485 high = 0xFFFFFFFF;
486 } else {
487 //low = 0x02000000; // This needs to be checked - what if low is
488 // not good...
489 high = 0x40000000;
490 }
491 break;
492 }
493 if(s<0) {
494 *status = U_INTERNAL_PROGRAM_ERROR;
495 return 0;
496 }
497 }
498 }
499
500 if(low == 0) {
501 low = 0x01000000;
502 }
503
504 if(strength == UCOL_SECONDARY) { /* similar as simple */
505 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
506 low = UCOL_COMMON_TOP2<<24;
507 }
508 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
509 high = UCOL_COMMON_TOP2<<24;
510 }
511 if(low < (UCOL_COMMON_BOT2<<24)) {
512 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
513 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
514 //g->current = UCOL_COMMON_BOT2<<24;
515 return g->current;
516 }
517 }
518
519 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
520 if(g->noOfRanges == 0) {
521 *status = U_INTERNAL_PROGRAM_ERROR;
522 }
523 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
524 return g->current;
525 }
526
527 static
528 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
529 uint32_t i = 0;
530 UChar c;
531
532 if(U_FAILURE(*status)) {
533 return 0;
534 }
535
536 if(sourceLen > resLen) {
537 *status = U_MEMORY_ALLOCATION_ERROR;
538 return 0;
539 }
540
541 for(i = 0; i < sourceLen; i++) {
542 c = source[i];
543 if(0x3042 < c && c < 0x30ef) { /* Kana range */
544 switch(c - 0x3000) {
545 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
546 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
547 c++;
548 break;
549 case 0xF5:
550 c = 0x30AB;
551 break;
552 case 0xF6:
553 c = 0x30B1;
554 break;
555 }
556 }
557 resBuf[i] = c;
558 }
559 return sourceLen;
560 }
561
562 static
563 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
564 uint32_t i = 0;
565 UChar c;
566
567 if(U_FAILURE(*status)) {
568 return 0;
569 }
570
571 if(sourceLen > resLen) {
572 *status = U_MEMORY_ALLOCATION_ERROR;
573 return 0;
574 }
575
576 for(i = 0; i < sourceLen; i++) {
577 c = source[i];
578 if(0x3042 < c && c < 0x30ef) { /* Kana range */
579 switch(c - 0x3000) {
580 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
581 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
582 c--;
583 break;
584 case 0xAB:
585 c = 0x30F5;
586 break;
587 case 0xB1:
588 c = 0x30F6;
589 break;
590 }
591 }
592 resBuf[i] = c;
593 }
594 return sourceLen;
595 }
596
597 static
598 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
599 uint32_t i = 0;
600 UChar n[128];
601 uint32_t nLen = 0;
602 uint32_t uCount = 0, lCount = 0;
603
604 collIterate s;
605 uint32_t order = 0;
606
607 if(U_FAILURE(*status)) {
608 return UCOL_LOWER_CASE;
609 }
610
611 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
612 if(U_SUCCESS(*status)) {
613 for(i = 0; i < nLen; i++) {
614 uprv_init_collIterate(UCA, &n[i], 1, &s);
615 order = ucol_getNextCE(UCA, &s, status);
616 if(isContinuation(order)) {
617 *status = U_INTERNAL_PROGRAM_ERROR;
618 return UCOL_LOWER_CASE;
619 }
620 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
621 uCount++;
622 } else {
623 if(u_islower(n[i])) {
624 lCount++;
625 } else {
626 UChar sk[1], lk[1];
627 u_toSmallKana(&n[i], 1, sk, 1, status);
628 u_toLargeKana(&n[i], 1, lk, 1, status);
629 if(sk[0] == n[i] && lk[0] != n[i]) {
630 lCount++;
631 }
632 }
633 }
634 }
635 }
636
637 if(uCount != 0 && lCount != 0) {
638 return UCOL_MIXED_CASE;
639 } else if(uCount != 0) {
640 return UCOL_UPPER_CASE;
641 } else {
642 return UCOL_LOWER_CASE;
643 }
644 }
645
646
647 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
648 /* this one makes the table and stuff */
649 uint32_t noOfBytes[3];
650 uint32_t i;
651
652 for(i = 0; i<3; i++) {
653 ucol_countBytes(CEparts[i], noOfBytes[i]);
654 }
655
656 /* Here we have to pack CEs from parts */
657
658 uint32_t CEi = 0;
659 uint32_t value = 0;
660
661 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
662 if(CEi > 0) {
663 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
664 } else {
665 value = 0;
666 }
667
668 if(2*CEi<noOfBytes[0]) {
669 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
670 }
671 if(CEi<noOfBytes[1]) {
672 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
673 }
674 if(CEi<noOfBytes[2]) {
675 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
676 }
677 tok->CEs[CEi] = value;
678 CEi++;
679 }
680 if(CEi == 0) { /* totally ignorable */
681 tok->noOfCEs = 1;
682 tok->CEs[0] = 0;
683 } else { /* there is at least something */
684 tok->noOfCEs = CEi;
685 }
686
687
688 // we want to set case bits here and now, not later.
689 // Case bits handling
690 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
691 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
692 int32_t cSize = (tok->source & 0xFF000000) >> 24;
693 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
694
695 if(cSize > 1) {
696 // Do it manually
697 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
698 } else {
699 // Copy it from the UCA
700 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
701 tok->CEs[0] |= (caseCE & 0xC0);
702 }
703 }
704
705 #if UCOL_DEBUG==2
706 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
707 for(i = 0; i<tok->noOfCEs; i++) {
708 fprintf(stderr, "%08X ", tok->CEs[i]);
709 }
710 fprintf(stderr, "\n");
711 #endif
712 }
713
714 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
715 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
716 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
717
718 UColToken *tok = lh->last;
719 uint32_t t[UCOL_STRENGTH_LIMIT];
720
721 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
722
723 tok->toInsert = 1;
724 t[tok->strength] = 1;
725
726 while(tok->previous != NULL) {
727 if(tok->previous->strength < tok->strength) { /* going up */
728 t[tok->strength] = 0;
729 t[tok->previous->strength]++;
730 } else if(tok->previous->strength > tok->strength) { /* going down */
731 t[tok->previous->strength] = 1;
732 } else {
733 t[tok->strength]++;
734 }
735 tok=tok->previous;
736 tok->toInsert = t[tok->strength];
737 }
738
739 tok->toInsert = t[tok->strength];
740 ucol_inv_getGapPositions(src, lh, status);
741
742 #if UCOL_DEBUG
743 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
744 int32_t j = 2;
745 for(j = 2; j >= 0; j--) {
746 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
747 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
748 }
749 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
750
751 do {
752 fprintf(stderr,"%i", tok->strength);
753 tok = tok->next;
754 } while(tok != NULL);
755 fprintf(stderr, "\n");
756
757 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
758
759 do {
760 fprintf(stderr,"%i", tok->toInsert);
761 tok = tok->next;
762 } while(tok != NULL);
763 #endif
764
765 tok = lh->first;
766 uint32_t fStrength = UCOL_IDENTICAL;
767 uint32_t initStrength = UCOL_IDENTICAL;
768
769
770 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
771 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
772 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
773
774 while (tok != NULL && U_SUCCESS(*status)) {
775 fStrength = tok->strength;
776 if(fStrength < initStrength) {
777 initStrength = fStrength;
778 if(lh->pos[fStrength] == -1) {
779 while(lh->pos[fStrength] == -1 && fStrength > 0) {
780 fStrength--;
781 }
782 if(lh->pos[fStrength] == -1) {
783 *status = U_INTERNAL_PROGRAM_ERROR;
784 return;
785 }
786 }
787 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
788 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
789 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
790 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
791 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
792 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
793 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
794 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
795 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
796 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
797 } else { /* primaries */
798 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
799 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
800 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
801 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
802 }
803 } else {
804 if(tok->strength == UCOL_TERTIARY) {
805 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
806 } else if(tok->strength == UCOL_SECONDARY) {
807 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
808 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
809 } else if(tok->strength == UCOL_PRIMARY) {
810 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
811 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
812 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
813 }
814 }
815 ucol_doCE(src, CEparts, tok, status);
816 tok = tok->next;
817 }
818 }
819
820 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
821 UCAElements el;
822 UColToken *tok = lh->first;
823 UColToken *expt = NULL;
824 uint32_t i = 0, j = 0;
825
826 while(tok != NULL && U_SUCCESS(*status)) {
827 /* first, check if there are any expansions */
828 /* if there are expansions, we need to do a little bit more processing */
829 /* since parts of expansion can be tailored, while others are not */
830 if(tok->expansion != 0) {
831 uint32_t len = tok->expansion >> 24;
832 uint32_t currentSequenceLen = len;
833 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
834 //uint32_t exp = currentSequenceLen | expOffset;
835 UColToken exp;
836 exp.source = currentSequenceLen | expOffset;
837 exp.rulesToParse = src->source;
838
839 while(len > 0) {
840 currentSequenceLen = len;
841 while(currentSequenceLen > 0) {
842 exp.source = (currentSequenceLen << 24) | expOffset;
843 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
844 uint32_t noOfCEsToCopy = expt->noOfCEs;
845 for(j = 0; j<noOfCEsToCopy; j++) {
846 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
847 }
848 tok->noOfExpCEs += noOfCEsToCopy;
849 // Smart people never try to add codepoints and CEs.
850 // For some odd reason, it won't work.
851 expOffset += currentSequenceLen; //noOfCEsToCopy;
852 len -= currentSequenceLen; //noOfCEsToCopy;
853 break;
854 } else {
855 currentSequenceLen--;
856 }
857 }
858 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
859 /* will have to get one from UCA */
860 /* first, get the UChars from the rules */
861 /* then pick CEs out until there is no more and stuff them into expansion */
862 collIterate s;
863 uint32_t order = 0;
864 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
865
866 for(;;) {
867 order = ucol_getNextCE(src->UCA, &s, status);
868 if(order == UCOL_NO_MORE_CES) {
869 break;
870 }
871 tok->expCEs[tok->noOfExpCEs++] = order;
872 }
873 expOffset++;
874 len--;
875 }
876 }
877 } else {
878 tok->noOfExpCEs = 0;
879 }
880
881 /* set the ucaelement with obtained values */
882 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
883 /* copy CEs */
884 for(i = 0; i<tok->noOfCEs; i++) {
885 el.CEs[i] = tok->CEs[i];
886 }
887 for(i = 0; i<tok->noOfExpCEs; i++) {
888 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
889 }
890
891 /* copy UChars */
892 // We kept prefix and source kind of together, as it is a kind of a contraction.
893 // However, now we have to slice the prefix off the main thing -
894 el.prefix = el.prefixChars;
895 el.cPoints = el.uchars;
896 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
897 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
898 // decomposed elements to the unsaf table.
899 el.prefixSize = tok->prefix>>24;
900 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
901
902 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
903 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
904 } else {
905 el.prefixSize = 0;
906 *el.prefix = 0;
907
908 el.cSize = (tok->source >> 24);
909 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
910 }
911 if(src->UCA != NULL) {
912 for(i = 0; i<el.cSize; i++) {
913 if(UCOL_ISJAMO(el.cPoints[i])) {
914 t->image->jamoSpecial = TRUE;
915 }
916 }
917 }
918
919 /* and then, add it */
920 #if UCOL_DEBUG==2
921 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
922 #endif
923 uprv_uca_addAnElement(t, &el, status);
924
925 #if UCOL_DEBUG_DUPLICATES
926 if(*status != U_ZERO_ERROR) {
927 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
928 *status = U_ZERO_ERROR;
929 }
930 #endif
931
932 tok = tok->next;
933 }
934 }
935
936 U_CDECL_BEGIN
937 static UBool U_CALLCONV
938 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
939 UErrorCode status = U_ZERO_ERROR;
940 tempUCATable *t = (tempUCATable *)context;
941 if(value == 0) {
942 while(start < limit) {
943 uint32_t CE = utrie_get32(t->mapping, start, NULL);
944 if(CE == UCOL_NOT_FOUND) {
945 UCAElements el;
946 el.isThai = FALSE;
947 el.prefixSize = 0;
948 el.prefixChars[0] = 0;
949 el.prefix = el.prefixChars;
950 el.cPoints = el.uchars;
951
952 el.cSize = 0;
953 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
954
955 el.noOfCEs = 1;
956 el.CEs[0] = 0;
957 uprv_uca_addAnElement(t, &el, &status);
958
959 }
960 start++;
961 }
962 }
963 if(U_FAILURE(status)) {
964 return FALSE;
965 } else {
966 return TRUE;
967 }
968 }
969 U_CDECL_END
970
971 static void
972 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
973 UChar32 start, UChar32 end,
974 UErrorCode *status) {
975 //UChar decomp[256];
976 uint32_t CE = UCOL_NOT_FOUND;
977 UChar32 u = 0;
978 UCAElements el;
979 el.isThai = FALSE;
980 el.prefixSize = 0;
981 el.prefixChars[0] = 0;
982 collIterate colIt;
983
984 if(U_SUCCESS(*status)) {
985 for(u = start; u<=end; u++) {
986 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
987 /* this test is for contractions that are missing the starting element. */
988 || ((isCntTableElement(CE)) &&
989 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
990 ) {
991 el.cSize = 0;
992 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
993 //decomp[0] = (UChar)u;
994 //el.uchars[0] = (UChar)u;
995 el.cPoints = el.uchars;
996 //el.cSize = 1;
997 el.noOfCEs = 0;
998 el.prefix = el.prefixChars;
999 el.prefixSize = 0;
1000 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
1001 // We actually want to check whether this element is a special
1002 // If it is an implicit element (hangul, CJK - we want to copy the
1003 // special, not the resolved CEs) - for hangul, copying resolved
1004 // would just make things the same (there is an expansion and it
1005 // takes approximately the same amount of time to resolve as
1006 // falling back to the UCA).
1007 /*
1008 UTRIE_GET32(src->UCA->mapping, u, CE);
1009 tag = getCETag(CE);
1010 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
1011 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1012 || tag == LEAD_SURROGATE_TAG) {
1013 el.CEs[el.noOfCEs++] = CE;
1014 } else {
1015 */
1016 // It turns out that it does not make sense to keep implicits
1017 // unresolved. The cost of resolving them is big enough so that
1018 // it doesn't make any difference whether we have to go to the UCA
1019 // or not.
1020 {
1021 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1022 while(CE != UCOL_NO_MORE_CES) {
1023 CE = ucol_getNextCE(src->UCA, &colIt, status);
1024 if(CE != UCOL_NO_MORE_CES) {
1025 el.CEs[el.noOfCEs++] = CE;
1026 }
1027 }
1028 }
1029 uprv_uca_addAnElement(t, &el, status);
1030 }
1031 }
1032 }
1033 }
1034
1035 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1036 uint32_t i = 0;
1037 if(U_FAILURE(*status)) {
1038 return NULL;
1039 }
1040 /*
1041 2. Eliminate the negative lists by doing the following for each non-null negative list:
1042 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1043 create new ListHeader X
1044 o reverse the list, add to the end of X's positive list. Reset the strength of the
1045 first item you add, based on the stronger strength levels of the two lists.
1046 */
1047 /*
1048 3. For each ListHeader with a non-null positive list:
1049 */
1050 /*
1051 o Find all character strings with CEs between the baseCE and the
1052 next/previous CE, at the strength of the first token. Add these to the
1053 tailoring.
1054 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1055 tailoring has & x < z...
1056 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1057 */
1058 /* It is possible that this part should be done even while constructing list */
1059 /* The problem is that it is unknown what is going to be the strongest weight */
1060 /* So we might as well do it here */
1061
1062 /*
1063 o Allocate CEs for each token in the list, based on the total number N of the
1064 largest level difference, and the gap G between baseCE and nextCE at that
1065 level. The relation * between the last item and nextCE is the same as the
1066 strongest strength.
1067 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1068 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1069 Then fit b and c into the secondary gap between a and d, then fit q
1070 into the tertiary gap between b and c.
1071
1072 o Example: baseCE << b <<< q << c * nextCE(X,2)
1073 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1074 Then fit q into the tertiary gap between b and c.
1075 o When incrementing primary values, we will not cross high byte
1076 boundaries except where there is only a single-byte primary. That is to
1077 ensure that the script reordering will continue to work.
1078 */
1079 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1080 /* test for NULL */
1081 if (image == NULL) {
1082 *status = U_MEMORY_ALLOCATION_ERROR;
1083 return NULL;
1084 }
1085 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1086
1087 for(i = 0; i<src->resultLen; i++) {
1088 /* now we need to generate the CEs */
1089 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1090 /* According to strength */
1091 if(U_SUCCESS(*status)) {
1092 if(src->lh[i].first) { // if there are any elements
1093 // due to the way parser works, subsequent tailorings
1094 // may remove all the elements from a sequence, therefore
1095 // leaving an empty tailoring sequence.
1096 ucol_initBuffers(src, &src->lh[i], status);
1097 }
1098 }
1099 if(U_FAILURE(*status)) {
1100 return NULL;
1101 }
1102
1103 }
1104
1105 if(src->varTop != NULL) { /* stuff the variable top value */
1106 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1107 /* remove it from the list */
1108 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1109 src->varTop->listHeader->first = src->varTop->next;
1110 }
1111 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1112 src->varTop->listHeader->last = src->varTop->previous;
1113 }
1114 if(src->varTop->next != NULL) {
1115 src->varTop->next->previous = src->varTop->previous;
1116 }
1117 if(src->varTop->previous != NULL) {
1118 src->varTop->previous->next = src->varTop->next;
1119 }
1120 }
1121
1122
1123 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
1124
1125
1126 /* After this, we have assigned CE values to all regular CEs */
1127 /* now we will go through list once more and resolve expansions, */
1128 /* make UCAElements structs and add them to table */
1129 for(i = 0; i<src->resultLen; i++) {
1130 /* now we need to generate the CEs */
1131 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1132 /* According to strength */
1133 if(U_SUCCESS(*status)) {
1134 ucol_createElements(src, t, &src->lh[i], status);
1135 }
1136 }
1137
1138 UCAElements el;
1139 el.isThai = FALSE;
1140 el.prefixSize = 0;
1141 el.prefixChars[0] = 0;
1142
1143 /* add latin-1 stuff */
1144 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1145
1146 /* add stuff for copying */
1147 if(src->copySet != NULL) {
1148 int32_t i = 0;
1149 UnicodeSet *set = (UnicodeSet *)src->copySet;
1150 for(i = 0; i < set->getRangeCount(); i++) {
1151 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1152 }
1153 }
1154
1155 if(U_SUCCESS(*status)) {
1156 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1157
1158 uint32_t tailoredCE = UCOL_NOT_FOUND;
1159 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1160 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1161 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1162 while(*conts != 0) {
1163 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1164 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1165 if(tailoredCE != UCOL_NOT_FOUND) {
1166 UBool needToAdd = TRUE;
1167 if(isCntTableElement(tailoredCE)) {
1168 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1169 needToAdd = FALSE;
1170 }
1171 }
1172 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1173 needToAdd = FALSE;
1174 }
1175
1176 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1177 el.prefix = el.prefixChars;
1178 el.prefixSize = 0;
1179 el.cPoints = el.uchars;
1180 el.noOfCEs = 0;
1181 el.uchars[0] = *conts;
1182 el.uchars[1] = *(conts+1);
1183 if(*(conts+2)!=0) {
1184 el.uchars[2] = *(conts+2);
1185 el.cSize = 3;
1186 } else {
1187 el.cSize = 2;
1188 }
1189 ucol_setText(ucaEl, el.uchars, el.cSize, status);
1190 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
1191 el.noOfCEs++;
1192 }
1193 uprv_uca_addAnElement(t, &el, status);
1194 }
1195
1196 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1197 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1198 }
1199 conts+=3;
1200 }
1201 ucol_closeElements(ucaEl);
1202 }
1203
1204 // Add completely ignorable elements
1205 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1206
1207
1208 // canonical closure
1209 uprv_uca_canonicalClosure(t, status);
1210
1211
1212 /* still need to produce compatibility closure */
1213
1214 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1215
1216 uprv_uca_closeTempTable(t);
1217 uprv_free(image);
1218
1219 return myData;
1220 }
1221
1222 U_CDECL_BEGIN
1223 static UBool U_CALLCONV
1224 ucol_bld_cleanup(void)
1225 {
1226 udata_close(invUCA_DATA_MEM);
1227 invUCA_DATA_MEM = NULL;
1228 _staticInvUCA = NULL;
1229 return TRUE;
1230 }
1231 U_CDECL_END
1232
1233 U_CAPI const InverseUCATableHeader * U_EXPORT2
1234 ucol_initInverseUCA(UErrorCode *status)
1235 {
1236 if(U_FAILURE(*status)) return NULL;
1237
1238 umtx_lock(NULL);
1239 UBool f = (_staticInvUCA == NULL);
1240 umtx_unlock(NULL);
1241
1242 if(f) {
1243 InverseUCATableHeader *newInvUCA = NULL;
1244 UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1245
1246 if(U_FAILURE(*status)) {
1247 if (result) {
1248 udata_close(result);
1249 }
1250 // This is not needed, as we are talking about
1251 // memory we got from UData
1252 //uprv_free(newInvUCA);
1253 }
1254
1255 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1256 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1257 UCollator *UCA = ucol_initUCA(status);
1258 // UCA versions of UCA and inverse UCA should match
1259 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1260 *status = U_INVALID_FORMAT_ERROR;
1261 udata_close(result);
1262 return NULL;
1263 }
1264
1265 umtx_lock(NULL);
1266 if(_staticInvUCA == NULL) {
1267 _staticInvUCA = newInvUCA;
1268 invUCA_DATA_MEM = result;
1269 result = NULL;
1270 newInvUCA = NULL;
1271 }
1272 umtx_unlock(NULL);
1273
1274 if(newInvUCA != NULL) {
1275 udata_close(result);
1276 // This is not needed, as we are talking about
1277 // memory we got from UData
1278 //uprv_free(newInvUCA);
1279 }
1280 else {
1281 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
1282 }
1283 }
1284 }
1285 return _staticInvUCA;
1286 }
1287
1288 #endif /* #if !UCONFIG_NO_COLLATION */