]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/ucol_bld.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / i18n / ucol_bld.cpp
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
374ca955 4* Copyright (C) 2001-2004, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucol_bld.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created 02/22/2001
14* created by: Vladimir Weinstein
15*
16* This module builds a collator based on the rule set.
17*
18*/
19
20#include "unicode/utypes.h"
21
22#if !UCONFIG_NO_COLLATION
23
24#include "unicode/ucoleitr.h"
25#include "unicode/uchar.h"
26#include "ucol_bld.h"
27#include "ucln_in.h"
28#include "umutex.h"
29#include "unicode/uniset.h"
30
374ca955 31static const InverseUCATableHeader* _staticInvUCA = NULL;
b75a7d8f
A
32static UDataMemory* invUCA_DATA_MEM = NULL;
33
34U_CDECL_BEGIN
35static UBool U_CALLCONV
36isAcceptableInvUCA(void * /*context*/,
37 const char * /*type*/, const char * /*name*/,
38 const UDataInfo *pInfo){
39 /* context, type & name are intentionally not used */
40 if( pInfo->size>=20 &&
41 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
42 pInfo->charsetFamily==U_CHARSET_FAMILY &&
374ca955
A
43 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */
44 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
45 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
46 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
47 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
48 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
49 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
50 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
51 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
b75a7d8f
A
52 ) {
53 UVersionInfo UCDVersion;
54 u_getUnicodeVersion(UCDVersion);
55 if(pInfo->dataVersion[0]==UCDVersion[0] &&
56 pInfo->dataVersion[1]==UCDVersion[1]) {
57 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
58 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
59 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
60 return TRUE;
61 } else {
62 return FALSE;
63 }
64 } else {
65 return FALSE;
66 }
67}
68U_CDECL_END
69
70static
374ca955
A
71int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
72 uint32_t bottom = 0, top = src->invUCA->tableSize;
b75a7d8f
A
73 uint32_t i = 0;
74 uint32_t first = 0, second = 0;
374ca955 75 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
76
77 while(bottom < top-1) {
78 i = (top+bottom)/2;
79 first = *(CETable+3*i);
80 second = *(CETable+3*i+1);
81 if(first > CE) {
82 top = i;
83 } else if(first < CE) {
84 bottom = i;
85 } else {
86 if(second > SecondCE) {
87 top = i;
88 } else if(second < SecondCE) {
89 bottom = i;
90 } else {
91 break;
92 }
93 }
94 }
95
96 /* weiv: */
97 /* in searching for elements, I have removed the failure */
98 /* The reason for this is that the builder does not rely */
99 /* on search mechanism telling it that it didn't find an */
100 /* element. However, indirect positioning relies on being */
101 /* able to find the elements around any CE, even if it is */
102 /* not defined in the UCA. */
103 return i;
104/*
105 if((first == CE && second == SecondCE)) {
106 return i;
107 } else {
108 return -1;
109 }
110*/
111}
112
113static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
114 0xFFFF0000,
115 0xFFFFFF00,
116 0xFFFFFFFF
117};
118
374ca955
A
119U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
120 uint32_t CE, uint32_t contCE,
b75a7d8f
A
121 uint32_t *nextCE, uint32_t *nextContCE,
122 uint32_t strength) {
374ca955 123 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
124 int32_t iCE;
125
374ca955 126 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f
A
127
128 if(iCE<0) {
129 *nextCE = UCOL_NOT_FOUND;
130 return -1;
131 }
132
133 CE &= strengthMask[strength];
134 contCE &= strengthMask[strength];
135
136 *nextCE = CE;
137 *nextContCE = contCE;
138
139 while((*nextCE & strengthMask[strength]) == CE
140 && (*nextContCE & strengthMask[strength]) == contCE) {
141 *nextCE = (*(CETable+3*(++iCE)));
142 *nextContCE = (*(CETable+3*(iCE)+1));
143 }
144
145 return iCE;
146}
147
374ca955
A
148U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
149 uint32_t CE, uint32_t contCE,
b75a7d8f
A
150 uint32_t *prevCE, uint32_t *prevContCE,
151 uint32_t strength) {
374ca955 152 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
153 int32_t iCE;
154
374ca955 155 iCE = ucol_inv_findCE(src, CE, contCE);
b75a7d8f
A
156
157 if(iCE<0) {
158 *prevCE = UCOL_NOT_FOUND;
159 return -1;
160 }
161
162 CE &= strengthMask[strength];
163 contCE &= strengthMask[strength];
164
165 *prevCE = CE;
166 *prevContCE = contCE;
167
168 while((*prevCE & strengthMask[strength]) == CE
169 && (*prevContCE & strengthMask[strength])== contCE
170 && iCE > 0) { /* this condition should prevent falling off the edge of the world */
171 /* here, we end up in a singularity - zero */
172 *prevCE = (*(CETable+3*(--iCE)));
173 *prevContCE = (*(CETable+3*(iCE)+1));
174 }
175
176 return iCE;
177}
178
374ca955
A
179U_CAPI uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
180 uint32_t prevCE, uint32_t prevContCE) {
181 uint32_t strength = UCOL_TERTIARY;
182 while(((prevCE & strengthMask[strength]) != (CE & strengthMask[strength])
183 || (prevContCE & strengthMask[strength]) != (contCE & strengthMask[strength]))
184 && strength) {
185 strength--;
186 }
187 return strength;
188
189}
190
191
b75a7d8f 192static
374ca955 193inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
b75a7d8f
A
194
195 uint32_t CE = lh->baseCE;
196 uint32_t SecondCE = lh->baseContCE;
197
374ca955 198 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
199 uint32_t previousCE, previousContCE;
200 int32_t iCE;
201
374ca955 202 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f
A
203
204 if(iCE<0) {
205 return -1;
206 }
207
208 CE &= strengthMask[strength];
209 SecondCE &= strengthMask[strength];
210
211 previousCE = CE;
212 previousContCE = SecondCE;
213
214 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) {
215 previousCE = (*(CETable+3*(--iCE)));
216 previousContCE = (*(CETable+3*(iCE)+1));
217 }
218 lh->previousCE = previousCE;
219 lh->previousContCE = previousContCE;
220
221 return iCE;
222}
223
224static
374ca955 225inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
b75a7d8f
A
226 uint32_t CE = lh->baseCE;
227 uint32_t SecondCE = lh->baseContCE;
228
374ca955 229 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
230 uint32_t nextCE, nextContCE;
231 int32_t iCE;
232
374ca955 233 iCE = ucol_inv_findCE(src, CE, SecondCE);
b75a7d8f
A
234
235 if(iCE<0) {
236 return -1;
237 }
238
239 CE &= strengthMask[strength];
240 SecondCE &= strengthMask[strength];
241
242 nextCE = CE;
243 nextContCE = SecondCE;
244
245 while((nextCE & strengthMask[strength]) == CE
246 && (nextContCE & strengthMask[strength]) == SecondCE) {
247 nextCE = (*(CETable+3*(++iCE)));
248 nextContCE = (*(CETable+3*(iCE)+1));
249 }
250
251 lh->nextCE = nextCE;
252 lh->nextContCE = nextContCE;
253
254 return iCE;
255}
256
257U_CFUNC void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
258 /* reset all the gaps */
259 int32_t i = 0;
374ca955 260 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
b75a7d8f
A
261 uint32_t st = 0;
262 uint32_t t1, t2;
263 int32_t pos;
264
265 UColToken *tok = lh->first;
266 uint32_t tokStrength = tok->strength;
267
268 for(i = 0; i<3; i++) {
269 lh->gapsHi[3*i] = 0;
270 lh->gapsHi[3*i+1] = 0;
271 lh->gapsHi[3*i+2] = 0;
272 lh->gapsLo[3*i] = 0;
273 lh->gapsLo[3*i+1] = 0;
274 lh->gapsLo[3*i+2] = 0;
275 lh->numStr[i] = 0;
276 lh->fStrToken[i] = NULL;
277 lh->lStrToken[i] = NULL;
278 lh->pos[i] = -1;
279 }
280
281 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
282
374ca955 283 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
b75a7d8f
A
284 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
285 lh->pos[0] = 0;
286 t1 = lh->baseCE;
287 t2 = lh->baseContCE;
288 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
289 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
290 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
374ca955
A
291 uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
292 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
293
294 t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
295 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
296
b75a7d8f
A
297 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
298 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
299 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
300 } else if(lh->indirect == TRUE && lh->nextCE != 0) {
301 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
302 lh->pos[0] = 0;
303 t1 = lh->baseCE;
304 t2 = lh->baseContCE;
305 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
306 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
307 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
308 t1 = lh->nextCE;
309 t2 = lh->nextContCE;
310 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
311 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
312 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
313 } else {
314 for(;;) {
315 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
374ca955 316 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
b75a7d8f
A
317 lh->fStrToken[tokStrength] = tok;
318 } else { /* The CE must be implicit, since it's not in the table */
319 /* Error */
320 *status = U_INTERNAL_PROGRAM_ERROR;
321 }
322 }
323
324 while(tok != NULL && tok->strength >= tokStrength) {
325 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
326 lh->lStrToken[tokStrength] = tok;
327 }
328 tok = tok->next;
329 }
330 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
331 /* check if previous interval is the same and merge the intervals if it is so */
332 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
333 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
334 lh->fStrToken[tokStrength+1] = NULL;
335 lh->lStrToken[tokStrength+1] = NULL;
336 lh->pos[tokStrength+1] = -1;
337 }
338 }
339 if(tok != NULL) {
340 tokStrength = tok->strength;
341 } else {
342 break;
343 }
344 }
345 for(st = 0; st < 3; st++) {
346 if((pos = lh->pos[st]) >= 0) {
347 t1 = *(CETable+3*(pos));
348 t2 = *(CETable+3*(pos)+1);
349 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
350 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
351 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
352 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
374ca955
A
353 //pos--;
354 //t1 = *(CETable+3*(pos));
355 //t2 = *(CETable+3*(pos)+1);
356 t1 = lh->baseCE;
357 t2 = lh->baseContCE;
b75a7d8f
A
358 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
359 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
360 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
361 }
362 }
363 }
364}
365
366
367#define ucol_countBytes(value, noOfBytes) \
368{ \
369 uint32_t mask = 0xFFFFFFFF; \
370 (noOfBytes) = 0; \
371 while(mask != 0) { \
372 if(((value) & mask) != 0) { \
373 (noOfBytes)++; \
374 } \
375 mask >>= 8; \
376 } \
377}
378
379U_CFUNC uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
380 if(U_SUCCESS(*status)) {
381 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
382 }
383 return g->current;
384}
385
386U_CFUNC uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
387/* TODO: rename to enum names */
388 uint32_t high, low, count=1;
389 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
390
391 if(strength == UCOL_SECONDARY) {
392 low = UCOL_COMMON_TOP2<<24;
393 high = 0xFFFFFFFF;
394 count = 0xFF - UCOL_COMMON_TOP2;
395 } else {
396 low = UCOL_BYTE_COMMON << 24; //0x05000000;
397 high = 0x40000000;
398 count = 0x40 - UCOL_BYTE_COMMON;
399 }
400
401 if(tok->next != NULL && tok->next->strength == strength) {
402 count = tok->next->toInsert;
403 }
404
405 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
406 g->current = UCOL_BYTE_COMMON<<24;
407
408 if(g->noOfRanges == 0) {
409 *status = U_INTERNAL_PROGRAM_ERROR;
410 }
411 return g->current;
412}
413
414U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
415 uint32_t strength = tok->strength;
416 uint32_t low = lows[fStrength*3+strength];
417 uint32_t high = highs[fStrength*3+strength];
374ca955
A
418 uint32_t maxByte = 0;
419 if(strength == UCOL_TERTIARY) {
420 maxByte = 0x3F;
421 } else if(strength == UCOL_PRIMARY) {
422 maxByte = 0xFE;
423 } else {
424 maxByte = 0xFF;
425 }
b75a7d8f
A
426
427 uint32_t count = tok->toInsert;
428
429 if(low >= high && strength > UCOL_PRIMARY) {
430 int32_t s = strength;
431 for(;;) {
432 s--;
433 if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
434 if(strength == UCOL_SECONDARY) {
435 low = UCOL_COMMON_TOP2<<24;
436 high = 0xFFFFFFFF;
437 } else {
438 //low = 0x02000000; // This needs to be checked - what if low is
439 // not good...
440 high = 0x40000000;
441 }
442 break;
443 }
444 if(s<0) {
445 *status = U_INTERNAL_PROGRAM_ERROR;
446 return 0;
447 }
448 }
449 }
450
451 if(low == 0) {
452 low = 0x01000000;
453 }
454
455 if(strength == UCOL_SECONDARY) { /* similar as simple */
456 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
457 low = UCOL_COMMON_TOP2<<24;
458 }
459 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
460 high = UCOL_COMMON_TOP2<<24;
461 }
374ca955
A
462 if(low < (UCOL_COMMON_BOT2<<24)) {
463 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
464 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
465 //g->current = UCOL_COMMON_BOT2<<24;
b75a7d8f
A
466 return g->current;
467 }
468 }
469
470 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
471 if(g->noOfRanges == 0) {
472 *status = U_INTERNAL_PROGRAM_ERROR;
473 }
474 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
475 return g->current;
476}
477
374ca955
A
478static
479uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
480 uint32_t i = 0;
481 UChar c;
482
483 if(U_FAILURE(*status)) {
484 return 0;
485 }
486
487 if(sourceLen > resLen) {
488 *status = U_MEMORY_ALLOCATION_ERROR;
489 return 0;
490 }
491
492 for(i = 0; i < sourceLen; i++) {
493 c = source[i];
494 if(0x3042 < c && c < 0x30ef) { /* Kana range */
495 switch(c - 0x3000) {
496 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
497 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
498 c++;
499 break;
500 case 0xF5:
501 c = 0x30AB;
502 break;
503 case 0xF6:
504 c = 0x30B1;
505 break;
506 }
507 }
508 resBuf[i] = c;
509 }
510 return sourceLen;
511}
512
513static
514uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
515 uint32_t i = 0;
516 UChar c;
517
518 if(U_FAILURE(*status)) {
519 return 0;
520 }
521
522 if(sourceLen > resLen) {
523 *status = U_MEMORY_ALLOCATION_ERROR;
524 return 0;
525 }
526
527 for(i = 0; i < sourceLen; i++) {
528 c = source[i];
529 if(0x3042 < c && c < 0x30ef) { /* Kana range */
530 switch(c - 0x3000) {
531 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
532 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
533 c--;
534 break;
535 case 0xAB:
536 c = 0x30F5;
537 break;
538 case 0xB1:
539 c = 0x30F6;
540 break;
541 }
542 }
543 resBuf[i] = c;
544 }
545 return sourceLen;
546}
547
548static
549uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
550 uint32_t i = 0;
551 UChar n[128];
552 uint32_t nLen = 0;
553 uint32_t uCount = 0, lCount = 0;
554
555 collIterate s;
556 uint32_t order = 0;
557
558 if(U_FAILURE(*status)) {
559 return UCOL_LOWER_CASE;
560 }
561
562 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
563 if(U_SUCCESS(*status)) {
564 for(i = 0; i < nLen; i++) {
565 uprv_init_collIterate(UCA, &n[i], 1, &s);
566 order = ucol_getNextCE(UCA, &s, status);
567 if(isContinuation(order)) {
568 *status = U_INTERNAL_PROGRAM_ERROR;
569 return UCOL_LOWER_CASE;
570 }
571 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
572 uCount++;
573 } else {
574 if(u_islower(n[i])) {
575 lCount++;
576 } else {
577 UChar sk[1], lk[1];
578 u_toSmallKana(&n[i], 1, sk, 1, status);
579 u_toLargeKana(&n[i], 1, lk, 1, status);
580 if(sk[0] == n[i] && lk[0] != n[i]) {
581 lCount++;
582 }
583 }
584 }
585 }
586 }
587
588 if(uCount != 0 && lCount != 0) {
589 return UCOL_MIXED_CASE;
590 } else if(uCount != 0) {
591 return UCOL_UPPER_CASE;
592 } else {
593 return UCOL_LOWER_CASE;
594 }
595}
596
597
598U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
b75a7d8f
A
599 /* this one makes the table and stuff */
600 uint32_t noOfBytes[3];
601 uint32_t i;
602
603 for(i = 0; i<3; i++) {
604 ucol_countBytes(CEparts[i], noOfBytes[i]);
605 }
606
607 /* Here we have to pack CEs from parts */
608
609 uint32_t CEi = 0;
610 uint32_t value = 0;
611
612 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
613 if(CEi > 0) {
614 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
615 } else {
616 value = 0;
617 }
618
619 if(2*CEi<noOfBytes[0]) {
620 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
621 }
622 if(CEi<noOfBytes[1]) {
623 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
624 }
625 if(CEi<noOfBytes[2]) {
626 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
627 }
628 tok->CEs[CEi] = value;
629 CEi++;
630 }
631 if(CEi == 0) { /* totally ignorable */
632 tok->noOfCEs = 1;
633 tok->CEs[0] = 0;
634 } else { /* there is at least something */
635 tok->noOfCEs = CEi;
636 }
637
374ca955
A
638
639 // we want to set case bits here and now, not later.
640 // Case bits handling
641 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
642 int32_t cSize = (tok->source & 0xFF000000) >> 24;
643 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
644
645 if(cSize > 1) {
646 // Do it manually
647 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
648 } else {
649 // Copy it from the UCA
650 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
651 tok->CEs[0] |= (caseCE & 0xC0);
652 }
653
b75a7d8f
A
654#if UCOL_DEBUG==2
655 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
656 for(i = 0; i<tok->noOfCEs; i++) {
657 fprintf(stderr, "%08X ", tok->CEs[i]);
658 }
659 fprintf(stderr, "\n");
660#endif
661}
662
663U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
664 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
665 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
666
667 UColToken *tok = lh->last;
668 uint32_t t[UCOL_STRENGTH_LIMIT];
669
670 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
671
672 tok->toInsert = 1;
673 t[tok->strength] = 1;
674
675 while(tok->previous != NULL) {
676 if(tok->previous->strength < tok->strength) { /* going up */
677 t[tok->strength] = 0;
678 t[tok->previous->strength]++;
679 } else if(tok->previous->strength > tok->strength) { /* going down */
680 t[tok->previous->strength] = 1;
681 } else {
682 t[tok->strength]++;
683 }
684 tok=tok->previous;
685 tok->toInsert = t[tok->strength];
686 }
687
688 tok->toInsert = t[tok->strength];
689 ucol_inv_getGapPositions(src, lh, status);
690
691#if UCOL_DEBUG
692 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
693 int32_t j = 2;
694 for(j = 2; j >= 0; j--) {
695 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
696 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
697 }
698 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
699
700 do {
701 fprintf(stderr,"%i", tok->strength);
702 tok = tok->next;
703 } while(tok != NULL);
704 fprintf(stderr, "\n");
705
706 tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
707
708 do {
709 fprintf(stderr,"%i", tok->toInsert);
710 tok = tok->next;
711 } while(tok != NULL);
712#endif
713
714 tok = lh->first;
715 uint32_t fStrength = UCOL_IDENTICAL;
716 uint32_t initStrength = UCOL_IDENTICAL;
717
718
719 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
720 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
721 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
722
723 while (tok != NULL && U_SUCCESS(*status)) {
724 fStrength = tok->strength;
725 if(fStrength < initStrength) {
726 initStrength = fStrength;
727 if(lh->pos[fStrength] == -1) {
728 while(lh->pos[fStrength] == -1 && fStrength > 0) {
729 fStrength--;
730 }
731 if(lh->pos[fStrength] == -1) {
732 *status = U_INTERNAL_PROGRAM_ERROR;
733 return;
734 }
735 }
736 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
737 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
738 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
739 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
740 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
741 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
742 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
743 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
744 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
745 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
746 } else { /* primaries */
747 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
748 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
749 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
750 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
751 }
752 } else {
753 if(tok->strength == UCOL_TERTIARY) {
754 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
755 } else if(tok->strength == UCOL_SECONDARY) {
756 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
757 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
758 } else if(tok->strength == UCOL_PRIMARY) {
759 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
760 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
761 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
762 }
763 }
374ca955 764 ucol_doCE(src, CEparts, tok, status);
b75a7d8f
A
765 tok = tok->next;
766 }
767}
768
b75a7d8f
A
769U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
770 UCAElements el;
771 UColToken *tok = lh->first;
772 UColToken *expt = NULL;
773 uint32_t i = 0, j = 0;
774
775 while(tok != NULL && U_SUCCESS(*status)) {
776 /* first, check if there are any expansions */
777 /* if there are expansions, we need to do a little bit more processing */
778 /* since parts of expansion can be tailored, while others are not */
779 if(tok->expansion != 0) {
780 uint32_t len = tok->expansion >> 24;
781 uint32_t currentSequenceLen = len;
782 uint32_t expOffset = tok->expansion & 0x00FFFFFF;
783 //uint32_t exp = currentSequenceLen | expOffset;
784 UColToken exp;
785 exp.source = currentSequenceLen | expOffset;
786 exp.rulesToParse = src->source;
787
788 while(len > 0) {
789 currentSequenceLen = len;
790 while(currentSequenceLen > 0) {
791 exp.source = (currentSequenceLen << 24) | expOffset;
792 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
793 uint32_t noOfCEsToCopy = expt->noOfCEs;
794 for(j = 0; j<noOfCEsToCopy; j++) {
795 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
796 }
797 tok->noOfExpCEs += noOfCEsToCopy;
798 // Smart people never try to add codepoints and CEs.
799 // For some odd reason, it won't work.
800 expOffset += currentSequenceLen; //noOfCEsToCopy;
801 len -= currentSequenceLen; //noOfCEsToCopy;
802 break;
803 } else {
804 currentSequenceLen--;
805 }
806 }
807 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
808 /* will have to get one from UCA */
809 /* first, get the UChars from the rules */
810 /* then pick CEs out until there is no more and stuff them into expansion */
811 collIterate s;
812 uint32_t order = 0;
813 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
814
815 for(;;) {
816 order = ucol_getNextCE(src->UCA, &s, status);
817 if(order == UCOL_NO_MORE_CES) {
818 break;
819 }
820 tok->expCEs[tok->noOfExpCEs++] = order;
821 }
822 expOffset++;
823 len--;
824 }
825 }
826 } else {
827 tok->noOfExpCEs = 0;
828 }
829
830 /* set the ucaelement with obtained values */
831 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
832 /* copy CEs */
833 for(i = 0; i<tok->noOfCEs; i++) {
834 el.CEs[i] = tok->CEs[i];
835 }
836 for(i = 0; i<tok->noOfExpCEs; i++) {
837 el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
838 }
839
840 /* copy UChars */
841 // We kept prefix and source kind of together, as it is a kind of a contraction.
842 // However, now we have to slice the prefix off the main thing -
843 el.prefix = el.prefixChars;
844 el.cPoints = el.uchars;
845 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
846 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
847 // decomposed elements to the unsaf table.
848 el.prefixSize = tok->prefix>>24;
849 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
850
851 el.cSize = (tok->source >> 24)-(tok->prefix>>24);
852 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
853 } else {
854 el.prefixSize = 0;
855 *el.prefix = 0;
856
857 el.cSize = (tok->source >> 24);
858 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
859 }
860
861 if(UCOL_ISTHAIPREVOWEL(el.cPoints[0])) {
862 el.isThai = TRUE;
863 } else {
864 el.isThai = FALSE;
865 }
866
867 if(src->UCA != NULL) {
868 for(i = 0; i<el.cSize; i++) {
869 if(UCOL_ISJAMO(el.cPoints[i])) {
870 t->image->jamoSpecial = TRUE;
871 }
872 }
873 }
874
374ca955
A
875#if 0
876 // we do case bits in doCE now, since we will mess up expansions otherwise.
b75a7d8f
A
877 // Case bits handling
878 el.CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
879 if(el.cSize > 1) {
880 // Do it manually
881 el.CEs[0] |= ucol_uprv_getCaseBits(src->UCA, el.cPoints, el.cSize, status);
882 } else {
883 // Copy it from the UCA
884 uint32_t caseCE = ucol_getFirstCE(src->UCA, el.cPoints[0], status);
885 el.CEs[0] |= (caseCE & 0xC0);
886 }
374ca955 887#endif
b75a7d8f
A
888
889 /* and then, add it */
890#if UCOL_DEBUG==2
891 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
892#endif
893 uprv_uca_addAnElement(t, &el, status);
894
895#if 0
896 if(el.cSize > 1) { // this is a contraction, we should check whether a composed form should also be included
897 UChar composed[256];
898 uint32_t compLen = unorm_normalize(el.cPoints, el.cSize, UNORM_NFC, 0, composed, 256, status);;
899
900 if(compLen != el.cSize || uprv_memcmp(composed, el.cPoints, el.cSize*sizeof(UChar))) {
901 // composed form of a contraction is different than the decomposed form!
902 // do it!
903#ifdef UCOL_DEBUG
904 fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed);
905#endif
906 el.cSize = compLen;
907 uprv_memcpy(el.cPoints, composed, el.cSize*sizeof(UChar));
908 uprv_uca_addAnElement(t, &el, status);
909 }
910 }
911#endif
912
913#if UCOL_DEBUG_DUPLICATES
914 if(*status != U_ZERO_ERROR) {
915 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
916 *status = U_ZERO_ERROR;
917 }
918#endif
919
920 tok = tok->next;
921 }
922}
923
924U_CDECL_BEGIN
925static UBool U_CALLCONV
926_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
927 UErrorCode status = U_ZERO_ERROR;
928 tempUCATable *t = (tempUCATable *)context;
929 if(value == 0) {
930 while(start < limit) {
931 uint32_t CE = utrie_get32(t->mapping, start, NULL);
932 if(CE == UCOL_NOT_FOUND) {
933 UCAElements el;
934 el.isThai = FALSE;
935 el.prefixSize = 0;
936 el.prefixChars[0] = 0;
937 el.prefix = el.prefixChars;
938 el.cPoints = el.uchars;
939
940 el.cSize = 0;
941 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
942
943 el.noOfCEs = 1;
944 el.CEs[0] = 0;
945 uprv_uca_addAnElement(t, &el, &status);
946
947 }
948 start++;
949 }
950 }
951 if(U_FAILURE(status)) {
952 return FALSE;
953 } else {
954 return TRUE;
955 }
956}
957U_CDECL_END
958
959static void
960ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
961 UChar32 start, UChar32 end,
962 UErrorCode *status) {
963 //UChar decomp[256];
964 uint32_t CE = UCOL_NOT_FOUND;
965 UChar32 u = 0;
966 UCAElements el;
967 el.isThai = FALSE;
968 el.prefixSize = 0;
969 el.prefixChars[0] = 0;
970 collIterate colIt;
971
972 if(U_SUCCESS(*status)) {
973 for(u = start; u<=end; u++) {
974 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
975 /* this test is for contractions that are missing the starting element. */
976 || ((isCntTableElement(CE)) &&
977 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
978 ) {
979 el.cSize = 0;
980 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
981 //decomp[0] = (UChar)u;
982 //el.uchars[0] = (UChar)u;
983 el.cPoints = el.uchars;
984 //el.cSize = 1;
985 el.noOfCEs = 0;
986 el.prefix = el.prefixChars;
987 el.prefixSize = 0;
988 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
989 // We actually want to check whether this element is a special
990 // If it is an implicit element (hangul, CJK - we want to copy the
991 // special, not the resolved CEs) - for hangul, copying resolved
992 // would just make things the same (there is an expansion and it
993 // takes approximately the same amount of time to resolve as
994 // falling back to the UCA).
995 /*
996 UTRIE_GET32(src->UCA->mapping, u, CE);
997 tag = getCETag(CE);
998 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
999 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
1000 || tag == LEAD_SURROGATE_TAG) {
1001 el.CEs[el.noOfCEs++] = CE;
1002 } else {
1003 */
1004 // It turns out that it does not make sense to keep implicits
1005 // unresolved. The cost of resolving them is big enough so that
1006 // it doesn't make any difference whether we have to go to the UCA
1007 // or not.
1008 {
1009 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
1010 while(CE != UCOL_NO_MORE_CES) {
1011 CE = ucol_getNextCE(src->UCA, &colIt, status);
1012 if(CE != UCOL_NO_MORE_CES) {
1013 el.CEs[el.noOfCEs++] = CE;
1014 }
1015 }
1016 }
1017 uprv_uca_addAnElement(t, &el, status);
1018 }
1019 }
1020 }
1021}
1022
1023UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
1024 uint32_t i = 0;
1025 if(U_FAILURE(*status)) {
1026 return NULL;
1027 }
1028/*
10292. Eliminate the negative lists by doing the following for each non-null negative list:
1030 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
1031 create new ListHeader X
1032 o reverse the list, add to the end of X's positive list. Reset the strength of the
1033 first item you add, based on the stronger strength levels of the two lists.
1034*/
1035/*
10363. For each ListHeader with a non-null positive list:
1037*/
1038/*
1039 o Find all character strings with CEs between the baseCE and the
1040 next/previous CE, at the strength of the first token. Add these to the
1041 tailoring.
1042 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the
1043 tailoring has & x < z...
1044 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ...
1045*/
1046 /* It is possible that this part should be done even while constructing list */
1047 /* The problem is that it is unknown what is going to be the strongest weight */
1048 /* So we might as well do it here */
1049
1050/*
1051 o Allocate CEs for each token in the list, based on the total number N of the
1052 largest level difference, and the gap G between baseCE and nextCE at that
1053 level. The relation * between the last item and nextCE is the same as the
1054 strongest strength.
1055 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
1056 ? There are 3 primary items: a, d, e. Fit them into the primary gap.
1057 Then fit b and c into the secondary gap between a and d, then fit q
1058 into the tertiary gap between b and c.
1059
1060 o Example: baseCE << b <<< q << c * nextCE(X,2)
1061 ? There are 2 secondary items: b, c. Fit them into the secondary gap.
1062 Then fit q into the tertiary gap between b and c.
1063 o When incrementing primary values, we will not cross high byte
1064 boundaries except where there is only a single-byte primary. That is to
1065 ensure that the script reordering will continue to work.
1066*/
1067 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
1068 /* test for NULL */
1069 if (image == NULL) {
1070 *status = U_MEMORY_ALLOCATION_ERROR;
1071 return NULL;
1072 }
1073 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
1074
1075 for(i = 0; i<src->resultLen; i++) {
1076 /* now we need to generate the CEs */
1077 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1078 /* According to strength */
1079 if(U_SUCCESS(*status)) {
374ca955
A
1080 if(src->lh[i].first) { // if there are any elements
1081 // due to the way parser works, subsequent tailorings
1082 // may remove all the elements from a sequence, therefore
1083 // leaving an empty tailoring sequence.
1084 ucol_initBuffers(src, &src->lh[i], status);
1085 }
b75a7d8f
A
1086 }
1087 if(U_FAILURE(*status)) {
1088 return NULL;
1089 }
1090
1091 }
1092
1093 if(src->varTop != NULL) { /* stuff the variable top value */
1094 src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
1095 /* remove it from the list */
1096 if(src->varTop->listHeader->first == src->varTop) { /* first in list */
1097 src->varTop->listHeader->first = src->varTop->next;
1098 }
1099 if(src->varTop->listHeader->last == src->varTop) { /* first in list */
1100 src->varTop->listHeader->last = src->varTop->previous;
1101 }
1102 if(src->varTop->next != NULL) {
1103 src->varTop->next->previous = src->varTop->previous;
1104 }
1105 if(src->varTop->previous != NULL) {
1106 src->varTop->previous->next = src->varTop->next;
1107 }
1108 }
1109
1110
374ca955 1111 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
b75a7d8f
A
1112
1113
1114 /* After this, we have assigned CE values to all regular CEs */
1115 /* now we will go through list once more and resolve expansions, */
1116 /* make UCAElements structs and add them to table */
1117 for(i = 0; i<src->resultLen; i++) {
1118 /* now we need to generate the CEs */
1119 /* We stuff the initial value in the buffers, and increase the appropriate buffer */
1120 /* According to strength */
1121 if(U_SUCCESS(*status)) {
1122 ucol_createElements(src, t, &src->lh[i], status);
1123 }
1124 }
1125
1126 UCAElements el;
1127 el.isThai = FALSE;
1128 el.prefixSize = 0;
1129 el.prefixChars[0] = 0;
1130
1131 /* add latin-1 stuff */
1132 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
1133
1134 /* add stuff for copying */
1135 if(src->copySet != NULL) {
1136 int32_t i = 0;
1137 UnicodeSet *set = (UnicodeSet *)src->copySet;
1138 for(i = 0; i < set->getRangeCount(); i++) {
1139 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
1140 }
1141 }
1142
1143 if(U_SUCCESS(*status)) {
1144 /* copy contractions from the UCA - this is felt mostly for cyrillic*/
1145
1146 uint32_t tailoredCE = UCOL_NOT_FOUND;
1147 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
1148 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
1149 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
1150 while(*conts != 0) {
1151 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
1152 tailoredCE = utrie_get32(t->mapping, *conts, NULL);
1153 if(tailoredCE != UCOL_NOT_FOUND) {
1154 UBool needToAdd = TRUE;
1155 if(isCntTableElement(tailoredCE)) {
1156 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
1157 needToAdd = FALSE;
1158 }
1159 }
1160 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1161 needToAdd = FALSE;
1162 }
1163
1164 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
1165 el.prefix = el.prefixChars;
1166 el.prefixSize = 0;
1167 el.cPoints = el.uchars;
1168 el.noOfCEs = 0;
1169 el.uchars[0] = *conts;
1170 el.uchars[1] = *(conts+1);
1171 if(*(conts+2)!=0) {
1172 el.uchars[2] = *(conts+2);
1173 el.cSize = 3;
1174 } else {
1175 el.cSize = 2;
1176 }
1177 ucol_setText(ucaEl, el.uchars, el.cSize, status);
374ca955 1178 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
b75a7d8f
A
1179 el.noOfCEs++;
1180 }
1181 uprv_uca_addAnElement(t, &el, status);
1182 }
1183
1184 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
1185 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
1186 }
1187 conts+=3;
1188 }
1189 ucol_closeElements(ucaEl);
1190 }
1191
1192 // Add completely ignorable elements
1193 utrie_enum(t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
1194
1195
1196 // canonical closure
1197 uprv_uca_canonicalClosure(t, status);
1198
1199
1200 /* still need to produce compatibility closure */
1201
1202 UCATableHeader *myData = uprv_uca_assembleTable(t, status);
1203
1204 uprv_uca_closeTempTable(t);
1205 uprv_free(image);
1206
1207 return myData;
1208}
1209
374ca955
A
1210U_CDECL_BEGIN
1211static UBool U_CALLCONV
b75a7d8f
A
1212ucol_bld_cleanup(void)
1213{
1214 udata_close(invUCA_DATA_MEM);
1215 invUCA_DATA_MEM = NULL;
374ca955 1216 _staticInvUCA = NULL;
b75a7d8f
A
1217 return TRUE;
1218}
374ca955 1219U_CDECL_END
b75a7d8f
A
1220
1221U_CAPI const InverseUCATableHeader * U_EXPORT2
1222ucol_initInverseUCA(UErrorCode *status)
1223{
1224 if(U_FAILURE(*status)) return NULL;
1225
1226 umtx_lock(NULL);
374ca955 1227 UBool f = (_staticInvUCA == NULL);
b75a7d8f
A
1228 umtx_unlock(NULL);
1229
1230 if(f) {
1231 InverseUCATableHeader *newInvUCA = NULL;
1232 UDataMemory *result = udata_openChoice(NULL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
1233
1234 if(U_FAILURE(*status)) {
1235 if (result) {
1236 udata_close(result);
1237 }
1238 // This is not needed, as we are talking about
1239 // memory we got from UData
1240 //uprv_free(newInvUCA);
1241 }
1242
1243 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
1244 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
1245 UCollator *UCA = ucol_initUCA(status);
1246 // UCA versions of UCA and inverse UCA should match
1247 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
1248 *status = U_INVALID_FORMAT_ERROR;
1249 udata_close(result);
1250 return NULL;
1251 }
1252
1253 umtx_lock(NULL);
374ca955
A
1254 if(_staticInvUCA == NULL) {
1255 _staticInvUCA = newInvUCA;
b75a7d8f
A
1256 invUCA_DATA_MEM = result;
1257 result = NULL;
1258 newInvUCA = NULL;
1259 }
1260 umtx_unlock(NULL);
1261
1262 if(newInvUCA != NULL) {
1263 udata_close(result);
1264 // This is not needed, as we are talking about
1265 // memory we got from UData
1266 //uprv_free(newInvUCA);
1267 }
1268 else {
374ca955 1269 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
b75a7d8f
A
1270 }
1271 }
1272 }
374ca955 1273 return _staticInvUCA;
b75a7d8f
A
1274}
1275
1276#endif /* #if !UCONFIG_NO_COLLATION */